diff --git a/Cargo.lock b/Cargo.lock
index cdc28f73..40b310a8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5788,8 +5788,10 @@ version = "0.7.0"
dependencies = [
"actix-multipart",
"actix-web",
+ "base64 0.22.1",
"embed_anything",
"futures-util",
+ "image",
"serde",
"tempfile",
"tokio",
diff --git a/README.md b/README.md
index c2d34094..79768c6c 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,7 @@ EmbedAnything is a minimalist, yet highly performant, modular, lightning-fast, l
- **AWS S3 Bucket:** : Directly import AWS S3 bucket files.
- **Prebult Docker Image** : Just pull it: starlightsearch/embedanything-server
- **SearchAgent** : Example of how you can use index for Searchr1 reasoning.
+- **Video guide** : Quick start for frame sampling: https://embed-anything.com/guides/video/
## 💡What is Vector Streaming
@@ -478,7 +479,7 @@ We’re excited to share that we've expanded our platform to support multiple mo
- [x] Images
-- [ ] Videos
+- [x] Videos (frame sampling; enable the `video` feature)
- [ ] Graph
@@ -498,7 +499,7 @@ We now support both candle and Onnx backend
We had multimodality from day one for our infrastructure. We have already included it for websites, images and audios but we want to expand it further to.
➡️ Graph embedding -- build deepwalks embeddings depth first and word to vec
-➡️ Video Embedding
+➡️ Video embedding improvements (temporal + audio)
➡️ Yolo Clip
diff --git a/docs/guides/video.md b/docs/guides/video.md
new file mode 100644
index 00000000..7059fab0
--- /dev/null
+++ b/docs/guides/video.md
@@ -0,0 +1,86 @@
+# Video Embeddings (Frame Sampling)
+
+EmbedAnything supports video by sampling frames and embedding them with a vision model
+(CLIP/SigLIP). This is opt-in via the `video` feature flag and requires the `ffmpeg`
+CLI to be available on your system. If `ffmpeg` is not on `PATH`, set `FFMPEG_BIN`
+to the full path of the executable.
+
+## Recommended Config
+
+`VideoEmbedConfig` controls how frames are sampled:
+
+- `frame_step`: sample every Nth frame. Default `30`.
+- `max_frames`: maximum frames per video. Default `300`.
+- `batch_size`: frames per embedding batch. Default `32`.
+
+Suggested starting point:
+
+```python
+from embed_anything import VideoEmbedConfig
+
+config = VideoEmbedConfig(frame_step=30, max_frames=300, batch_size=16)
+```
+
+## Python Usage
+
+```python
+import embed_anything
+from embed_anything import VideoEmbedConfig
+
+model = embed_anything.EmbeddingModel.from_pretrained_hf(
+ model_id="openai/clip-vit-base-patch16"
+)
+
+config = VideoEmbedConfig(frame_step=30, max_frames=200, batch_size=16)
+
+data = embed_anything.embed_video_file("path/to/video.mp4", embedder=model, config=config)
+```
+
+## Build with Video Support
+
+You must enable the `video` feature and have the `ffmpeg` CLI installed.
+
+### macOS
+
+```bash
+brew install ffmpeg
+cargo build --features video
+# Python (maturin)
+maturin develop --features "extension-module,video"
+```
+
+### Linux (Debian/Ubuntu)
+
+```bash
+sudo apt-get update
+sudo apt-get install -y ffmpeg
+cargo build --features video
+# Python (maturin)
+maturin develop --features "extension-module,video"
+```
+
+### Windows (prebuilt FFmpeg)
+
+```powershell
+1. Download a static build from https://www.gyan.dev/ffmpeg/builds/
+2. Extract it and set:
+
+```powershell
+$env:FFMPEG_BIN = "C:\path\to\ffmpeg.exe"
+```
+
+Then build:
+
+```powershell
+cargo build --features video
+# Python (maturin)
+maturin develop --features "extension-module,video"
+```
+```
+
+## Output Metadata
+
+Each embedding includes:
+
+- `video_path`: the source video file
+- `frame_index`: the sampled frame index (0-based)
diff --git a/docs/index.md b/docs/index.md
index 9b828a13..1dc73cd1 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -74,7 +74,7 @@ EmbedAnything is a minimalist, yet highly performant, modular, lightning-fast, l
- **Candle Backend** : Supports BERT, Jina, ColPali, Splade, ModernBERT, Reranker, Qwen
- **ONNX Backend**: Supports BERT, Jina, ColPali, ColBERT Splade, Reranker, ModernBERT, Qwen
- **Cloud Embedding Models:**: Supports OpenAI, Cohere, and Gemini.
-- **MultiModality** : Works with text sources like PDFs, txt, md, Images JPG and Audio, .WAV
+- **MultiModality** : Works with text sources like PDFs, txt, md, images, audio (.WAV), and videos (frame sampling; enable the `video` feature)
- **GPU support** : Hardware acceleration on GPU as well.
- **Chunking** : In-built chunking methods like semantic, late-chunking
- **Vector Streaming:** Separate file processing, Indexing and Inferencing on different threads, reduces latency.
@@ -339,7 +339,7 @@ We’re excited to share that we've expanded our platform to support multiple mo
- [x] Images
-- [ ] Videos
+- [x] Videos (frame sampling; enable the `video` feature)
- [ ] Graph
@@ -359,7 +359,7 @@ We now support both candle and Onnx backend
We had multimodality from day one for our infrastructure. We have already included it for websites, images and audios but we want to expand it further to.
➡️ Graph embedding -- build deepwalks embeddings depth first and word to vec
-➡️ Video Embedding
+➡️ Video embedding improvements (temporal + audio)
➡️ Yolo Clip
diff --git a/docs/roadmap/roadmap.md b/docs/roadmap/roadmap.md
index d3a76d6b..11beae2a 100644
--- a/docs/roadmap/roadmap.md
+++ b/docs/roadmap/roadmap.md
@@ -17,7 +17,7 @@ We’re excited to share that we've expanded our platform to support multiple mo
- [x] Images
-- [ ] Videos
+- [x] Videos (frame sampling; enable the `video` feature)
- [ ] Graph
@@ -58,7 +58,7 @@ To address this, we’re excited to announce that we’re introducing Candle-ONN
We had multimodality from day one for our infrastructure. We have already included it for websites, images and audios but we want to expand it further to.
☑️Graph embedding -- build deepwalks embeddings depth first and word to vec
-☑️Video Embedding
+☑️Video embedding improvements (temporal + audio)
☑️ Yolo Clip
diff --git a/examples/video.py b/examples/video.py
new file mode 100644
index 00000000..cab27670
--- /dev/null
+++ b/examples/video.py
@@ -0,0 +1,37 @@
+import os
+from pathlib import Path
+
+import embed_anything
+from embed_anything import EmbedData, VideoEmbedConfig
+
+# Load a vision model (CLIP/SigLIP) for frame embeddings
+model = embed_anything.EmbeddingModel.from_pretrained_hf(
+ model_id="openai/clip-vit-base-patch16"
+)
+
+# Sample every 30th frame (~1 fps for 30 fps videos), cap to 200 frames
+config = VideoEmbedConfig(frame_step=30, max_frames=200, batch_size=16)
+
+video_path = os.environ.get("VIDEO_PATH", "path/to/video.mp4")
+if not Path(video_path).exists():
+ raise FileNotFoundError(
+ f"Video not found: {video_path}. Set VIDEO_PATH env var to a valid file."
+ )
+
+# Embed a single video
+video_embeddings: list[EmbedData] = embed_anything.embed_video_file(
+ video_path,
+ embedder=model,
+ config=config,
+)
+print(f"Embedded {len(video_embeddings)} frames from video.")
+
+video_dir = os.environ.get("VIDEO_DIR")
+if video_dir:
+ dir_embeddings = embed_anything.embed_video_directory(
+ video_dir,
+ embedder=model,
+ config=config,
+ )
+ if dir_embeddings is not None:
+ print(f"Embedded {len(dir_embeddings)} total frames from directory.")
diff --git a/mkdocs.yml b/mkdocs.yml
index 9278f06e..e3118209 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -52,6 +52,7 @@ nav:
- Guides:
- guides/colpali.md
- guides/images.md
+ - guides/video.md
- guides/semantic.md
- guides/adapters.md
- guides/onnx_models.md
diff --git a/processors/Cargo.toml b/processors/Cargo.toml
index 397e655c..c23a7aa4 100644
--- a/processors/Cargo.toml
+++ b/processors/Cargo.toml
@@ -30,9 +30,11 @@ pdf2image = "0.1.3"
image = "0.25.6"
thiserror = "2.0.12"
tempfile = "3.19.1"
+# Video processing (uses external ffmpeg CLI)
[dev-dependencies]
tempdir = "0.3.7"
[features]
default = []
+video = []
\ No newline at end of file
diff --git a/processors/src/lib.rs b/processors/src/lib.rs
index 831daaea..8a91540a 100644
--- a/processors/src/lib.rs
+++ b/processors/src/lib.rs
@@ -15,3 +15,7 @@ pub mod html_processor;
/// This module contains the file processor for DOCX files.
pub mod docx_processor;
+
+/// This module contains the file processor for video files.
+#[cfg(feature = "video")]
+pub mod video_processor;
diff --git a/processors/src/video_processor.rs b/processors/src/video_processor.rs
new file mode 100644
index 00000000..6416a4ff
--- /dev/null
+++ b/processors/src/video_processor.rs
@@ -0,0 +1,145 @@
+#![cfg(feature = "video")]
+
+use anyhow::{anyhow, Result};
+use std::env;
+use std::path::{Path, PathBuf};
+use std::process::Command;
+use std::{fs, path};
+
+#[derive(Debug, Clone, Copy)]
+pub enum VideoFrameFormat {
+ Jpeg,
+ Png,
+}
+
+impl VideoFrameFormat {
+ fn extension(self) -> &'static str {
+ match self {
+ VideoFrameFormat::Jpeg => "jpg",
+ VideoFrameFormat::Png => "png",
+ }
+ }
+}
+
+#[derive(Debug, Clone)]
+pub struct VideoFrame {
+ pub index: usize,
+ pub path: PathBuf,
+}
+
+#[derive(Debug, Clone)]
+pub struct VideoProcessor {
+ frame_step: usize,
+ max_frames: Option,
+ output_format: VideoFrameFormat,
+ ffmpeg_bin: Option,
+}
+
+impl VideoProcessor {
+ pub fn new(frame_step: usize) -> Self {
+ Self {
+ frame_step: frame_step.max(1),
+ max_frames: None,
+ output_format: VideoFrameFormat::Jpeg,
+ ffmpeg_bin: None,
+ }
+ }
+
+ pub fn with_max_frames(mut self, max_frames: usize) -> Self {
+ self.max_frames = Some(max_frames);
+ self
+ }
+
+ pub fn with_output_format(mut self, output_format: VideoFrameFormat) -> Self {
+ self.output_format = output_format;
+ self
+ }
+
+ pub fn with_ffmpeg_bin>(mut self, ffmpeg_bin: P) -> Self {
+ self.ffmpeg_bin = Some(ffmpeg_bin.as_ref().to_path_buf());
+ self
+ }
+
+ fn resolve_ffmpeg_bin(&self) -> Result {
+ if let Some(bin) = &self.ffmpeg_bin {
+ return Ok(bin.clone());
+ }
+ if let Ok(bin) = env::var("FFMPEG_BIN") {
+ return Ok(PathBuf::from(bin));
+ }
+ Ok(PathBuf::from("ffmpeg"))
+ }
+
+ pub fn extract_frames_to_dir, Q: AsRef>(
+ &self,
+ video_path: P,
+ output_dir: Q,
+ ) -> Result> {
+ let output_dir = output_dir.as_ref();
+ fs::create_dir_all(output_dir)?;
+
+ let ffmpeg_bin = self.resolve_ffmpeg_bin()?;
+ let frame_step = self.frame_step.max(1);
+ let filter = format!("select=not(mod(n\\,{}))", frame_step);
+ let output_pattern = output_dir.join(format!(
+ "frame_%06d.{}",
+ self.output_format.extension()
+ ));
+
+ let mut command = Command::new(ffmpeg_bin);
+ command
+ .arg("-hide_banner")
+ .arg("-loglevel")
+ .arg("error")
+ .arg("-i")
+ .arg(video_path.as_ref())
+ .arg("-vf")
+ .arg(filter)
+ .arg("-vsync")
+ .arg("vfr");
+
+ if let Some(max_frames) = self.max_frames {
+ command.arg("-vframes").arg(max_frames.to_string());
+ }
+
+ let status = command.arg(output_pattern).status()?;
+ if !status.success() {
+ return Err(anyhow!("ffmpeg failed with exit code {:?}", status.code()));
+ }
+
+ let mut frame_paths = fs::read_dir(output_dir)?
+ .filter_map(|entry| entry.ok())
+ .filter(|entry| entry.file_type().map(|t| t.is_file()).unwrap_or(false))
+ .map(|entry| entry.path())
+ .filter(|path| {
+ path.extension()
+ .and_then(|ext| ext.to_str())
+ .map(|ext| ext.eq_ignore_ascii_case(self.output_format.extension()))
+ .unwrap_or(false)
+ })
+ .collect::>();
+
+ frame_paths.sort();
+
+ if frame_paths.is_empty() {
+ return Err(anyhow!("No frames extracted from video"));
+ }
+
+ let frames = frame_paths
+ .into_iter()
+ .enumerate()
+ .map(|(index, path)| VideoFrame { index, path })
+ .collect();
+
+ Ok(frames)
+ }
+
+ pub fn extract_frames_to_temp_dir>(
+ &self,
+ video_path: P,
+ ) -> Result<(tempfile::TempDir, Vec)> {
+ let temp_dir = tempfile::TempDir::new()?;
+ let frames = self.extract_frames_to_dir(video_path, temp_dir.path())?;
+ Ok((temp_dir, frames))
+ }
+}
diff --git a/python/Cargo.toml b/python/Cargo.toml
index e460a817..6f7e4a0c 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -26,4 +26,4 @@ cudnn = ["embed_anything/cudnn"]
metal = ["embed_anything/metal"]
ort = ["embed_anything/ort"]
audio = ["embed_anything/audio"]
-aws = ["embed_anything/aws"]
\ No newline at end of file
+aws = ["embed_anything/aws"]
diff --git a/python/python/embed_anything/__init__.py b/python/python/embed_anything/__init__.py
index 29cd66e4..3267caac 100644
--- a/python/python/embed_anything/__init__.py
+++ b/python/python/embed_anything/__init__.py
@@ -2,7 +2,7 @@
This module provides functions and classes for embedding queries, files, and
directories using different embedding models. It supports text, images, audio,
-PDFs, and other media types with various embedding backends (Candle, ONNX, Cloud).
+videos, PDFs, and other media types with various embedding backends (Candle, ONNX, Cloud).
Main Functions:
---------------
@@ -11,6 +11,8 @@
- `embed_directory`: Embeds all files in a directory and returns a list of EmbedData objects.
- `embed_image_directory`: Embeds all images in a directory.
- `embed_audio_file`: Embeds audio files using Whisper for transcription.
+- `embed_video_file`: Embeds a video file by sampling frames.
+- `embed_video_directory`: Embeds all videos in a directory.
- `embed_webpage`: Embeds content from a webpage URL.
Main Classes:
@@ -18,6 +20,7 @@
- `EmbeddingModel`: Main class for loading and using embedding models.
- `EmbedData`: Represents embedded data with text, embedding vector, and metadata.
- `TextEmbedConfig`: Configuration for text embedding (chunking, batching, etc.).
+- `VideoEmbedConfig`: Configuration for video embedding (frame sampling, batching).
- `ColpaliModel`: Specialized model for document/image-text embedding.
- `ColbertModel`: Model for late-interaction embeddings.
- `Reranker`: Model for re-ranking search results.
diff --git a/python/python/embed_anything/_embed_anything.pyi b/python/python/embed_anything/_embed_anything.pyi
index b1a47c16..7e2ecc2b 100644
--- a/python/python/embed_anything/_embed_anything.pyi
+++ b/python/python/embed_anything/_embed_anything.pyi
@@ -268,6 +268,42 @@ def embed_audio_file(
),
) -> list[EmbedData]:
"""
+
+def embed_video_file(
+ file_path: str,
+ embedder: EmbeddingModel,
+ config: VideoEmbedConfig | None = None,
+) -> list[EmbedData]:
+ """
+ Embeds the given video file by sampling frames and returns a list of EmbedData objects.
+
+ Args:
+ file_path: The path to the video file to embed.
+ embedder: The embedding model to use.
+ config: The configuration for video embedding.
+
+ Returns:
+ A list of EmbedData objects.
+ """
+
+def embed_video_directory(
+ file_path: str,
+ embedder: EmbeddingModel,
+ config: VideoEmbedConfig | None = None,
+ adapter: Adapter | None = None,
+) -> list[EmbedData] | None:
+ """
+ Embeds all videos in the given directory and returns a list of EmbedData objects.
+
+ Args:
+ file_path: The path to the directory containing videos to embed.
+ embedder: The embedding model to use.
+ config: The configuration for video embedding.
+ adapter: The adapter to use for storing the embeddings in a vector database.
+
+ Returns:
+ A list of EmbedData objects, or None if an adapter is used.
+ """
Embeds the given audio file and returns a list of EmbedData objects.
Args:
@@ -585,6 +621,29 @@ class ImageEmbedConfig:
buffer_size: int | None
batch_size: int | None
+class VideoEmbedConfig:
+ """
+ Represents the configuration for the Video Embedding model.
+
+ Attributes:
+ frame_step: Sample every Nth frame. Default is 30.
+ max_frames: Maximum number of frames to embed. Default is 300.
+ batch_size: The batch size for processing frames. Default is 32.
+ """
+
+ def __init__(
+ self,
+ frame_step: int | None = None,
+ max_frames: int | None = None,
+ batch_size: int | None = None,
+ ):
+ self.frame_step = frame_step
+ self.max_frames = max_frames
+ self.batch_size = batch_size
+ frame_step: int | None
+ max_frames: int | None
+ batch_size: int | None
+
class EmbeddingModel:
"""
Represents an embedding model.
@@ -760,6 +819,40 @@ class EmbeddingModel:
A list of EmbedData objects.
"""
+ def embed_video_file(
+ self,
+ video_file: str,
+ config: VideoEmbedConfig | None = None,
+ ) -> list[EmbedData]:
+ """
+ Embeds the given video file and returns a list of EmbedData objects.
+
+ Args:
+ video_file: The path to the video file to embed.
+ config: The configuration for video embedding.
+
+ Returns:
+ A list of EmbedData objects.
+ """
+
+ def embed_video_directory(
+ self,
+ directory: str,
+ config: VideoEmbedConfig | None = None,
+ adapter: Adapter | None = None,
+ ) -> list[EmbedData] | None:
+ """
+ Embeds videos in the given directory and returns a list of EmbedData objects.
+
+ Args:
+ directory: The path to the directory to embed.
+ config: The configuration for video embedding.
+ adapter: The adapter for the embedding.
+
+ Returns:
+ A list of EmbedData objects, or None if an adapter is used.
+ """
+
def embed_query(
self,
query: list[str],
diff --git a/python/src/config.rs b/python/src/config.rs
index 258c4468..f8a749d1 100644
--- a/python/src/config.rs
+++ b/python/src/config.rs
@@ -92,3 +92,44 @@ impl ImageEmbedConfig {
self.inner.batch_size
}
}
+
+#[pyclass]
+#[derive(Clone, Default)]
+pub struct VideoEmbedConfig {
+ pub inner: embed_anything::config::VideoEmbedConfig,
+}
+
+#[pymethods]
+impl VideoEmbedConfig {
+ #[new]
+ #[pyo3(signature = (frame_step=None, max_frames=None, batch_size=None))]
+ pub fn new(
+ frame_step: Option,
+ max_frames: Option,
+ batch_size: Option,
+ ) -> Self {
+ let default_config = embed_anything::config::VideoEmbedConfig::default();
+ Self {
+ inner: embed_anything::config::VideoEmbedConfig {
+ frame_step: frame_step.or(default_config.frame_step),
+ max_frames: max_frames.or(default_config.max_frames),
+ batch_size: batch_size.or(default_config.batch_size),
+ },
+ }
+ }
+
+ #[getter]
+ pub fn frame_step(&self) -> Option {
+ self.inner.frame_step
+ }
+
+ #[getter]
+ pub fn max_frames(&self) -> Option {
+ self.inner.max_frames
+ }
+
+ #[getter]
+ pub fn batch_size(&self) -> Option {
+ self.inner.batch_size
+ }
+}
diff --git a/python/src/lib.rs b/python/src/lib.rs
index f998750f..bef17ff7 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -7,6 +7,8 @@ use embed_anything::{
self,
config::TextEmbedConfig,
emb_audio,
+ embed_video_directory as embed_video_directory_rs,
+ embed_video_file as embed_video_file_rs,
embeddings::embed::{Embedder, EmbeddingResult},
file_processor::audio::audio_processor,
FileLoadingError,
@@ -385,6 +387,25 @@ impl EmbeddingModel {
) -> PyResult