From 92ef2e5dc0467b31f6a65466778e7e3f1447847d Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 4 Sep 2025 10:07:57 +0200
Subject: [PATCH 1/2] load a tiny video to make CI faster

---
 tests/models/glm4v/test_processor_glm4v.py          | 11 ++++++-----
 .../qwen2_5_omni/test_processing_qwen2_5_omni.py    |  6 +++---
 .../models/qwen2_5_vl/test_processing_qwen2_5_vl.py |  6 +++---
 tests/models/qwen2_vl/test_processing_qwen2_vl.py   |  6 +++---
 tests/models/smolvlm/test_processing_smolvlm.py     |  8 ++++----
 tests/test_processing_common.py                     | 13 +++++++------
 6 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processor_glm4v.py
index 69413b786d7b..c4d9240c5fd5 100644
--- a/tests/models/glm4v/test_processor_glm4v.py
+++ b/tests/models/glm4v/test_processor_glm4v.py
@@ -204,12 +204,12 @@ def test_apply_chat_template_video_frame_sampling(self):
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": url_to_local_path(
-                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
             ),
         }
 
         # Load with `video_fps` arg
-        video_fps = 1
+        video_fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -218,17 +218,18 @@ def test_apply_chat_template_video_frame_sampling(self):
             video_fps=video_fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 20)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
 
-        # Load without any arg should load the whole video
+        # Load the whole video
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
+            do_sample_frames=False,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 40)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 24)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video
diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
index 658c67af7cbc..ca6c2a395554 100644
--- a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
@@ -457,7 +457,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             {
                 "type": "video",
                 "url": url_to_local_path(
-                    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
                 ),
             }
         )
@@ -482,7 +482,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             fps=fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -503,7 +503,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 17280)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video
diff --git a/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py
index 02ce2e4c8602..118a88bdb85c 100644
--- a/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py
@@ -275,7 +275,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": url_to_local_path(
-                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
             ),
         }
         num_frames = 3
@@ -299,7 +299,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             fps=fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -320,7 +320,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video
diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
index e2882c2e29df..dc801408452b 100644
--- a/tests/models/qwen2_vl/test_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
@@ -275,7 +275,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": url_to_local_path(
-                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
             ),
         }
         num_frames = 3
@@ -299,7 +299,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             fps=fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -320,7 +320,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video
diff --git a/tests/models/smolvlm/test_processing_smolvlm.py b/tests/models/smolvlm/test_processing_smolvlm.py
index b373e2bb025a..6c7ce6a0bba5 100644
--- a/tests/models/smolvlm/test_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_processing_smolvlm.py
@@ -398,7 +398,7 @@ def test_apply_chat_template_video_frame_sampling(self):
                         {
                             "type": "video",
                             "url": url_to_local_path(
-                                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
                             ),
                         },
                         {"type": "text", "text": "What is shown in this video?"},
@@ -419,10 +419,10 @@ def test_apply_chat_template_video_frame_sampling(self):
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
         # SmolVLM doesn't sample `num_frames` exactly, by uses other sampling method
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 3)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 1)
 
         # Load with `fps` arg
-        fps = 1
+        fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -434,7 +434,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
         # SmolVLM doesn't sample 1 frame per second exactly, by uses other sampling method
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), fps * 10)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 4)
 
         # NOTE: the last assert checks are removed
         # Loading video as a list of frames (i.e. images) is not supported in SmolVLM
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index e1b36cd2445f..cf861188dc69 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -1134,7 +1134,7 @@ def test_apply_chat_template_video_frame_sampling(self):
                     "content": [
                         {
                             "type": "video",
-                            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+                            "url": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
                         },
                         {"type": "text", "text": "What is shown in this video?"},
                     ],
@@ -1156,7 +1156,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), num_frames)
 
         # Load with `fps` arg
-        fps = 1
+        fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -1167,10 +1167,11 @@ def test_apply_chat_template_video_frame_sampling(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), fps * 10)
+        # 3 frames are inferred from input video's length and FPS, so can be hardcoded
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 3)
 
         # Whan `do_sample_frames=False` no sampling is done and whole video is loaded, even if number of frames is passed
-        fps = 1
+        fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -1182,7 +1183,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 11)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -1204,7 +1205,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 11)
 
         # Load video as a list of frames (i.e. images).
         # NOTE: each frame should have same size because we assume they come from one video

From 38e09db195531ebeee9cf199352f28754c166eab Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 4 Sep 2025 14:38:20 +0200
Subject: [PATCH 2/2] add video in url_to_local_path

---
 tests/test_processing_common.py   | 4 +++-
 utils/fetch_hub_objects_for_ci.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index cf861188dc69..acd33b78c5f3 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -1134,7 +1134,9 @@ def test_apply_chat_template_video_frame_sampling(self):
                     "content": [
                         {
                             "type": "video",
-                            "url": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
+                            "url": url_to_local_path(
+                                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
+                            ),
                         },
                         {"type": "text", "text": "What is shown in this video?"},
                     ],
diff --git a/utils/fetch_hub_objects_for_ci.py b/utils/fetch_hub_objects_for_ci.py
index f1dcc5467b4f..94a9061aa568 100644
--- a/utils/fetch_hub_objects_for_ci.py
+++ b/utils/fetch_hub_objects_for_ci.py
@@ -18,6 +18,7 @@
     "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
     "https://llava-vl.github.io/static/images/view.jpg",
     "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
+    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
 ]