huggingface · zucchini-nlp · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processor_glm4v.py
@@ -204,12 +204,12 @@ def test_apply_chat_template_video_frame_sampling(self):
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": url_to_local_path(
-                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
             ),
         }
 
         # Load with `video_fps` arg
-        video_fps = 1
+        video_fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -218,17 +218,18 @@ def test_apply_chat_template_video_frame_sampling(self):
             video_fps=video_fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 20)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
 
-        # Load without any arg should load the whole video
+        # Load the whole video
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
+            do_sample_frames=False,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 40)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 24)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video

diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
@@ -457,7 +457,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             {
                 "type": "video",
                 "url": url_to_local_path(
-                    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
                 ),
             }
         )
@@ -482,7 +482,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             fps=fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -503,7 +503,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 17280)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video

diff --git a/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py
@@ -275,7 +275,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": url_to_local_path(
-                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
             ),
         }
         num_frames = 3
@@ -299,7 +299,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             fps=fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -320,7 +320,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video

diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
@@ -275,7 +275,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         messages[0][0]["content"][0] = {
             "type": "video",
             "url": url_to_local_path(
-                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
             ),
         }
         num_frames = 3
@@ -299,7 +299,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             fps=fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -320,7 +320,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video

diff --git a/tests/models/smolvlm/test_processing_smolvlm.py b/tests/models/smolvlm/test_processing_smolvlm.py
@@ -393,7 +393,7 @@ def test_apply_chat_template_video_frame_sampling(self):
                         {
                             "type": "video",
                             "url": url_to_local_path(
-                                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
                             ),
                         },
                         {"type": "text", "text": "What is shown in this video?"},
@@ -414,10 +414,10 @@ def test_apply_chat_template_video_frame_sampling(self):
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
         # SmolVLM doesn't sample `num_frames` exactly, by uses other sampling method
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 3)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 1)
 
         # Load with `fps` arg
-        fps = 1
+        fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -429,7 +429,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
         # SmolVLM doesn't sample 1 frame per second exactly, by uses other sampling method
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), fps * 10)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 4)
 
         # NOTE: the last assert checks are removed
         # Loading video as a list of frames (i.e. images) is not supported in SmolVLM

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
@@ -1132,7 +1132,9 @@ def test_apply_chat_template_video_frame_sampling(self):
                     "content": [
                         {
                             "type": "video",
-                            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+                            "url": url_to_local_path(
+                                "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
+                            ),
                         },
                         {"type": "text", "text": "What is shown in this video?"},
                     ],
@@ -1154,7 +1156,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), num_frames)
 
         # Load with `fps` arg
-        fps = 1
+        fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -1165,10 +1167,11 @@ def test_apply_chat_template_video_frame_sampling(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), fps * 10)
+        # 3 frames are inferred from input video's length and FPS, so can be hardcoded
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 3)
 
         # Whan `do_sample_frames=False` no sampling is done and whole video is loaded, even if number of frames is passed
-        fps = 1
+        fps = 10
         out_dict_with_video = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -1180,7 +1183,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 11)
 
         # Load with `fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -1202,7 +1205,7 @@ def test_apply_chat_template_video_frame_sampling(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 11)
 
         # Load video as a list of frames (i.e. images).
         # NOTE: each frame should have same size because we assume they come from one video

diff --git a/utils/fetch_hub_objects_for_ci.py b/utils/fetch_hub_objects_for_ci.py
@@ -32,6 +32,8 @@
     "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png",
     "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg",
     "https://llava-vl.github.io/static/images/view.jpg",
+    "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
+    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
     "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
 ]