From 92ef2e5dc0467b31f6a65466778e7e3f1447847d Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 4 Sep 2025 10:07:57 +0200 Subject: [PATCH 1/2] load a tiny video to make CI faster --- tests/models/glm4v/test_processor_glm4v.py | 11 ++++++----- .../qwen2_5_omni/test_processing_qwen2_5_omni.py | 6 +++--- .../models/qwen2_5_vl/test_processing_qwen2_5_vl.py | 6 +++--- tests/models/qwen2_vl/test_processing_qwen2_vl.py | 6 +++--- tests/models/smolvlm/test_processing_smolvlm.py | 8 ++++---- tests/test_processing_common.py | 13 +++++++------ 6 files changed, 26 insertions(+), 24 deletions(-) diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processor_glm4v.py index 69413b786d7b..c4d9240c5fd5 100644 --- a/tests/models/glm4v/test_processor_glm4v.py +++ b/tests/models/glm4v/test_processor_glm4v.py @@ -204,12 +204,12 @@ def test_apply_chat_template_video_frame_sampling(self): messages[0][0]["content"][0] = { "type": "video", "url": url_to_local_path( - "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4" + "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4" ), } # Load with `video_fps` arg - video_fps = 1 + video_fps = 10 out_dict_with_video = processor.apply_chat_template( messages, add_generation_prompt=True, @@ -218,17 +218,18 @@ def test_apply_chat_template_video_frame_sampling(self): video_fps=video_fps, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 20) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8) - # Load without any arg should load the whole video + # Load the whole video out_dict_with_video = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, + do_sample_frames=False, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 40) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 24) # Load video as a list of frames (i.e. images). NOTE: each frame should have same size # because we assume they come from one video diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py index 658c67af7cbc..ca6c2a395554 100644 --- a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py @@ -457,7 +457,7 @@ def test_apply_chat_template_video_frame_sampling(self): { "type": "video", "url": url_to_local_path( - "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4" + "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4" ), } ) @@ -482,7 +482,7 @@ def test_apply_chat_template_video_frame_sampling(self): fps=fps, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760) # Load with `fps` and `num_frames` args, should raise an error with self.assertRaises(ValueError): @@ -503,7 +503,7 @@ def test_apply_chat_template_video_frame_sampling(self): return_dict=True, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 17280) # Load video as a list of frames (i.e. images). NOTE: each frame should have same size # because we assume they come from one video diff --git a/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py index 02ce2e4c8602..118a88bdb85c 100644 --- a/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_processing_qwen2_5_vl.py @@ -275,7 +275,7 @@ def test_apply_chat_template_video_frame_sampling(self): messages[0][0]["content"][0] = { "type": "video", "url": url_to_local_path( - "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4" + "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4" ), } num_frames = 3 @@ -299,7 +299,7 @@ def test_apply_chat_template_video_frame_sampling(self): fps=fps, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360) # Load with `fps` and `num_frames` args, should raise an error with self.assertRaises(ValueError): @@ -320,7 +320,7 @@ def test_apply_chat_template_video_frame_sampling(self): return_dict=True, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080) # Load video as a list of frames (i.e. images). NOTE: each frame should have same size # because we assume they come from one video diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processing_qwen2_vl.py index e2882c2e29df..dc801408452b 100644 --- a/tests/models/qwen2_vl/test_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_processing_qwen2_vl.py @@ -275,7 +275,7 @@ def test_apply_chat_template_video_frame_sampling(self): messages[0][0]["content"][0] = { "type": "video", "url": url_to_local_path( - "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4" + "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4" ), } num_frames = 3 @@ -299,7 +299,7 @@ def test_apply_chat_template_video_frame_sampling(self): fps=fps, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 900) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360) # Load with `fps` and `num_frames` args, should raise an error with self.assertRaises(ValueError): @@ -320,7 +320,7 @@ def test_apply_chat_template_video_frame_sampling(self): return_dict=True, ) self.assertTrue(self.videos_input_name in out_dict_with_video) - self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 27000) + self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080) # Load video as a list of frames (i.e. images). NOTE: each frame should have same size # because we assume they come from one video diff --git a/tests/models/smolvlm/test_processing_smolvlm.py b/tests/models/smolvlm/test_processing_smolvlm.py index b373e2bb025a..6c7ce6a0bba5 100644 --- a/tests/models/smolvlm/test_processing_smolvlm.py +++ b/tests/models/smolvlm/test_processing_smolvlm.py @@ -398,7 +398,7 @@ def test_apply_chat_template_video_frame_sampling(self): { "type": "video", "url": url_to_local_path( - "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4" + "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4" ), }, {"type": "text", "text": "What is shown in this video?"}, @@ -419,10 +419,10 @@ def test_apply_chat_template_video_frame_sampling(self): self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) # SmolVLM doesn't sample `num_frames` exactly, by uses other sampling method - self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 3) + self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 1) # Load with `fps` arg - fps = 1 + fps = 10 out_dict_with_video = processor.apply_chat_template( messages, add_generation_prompt=True, @@ -434,7 +434,7 @@ def test_apply_chat_template_video_frame_sampling(self): self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) # SmolVLM doesn't sample 1 frame per second exactly, by uses other sampling method - self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), fps * 10) + self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 4) # NOTE: the last assert checks are removed # Loading video as a list of frames (i.e. images) is not supported in SmolVLM diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index e1b36cd2445f..cf861188dc69 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -1134,7 +1134,7 @@ def test_apply_chat_template_video_frame_sampling(self): "content": [ { "type": "video", - "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4", + "url": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4", }, {"type": "text", "text": "What is shown in this video?"}, ], @@ -1156,7 +1156,7 @@ def test_apply_chat_template_video_frame_sampling(self): self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), num_frames) # Load with `fps` arg - fps = 1 + fps = 10 out_dict_with_video = processor.apply_chat_template( messages, add_generation_prompt=True, @@ -1167,10 +1167,11 @@ def test_apply_chat_template_video_frame_sampling(self): ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) - self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), fps * 10) + # 3 frames are inferred from input video's length and FPS, so can be hardcoded + self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 3) # Whan `do_sample_frames=False` no sampling is done and whole video is loaded, even if number of frames is passed - fps = 1 + fps = 10 out_dict_with_video = processor.apply_chat_template( messages, add_generation_prompt=True, @@ -1182,7 +1183,7 @@ def test_apply_chat_template_video_frame_sampling(self): ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) - self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300) + self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 11) # Load with `fps` and `num_frames` args, should raise an error with self.assertRaises(ValueError): @@ -1204,7 +1205,7 @@ def test_apply_chat_template_video_frame_sampling(self): ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1) - self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300) + self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 11) # Load video as a list of frames (i.e. images). # NOTE: each frame should have same size because we assume they come from one video From 38e09db195531ebeee9cf199352f28754c166eab Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 4 Sep 2025 14:38:20 +0200 Subject: [PATCH 2/2] add video in url_to_local_path --- tests/test_processing_common.py | 4 +++- utils/fetch_hub_objects_for_ci.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index cf861188dc69..acd33b78c5f3 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -1134,7 +1134,9 @@ def test_apply_chat_template_video_frame_sampling(self): "content": [ { "type": "video", - "url": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4", + "url": url_to_local_path( + "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4" + ), }, {"type": "text", "text": "What is shown in this video?"}, ], diff --git a/utils/fetch_hub_objects_for_ci.py b/utils/fetch_hub_objects_for_ci.py index f1dcc5467b4f..94a9061aa568 100644 --- a/utils/fetch_hub_objects_for_ci.py +++ b/utils/fetch_hub_objects_for_ci.py @@ -18,6 +18,7 @@ "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg", "https://llava-vl.github.io/static/images/view.jpg", "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4", + "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4", ]