diff --git a/docs/source/io.rst b/docs/source/io.rst index 478321a4e6d..a77dad39986 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -104,16 +104,3 @@ Video - DEPREACTED read_video read_video_timestamps write_video - - -**Fine-grained video API** - -In addition to the :mod:`read_video` function, we provide a high-performance -lower-level API for more fine-grained control compared to the :mod:`read_video` function. -It does all this whilst fully supporting torchscript. - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - VideoReader diff --git a/test/test_datasets_video_utils_opt.py b/test/test_datasets_video_utils_opt.py deleted file mode 100644 index 5e6b19bfb95..00000000000 --- a/test/test_datasets_video_utils_opt.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest - -import test_datasets_video_utils -from torchvision import set_video_backend # noqa: 401 - -# Disabling the video backend switching temporarily -# set_video_backend('video_reader') - - -if __name__ == "__main__": - suite = unittest.TestLoader().loadTestsFromModule(test_datasets_video_utils) - unittest.TextTestRunner(verbosity=1).run(suite) diff --git a/test/test_io.py b/test/test_io.py index d2950ac9595..84d30ee3297 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -7,7 +7,6 @@ import torch import torchvision.io as io from common_utils import assert_equal, cpu_and_cuda -from torchvision import get_video_backend try: @@ -45,12 +44,7 @@ def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None, options = {"crf": "0"} if video_codec is None: - if get_video_backend() == "pyav": - video_codec = "libx264" - else: - # when video_codec is not set, we assume it is libx264rgb which accepts - # RGB pixel formats as input instead of YUV - video_codec = "libx264rgb" + video_codec = "libx264" if options is None: options = {} @@ -62,9 +56,6 @@ def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None, os.unlink(f.name) -@pytest.mark.skipif( - get_video_backend() != "pyav" and not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend not available" -) @pytest.mark.skipif(av is None, reason="PyAV unavailable") class TestVideo: # compression adds artifacts, thus we add a tolerance of @@ -77,22 +68,6 @@ def test_write_read_video(self): assert_equal(data, lv) assert info["video_fps"] == 5 - @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen") - def test_probe_video_from_file(self): - with temp_video(10, 300, 300, 5) as (f_name, data): - video_info = io._probe_video_from_file(f_name) - assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration - assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps - - @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen") - def test_probe_video_from_memory(self): - with temp_video(10, 300, 300, 5) as (f_name, data): - with open(f_name, "rb") as fp: - filebuffer = fp.read() - video_info = io._probe_video_from_memory(filebuffer) - assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration - assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps - def test_read_timestamps(self): with temp_video(10, 300, 300, 5) as (f_name, data): pts, _ = io.read_video_timestamps(f_name) @@ -118,12 +93,9 @@ def test_read_partial_video(self, start, offset): assert len(lv) == offset assert_equal(s_data, lv) - if get_video_backend() == "pyav": - # for "video_reader" backend, we don't decode the closest early frame - # when the given start pts is not matching any frame pts - lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) - assert len(lv) == 4 - assert_equal(data[4:8], lv) + lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) + assert len(lv) == 4 + assert_equal(data[4:8], lv) @pytest.mark.parametrize("start", range(0, 80, 20)) @pytest.mark.parametrize("offset", range(1, 4)) @@ -139,13 +111,8 @@ def test_read_partial_video_bframes(self, start, offset): assert_equal(s_data, lv, rtol=0.0, atol=self.TOLERANCE) lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) - # TODO fix this - if get_video_backend() == "pyav": - assert len(lv) == 4 - assert_equal(data[4:8], lv, rtol=0.0, atol=self.TOLERANCE) - else: - assert len(lv) == 3 - assert_equal(data[5:8], lv, rtol=0.0, atol=self.TOLERANCE) + assert len(lv) == 4 + assert_equal(data[4:8], lv, rtol=0.0, atol=self.TOLERANCE) def test_read_packed_b_frames_divx_file(self): name = "hmdb51_Turnk_r_Pippi_Michel_cartwheel_f_cm_np2_le_med_6.avi" @@ -207,11 +174,8 @@ def test_read_partial_video_pts_unit_sec(self, start, offset): lv, _, _ = io.read_video( f_name, int(pts[4] * (1.0 / stream.time_base) + 1) * stream.time_base, pts[7], pts_unit="sec" ) - if get_video_backend() == "pyav": - # for "video_reader" backend, we don't decode the closest early frame - # when the given start pts is not matching any frame pts - assert len(lv) == 4 - assert_equal(data[4:8], lv) + assert len(lv) == 4 + assert_equal(data[4:8], lv) def test_read_video_corrupted_file(self): with tempfile.NamedTemporaryFile(suffix=".mp4") as f: @@ -243,11 +207,7 @@ def test_read_video_partially_corrupted_file(self): # this exercises the container.decode assertion check video, audio, info = io.read_video(f.name, pts_unit="sec") # check that size is not equal to 5, but 3 - # TODO fix this - if get_video_backend() == "pyav": - assert len(video) == 3 - else: - assert len(video) == 4 + assert len(video) == 3 # but the valid decoded content is still correct assert_equal(video[:3], data[:3]) # and the last few frames are wrong diff --git a/test/test_io_opt.py b/test/test_io_opt.py deleted file mode 100644 index f4e3d305295..00000000000 --- a/test/test_io_opt.py +++ /dev/null @@ -1,13 +0,0 @@ -import unittest - -import test_io -from torchvision import set_video_backend # noqa: 401 - - -# Disabling the video backend switching temporarily -# set_video_backend('video_reader') - - -if __name__ == "__main__": - suite = unittest.TestLoader().loadTestsFromModule(test_io) - unittest.TextTestRunner(verbosity=1).run(suite) diff --git a/test/test_video_reader.py b/test/test_video_reader.py deleted file mode 100644 index 10995424982..00000000000 --- a/test/test_video_reader.py +++ /dev/null @@ -1,1254 +0,0 @@ -import collections -import math -import os -from fractions import Fraction - -import numpy as np -import pytest -import torch -import torchvision.io as io -from common_utils import assert_equal -from numpy.random import randint -from pytest import approx -from torchvision import set_video_backend -from torchvision.io import _HAS_CPU_VIDEO_DECODER - - -try: - import av - - # Do a version test too - io.video._check_av_available() -except ImportError: - av = None - - -VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos") - -CheckerConfig = [ - "duration", - "video_fps", - "audio_sample_rate", - # We find for some videos (e.g. HMDB51 videos), the decoded audio frames and pts are - # slightly different between TorchVision decoder and PyAv decoder. So omit it during check - "check_aframes", - "check_aframe_pts", -] -GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig)) - -all_check_config = GroundTruth( - duration=0, - video_fps=0, - audio_sample_rate=0, - check_aframes=True, - check_aframe_pts=True, -) - -test_videos = { - "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth( - duration=2.0, - video_fps=30.0, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth( - duration=2.0, - video_fps=30.0, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth( - duration=2.0, - video_fps=30.0, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "v_SoccerJuggling_g23_c01.avi": GroundTruth( - duration=8.0, - video_fps=29.97, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "v_SoccerJuggling_g24_c01.avi": GroundTruth( - duration=8.0, - video_fps=29.97, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "R6llTwEh07w.mp4": GroundTruth( - duration=10.0, - video_fps=30.0, - audio_sample_rate=44100, - # PyAv miss one audio frame at the beginning (pts=0) - check_aframes=False, - check_aframe_pts=False, - ), - "SOX5yA1l24A.mp4": GroundTruth( - duration=11.0, - video_fps=29.97, - audio_sample_rate=48000, - # PyAv miss one audio frame at the beginning (pts=0) - check_aframes=False, - check_aframe_pts=False, - ), - "WUzgd7C1pWA.mp4": GroundTruth( - duration=11.0, - video_fps=29.97, - audio_sample_rate=48000, - # PyAv miss one audio frame at the beginning (pts=0) - check_aframes=False, - check_aframe_pts=False, - ), -} - - -DecoderResult = collections.namedtuple("DecoderResult", "vframes vframe_pts vtimebase aframes aframe_pts atimebase") - -# av_seek_frame is imprecise so seek to a timestamp earlier by a margin -# The unit of margin is second -SEEK_FRAME_MARGIN = 0.25 - - -def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4): - """ - Args: - container: pyav container - start_pts/end_pts: the starting/ending Presentation TimeStamp where - frames are read - stream: pyav stream - stream_name: a dictionary of streams. For example, {"video": 0} means - video stream at stream index 0 - buffer_size: pts of frames decoded by PyAv is not guaranteed to be in - ascending order. We need to decode more frames even when we meet end - pts - """ - # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin - margin = 1 - seek_offset = max(start_pts - margin, 0) - - container.seek(seek_offset, any_frame=False, backward=True, stream=stream) - frames = {} - buffer_count = 0 - for frame in container.decode(**stream_name): - if frame.pts < start_pts: - continue - if frame.pts <= end_pts: - frames[frame.pts] = frame - else: - buffer_count += 1 - if buffer_count >= buffer_size: - break - result = [frames[pts] for pts in sorted(frames)] - - return result - - -def _get_timebase_by_av_module(full_path): - container = av.open(full_path) - video_time_base = container.streams.video[0].time_base - if container.streams.audio: - audio_time_base = container.streams.audio[0].time_base - else: - audio_time_base = None - return video_time_base, audio_time_base - - -def _fraction_to_tensor(fraction): - ret = torch.zeros([2], dtype=torch.int32) - ret[0] = fraction.numerator - ret[1] = fraction.denominator - return ret - - -def _decode_frames_by_av_module( - full_path, - video_start_pts=0, - video_end_pts=None, - audio_start_pts=0, - audio_end_pts=None, -): - """ - Use PyAv to decode video frames. This provides a reference for our decoder - to compare the decoding results. - Input arguments: - full_path: video file path - video_start_pts/video_end_pts: the starting/ending Presentation TimeStamp where - frames are read - """ - if video_end_pts is None: - video_end_pts = float("inf") - if audio_end_pts is None: - audio_end_pts = float("inf") - container = av.open(full_path) - - video_frames = [] - vtimebase = torch.zeros([0], dtype=torch.int32) - if container.streams.video: - video_frames = _read_from_stream( - container, - video_start_pts, - video_end_pts, - container.streams.video[0], - {"video": 0}, - ) - # container.streams.video[0].average_rate is not a reliable estimator of - # frame rate. It can be wrong for certain codec, such as VP80 - # So we do not return video fps here - vtimebase = _fraction_to_tensor(container.streams.video[0].time_base) - - audio_frames = [] - atimebase = torch.zeros([0], dtype=torch.int32) - if container.streams.audio: - audio_frames = _read_from_stream( - container, - audio_start_pts, - audio_end_pts, - container.streams.audio[0], - {"audio": 0}, - ) - atimebase = _fraction_to_tensor(container.streams.audio[0].time_base) - - container.close() - vframes = [frame.to_rgb().to_ndarray() for frame in video_frames] - vframes = torch.as_tensor(np.stack(vframes)) - - vframe_pts = torch.tensor([frame.pts for frame in video_frames], dtype=torch.int64) - - aframes = [frame.to_ndarray() for frame in audio_frames] - if aframes: - aframes = np.transpose(np.concatenate(aframes, axis=1)) - aframes = torch.as_tensor(aframes) - else: - aframes = torch.empty((1, 0), dtype=torch.float32) - - aframe_pts = torch.tensor([audio_frame.pts for audio_frame in audio_frames], dtype=torch.int64) - - return DecoderResult( - vframes=vframes, - vframe_pts=vframe_pts, - vtimebase=vtimebase, - aframes=aframes, - aframe_pts=aframe_pts, - atimebase=atimebase, - ) - - -def _pts_convert(pts, timebase_from, timebase_to, round_func=math.floor): - """convert pts between different time bases - Args: - pts: presentation timestamp, float - timebase_from: original timebase. Fraction - timebase_to: new timebase. Fraction - round_func: rounding function. - """ - new_pts = Fraction(pts, 1) * timebase_from / timebase_to - return int(round_func(new_pts)) - - -def _get_video_tensor(video_dir, video_file): - """open a video file, and represent the video data by a PT tensor""" - full_path = os.path.join(video_dir, video_file) - - assert os.path.exists(full_path), "File not found: %s" % full_path - - with open(full_path, "rb") as fp: - video_tensor = torch.frombuffer(fp.read(), dtype=torch.uint8) - - return full_path, video_tensor - - -@pytest.mark.skipif(av is None, reason="PyAV unavailable") -@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg") -class TestVideoReader: - def check_separate_decoding_result(self, tv_result, config): - """check the decoding results from TorchVision decoder""" - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - - video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item()) - assert video_duration == approx(config.duration, abs=0.5) - - assert vfps.item() == approx(config.video_fps, abs=0.5) - - if asample_rate.numel() > 0: - assert asample_rate.item() == config.audio_sample_rate - audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item()) - assert audio_duration == approx(config.duration, abs=0.5) - - # check if pts of video frames are sorted in ascending order - for i in range(len(vframe_pts) - 1): - assert vframe_pts[i] < vframe_pts[i + 1] - - if len(aframe_pts) > 1: - # check if pts of audio frames are sorted in ascending order - for i in range(len(aframe_pts) - 1): - assert aframe_pts[i] < aframe_pts[i + 1] - - def check_probe_result(self, result, config): - vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result - video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item()) - assert video_duration == approx(config.duration, abs=0.5) - assert vfps.item() == approx(config.video_fps, abs=0.5) - if asample_rate.numel() > 0: - assert asample_rate.item() == config.audio_sample_rate - audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item()) - assert audio_duration == approx(config.duration, abs=0.5) - - def check_meta_result(self, result, config): - assert result.video_duration == approx(config.duration, abs=0.5) - assert result.video_fps == approx(config.video_fps, abs=0.5) - if result.has_audio > 0: - assert result.audio_sample_rate == config.audio_sample_rate - assert result.audio_duration == approx(config.duration, abs=0.5) - - def compare_decoding_result(self, tv_result, ref_result, config=all_check_config): - """ - Compare decoding results from two sources. - Args: - tv_result: decoding results from TorchVision decoder - ref_result: reference decoding results which can be from either PyAv - decoder or TorchVision decoder with getPtsOnly = 1 - config: config of decoding results checker - """ - ( - vframes, - vframe_pts, - vtimebase, - _vfps, - _vduration, - aframes, - aframe_pts, - atimebase, - _asample_rate, - _aduration, - ) = tv_result - if isinstance(ref_result, list): - # the ref_result is from new video_reader decoder - ref_result = DecoderResult( - vframes=ref_result[0], - vframe_pts=ref_result[1], - vtimebase=ref_result[2], - aframes=ref_result[5], - aframe_pts=ref_result[6], - atimebase=ref_result[7], - ) - - if vframes.numel() > 0 and ref_result.vframes.numel() > 0: - mean_delta = torch.mean(torch.abs(vframes.float() - ref_result.vframes.float())) - assert mean_delta == approx(0.0, abs=8.0) - - mean_delta = torch.mean(torch.abs(vframe_pts.float() - ref_result.vframe_pts.float())) - assert mean_delta == approx(0.0, abs=1.0) - - assert_equal(vtimebase, ref_result.vtimebase) - - if config.check_aframes and aframes.numel() > 0 and ref_result.aframes.numel() > 0: - """Audio stream is available and audio frame is required to return - from decoder""" - assert_equal(aframes, ref_result.aframes) - - if config.check_aframe_pts and aframe_pts.numel() > 0 and ref_result.aframe_pts.numel() > 0: - """Audio stream is available""" - assert_equal(aframe_pts, ref_result.aframe_pts) - - assert_equal(atimebase, ref_result.atimebase) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_stress_test_read_video_from_file(self, test_video): - pytest.skip( - "This stress test will iteratively decode the same set of videos." - "It helps to detect memory leak but it takes lots of time to run." - "By default, it is disabled" - ) - num_iter = 10000 - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - for _i in range(num_iter): - full_path = os.path.join(VIDEO_DIR, test_video) - - # pass 1: decode all frames using new decoder - torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_read_video_from_file(self, test_video, config): - """ - Test the case when decoder starts with a video file to decode frames. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - # pass 1: decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - # pass 2: decode all frames using av - pyav_result = _decode_frames_by_av_module(full_path) - # check results from TorchVision decoder - self.check_separate_decoding_result(tv_result, config) - # compare decoding results - self.compare_decoding_result(tv_result, pyav_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)]) - def test_read_video_from_file_read_single_stream_only( - self, test_video, config, read_video_stream, read_audio_stream - ): - """ - Test the case when decoder starts with a video file to decode frames, and - only reads video stream and ignores audio stream - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - # decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - read_video_stream, - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - read_audio_stream, - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - - assert (vframes.numel() > 0) is bool(read_video_stream) - assert (vframe_pts.numel() > 0) is bool(read_video_stream) - assert (vtimebase.numel() > 0) is bool(read_video_stream) - assert (vfps.numel() > 0) is bool(read_video_stream) - - expect_audio_data = read_audio_stream == 1 and config.audio_sample_rate is not None - assert (aframes.numel() > 0) is bool(expect_audio_data) - assert (aframe_pts.numel() > 0) is bool(expect_audio_data) - assert (atimebase.numel() > 0) is bool(expect_audio_data) - assert (asample_rate.numel() > 0) is bool(expect_audio_data) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_min_dimension(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video min dimension between height and width is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 128, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_max_dimension(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video min dimension between height and width is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 85 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_both_min_max_dimension(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video min dimension between height and width is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 64, 85 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) - assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_width(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video width is set. - """ - # video related - width, height, min_dimension, max_dimension = 256, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(2) == width - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_height(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video height is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 224, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(1) == height - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_width_and_height(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - both video height and width are set. - """ - # video related - width, height, min_dimension, max_dimension = 320, 240, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(1) == height - assert tv_result[0].size(2) == width - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("samples", [9600, 96000]) - def test_read_video_from_file_audio_resampling(self, test_video, samples): - """ - Test the case when decoder starts with a video file to decode frames, and - audio waveform are resampled - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - channels = 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - if aframes.numel() > 0: - assert samples == asample_rate.item() - assert 1 == aframes.size(1) - # when audio stream is found - duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1]) - assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item()) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_compare_read_video_from_memory_and_file(self, test_video, config): - """ - Test the case when video is already in memory, and decoder reads data in memory - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result_memory = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - self.check_separate_decoding_result(tv_result_memory, config) - # pass 2: decode all frames from file - tv_result_file = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - self.check_separate_decoding_result(tv_result_file, config) - # finally, compare results decoded from memory and file - self.compare_decoding_result(tv_result_memory, tv_result_file) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_read_video_from_memory(self, test_video, config): - """ - Test the case when video is already in memory, and decoder reads data in memory - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - # pass 2: decode all frames using av - pyav_result = _decode_frames_by_av_module(full_path) - - self.check_separate_decoding_result(tv_result, config) - self.compare_decoding_result(tv_result, pyav_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_read_video_from_memory_get_pts_only(self, test_video, config): - """ - Test the case when video is already in memory, and decoder reads data in memory. - Compare frame pts between decoding for pts only and full decoding - for both pts and frame data - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert abs(config.video_fps - tv_result[3].item()) < 0.01 - - # pass 2: decode all frames to get PTS only using cpp decoder - tv_result_pts_only = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 1, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - assert not tv_result_pts_only[0].numel() - assert not tv_result_pts_only[5].numel() - self.compare_decoding_result(tv_result, tv_result_pts_only) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("num_frames", [4, 8, 16, 32, 64, 128]) - def test_read_video_in_range_from_memory(self, test_video, config, num_frames): - """ - Test the case when video is already in memory, and decoder reads data in memory. - In addition, decoder takes meaningful start- and end PTS as input, and decode - frames within that interval - """ - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - # pass 1: decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - assert abs(config.video_fps - vfps.item()) < 0.01 - - start_pts_ind_max = vframe_pts.size(0) - num_frames - if start_pts_ind_max <= 0: - return - # randomly pick start pts - start_pts_ind = randint(0, start_pts_ind_max) - end_pts_ind = start_pts_ind + num_frames - 1 - video_start_pts = vframe_pts[start_pts_ind] - video_end_pts = vframe_pts[end_pts_ind] - - video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1] - if len(atimebase) > 0: - # when audio stream is available - audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1] - audio_start_pts = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_num.item(), audio_timebase_den.item()), - math.floor, - ) - audio_end_pts = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_num.item(), audio_timebase_den.item()), - math.ceil, - ) - - # pass 2: decode frames in the randomly generated range - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - # pass 3: decode frames in range using PyAv - video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path) - - video_start_pts_av = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(video_timebase_av.numerator, video_timebase_av.denominator), - math.floor, - ) - video_end_pts_av = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(video_timebase_av.numerator, video_timebase_av.denominator), - math.ceil, - ) - if audio_timebase_av: - audio_start_pts = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), - math.floor, - ) - audio_end_pts = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), - math.ceil, - ) - - pyav_result = _decode_frames_by_av_module( - full_path, - video_start_pts_av, - video_end_pts_av, - audio_start_pts, - audio_end_pts, - ) - - assert tv_result[0].size(0) == num_frames - if pyav_result.vframes.size(0) == num_frames: - # if PyAv decodes a different number of video frames, skip - # comparing the decoding results between Torchvision video reader - # and PyAv - self.compare_decoding_result(tv_result, pyav_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_probe_video_from_file(self, test_video, config): - """ - Test the case when decoder probes a video file - """ - full_path = os.path.join(VIDEO_DIR, test_video) - probe_result = torch.ops.video_reader.probe_video_from_file(full_path) - self.check_probe_result(probe_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_probe_video_from_memory(self, test_video, config): - """ - Test the case when decoder probes a video in memory - """ - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor) - self.check_probe_result(probe_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_probe_video_from_memory_script(self, test_video, config): - scripted_fun = torch.jit.script(io._probe_video_from_memory) - assert scripted_fun is not None - - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - probe_result = scripted_fun(video_tensor) - self.check_meta_result(probe_result, config) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_memory_scripted(self, test_video): - """ - Test the case when video is already in memory, and decoder reads data in memory - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - scripted_fun = torch.jit.script(io._read_video_from_memory) - assert scripted_fun is not None - - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # decode all frames using cpp decoder - scripted_fun( - video_tensor, - SEEK_FRAME_MARGIN, - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - [video_start_pts, video_end_pts], - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - [audio_start_pts, audio_end_pts], - audio_timebase_num, - audio_timebase_den, - ) - # FUTURE: check value of video / audio frames - - def test_invalid_file(self): - set_video_backend("video_reader") - with pytest.raises(RuntimeError): - io.read_video("foo.mp4") - - set_video_backend("pyav") - with pytest.raises(RuntimeError): - io.read_video("foo.mp4") - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", ["video_reader", "pyav"]) - @pytest.mark.parametrize("start_offset", [0, 500]) - @pytest.mark.parametrize("end_offset", [3000, None]) - def test_audio_present_pts(self, test_video, backend, start_offset, end_offset): - """Test if audio frames are returned with pts unit.""" - full_path = os.path.join(VIDEO_DIR, test_video) - container = av.open(full_path) - if container.streams.audio: - set_video_backend(backend) - _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts") - assert all([dimension > 0 for dimension in audio.shape[:2]]) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", ["video_reader", "pyav"]) - @pytest.mark.parametrize("start_offset", [0, 0.1]) - @pytest.mark.parametrize("end_offset", [0.3, None]) - def test_audio_present_sec(self, test_video, backend, start_offset, end_offset): - """Test if audio frames are returned with sec unit.""" - full_path = os.path.join(VIDEO_DIR, test_video) - container = av.open(full_path) - if container.streams.audio: - set_video_backend(backend) - _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec") - assert all([dimension > 0 for dimension in audio.shape[:2]]) - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/test/test_videoapi.py b/test/test_videoapi.py deleted file mode 100644 index aabcf6407f7..00000000000 --- a/test/test_videoapi.py +++ /dev/null @@ -1,312 +0,0 @@ -import collections -import os -import urllib - -import pytest -import torch -import torchvision -from pytest import approx -from torchvision.datasets.utils import download_url -from torchvision.io import _HAS_CPU_VIDEO_DECODER, VideoReader - - -# WARNING: these tests have been skipped forever on the CI because the video ops -# are never properly available. This is bad, but things have been in a terrible -# state for a long time already as we write this comment, and we'll hopefully be -# able to get rid of this all soon. - - -try: - import av - - # Do a version test too - torchvision.io.video._check_av_available() -except ImportError: - av = None - - -VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos") - -CheckerConfig = ["duration", "video_fps", "audio_sample_rate"] -GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig)) - - -def backends(): - backends_ = ["video_reader"] - if av is not None: - backends_.append("pyav") - return backends_ - - -def fate(name, path="."): - """Download and return a path to a sample from the FFmpeg test suite. - See the `FFmpeg Automated Test Environment `_ - """ - - file_name = name.split("/")[1] - download_url("http://fate.ffmpeg.org/fate-suite/" + name, path, file_name) - return os.path.join(path, file_name) - - -test_videos = { - "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None), - "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth( - duration=2.0, video_fps=30.0, audio_sample_rate=None - ), - "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None), - "v_SoccerJuggling_g23_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None), - "v_SoccerJuggling_g24_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None), - "R6llTwEh07w.mp4": GroundTruth(duration=10.0, video_fps=30.0, audio_sample_rate=44100), - "SOX5yA1l24A.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000), - "WUzgd7C1pWA.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000), -} - - -@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg") -class TestVideoApi: - @pytest.mark.skipif(av is None, reason="PyAV unavailable") - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", backends()) - def test_frame_reading(self, test_video, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - with av.open(full_path) as av_reader: - if av_reader.streams.video: - av_frames, vr_frames = [], [] - av_pts, vr_pts = [], [] - # get av frames - for av_frame in av_reader.decode(av_reader.streams.video[0]): - av_frames.append(torch.tensor(av_frame.to_rgb().to_ndarray()).permute(2, 0, 1)) - av_pts.append(av_frame.pts * av_frame.time_base) - - # get vr frames - video_reader = VideoReader(full_path, "video") - for vr_frame in video_reader: - vr_frames.append(vr_frame["data"]) - vr_pts.append(vr_frame["pts"]) - - # same number of frames - assert len(vr_frames) == len(av_frames) - assert len(vr_pts) == len(av_pts) - - # compare the frames and ptss - for i in range(len(vr_frames)): - assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) - - mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) - # on average the difference is very small and caused - # by decoding (around 1%) - # TODO: asses empirically how to set this? atm it's 1% - # averaged over all frames - assert mean_delta.item() < 2.55 - - del vr_frames, av_frames, vr_pts, av_pts - - # test audio reading compared to PYAV - with av.open(full_path) as av_reader: - if av_reader.streams.audio: - av_frames, vr_frames = [], [] - av_pts, vr_pts = [], [] - # get av frames - for av_frame in av_reader.decode(av_reader.streams.audio[0]): - av_frames.append(torch.tensor(av_frame.to_ndarray()).permute(1, 0)) - av_pts.append(av_frame.pts * av_frame.time_base) - av_reader.close() - - # get vr frames - video_reader = VideoReader(full_path, "audio") - for vr_frame in video_reader: - vr_frames.append(vr_frame["data"]) - vr_pts.append(vr_frame["pts"]) - - # same number of frames - assert len(vr_frames) == len(av_frames) - assert len(vr_pts) == len(av_pts) - - # compare the frames and ptss - for i in range(len(vr_frames)): - assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) - max_delta = torch.max(torch.abs(av_frames[i].float() - vr_frames[i].float())) - # we assure that there is never more than 1% difference in signal - assert max_delta.item() < 0.001 - - @pytest.mark.parametrize("stream", ["video", "audio"]) - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", backends()) - def test_frame_reading_mem_vs_file(self, test_video, stream, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - - reader = VideoReader(full_path) - reader_md = reader.get_metadata() - - if stream in reader_md: - # Test video reading from file vs from memory - vr_frames, vr_frames_mem = [], [] - vr_pts, vr_pts_mem = [], [] - # get vr frames - video_reader = VideoReader(full_path, stream) - for vr_frame in video_reader: - vr_frames.append(vr_frame["data"]) - vr_pts.append(vr_frame["pts"]) - - # get vr frames = read from memory - f = open(full_path, "rb") - fbytes = f.read() - f.close() - video_reader_from_mem = VideoReader(fbytes, stream) - - for vr_frame_from_mem in video_reader_from_mem: - vr_frames_mem.append(vr_frame_from_mem["data"]) - vr_pts_mem.append(vr_frame_from_mem["pts"]) - - # same number of frames - assert len(vr_frames) == len(vr_frames_mem) - assert len(vr_pts) == len(vr_pts_mem) - - # compare the frames and ptss - for i in range(len(vr_frames)): - assert vr_pts[i] == vr_pts_mem[i] - mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float())) - # on average the difference is very small and caused - # by decoding (around 1%) - # TODO: asses empirically how to set this? atm it's 1% - # averaged over all frames - assert mean_delta.item() < 2.55 - - del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem - else: - del reader, reader_md - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("backend", backends()) - def test_metadata(self, test_video, config, backend): - """ - Test that the metadata returned via pyav corresponds to the one returned - by the new video decoder API - """ - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - reader = VideoReader(full_path, "video") - reader_md = reader.get_metadata() - assert config.video_fps == approx(reader_md["video"]["fps"][0], abs=0.0001) - assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", backends()) - def test_seek_start(self, test_video, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - video_reader = VideoReader(full_path, "video") - num_frames = 0 - for _ in video_reader: - num_frames += 1 - - # now seek the container to 0 and do it again - # It's often that starting seek can be inprecise - # this way and it doesn't start at 0 - video_reader.seek(0) - start_num_frames = 0 - for _ in video_reader: - start_num_frames += 1 - - assert start_num_frames == num_frames - - # now seek the container to < 0 to check for unexpected behaviour - video_reader.seek(-1) - start_num_frames = 0 - for _ in video_reader: - start_num_frames += 1 - - assert start_num_frames == num_frames - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", ["video_reader"]) - def test_accurateseek_middle(self, test_video, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - stream = "video" - video_reader = VideoReader(full_path, stream) - md = video_reader.get_metadata() - duration = md[stream]["duration"][0] - if duration is not None: - num_frames = 0 - for _ in video_reader: - num_frames += 1 - - video_reader.seek(duration / 2) - middle_num_frames = 0 - for _ in video_reader: - middle_num_frames += 1 - - assert middle_num_frames < num_frames - assert middle_num_frames == approx(num_frames // 2, abs=1) - - video_reader.seek(duration / 2) - frame = next(video_reader) - lb = duration / 2 - 1 / md[stream]["fps"][0] - ub = duration / 2 + 1 / md[stream]["fps"][0] - assert (lb <= frame["pts"]) and (ub >= frame["pts"]) - - def test_fate_suite(self): - # TODO: remove the try-except statement once the connectivity issues are resolved - try: - video_path = fate("sub/MovText_capability_tester.mp4", VIDEO_DIR) - except (urllib.error.URLError, ConnectionError) as error: - pytest.skip(f"Skipping due to connectivity issues: {error}") - vr = VideoReader(video_path) - metadata = vr.get_metadata() - - assert metadata["subtitles"]["duration"] is not None - os.remove(video_path) - - @pytest.mark.skipif(av is None, reason="PyAV unavailable") - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("backend", backends()) - def test_keyframe_reading(self, test_video, config, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - - av_reader = av.open(full_path) - # reduce streams to only keyframes - av_stream = av_reader.streams.video[0] - av_stream.codec_context.skip_frame = "NONKEY" - - av_keyframes = [] - vr_keyframes = [] - if av_reader.streams.video: - - # get all keyframes using pyav. Then, seek randomly into video reader - # and assert that all the returned values are in AV_KEYFRAMES - - for av_frame in av_reader.decode(av_stream): - av_keyframes.append(float(av_frame.pts * av_frame.time_base)) - - if len(av_keyframes) > 1: - video_reader = VideoReader(full_path, "video") - for i in range(1, len(av_keyframes)): - seek_val = (av_keyframes[i] + av_keyframes[i - 1]) / 2 - data = next(video_reader.seek(seek_val, True)) - vr_keyframes.append(data["pts"]) - - data = next(video_reader.seek(config.duration, True)) - vr_keyframes.append(data["pts"]) - - assert len(av_keyframes) == len(vr_keyframes) - # NOTE: this video gets different keyframe with different - # loaders (0.333 pyav, 0.666 for us) - if test_video != "TrumanShow_wave_f_nm_np1_fr_med_26.avi": - for i in range(len(av_keyframes)): - assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001) - - def test_src(self): - with pytest.raises(ValueError, match="src cannot be empty"): - VideoReader(src="") - with pytest.raises(ValueError, match="src must be either string"): - VideoReader(src=2) - with pytest.raises(TypeError, match="unexpected keyword argument"): - VideoReader(path="path") - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/torchvision/__init__.py b/torchvision/__init__.py index d47c70d4074..601f786e374 100644 --- a/torchvision/__init__.py +++ b/torchvision/__init__.py @@ -45,26 +45,11 @@ def set_video_backend(backend): Specifies the package used to decode videos. Args: - backend (string): Name of the video backend. one of {'pyav', 'video_reader'}. + backend (string): Name of the video backend. Only 'pyav' is supported. The :mod:`pyav` package uses the 3rd party PyAv library. It is a Pythonic binding for the FFmpeg libraries. - The :mod:`video_reader` package includes a native C++ implementation on - top of FFMPEG libraries, and a python API of TorchScript custom operator. - It generally decodes faster than :mod:`pyav`, but is perhaps less robust. - - .. note:: - Building with FFMPEG is disabled by default in the latest `main`. If you want to use the 'video_reader' - backend, please compile torchvision from source. """ - global _video_backend - if backend not in ["pyav", "video_reader"]: - raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend) - if backend == "video_reader" and not io._HAS_CPU_VIDEO_DECODER: - # TODO: better messages - message = "video_reader video backend is not available. Please compile torchvision from source and try again" - raise RuntimeError(message) - else: - _video_backend = backend + pass def get_video_backend(): @@ -72,7 +57,7 @@ def get_video_backend(): Returns the currently active video backend used to decode videos. Returns: - str: Name of the video backend. one of {'pyav', 'video_reader'}. + str: Name of the video backend. Currently only 'pyav' is supported. """ return _video_backend diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index d9214beaa68..ad26299cff6 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -1,29 +1,16 @@ import bisect import math import warnings -from fractions import Fraction -from typing import Any, Callable, cast, Optional, TypeVar, Union +from typing import Any, Optional, TypeVar, Union import torch -from torchvision.io import _probe_video_from_file, _read_video_from_file, read_video, read_video_timestamps +from torchvision.io import read_video, read_video_timestamps from .utils import tqdm T = TypeVar("T") -def pts_convert(pts: int, timebase_from: Fraction, timebase_to: Fraction, round_func: Callable = math.floor) -> int: - """convert pts between different time bases - Args: - pts: presentation timestamp, float - timebase_from: original timebase. Fraction - timebase_to: new timebase. Fraction - round_func: rounding function. - """ - new_pts = Fraction(pts, 1) * timebase_from / timebase_to - return round_func(new_pts) - - def unfold(tensor: torch.Tensor, size: int, step: int, dilation: int = 1) -> torch.Tensor: """ similar to tensor.unfold, but with the dilation @@ -305,60 +292,9 @@ def get_clip(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any] video_path = self.video_paths[video_idx] clip_pts = self.clips[video_idx][clip_idx] - from torchvision import get_video_backend - - backend = get_video_backend() - - if backend == "pyav": - # check for invalid options - if self._video_width != 0: - raise ValueError("pyav backend doesn't support _video_width != 0") - if self._video_height != 0: - raise ValueError("pyav backend doesn't support _video_height != 0") - if self._video_min_dimension != 0: - raise ValueError("pyav backend doesn't support _video_min_dimension != 0") - if self._video_max_dimension != 0: - raise ValueError("pyav backend doesn't support _video_max_dimension != 0") - if self._audio_samples != 0: - raise ValueError("pyav backend doesn't support _audio_samples != 0") - - if backend == "pyav": - start_pts = clip_pts[0].item() - end_pts = clip_pts[-1].item() - video, audio, info = read_video(video_path, start_pts, end_pts) - else: - _info = _probe_video_from_file(video_path) - video_fps = _info.video_fps - audio_fps = None - - video_start_pts = cast(int, clip_pts[0].item()) - video_end_pts = cast(int, clip_pts[-1].item()) - - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase = Fraction(0, 1) - video_timebase = Fraction(_info.video_timebase.numerator, _info.video_timebase.denominator) - if _info.has_audio: - audio_timebase = Fraction(_info.audio_timebase.numerator, _info.audio_timebase.denominator) - audio_start_pts = pts_convert(video_start_pts, video_timebase, audio_timebase, math.floor) - audio_end_pts = pts_convert(video_end_pts, video_timebase, audio_timebase, math.ceil) - audio_fps = _info.audio_sample_rate - video, audio, _ = _read_video_from_file( - video_path, - video_width=self._video_width, - video_height=self._video_height, - video_min_dimension=self._video_min_dimension, - video_max_dimension=self._video_max_dimension, - video_pts_range=(video_start_pts, video_end_pts), - video_timebase=video_timebase, - audio_samples=self._audio_samples, - audio_channels=self._audio_channels, - audio_pts_range=(audio_start_pts, audio_end_pts), - audio_timebase=audio_timebase, - ) - - info = {"video_fps": video_fps} - if audio_fps is not None: - info["audio_fps"] = audio_fps + start_pts = clip_pts[0].item() + end_pts = clip_pts[-1].item() + video, audio, info = read_video(video_path, start_pts, end_pts) if self.frame_rate is not None: resampling_idx = self.resampling_idxs[video_idx][clip_idx] diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py index 3c5c13482f5..a486b0275e1 100644 --- a/torchvision/io/__init__.py +++ b/torchvision/io/__init__.py @@ -16,63 +16,7 @@ VideoReader, ) except ImportError: - # OSS fallback - video_reader backend not available - _HAS_CPU_VIDEO_DECODER = False - _HAS_VIDEO_OPT = False - - def _stub_not_available(*args, **kwargs): - raise RuntimeError( - "video_reader backend is not available in open-source torchvision. " "Use PyAV or TorchCodec instead." - ) - - _probe_video_from_file = _stub_not_available - _probe_video_from_memory = _stub_not_available - _read_video_from_file = _stub_not_available - _read_video_from_memory = _stub_not_available - _read_video_timestamps_from_file = _stub_not_available - _read_video_timestamps_from_memory = _stub_not_available - - class Timebase: # type: ignore[no-redef] - __annotations__ = {"numerator": int, "denominator": int} - __slots__ = ["numerator", "denominator"] - - def __init__(self, numerator: int = 0, denominator: int = 1) -> None: - self.numerator = numerator - self.denominator = denominator - - class VideoMetaData: # type: ignore[no-redef] - pass - - class VideoReader: # type: ignore[no-redef] - def __init__(self, *args, **kwargs): - raise RuntimeError( - "VideoReader with video_reader backend is not available. " - "Use backend='pyav' or migrate to TorchCodec." - ) - - def __iter__(self): - return self - - def __next__(self): - raise StopIteration - - # Stub module for _video_opt to prevent circular import issues - # This module is imported by video.py - import types - from fractions import Fraction - - _video_opt = types.ModuleType("_video_opt") - _video_opt._HAS_VIDEO_OPT = False - _video_opt.default_timebase = Fraction(0, 1) - - def _read_video_stub(filename, start_pts, end_pts, pts_unit): - raise RuntimeError("video_reader backend is not available. Use backend='pyav'.") - - def _read_video_timestamps_stub(filename, pts_unit): - raise RuntimeError("video_reader backend is not available. Use backend='pyav'.") - - _video_opt._read_video = _read_video_stub - _video_opt._read_video_timestamps = _read_video_timestamps_stub + pass from .image import ( decode_avif, @@ -98,18 +42,6 @@ def _read_video_timestamps_stub(filename, pts_unit): "write_video", "read_video", "read_video_timestamps", - "_read_video_from_file", - "_read_video_timestamps_from_file", - "_probe_video_from_file", - "_read_video_from_memory", - "_read_video_timestamps_from_memory", - "_probe_video_from_memory", - "_HAS_CPU_VIDEO_DECODER", - "_HAS_VIDEO_OPT", - "_read_video_clip_from_memory", - "_read_video_meta_data", - "VideoMetaData", - "Timebase", "ImageReadMode", "decode_image", "decode_jpeg", @@ -125,6 +57,4 @@ def _read_video_timestamps_stub(filename, pts_unit): "write_file", "write_jpeg", "write_png", - "Video", - "VideoReader", ] diff --git a/torchvision/io/video.py b/torchvision/io/video.py index 14edcf50aaa..5331b764d27 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -1,6 +1,5 @@ import gc import math -import os import re import warnings from fractions import Fraction @@ -10,7 +9,6 @@ import torch from ..utils import _log_api_usage_once -from . import _video_opt from ._video_deprecation_warning import _raise_video_deprecation_warning try: @@ -311,79 +309,70 @@ def read_video( if output_format not in ("THWC", "TCHW"): raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.") - from torchvision import get_video_backend + _check_av_available() + + if end_pts is None: + end_pts = float("inf") + + if end_pts < start_pts: + raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}") + + info = {} + video_frames = [] + audio_frames = [] + audio_timebase = Fraction(0, 1) + + try: + with av.open(filename, metadata_errors="ignore") as container: + if container.streams.audio: + audio_timebase = container.streams.audio[0].time_base + if container.streams.video: + video_frames = _read_from_stream( + container, + start_pts, + end_pts, + pts_unit, + container.streams.video[0], + {"video": 0}, + ) + video_fps = container.streams.video[0].average_rate + # guard against potentially corrupted files + if video_fps is not None: + info["video_fps"] = float(video_fps) + + if container.streams.audio: + audio_frames = _read_from_stream( + container, + start_pts, + end_pts, + pts_unit, + container.streams.audio[0], + {"audio": 0}, + ) + info["audio_fps"] = container.streams.audio[0].rate + + except FFmpegError: + # TODO raise a warning? + pass - if get_video_backend() != "pyav": - if not os.path.exists(filename): - raise RuntimeError(f"File not found: {filename}") - vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit) + vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames] + aframes_list = [frame.to_ndarray() for frame in audio_frames] + + if vframes_list: + vframes = torch.as_tensor(np.stack(vframes_list)) + else: + vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8) + + if aframes_list: + aframes = np.concatenate(aframes_list, 1) + aframes = torch.as_tensor(aframes) + if pts_unit == "sec": + start_pts = int(math.floor(start_pts * (1 / audio_timebase))) + if end_pts != float("inf"): + end_pts = int(math.ceil(end_pts * (1 / audio_timebase))) + aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts) else: - _check_av_available() - - if end_pts is None: - end_pts = float("inf") - - if end_pts < start_pts: - raise ValueError( - f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}" - ) - - info = {} - video_frames = [] - audio_frames = [] - audio_timebase = _video_opt.default_timebase - - try: - with av.open(filename, metadata_errors="ignore") as container: - if container.streams.audio: - audio_timebase = container.streams.audio[0].time_base - if container.streams.video: - video_frames = _read_from_stream( - container, - start_pts, - end_pts, - pts_unit, - container.streams.video[0], - {"video": 0}, - ) - video_fps = container.streams.video[0].average_rate - # guard against potentially corrupted files - if video_fps is not None: - info["video_fps"] = float(video_fps) - - if container.streams.audio: - audio_frames = _read_from_stream( - container, - start_pts, - end_pts, - pts_unit, - container.streams.audio[0], - {"audio": 0}, - ) - info["audio_fps"] = container.streams.audio[0].rate - - except FFmpegError: - # TODO raise a warning? - pass - - vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames] - aframes_list = [frame.to_ndarray() for frame in audio_frames] - - if vframes_list: - vframes = torch.as_tensor(np.stack(vframes_list)) - else: - vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8) - - if aframes_list: - aframes = np.concatenate(aframes_list, 1) - aframes = torch.as_tensor(aframes) - if pts_unit == "sec": - start_pts = int(math.floor(start_pts * (1 / audio_timebase))) - if end_pts != float("inf"): - end_pts = int(math.ceil(end_pts * (1 / audio_timebase))) - aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts) - else: - aframes = torch.empty((1, 0), dtype=torch.float32) + aframes = torch.empty((1, 0), dtype=torch.float32) if output_format == "TCHW": # [T,H,W,C] --> [T,C,H,W] @@ -436,10 +425,6 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> tuple[list[in _raise_video_deprecation_warning() if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(read_video_timestamps) - from torchvision import get_video_backend - - if get_video_backend() != "pyav": - return _video_opt._read_video_timestamps(filename, pts_unit) _check_av_available()