diff --git a/docs/source/io.rst b/docs/source/io.rst
index 478321a4e6d..a77dad39986 100644
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -104,16 +104,3 @@ Video - DEPREACTED
     read_video
     read_video_timestamps
     write_video
-
-
-**Fine-grained video API**
-
-In addition to the :mod:`read_video` function, we provide a high-performance 
-lower-level API for more fine-grained control compared to the :mod:`read_video` function.
-It does all this whilst fully supporting torchscript.
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    VideoReader
diff --git a/test/test_datasets_video_utils_opt.py b/test/test_datasets_video_utils_opt.py
deleted file mode 100644
index 5e6b19bfb95..00000000000
--- a/test/test_datasets_video_utils_opt.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import unittest
-
-import test_datasets_video_utils
-from torchvision import set_video_backend  # noqa: 401
-
-# Disabling the video backend switching temporarily
-# set_video_backend('video_reader')
-
-
-if __name__ == "__main__":
-    suite = unittest.TestLoader().loadTestsFromModule(test_datasets_video_utils)
-    unittest.TextTestRunner(verbosity=1).run(suite)
diff --git a/test/test_io.py b/test/test_io.py
index d2950ac9595..84d30ee3297 100644
--- a/test/test_io.py
+++ b/test/test_io.py
@@ -7,7 +7,6 @@
 import torch
 import torchvision.io as io
 from common_utils import assert_equal, cpu_and_cuda
-from torchvision import get_video_backend
 
 
 try:
@@ -45,12 +44,7 @@ def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None,
         options = {"crf": "0"}
 
     if video_codec is None:
-        if get_video_backend() == "pyav":
-            video_codec = "libx264"
-        else:
-            # when video_codec is not set, we assume it is libx264rgb which accepts
-            # RGB pixel formats as input instead of YUV
-            video_codec = "libx264rgb"
+        video_codec = "libx264"
     if options is None:
         options = {}
 
@@ -62,9 +56,6 @@ def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None,
     os.unlink(f.name)
 
 
-@pytest.mark.skipif(
-    get_video_backend() != "pyav" and not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend not available"
-)
 @pytest.mark.skipif(av is None, reason="PyAV unavailable")
 class TestVideo:
     # compression adds artifacts, thus we add a tolerance of
@@ -77,22 +68,6 @@ def test_write_read_video(self):
             assert_equal(data, lv)
             assert info["video_fps"] == 5
 
-    @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen")
-    def test_probe_video_from_file(self):
-        with temp_video(10, 300, 300, 5) as (f_name, data):
-            video_info = io._probe_video_from_file(f_name)
-            assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration
-            assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps
-
-    @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen")
-    def test_probe_video_from_memory(self):
-        with temp_video(10, 300, 300, 5) as (f_name, data):
-            with open(f_name, "rb") as fp:
-                filebuffer = fp.read()
-            video_info = io._probe_video_from_memory(filebuffer)
-            assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration
-            assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps
-
     def test_read_timestamps(self):
         with temp_video(10, 300, 300, 5) as (f_name, data):
             pts, _ = io.read_video_timestamps(f_name)
@@ -118,12 +93,9 @@ def test_read_partial_video(self, start, offset):
             assert len(lv) == offset
             assert_equal(s_data, lv)
 
-            if get_video_backend() == "pyav":
-                # for "video_reader" backend, we don't decode the closest early frame
-                # when the given start pts is not matching any frame pts
-                lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7])
-                assert len(lv) == 4
-                assert_equal(data[4:8], lv)
+            lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7])
+            assert len(lv) == 4
+            assert_equal(data[4:8], lv)
 
     @pytest.mark.parametrize("start", range(0, 80, 20))
     @pytest.mark.parametrize("offset", range(1, 4))
@@ -139,13 +111,8 @@ def test_read_partial_video_bframes(self, start, offset):
             assert_equal(s_data, lv, rtol=0.0, atol=self.TOLERANCE)
 
             lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7])
-            # TODO fix this
-            if get_video_backend() == "pyav":
-                assert len(lv) == 4
-                assert_equal(data[4:8], lv, rtol=0.0, atol=self.TOLERANCE)
-            else:
-                assert len(lv) == 3
-                assert_equal(data[5:8], lv, rtol=0.0, atol=self.TOLERANCE)
+            assert len(lv) == 4
+            assert_equal(data[4:8], lv, rtol=0.0, atol=self.TOLERANCE)
 
     def test_read_packed_b_frames_divx_file(self):
         name = "hmdb51_Turnk_r_Pippi_Michel_cartwheel_f_cm_np2_le_med_6.avi"
@@ -207,11 +174,8 @@ def test_read_partial_video_pts_unit_sec(self, start, offset):
                 lv, _, _ = io.read_video(
                     f_name, int(pts[4] * (1.0 / stream.time_base) + 1) * stream.time_base, pts[7], pts_unit="sec"
                 )
-            if get_video_backend() == "pyav":
-                # for "video_reader" backend, we don't decode the closest early frame
-                # when the given start pts is not matching any frame pts
-                assert len(lv) == 4
-                assert_equal(data[4:8], lv)
+            assert len(lv) == 4
+            assert_equal(data[4:8], lv)
 
     def test_read_video_corrupted_file(self):
         with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
@@ -243,11 +207,7 @@ def test_read_video_partially_corrupted_file(self):
             # this exercises the container.decode assertion check
             video, audio, info = io.read_video(f.name, pts_unit="sec")
             # check that size is not equal to 5, but 3
-            # TODO fix this
-            if get_video_backend() == "pyav":
-                assert len(video) == 3
-            else:
-                assert len(video) == 4
+            assert len(video) == 3
             # but the valid decoded content is still correct
             assert_equal(video[:3], data[:3])
             # and the last few frames are wrong
diff --git a/test/test_io_opt.py b/test/test_io_opt.py
deleted file mode 100644
index f4e3d305295..00000000000
--- a/test/test_io_opt.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import unittest
-
-import test_io
-from torchvision import set_video_backend  # noqa: 401
-
-
-# Disabling the video backend switching temporarily
-# set_video_backend('video_reader')
-
-
-if __name__ == "__main__":
-    suite = unittest.TestLoader().loadTestsFromModule(test_io)
-    unittest.TextTestRunner(verbosity=1).run(suite)
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
deleted file mode 100644
index 10995424982..00000000000
--- a/test/test_video_reader.py
+++ /dev/null
@@ -1,1254 +0,0 @@
-import collections
-import math
-import os
-from fractions import Fraction
-
-import numpy as np
-import pytest
-import torch
-import torchvision.io as io
-from common_utils import assert_equal
-from numpy.random import randint
-from pytest import approx
-from torchvision import set_video_backend
-from torchvision.io import _HAS_CPU_VIDEO_DECODER
-
-
-try:
-    import av
-
-    # Do a version test too
-    io.video._check_av_available()
-except ImportError:
-    av = None
-
-
-VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
-
-CheckerConfig = [
-    "duration",
-    "video_fps",
-    "audio_sample_rate",
-    # We find for some videos (e.g. HMDB51 videos), the decoded audio frames and pts are
-    # slightly different between TorchVision decoder and PyAv decoder. So omit it during check
-    "check_aframes",
-    "check_aframe_pts",
-]
-GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
-
-all_check_config = GroundTruth(
-    duration=0,
-    video_fps=0,
-    audio_sample_rate=0,
-    check_aframes=True,
-    check_aframe_pts=True,
-)
-
-test_videos = {
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(
-        duration=2.0,
-        video_fps=30.0,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth(
-        duration=2.0,
-        video_fps=30.0,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(
-        duration=2.0,
-        video_fps=30.0,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "v_SoccerJuggling_g23_c01.avi": GroundTruth(
-        duration=8.0,
-        video_fps=29.97,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "v_SoccerJuggling_g24_c01.avi": GroundTruth(
-        duration=8.0,
-        video_fps=29.97,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "R6llTwEh07w.mp4": GroundTruth(
-        duration=10.0,
-        video_fps=30.0,
-        audio_sample_rate=44100,
-        # PyAv miss one audio frame at the beginning (pts=0)
-        check_aframes=False,
-        check_aframe_pts=False,
-    ),
-    "SOX5yA1l24A.mp4": GroundTruth(
-        duration=11.0,
-        video_fps=29.97,
-        audio_sample_rate=48000,
-        # PyAv miss one audio frame at the beginning (pts=0)
-        check_aframes=False,
-        check_aframe_pts=False,
-    ),
-    "WUzgd7C1pWA.mp4": GroundTruth(
-        duration=11.0,
-        video_fps=29.97,
-        audio_sample_rate=48000,
-        # PyAv miss one audio frame at the beginning (pts=0)
-        check_aframes=False,
-        check_aframe_pts=False,
-    ),
-}
-
-
-DecoderResult = collections.namedtuple("DecoderResult", "vframes vframe_pts vtimebase aframes aframe_pts atimebase")
-
-# av_seek_frame is imprecise so seek to a timestamp earlier by a margin
-# The unit of margin is second
-SEEK_FRAME_MARGIN = 0.25
-
-
-def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4):
-    """
-    Args:
-        container: pyav container
-        start_pts/end_pts: the starting/ending Presentation TimeStamp where
-            frames are read
-        stream: pyav stream
-        stream_name: a dictionary of streams. For example, {"video": 0} means
-            video stream at stream index 0
-        buffer_size: pts of frames decoded by PyAv is not guaranteed to be in
-            ascending order. We need to decode more frames even when we meet end
-            pts
-    """
-    # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin
-    margin = 1
-    seek_offset = max(start_pts - margin, 0)
-
-    container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
-    frames = {}
-    buffer_count = 0
-    for frame in container.decode(**stream_name):
-        if frame.pts < start_pts:
-            continue
-        if frame.pts <= end_pts:
-            frames[frame.pts] = frame
-        else:
-            buffer_count += 1
-            if buffer_count >= buffer_size:
-                break
-    result = [frames[pts] for pts in sorted(frames)]
-
-    return result
-
-
-def _get_timebase_by_av_module(full_path):
-    container = av.open(full_path)
-    video_time_base = container.streams.video[0].time_base
-    if container.streams.audio:
-        audio_time_base = container.streams.audio[0].time_base
-    else:
-        audio_time_base = None
-    return video_time_base, audio_time_base
-
-
-def _fraction_to_tensor(fraction):
-    ret = torch.zeros([2], dtype=torch.int32)
-    ret[0] = fraction.numerator
-    ret[1] = fraction.denominator
-    return ret
-
-
-def _decode_frames_by_av_module(
-    full_path,
-    video_start_pts=0,
-    video_end_pts=None,
-    audio_start_pts=0,
-    audio_end_pts=None,
-):
-    """
-    Use PyAv to decode video frames. This provides a reference for our decoder
-    to compare the decoding results.
-    Input arguments:
-        full_path: video file path
-        video_start_pts/video_end_pts: the starting/ending Presentation TimeStamp where
-            frames are read
-    """
-    if video_end_pts is None:
-        video_end_pts = float("inf")
-    if audio_end_pts is None:
-        audio_end_pts = float("inf")
-    container = av.open(full_path)
-
-    video_frames = []
-    vtimebase = torch.zeros([0], dtype=torch.int32)
-    if container.streams.video:
-        video_frames = _read_from_stream(
-            container,
-            video_start_pts,
-            video_end_pts,
-            container.streams.video[0],
-            {"video": 0},
-        )
-        # container.streams.video[0].average_rate is not a reliable estimator of
-        # frame rate. It can be wrong for certain codec, such as VP80
-        # So we do not return video fps here
-        vtimebase = _fraction_to_tensor(container.streams.video[0].time_base)
-
-    audio_frames = []
-    atimebase = torch.zeros([0], dtype=torch.int32)
-    if container.streams.audio:
-        audio_frames = _read_from_stream(
-            container,
-            audio_start_pts,
-            audio_end_pts,
-            container.streams.audio[0],
-            {"audio": 0},
-        )
-        atimebase = _fraction_to_tensor(container.streams.audio[0].time_base)
-
-    container.close()
-    vframes = [frame.to_rgb().to_ndarray() for frame in video_frames]
-    vframes = torch.as_tensor(np.stack(vframes))
-
-    vframe_pts = torch.tensor([frame.pts for frame in video_frames], dtype=torch.int64)
-
-    aframes = [frame.to_ndarray() for frame in audio_frames]
-    if aframes:
-        aframes = np.transpose(np.concatenate(aframes, axis=1))
-        aframes = torch.as_tensor(aframes)
-    else:
-        aframes = torch.empty((1, 0), dtype=torch.float32)
-
-    aframe_pts = torch.tensor([audio_frame.pts for audio_frame in audio_frames], dtype=torch.int64)
-
-    return DecoderResult(
-        vframes=vframes,
-        vframe_pts=vframe_pts,
-        vtimebase=vtimebase,
-        aframes=aframes,
-        aframe_pts=aframe_pts,
-        atimebase=atimebase,
-    )
-
-
-def _pts_convert(pts, timebase_from, timebase_to, round_func=math.floor):
-    """convert pts between different time bases
-    Args:
-        pts: presentation timestamp, float
-        timebase_from: original timebase. Fraction
-        timebase_to: new timebase. Fraction
-        round_func: rounding function.
-    """
-    new_pts = Fraction(pts, 1) * timebase_from / timebase_to
-    return int(round_func(new_pts))
-
-
-def _get_video_tensor(video_dir, video_file):
-    """open a video file, and represent the video data by a PT tensor"""
-    full_path = os.path.join(video_dir, video_file)
-
-    assert os.path.exists(full_path), "File not found: %s" % full_path
-
-    with open(full_path, "rb") as fp:
-        video_tensor = torch.frombuffer(fp.read(), dtype=torch.uint8)
-
-    return full_path, video_tensor
-
-
-@pytest.mark.skipif(av is None, reason="PyAV unavailable")
-@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg")
-class TestVideoReader:
-    def check_separate_decoding_result(self, tv_result, config):
-        """check the decoding results from TorchVision decoder"""
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-
-        video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item())
-        assert video_duration == approx(config.duration, abs=0.5)
-
-        assert vfps.item() == approx(config.video_fps, abs=0.5)
-
-        if asample_rate.numel() > 0:
-            assert asample_rate.item() == config.audio_sample_rate
-            audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item())
-            assert audio_duration == approx(config.duration, abs=0.5)
-
-        # check if pts of video frames are sorted in ascending order
-        for i in range(len(vframe_pts) - 1):
-            assert vframe_pts[i] < vframe_pts[i + 1]
-
-        if len(aframe_pts) > 1:
-            # check if pts of audio frames are sorted in ascending order
-            for i in range(len(aframe_pts) - 1):
-                assert aframe_pts[i] < aframe_pts[i + 1]
-
-    def check_probe_result(self, result, config):
-        vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-        video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item())
-        assert video_duration == approx(config.duration, abs=0.5)
-        assert vfps.item() == approx(config.video_fps, abs=0.5)
-        if asample_rate.numel() > 0:
-            assert asample_rate.item() == config.audio_sample_rate
-            audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item())
-            assert audio_duration == approx(config.duration, abs=0.5)
-
-    def check_meta_result(self, result, config):
-        assert result.video_duration == approx(config.duration, abs=0.5)
-        assert result.video_fps == approx(config.video_fps, abs=0.5)
-        if result.has_audio > 0:
-            assert result.audio_sample_rate == config.audio_sample_rate
-            assert result.audio_duration == approx(config.duration, abs=0.5)
-
-    def compare_decoding_result(self, tv_result, ref_result, config=all_check_config):
-        """
-        Compare decoding results from two sources.
-        Args:
-            tv_result: decoding results from TorchVision decoder
-            ref_result: reference decoding results which can be from either PyAv
-                        decoder or TorchVision decoder with getPtsOnly = 1
-            config: config of decoding results checker
-        """
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            _vfps,
-            _vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            _asample_rate,
-            _aduration,
-        ) = tv_result
-        if isinstance(ref_result, list):
-            # the ref_result is from new video_reader decoder
-            ref_result = DecoderResult(
-                vframes=ref_result[0],
-                vframe_pts=ref_result[1],
-                vtimebase=ref_result[2],
-                aframes=ref_result[5],
-                aframe_pts=ref_result[6],
-                atimebase=ref_result[7],
-            )
-
-        if vframes.numel() > 0 and ref_result.vframes.numel() > 0:
-            mean_delta = torch.mean(torch.abs(vframes.float() - ref_result.vframes.float()))
-            assert mean_delta == approx(0.0, abs=8.0)
-
-        mean_delta = torch.mean(torch.abs(vframe_pts.float() - ref_result.vframe_pts.float()))
-        assert mean_delta == approx(0.0, abs=1.0)
-
-        assert_equal(vtimebase, ref_result.vtimebase)
-
-        if config.check_aframes and aframes.numel() > 0 and ref_result.aframes.numel() > 0:
-            """Audio stream is available and audio frame is required to return
-            from decoder"""
-            assert_equal(aframes, ref_result.aframes)
-
-        if config.check_aframe_pts and aframe_pts.numel() > 0 and ref_result.aframe_pts.numel() > 0:
-            """Audio stream is available"""
-            assert_equal(aframe_pts, ref_result.aframe_pts)
-
-            assert_equal(atimebase, ref_result.atimebase)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_stress_test_read_video_from_file(self, test_video):
-        pytest.skip(
-            "This stress test will iteratively decode the same set of videos."
-            "It helps to detect memory leak but it takes lots of time to run."
-            "By default, it is disabled"
-        )
-        num_iter = 10000
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        for _i in range(num_iter):
-            full_path = os.path.join(VIDEO_DIR, test_video)
-
-            # pass 1: decode all frames using new decoder
-            torch.ops.video_reader.read_video_from_file(
-                full_path,
-                SEEK_FRAME_MARGIN,
-                0,  # getPtsOnly
-                1,  # readVideoStream
-                width,
-                height,
-                min_dimension,
-                max_dimension,
-                video_start_pts,
-                video_end_pts,
-                video_timebase_num,
-                video_timebase_den,
-                1,  # readAudioStream
-                samples,
-                channels,
-                audio_start_pts,
-                audio_end_pts,
-                audio_timebase_num,
-                audio_timebase_den,
-            )
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_read_video_from_file(self, test_video, config):
-        """
-        Test the case when decoder starts with a video file to decode frames.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using new decoder
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        # pass 2: decode all frames using av
-        pyav_result = _decode_frames_by_av_module(full_path)
-        # check results from TorchVision decoder
-        self.check_separate_decoding_result(tv_result, config)
-        # compare decoding results
-        self.compare_decoding_result(tv_result, pyav_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)])
-    def test_read_video_from_file_read_single_stream_only(
-        self, test_video, config, read_video_stream, read_audio_stream
-    ):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        only reads video stream and ignores audio stream
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        # decode all frames using new decoder
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            read_video_stream,
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            read_audio_stream,
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-
-        assert (vframes.numel() > 0) is bool(read_video_stream)
-        assert (vframe_pts.numel() > 0) is bool(read_video_stream)
-        assert (vtimebase.numel() > 0) is bool(read_video_stream)
-        assert (vfps.numel() > 0) is bool(read_video_stream)
-
-        expect_audio_data = read_audio_stream == 1 and config.audio_sample_rate is not None
-        assert (aframes.numel() > 0) is bool(expect_audio_data)
-        assert (aframe_pts.numel() > 0) is bool(expect_audio_data)
-        assert (atimebase.numel() > 0) is bool(expect_audio_data)
-        assert (asample_rate.numel() > 0) is bool(expect_audio_data)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_min_dimension(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video min dimension between height and width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 128, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_max_dimension(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video min dimension between height and width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 85
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_both_min_max_dimension(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video min dimension between height and width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 64, 85
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
-        assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_width(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 256, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert tv_result[0].size(2) == width
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_height(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video height is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 224, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert tv_result[0].size(1) == height
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_width_and_height(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        both video height and width are set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 320, 240, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert tv_result[0].size(1) == height
-        assert tv_result[0].size(2) == width
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("samples", [9600, 96000])
-    def test_read_video_from_file_audio_resampling(self, test_video, samples):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        audio waveform are resampled
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        channels = 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-        if aframes.numel() > 0:
-            assert samples == asample_rate.item()
-            assert 1 == aframes.size(1)
-            # when audio stream is found
-            duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1])
-            assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item())
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_compare_read_video_from_memory_and_file(self, test_video, config):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using cpp decoder
-        tv_result_memory = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        self.check_separate_decoding_result(tv_result_memory, config)
-        # pass 2: decode all frames from file
-        tv_result_file = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        self.check_separate_decoding_result(tv_result_file, config)
-        # finally, compare results decoded from memory and file
-        self.compare_decoding_result(tv_result_memory, tv_result_file)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_read_video_from_memory(self, test_video, config):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using cpp decoder
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        # pass 2: decode all frames using av
-        pyav_result = _decode_frames_by_av_module(full_path)
-
-        self.check_separate_decoding_result(tv_result, config)
-        self.compare_decoding_result(tv_result, pyav_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_read_video_from_memory_get_pts_only(self, test_video, config):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory.
-        Compare frame pts between decoding for pts only and full decoding
-        for both pts and frame data
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using cpp decoder
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert abs(config.video_fps - tv_result[3].item()) < 0.01
-
-        # pass 2: decode all frames to get PTS only using cpp decoder
-        tv_result_pts_only = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            1,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        assert not tv_result_pts_only[0].numel()
-        assert not tv_result_pts_only[5].numel()
-        self.compare_decoding_result(tv_result, tv_result_pts_only)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("num_frames", [4, 8, 16, 32, 64, 128])
-    def test_read_video_in_range_from_memory(self, test_video, config, num_frames):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory.
-        In addition, decoder takes meaningful start- and end PTS as input, and decode
-        frames within that interval
-        """
-        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-        # pass 1: decode all frames using new decoder
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-        assert abs(config.video_fps - vfps.item()) < 0.01
-
-        start_pts_ind_max = vframe_pts.size(0) - num_frames
-        if start_pts_ind_max <= 0:
-            return
-        # randomly pick start pts
-        start_pts_ind = randint(0, start_pts_ind_max)
-        end_pts_ind = start_pts_ind + num_frames - 1
-        video_start_pts = vframe_pts[start_pts_ind]
-        video_end_pts = vframe_pts[end_pts_ind]
-
-        video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1]
-        if len(atimebase) > 0:
-            # when audio stream is available
-            audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1]
-            audio_start_pts = _pts_convert(
-                video_start_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
-                math.floor,
-            )
-            audio_end_pts = _pts_convert(
-                video_end_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
-                math.ceil,
-            )
-
-        # pass 2: decode frames in the randomly generated range
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        # pass 3: decode frames in range using PyAv
-        video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path)
-
-        video_start_pts_av = _pts_convert(
-            video_start_pts.item(),
-            Fraction(video_timebase_num.item(), video_timebase_den.item()),
-            Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
-            math.floor,
-        )
-        video_end_pts_av = _pts_convert(
-            video_end_pts.item(),
-            Fraction(video_timebase_num.item(), video_timebase_den.item()),
-            Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
-            math.ceil,
-        )
-        if audio_timebase_av:
-            audio_start_pts = _pts_convert(
-                video_start_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
-                math.floor,
-            )
-            audio_end_pts = _pts_convert(
-                video_end_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
-                math.ceil,
-            )
-
-        pyav_result = _decode_frames_by_av_module(
-            full_path,
-            video_start_pts_av,
-            video_end_pts_av,
-            audio_start_pts,
-            audio_end_pts,
-        )
-
-        assert tv_result[0].size(0) == num_frames
-        if pyav_result.vframes.size(0) == num_frames:
-            # if PyAv decodes a different number of video frames, skip
-            # comparing the decoding results between Torchvision video reader
-            # and PyAv
-            self.compare_decoding_result(tv_result, pyav_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_probe_video_from_file(self, test_video, config):
-        """
-        Test the case when decoder probes a video file
-        """
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        probe_result = torch.ops.video_reader.probe_video_from_file(full_path)
-        self.check_probe_result(probe_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_probe_video_from_memory(self, test_video, config):
-        """
-        Test the case when decoder probes a video in memory
-        """
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-        probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor)
-        self.check_probe_result(probe_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_probe_video_from_memory_script(self, test_video, config):
-        scripted_fun = torch.jit.script(io._probe_video_from_memory)
-        assert scripted_fun is not None
-
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-        probe_result = scripted_fun(video_tensor)
-        self.check_meta_result(probe_result, config)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_memory_scripted(self, test_video):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        scripted_fun = torch.jit.script(io._read_video_from_memory)
-        assert scripted_fun is not None
-
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # decode all frames using cpp decoder
-        scripted_fun(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            [video_start_pts, video_end_pts],
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            [audio_start_pts, audio_end_pts],
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        # FUTURE: check value of video / audio frames
-
-    def test_invalid_file(self):
-        set_video_backend("video_reader")
-        with pytest.raises(RuntimeError):
-            io.read_video("foo.mp4")
-
-        set_video_backend("pyav")
-        with pytest.raises(RuntimeError):
-            io.read_video("foo.mp4")
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    @pytest.mark.parametrize("start_offset", [0, 500])
-    @pytest.mark.parametrize("end_offset", [3000, None])
-    def test_audio_present_pts(self, test_video, backend, start_offset, end_offset):
-        """Test if audio frames are returned with pts unit."""
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        container = av.open(full_path)
-        if container.streams.audio:
-            set_video_backend(backend)
-            _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts")
-            assert all([dimension > 0 for dimension in audio.shape[:2]])
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    @pytest.mark.parametrize("start_offset", [0, 0.1])
-    @pytest.mark.parametrize("end_offset", [0.3, None])
-    def test_audio_present_sec(self, test_video, backend, start_offset, end_offset):
-        """Test if audio frames are returned with sec unit."""
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        container = av.open(full_path)
-        if container.streams.audio:
-            set_video_backend(backend)
-            _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec")
-            assert all([dimension > 0 for dimension in audio.shape[:2]])
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
deleted file mode 100644
index aabcf6407f7..00000000000
--- a/test/test_videoapi.py
+++ /dev/null
@@ -1,312 +0,0 @@
-import collections
-import os
-import urllib
-
-import pytest
-import torch
-import torchvision
-from pytest import approx
-from torchvision.datasets.utils import download_url
-from torchvision.io import _HAS_CPU_VIDEO_DECODER, VideoReader
-
-
-# WARNING: these tests have been skipped forever on the CI because the video ops
-# are never properly available. This is bad, but things have been in a terrible
-# state for a long time already as we write this comment, and we'll hopefully be
-# able to get rid of this all soon.
-
-
-try:
-    import av
-
-    # Do a version test too
-    torchvision.io.video._check_av_available()
-except ImportError:
-    av = None
-
-
-VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
-
-CheckerConfig = ["duration", "video_fps", "audio_sample_rate"]
-GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
-
-
-def backends():
-    backends_ = ["video_reader"]
-    if av is not None:
-        backends_.append("pyav")
-    return backends_
-
-
-def fate(name, path="."):
-    """Download and return a path to a sample from the FFmpeg test suite.
-    See the `FFmpeg Automated Test Environment <https://www.ffmpeg.org/fate.html>`_
-    """
-
-    file_name = name.split("/")[1]
-    download_url("http://fate.ffmpeg.org/fate-suite/" + name, path, file_name)
-    return os.path.join(path, file_name)
-
-
-test_videos = {
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None),
-    "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth(
-        duration=2.0, video_fps=30.0, audio_sample_rate=None
-    ),
-    "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None),
-    "v_SoccerJuggling_g23_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None),
-    "v_SoccerJuggling_g24_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None),
-    "R6llTwEh07w.mp4": GroundTruth(duration=10.0, video_fps=30.0, audio_sample_rate=44100),
-    "SOX5yA1l24A.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000),
-    "WUzgd7C1pWA.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000),
-}
-
-
-@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg")
-class TestVideoApi:
-    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", backends())
-    def test_frame_reading(self, test_video, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        with av.open(full_path) as av_reader:
-            if av_reader.streams.video:
-                av_frames, vr_frames = [], []
-                av_pts, vr_pts = [], []
-                # get av frames
-                for av_frame in av_reader.decode(av_reader.streams.video[0]):
-                    av_frames.append(torch.tensor(av_frame.to_rgb().to_ndarray()).permute(2, 0, 1))
-                    av_pts.append(av_frame.pts * av_frame.time_base)
-
-                # get vr frames
-                video_reader = VideoReader(full_path, "video")
-                for vr_frame in video_reader:
-                    vr_frames.append(vr_frame["data"])
-                    vr_pts.append(vr_frame["pts"])
-
-                # same number of frames
-                assert len(vr_frames) == len(av_frames)
-                assert len(vr_pts) == len(av_pts)
-
-                # compare the frames and ptss
-                for i in range(len(vr_frames)):
-                    assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
-
-                    mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
-                    # on average the difference is very small and caused
-                    # by decoding (around 1%)
-                    # TODO: asses empirically how to set this? atm it's 1%
-                    # averaged over all frames
-                    assert mean_delta.item() < 2.55
-
-                del vr_frames, av_frames, vr_pts, av_pts
-
-        # test audio reading compared to PYAV
-        with av.open(full_path) as av_reader:
-            if av_reader.streams.audio:
-                av_frames, vr_frames = [], []
-                av_pts, vr_pts = [], []
-                # get av frames
-                for av_frame in av_reader.decode(av_reader.streams.audio[0]):
-                    av_frames.append(torch.tensor(av_frame.to_ndarray()).permute(1, 0))
-                    av_pts.append(av_frame.pts * av_frame.time_base)
-                av_reader.close()
-
-                # get vr frames
-                video_reader = VideoReader(full_path, "audio")
-                for vr_frame in video_reader:
-                    vr_frames.append(vr_frame["data"])
-                    vr_pts.append(vr_frame["pts"])
-
-                # same number of frames
-                assert len(vr_frames) == len(av_frames)
-                assert len(vr_pts) == len(av_pts)
-
-                # compare the frames and ptss
-                for i in range(len(vr_frames)):
-                    assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
-                    max_delta = torch.max(torch.abs(av_frames[i].float() - vr_frames[i].float()))
-                    # we assure that there is never more than 1% difference in signal
-                    assert max_delta.item() < 0.001
-
-    @pytest.mark.parametrize("stream", ["video", "audio"])
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", backends())
-    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        reader = VideoReader(full_path)
-        reader_md = reader.get_metadata()
-
-        if stream in reader_md:
-            # Test video reading from file vs from memory
-            vr_frames, vr_frames_mem = [], []
-            vr_pts, vr_pts_mem = [], []
-            # get vr frames
-            video_reader = VideoReader(full_path, stream)
-            for vr_frame in video_reader:
-                vr_frames.append(vr_frame["data"])
-                vr_pts.append(vr_frame["pts"])
-
-            # get vr frames = read from memory
-            f = open(full_path, "rb")
-            fbytes = f.read()
-            f.close()
-            video_reader_from_mem = VideoReader(fbytes, stream)
-
-            for vr_frame_from_mem in video_reader_from_mem:
-                vr_frames_mem.append(vr_frame_from_mem["data"])
-                vr_pts_mem.append(vr_frame_from_mem["pts"])
-
-            # same number of frames
-            assert len(vr_frames) == len(vr_frames_mem)
-            assert len(vr_pts) == len(vr_pts_mem)
-
-            # compare the frames and ptss
-            for i in range(len(vr_frames)):
-                assert vr_pts[i] == vr_pts_mem[i]
-                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-                # on average the difference is very small and caused
-                # by decoding (around 1%)
-                # TODO: asses empirically how to set this? atm it's 1%
-                # averaged over all frames
-                assert mean_delta.item() < 2.55
-
-            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
-        else:
-            del reader, reader_md
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", backends())
-    def test_metadata(self, test_video, config, backend):
-        """
-        Test that the metadata returned via pyav corresponds to the one returned
-        by the new video decoder API
-        """
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        reader = VideoReader(full_path, "video")
-        reader_md = reader.get_metadata()
-        assert config.video_fps == approx(reader_md["video"]["fps"][0], abs=0.0001)
-        assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", backends())
-    def test_seek_start(self, test_video, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        video_reader = VideoReader(full_path, "video")
-        num_frames = 0
-        for _ in video_reader:
-            num_frames += 1
-
-        # now seek the container to 0 and do it again
-        # It's often that starting seek can be inprecise
-        # this way and it doesn't start at 0
-        video_reader.seek(0)
-        start_num_frames = 0
-        for _ in video_reader:
-            start_num_frames += 1
-
-        assert start_num_frames == num_frames
-
-        # now seek the container to < 0 to check for unexpected behaviour
-        video_reader.seek(-1)
-        start_num_frames = 0
-        for _ in video_reader:
-            start_num_frames += 1
-
-        assert start_num_frames == num_frames
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader"])
-    def test_accurateseek_middle(self, test_video, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        stream = "video"
-        video_reader = VideoReader(full_path, stream)
-        md = video_reader.get_metadata()
-        duration = md[stream]["duration"][0]
-        if duration is not None:
-            num_frames = 0
-            for _ in video_reader:
-                num_frames += 1
-
-            video_reader.seek(duration / 2)
-            middle_num_frames = 0
-            for _ in video_reader:
-                middle_num_frames += 1
-
-            assert middle_num_frames < num_frames
-            assert middle_num_frames == approx(num_frames // 2, abs=1)
-
-            video_reader.seek(duration / 2)
-            frame = next(video_reader)
-            lb = duration / 2 - 1 / md[stream]["fps"][0]
-            ub = duration / 2 + 1 / md[stream]["fps"][0]
-            assert (lb <= frame["pts"]) and (ub >= frame["pts"])
-
-    def test_fate_suite(self):
-        # TODO: remove the try-except statement once the connectivity issues are resolved
-        try:
-            video_path = fate("sub/MovText_capability_tester.mp4", VIDEO_DIR)
-        except (urllib.error.URLError, ConnectionError) as error:
-            pytest.skip(f"Skipping due to connectivity issues: {error}")
-        vr = VideoReader(video_path)
-        metadata = vr.get_metadata()
-
-        assert metadata["subtitles"]["duration"] is not None
-        os.remove(video_path)
-
-    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", backends())
-    def test_keyframe_reading(self, test_video, config, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        av_reader = av.open(full_path)
-        # reduce streams to only keyframes
-        av_stream = av_reader.streams.video[0]
-        av_stream.codec_context.skip_frame = "NONKEY"
-
-        av_keyframes = []
-        vr_keyframes = []
-        if av_reader.streams.video:
-
-            # get all keyframes using pyav. Then, seek randomly into video reader
-            # and assert that all the returned values are in AV_KEYFRAMES
-
-            for av_frame in av_reader.decode(av_stream):
-                av_keyframes.append(float(av_frame.pts * av_frame.time_base))
-
-        if len(av_keyframes) > 1:
-            video_reader = VideoReader(full_path, "video")
-            for i in range(1, len(av_keyframes)):
-                seek_val = (av_keyframes[i] + av_keyframes[i - 1]) / 2
-                data = next(video_reader.seek(seek_val, True))
-                vr_keyframes.append(data["pts"])
-
-            data = next(video_reader.seek(config.duration, True))
-            vr_keyframes.append(data["pts"])
-
-            assert len(av_keyframes) == len(vr_keyframes)
-            # NOTE: this video gets different keyframe with different
-            # loaders (0.333 pyav, 0.666 for us)
-            if test_video != "TrumanShow_wave_f_nm_np1_fr_med_26.avi":
-                for i in range(len(av_keyframes)):
-                    assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001)
-
-    def test_src(self):
-        with pytest.raises(ValueError, match="src cannot be empty"):
-            VideoReader(src="")
-        with pytest.raises(ValueError, match="src must be either string"):
-            VideoReader(src=2)
-        with pytest.raises(TypeError, match="unexpected keyword argument"):
-            VideoReader(path="path")
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index d47c70d4074..601f786e374 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -45,26 +45,11 @@ def set_video_backend(backend):
     Specifies the package used to decode videos.
 
     Args:
-        backend (string): Name of the video backend. one of {'pyav', 'video_reader'}.
+        backend (string): Name of the video backend. Only 'pyav' is supported.
             The :mod:`pyav` package uses the 3rd party PyAv library. It is a Pythonic
             binding for the FFmpeg libraries.
-            The :mod:`video_reader` package includes a native C++ implementation on
-            top of FFMPEG libraries, and a python API of TorchScript custom operator.
-            It generally decodes faster than :mod:`pyav`, but is perhaps less robust.
-
-    .. note::
-        Building with FFMPEG is disabled by default in the latest `main`. If you want to use the 'video_reader'
-        backend, please compile torchvision from source.
     """
-    global _video_backend
-    if backend not in ["pyav", "video_reader"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
-    if backend == "video_reader" and not io._HAS_CPU_VIDEO_DECODER:
-        # TODO: better messages
-        message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        raise RuntimeError(message)
-    else:
-        _video_backend = backend
+    pass
 
 
 def get_video_backend():
@@ -72,7 +57,7 @@ def get_video_backend():
     Returns the currently active video backend used to decode videos.
 
     Returns:
-        str: Name of the video backend. one of {'pyav', 'video_reader'}.
+        str: Name of the video backend. Currently only 'pyav' is supported.
     """
 
     return _video_backend
diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index d9214beaa68..ad26299cff6 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -1,29 +1,16 @@
 import bisect
 import math
 import warnings
-from fractions import Fraction
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union
 
 import torch
-from torchvision.io import _probe_video_from_file, _read_video_from_file, read_video, read_video_timestamps
+from torchvision.io import read_video, read_video_timestamps
 
 from .utils import tqdm
 
 T = TypeVar("T")
 
 
-def pts_convert(pts: int, timebase_from: Fraction, timebase_to: Fraction, round_func: Callable = math.floor) -> int:
-    """convert pts between different time bases
-    Args:
-        pts: presentation timestamp, float
-        timebase_from: original timebase. Fraction
-        timebase_to: new timebase. Fraction
-        round_func: rounding function.
-    """
-    new_pts = Fraction(pts, 1) * timebase_from / timebase_to
-    return round_func(new_pts)
-
-
 def unfold(tensor: torch.Tensor, size: int, step: int, dilation: int = 1) -> torch.Tensor:
     """
     similar to tensor.unfold, but with the dilation
@@ -305,60 +292,9 @@ def get_clip(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]
         video_path = self.video_paths[video_idx]
         clip_pts = self.clips[video_idx][clip_idx]
 
-        from torchvision import get_video_backend
-
-        backend = get_video_backend()
-
-        if backend == "pyav":
-            # check for invalid options
-            if self._video_width != 0:
-                raise ValueError("pyav backend doesn't support _video_width != 0")
-            if self._video_height != 0:
-                raise ValueError("pyav backend doesn't support _video_height != 0")
-            if self._video_min_dimension != 0:
-                raise ValueError("pyav backend doesn't support _video_min_dimension != 0")
-            if self._video_max_dimension != 0:
-                raise ValueError("pyav backend doesn't support _video_max_dimension != 0")
-            if self._audio_samples != 0:
-                raise ValueError("pyav backend doesn't support _audio_samples != 0")
-
-        if backend == "pyav":
-            start_pts = clip_pts[0].item()
-            end_pts = clip_pts[-1].item()
-            video, audio, info = read_video(video_path, start_pts, end_pts)
-        else:
-            _info = _probe_video_from_file(video_path)
-            video_fps = _info.video_fps
-            audio_fps = None
-
-            video_start_pts = cast(int, clip_pts[0].item())
-            video_end_pts = cast(int, clip_pts[-1].item())
-
-            audio_start_pts, audio_end_pts = 0, -1
-            audio_timebase = Fraction(0, 1)
-            video_timebase = Fraction(_info.video_timebase.numerator, _info.video_timebase.denominator)
-            if _info.has_audio:
-                audio_timebase = Fraction(_info.audio_timebase.numerator, _info.audio_timebase.denominator)
-                audio_start_pts = pts_convert(video_start_pts, video_timebase, audio_timebase, math.floor)
-                audio_end_pts = pts_convert(video_end_pts, video_timebase, audio_timebase, math.ceil)
-                audio_fps = _info.audio_sample_rate
-            video, audio, _ = _read_video_from_file(
-                video_path,
-                video_width=self._video_width,
-                video_height=self._video_height,
-                video_min_dimension=self._video_min_dimension,
-                video_max_dimension=self._video_max_dimension,
-                video_pts_range=(video_start_pts, video_end_pts),
-                video_timebase=video_timebase,
-                audio_samples=self._audio_samples,
-                audio_channels=self._audio_channels,
-                audio_pts_range=(audio_start_pts, audio_end_pts),
-                audio_timebase=audio_timebase,
-            )
-
-            info = {"video_fps": video_fps}
-            if audio_fps is not None:
-                info["audio_fps"] = audio_fps
+        start_pts = clip_pts[0].item()
+        end_pts = clip_pts[-1].item()
+        video, audio, info = read_video(video_path, start_pts, end_pts)
 
         if self.frame_rate is not None:
             resampling_idx = self.resampling_idxs[video_idx][clip_idx]
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index 3c5c13482f5..a486b0275e1 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -16,63 +16,7 @@
         VideoReader,
     )
 except ImportError:
-    # OSS fallback - video_reader backend not available
-    _HAS_CPU_VIDEO_DECODER = False
-    _HAS_VIDEO_OPT = False
-
-    def _stub_not_available(*args, **kwargs):
-        raise RuntimeError(
-            "video_reader backend is not available in open-source torchvision. " "Use PyAV or TorchCodec instead."
-        )
-
-    _probe_video_from_file = _stub_not_available
-    _probe_video_from_memory = _stub_not_available
-    _read_video_from_file = _stub_not_available
-    _read_video_from_memory = _stub_not_available
-    _read_video_timestamps_from_file = _stub_not_available
-    _read_video_timestamps_from_memory = _stub_not_available
-
-    class Timebase:  # type: ignore[no-redef]
-        __annotations__ = {"numerator": int, "denominator": int}
-        __slots__ = ["numerator", "denominator"]
-
-        def __init__(self, numerator: int = 0, denominator: int = 1) -> None:
-            self.numerator = numerator
-            self.denominator = denominator
-
-    class VideoMetaData:  # type: ignore[no-redef]
-        pass
-
-    class VideoReader:  # type: ignore[no-redef]
-        def __init__(self, *args, **kwargs):
-            raise RuntimeError(
-                "VideoReader with video_reader backend is not available. "
-                "Use backend='pyav' or migrate to TorchCodec."
-            )
-
-        def __iter__(self):
-            return self
-
-        def __next__(self):
-            raise StopIteration
-
-    # Stub module for _video_opt to prevent circular import issues
-    # This module is imported by video.py
-    import types
-    from fractions import Fraction
-
-    _video_opt = types.ModuleType("_video_opt")
-    _video_opt._HAS_VIDEO_OPT = False
-    _video_opt.default_timebase = Fraction(0, 1)
-
-    def _read_video_stub(filename, start_pts, end_pts, pts_unit):
-        raise RuntimeError("video_reader backend is not available. Use backend='pyav'.")
-
-    def _read_video_timestamps_stub(filename, pts_unit):
-        raise RuntimeError("video_reader backend is not available. Use backend='pyav'.")
-
-    _video_opt._read_video = _read_video_stub
-    _video_opt._read_video_timestamps = _read_video_timestamps_stub
+    pass
 
 from .image import (
     decode_avif,
@@ -98,18 +42,6 @@ def _read_video_timestamps_stub(filename, pts_unit):
     "write_video",
     "read_video",
     "read_video_timestamps",
-    "_read_video_from_file",
-    "_read_video_timestamps_from_file",
-    "_probe_video_from_file",
-    "_read_video_from_memory",
-    "_read_video_timestamps_from_memory",
-    "_probe_video_from_memory",
-    "_HAS_CPU_VIDEO_DECODER",
-    "_HAS_VIDEO_OPT",
-    "_read_video_clip_from_memory",
-    "_read_video_meta_data",
-    "VideoMetaData",
-    "Timebase",
     "ImageReadMode",
     "decode_image",
     "decode_jpeg",
@@ -125,6 +57,4 @@ def _read_video_timestamps_stub(filename, pts_unit):
     "write_file",
     "write_jpeg",
     "write_png",
-    "Video",
-    "VideoReader",
 ]
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index 14edcf50aaa..5331b764d27 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -1,6 +1,5 @@
 import gc
 import math
-import os
 import re
 import warnings
 from fractions import Fraction
@@ -10,7 +9,6 @@
 import torch
 
 from ..utils import _log_api_usage_once
-from . import _video_opt
 from ._video_deprecation_warning import _raise_video_deprecation_warning
 
 try:
@@ -311,79 +309,70 @@ def read_video(
     if output_format not in ("THWC", "TCHW"):
         raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.")
 
-    from torchvision import get_video_backend
+    _check_av_available()
+
+    if end_pts is None:
+        end_pts = float("inf")
+
+    if end_pts < start_pts:
+        raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}")
+
+    info = {}
+    video_frames = []
+    audio_frames = []
+    audio_timebase = Fraction(0, 1)
+
+    try:
+        with av.open(filename, metadata_errors="ignore") as container:
+            if container.streams.audio:
+                audio_timebase = container.streams.audio[0].time_base
+            if container.streams.video:
+                video_frames = _read_from_stream(
+                    container,
+                    start_pts,
+                    end_pts,
+                    pts_unit,
+                    container.streams.video[0],
+                    {"video": 0},
+                )
+                video_fps = container.streams.video[0].average_rate
+                # guard against potentially corrupted files
+                if video_fps is not None:
+                    info["video_fps"] = float(video_fps)
+
+            if container.streams.audio:
+                audio_frames = _read_from_stream(
+                    container,
+                    start_pts,
+                    end_pts,
+                    pts_unit,
+                    container.streams.audio[0],
+                    {"audio": 0},
+                )
+                info["audio_fps"] = container.streams.audio[0].rate
+
+    except FFmpegError:
+        # TODO raise a warning?
+        pass
 
-    if get_video_backend() != "pyav":
-        if not os.path.exists(filename):
-            raise RuntimeError(f"File not found: {filename}")
-        vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit)
+    vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
+    aframes_list = [frame.to_ndarray() for frame in audio_frames]
+
+    if vframes_list:
+        vframes = torch.as_tensor(np.stack(vframes_list))
+    else:
+        vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
+
+    if aframes_list:
+        aframes = np.concatenate(aframes_list, 1)
+        aframes = torch.as_tensor(aframes)
+        if pts_unit == "sec":
+            start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
+            if end_pts != float("inf"):
+                end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
+        aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
     else:
-        _check_av_available()
-
-        if end_pts is None:
-            end_pts = float("inf")
-
-        if end_pts < start_pts:
-            raise ValueError(
-                f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}"
-            )
-
-        info = {}
-        video_frames = []
-        audio_frames = []
-        audio_timebase = _video_opt.default_timebase
-
-        try:
-            with av.open(filename, metadata_errors="ignore") as container:
-                if container.streams.audio:
-                    audio_timebase = container.streams.audio[0].time_base
-                if container.streams.video:
-                    video_frames = _read_from_stream(
-                        container,
-                        start_pts,
-                        end_pts,
-                        pts_unit,
-                        container.streams.video[0],
-                        {"video": 0},
-                    )
-                    video_fps = container.streams.video[0].average_rate
-                    # guard against potentially corrupted files
-                    if video_fps is not None:
-                        info["video_fps"] = float(video_fps)
-
-                if container.streams.audio:
-                    audio_frames = _read_from_stream(
-                        container,
-                        start_pts,
-                        end_pts,
-                        pts_unit,
-                        container.streams.audio[0],
-                        {"audio": 0},
-                    )
-                    info["audio_fps"] = container.streams.audio[0].rate
-
-        except FFmpegError:
-            # TODO raise a warning?
-            pass
-
-        vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
-        aframes_list = [frame.to_ndarray() for frame in audio_frames]
-
-        if vframes_list:
-            vframes = torch.as_tensor(np.stack(vframes_list))
-        else:
-            vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
-
-        if aframes_list:
-            aframes = np.concatenate(aframes_list, 1)
-            aframes = torch.as_tensor(aframes)
-            if pts_unit == "sec":
-                start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
-                if end_pts != float("inf"):
-                    end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
-            aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
-        else:
-            aframes = torch.empty((1, 0), dtype=torch.float32)
+        aframes = torch.empty((1, 0), dtype=torch.float32)
 
     if output_format == "TCHW":
         # [T,H,W,C] --> [T,C,H,W]
@@ -436,10 +425,6 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> tuple[list[in
     _raise_video_deprecation_warning()
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(read_video_timestamps)
-    from torchvision import get_video_backend
-
-    if get_video_backend() != "pyav":
-        return _video_opt._read_video_timestamps(filename, pts_unit)
 
     _check_av_available()