diff --git a/VIDEO_READER_MOVE_PLAN.md b/VIDEO_READER_MOVE_PLAN.md
new file mode 100644
index 00000000000..98109a2b47f
--- /dev/null
+++ b/VIDEO_READER_MOVE_PLAN.md
@@ -0,0 +1,517 @@
+# Video Reader Move Plan: torchvision → fb/ (Internal Only)
+
+## Overview
+
+Move the `video_reader` backend from the open-source `torchvision/` folder to the internal-only `fb/` folder. This allows:
+- ✅ Removal from GitHub/open source
+- ✅ Internal Meta users continue to have access
+- ✅ Existing `fb/datasets/video_clip_sampler.py` keeps working
+
+---
+
+## Part 1: C++ Files to Move
+
+### 1.1 Decoder Core (`decoder/`)
+
+**From:** `torchvision/csrc/io/decoder/`
+**To:** `fb/csrc/io/decoder/`
+
+| File | Description |
+|------|-------------|
+| `audio_sampler.cpp` | Audio frame sampling |
+| `audio_sampler.h` | |
+| `audio_stream.cpp` | Audio stream handling |
+| `audio_stream.h` | |
+| `cc_stream.cpp` | Closed caption stream |
+| `cc_stream.h` | |
+| `decoder.cpp` | Main FFmpeg decoder class |
+| `decoder.h` | |
+| `defs.h` | Common definitions |
+| `memory_buffer.cpp` | Memory buffer utils |
+| `memory_buffer.h` | |
+| `seekable_buffer.cpp` | Seekable buffer for streaming |
+| `seekable_buffer.h` | |
+| `stream.cpp` | Base stream class |
+| `stream.h` | |
+| `subtitle_sampler.cpp` | Subtitle sampling |
+| `subtitle_sampler.h` | |
+| `subtitle_stream.cpp` | Subtitle stream handling |
+| `subtitle_stream.h` | |
+| `sync_decoder.cpp` | Synchronous decoder wrapper |
+| `sync_decoder.h` | |
+| `time_keeper.cpp` | Timestamp management |
+| `time_keeper.h` | |
+| `util.cpp` | Utility functions |
+| `util.h` | |
+| `video_sampler.cpp` | Video frame sampling |
+| `video_sampler.h` | |
+| `video_stream.cpp` | Video stream handling |
+| `video_stream.h` | |
+
+**Test files (move to `fb/csrc/io/decoder/` or `fb/tests/`):**
+| File | Description |
+|------|-------------|
+| `sync_decoder_test.cpp` | Unit tests for sync_decoder |
+| `util_test.cpp` | Unit tests for utilities |
+
+### 1.2 Video Utils (`video/`)
+
+**From:** `torchvision/csrc/io/video/`
+**To:** `fb/csrc/io/video/`
+
+| File | Description |
+|------|-------------|
+| `video.cpp` | Video class implementation |
+| `video.h` | Video class header |
+
+### 1.3 Video Reader Ops (`video_reader/`)
+
+**From:** `torchvision/csrc/io/video_reader/`
+**To:** `fb/csrc/io/video_reader/`
+
+| File | Description |
+|------|-------------|
+| `video_reader.cpp` | torch.ops.video_reader registration |
+| `video_reader.h` | |
+
+---
+
+## Part 2: Python Files to Move
+
+**From:** `torchvision/io/`
+**To:** `fb/io/`
+
+| File | Description |
+|------|-------------|
+| `_video_opt.py` | Core video_reader Python API (`_read_video_from_memory`, etc.) |
+| `video_reader.py` | `VideoReader` class |
+| `_video_deprecation_warning.py` | Deprecation warning helper (can stay in torchvision or be duplicated) |
+
+---
+
+## Part 3: BUCK Target Changes
+
+### 3.1 Current Targets (in `pytorch/vision/BUCK`)
+
+```python
+# Lines 501-539: decoder_streaming
+fbcode_target(
+    _kind = cpp_library,
+    name = "decoder_streaming",
+    srcs = glob(["torchvision/csrc/io/decoder/*.cpp"], exclude = [...]),
+    ...
+)
+
+# Lines 541-585: Tests
+fbcode_target(_kind = cpp_unittest, name = "sync_decoder_test", ...)
+fbcode_target(_kind = cpp_unittest, name = "sync_decoder_test_ffmpeg_7_1", ...)
+fbcode_target(_kind = cpp_unittest, name = "util_test", ...)
+fbcode_target(_kind = cpp_unittest, name = "util_test_ffmpeg_7_1", ...)
+
+# Lines 587-613: video_reader
+fbcode_target(
+    _kind = cpp_library,
+    name = "video_reader",
+    srcs = glob([
+        "torchvision/csrc/io/video/*.cpp",
+        "torchvision/csrc/io/video_reader/*.cpp",
+    ]),
+    ...
+)
+
+# Lines 615-640: video_reader_cpu
+fbcode_target(
+    _kind = cpp_library,
+    name = "video_reader_cpu",
+    ...
+)
+```
+
+### 3.2 New Targets (create `pytorch/vision/fb/BUCK` or add to existing)
+
+```python
+# fb/BUCK - New or updated file
+
+load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+# C++ decoder library
+cpp_library(
+    name = "decoder_streaming",
+    srcs = glob(
+        ["csrc/io/decoder/*.cpp"],
+        exclude = [
+            "csrc/io/decoder/sync_decoder_test.cpp",
+            "csrc/io/decoder/util_test.cpp",
+        ],
+    ),
+    headers = glob(["csrc/io/decoder/*.h"]),
+    propagated_pp_flags = [
+        "-Ipytorch/vision/fb/csrc/io/decoder",
+    ],
+    exported_deps = [
+        "//caffe2/c10:c10",
+    ] + select({
+        "DEFAULT": [],
+        "ovr_config//third-party/ffmpeg/constraints:7.1": [
+            "fbsource//third-party/ffmpeg/ffmpeg_7_1:avcodec-network",
+            "fbsource//third-party/ffmpeg/ffmpeg_7_1:avfilter-network",
+        ],
+    }),
+    exported_external_deps = select({
+        "DEFAULT": [
+            ("ffmpeg-ref", None, "avfilter"),
+            ("ffmpeg-ref", None, "avcodec"),
+        ],
+        "ovr_config//third-party/ffmpeg/constraints:7.1": [],
+    }),
+)
+
+# C++ video_reader library
+cpp_library(
+    name = "video_reader",
+    srcs = glob([
+        "csrc/io/video/*.cpp",
+        "csrc/io/video_reader/*.cpp",
+    ]),
+    headers = glob([
+        "csrc/io/video/*.h",
+        "csrc/io/video_reader/*.h",
+    ]),
+    link_whole = True,
+    preprocessor_flags = [
+        "-Ipytorch/vision/fb/csrc/io/video",
+        "-Ipytorch/vision/fb/csrc/io/video_reader",
+        "-DTORCH_EXTENSION_NAME=video_reader",
+    ],
+    propagated_pp_flags = [
+        "-Ipytorch/vision/fb/csrc/io/video",
+        "-Ipytorch/vision/fb/csrc/io/video_reader",
+    ],
+    supports_python_dlopen = True,
+    exported_deps = [
+        ":decoder_streaming",
+        "//caffe2:torch-cpp",
+    ],
+)
+
+# CPU-only variant
+cpp_library(
+    name = "video_reader_cpu",
+    srcs = glob([
+        "csrc/io/video/*.cpp",
+        "csrc/io/video_reader/*.cpp",
+    ]),
+    headers = glob([
+        "csrc/io/video/*.h",
+        "csrc/io/video_reader/*.h",
+    ]),
+    link_whole = False,
+    preprocessor_flags = [
+        "-Ipytorch/vision/fb/csrc/io/video",
+        "-Ipytorch/vision/fb/csrc/io/video_reader",
+        "-DTORCH_EXTENSION_NAME=video_reader",
+    ],
+    propagated_pp_flags = [
+        "-Ipytorch/vision/fb/csrc/io/video",
+        "-Ipytorch/vision/fb/csrc/io/video_reader",
+    ],
+    exported_deps = [
+        ":decoder_streaming",
+        "//caffe2:torch-cpp-cpu",
+    ],
+)
+
+# Tests
+cpp_unittest(
+    name = "sync_decoder_test",
+    srcs = ["csrc/io/decoder/sync_decoder_test.cpp"],
+    deps = [":decoder_streaming"],
+)
+
+cpp_unittest(
+    name = "sync_decoder_test_ffmpeg_7_1",
+    srcs = ["csrc/io/decoder/sync_decoder_test.cpp"],
+    modifiers = ["ovr_config//third-party/ffmpeg/constraints:7.1"],
+    deps = [":decoder_streaming"],
+)
+
+cpp_unittest(
+    name = "util_test",
+    srcs = ["csrc/io/decoder/util_test.cpp"],
+    deps = [":decoder_streaming"],
+)
+
+cpp_unittest(
+    name = "util_test_ffmpeg_7_1",
+    srcs = ["csrc/io/decoder/util_test.cpp"],
+    modifiers = ["ovr_config//third-party/ffmpeg/constraints:7.1"],
+    deps = [":decoder_streaming"],
+)
+
+# Python library for video_reader API
+python_library(
+    name = "video_reader_py",
+    srcs = [
+        "io/_video_opt.py",
+        "io/video_reader.py",
+    ],
+    deps = [
+        "//pytorch/vision:torchvision",  # For extension loading
+    ],
+    cpp_deps = [
+        ":video_reader",
+    ],
+)
+```
+
+### 3.3 Remove from `pytorch/vision/BUCK`
+
+Delete these targets from the main BUCK file:
+- `:decoder_streaming` (lines 501-539)
+- `:sync_decoder_test` (lines 541-550)
+- `:sync_decoder_test_ffmpeg_7_1` (lines 552-562)
+- `:util_test` (lines 564-573)
+- `:util_test_ffmpeg_7_1` (lines 575-585)
+- `:video_reader` (lines 587-613)
+- `:video_reader_cpu` (lines 615-640)
+
+---
+
+## Part 4: Update Include Paths in C++ Files
+
+After moving, update `#include` statements in moved files:
+
+### In `fb/csrc/io/decoder/*.cpp` files:
+```cpp
+// Before
+#include "sync_decoder.h"
+
+// After (if using full paths)
+#include "pytorch/vision/fb/csrc/io/decoder/sync_decoder.h"
+// Or keep relative if propagated_pp_flags handles it
+```
+
+### In `fb/csrc/io/video/*.cpp` and `fb/csrc/io/video_reader/*.cpp`:
+```cpp
+// Before
+#include "pytorch/vision/torchvision/csrc/io/decoder/sync_decoder.h"
+
+// After
+#include "pytorch/vision/fb/csrc/io/decoder/sync_decoder.h"
+```
+
+---
+
+## Part 5: Update Python Imports
+
+### 5.1 Create `fb/io/__init__.py`
+
+```python
+# fb/io/__init__.py
+from ._video_opt import (
+    _HAS_CPU_VIDEO_DECODER,
+    _HAS_VIDEO_OPT,
+    _probe_video_from_file,
+    _probe_video_from_memory,
+    _read_video_from_file,
+    _read_video_from_memory,
+    _read_video_timestamps_from_file,
+    _read_video_timestamps_from_memory,
+    Timebase,
+    VideoMetaData,
+)
+from .video_reader import VideoReader
+
+__all__ = [
+    "_read_video_from_file",
+    "_read_video_timestamps_from_file",
+    "_probe_video_from_file",
+    "_read_video_from_memory",
+    "_read_video_timestamps_from_memory",
+    "_probe_video_from_memory",
+    "_HAS_CPU_VIDEO_DECODER",
+    "_HAS_VIDEO_OPT",
+    "VideoMetaData",
+    "Timebase",
+    "VideoReader",
+]
+```
+
+### 5.2 Update `fb/io/_video_opt.py`
+
+```python
+# Change this line:
+from ..extension import _load_library
+
+# To:
+from torchvision.extension import _load_library
+
+# OR create fb/extension.py that loads from fb/BUCK target
+```
+
+### 5.3 Update `fb/io/video_reader.py`
+
+```python
+# Change:
+from ..utils import _log_api_usage_once
+from ._video_deprecation_warning import _raise_video_deprecation_warning
+from ._video_opt import _HAS_CPU_VIDEO_DECODER
+
+# To:
+from torchvision.utils import _log_api_usage_once
+from torchvision.io._video_deprecation_warning import _raise_video_deprecation_warning
+from ._video_opt import _HAS_CPU_VIDEO_DECODER
+```
+
+### 5.4 Update `fb/datasets/video_clip_sampler.py`
+
+```python
+# Change line 8:
+from torchvision.io import _probe_video_from_memory, _read_video_from_memory, Timebase
+
+# To:
+from pytorch.vision.fb.io import _probe_video_from_memory, _read_video_from_memory, Timebase
+
+# OR if using package structure:
+from ..io import _probe_video_from_memory, _read_video_from_memory, Timebase
+```
+
+---
+
+## Part 6: Update torchvision's Public API
+
+### 6.1 Update `torchvision/io/__init__.py`
+
+Remove video_reader exports (or make them conditional):
+
+```python
+# Remove these lines:
+from ._video_opt import (
+    _HAS_CPU_VIDEO_DECODER,
+    _HAS_VIDEO_OPT,
+    _probe_video_from_file,
+    _probe_video_from_memory,
+    _read_video_from_file,
+    _read_video_from_memory,
+    _read_video_timestamps_from_file,
+    _read_video_timestamps_from_memory,
+    Timebase,
+    VideoMetaData,
+)
+from .video_reader import VideoReader
+
+# Replace with stubs that raise deprecation errors for OSS:
+_HAS_CPU_VIDEO_DECODER = False
+_HAS_VIDEO_OPT = False
+
+def _stub_not_available(*args, **kwargs):
+    raise RuntimeError(
+        "video_reader backend is not available in open-source torchvision. "
+        "Use PyAV or TorchCodec instead."
+    )
+
+_probe_video_from_file = _stub_not_available
+_probe_video_from_memory = _stub_not_available
+_read_video_from_file = _stub_not_available
+_read_video_from_memory = _stub_not_available
+_read_video_timestamps_from_file = _stub_not_available
+_read_video_timestamps_from_memory = _stub_not_available
+
+class Timebase:
+    pass  # Keep for compatibility
+
+class VideoMetaData:
+    pass  # Keep for compatibility
+
+class VideoReader:
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "VideoReader with video_reader backend is not available. "
+            "Use backend='pyav' or migrate to TorchCodec."
+        )
+```
+
+### 6.2 Update `torchvision/__init__.py`
+
+```python
+# In set_video_backend(), remove "video_reader" as valid option for OSS:
+def set_video_backend(backend: str) -> None:
+    # OSS version: only pyav
+    if backend not in ("pyav",):
+        raise ValueError(f"Invalid video backend: {backend}. Use 'pyav'.")
+    ...
+```
+
+---
+
+## Part 7: Update External Dependencies
+
+Update these BUCK files to point to new target:
+
+| File | Change |
+|------|--------|
+| `cu_tdm/dps/worker/udf/BUCK` | `//pytorch/vision:video_reader` → `//pytorch/vision/fb:video_reader` |
+| `cu_tdm/dps/worker/udf/BUCK` | `//pytorch/vision:decoder_streaming` → `//pytorch/vision/fb:decoder_streaming` |
+| `fblearner/predictor/model/BUCK` | `//pytorch/vision:video_reader_cpu` → `//pytorch/vision/fb:video_reader_cpu` |
+| `mitra/projects/xray_video_integrity/transforms/BUCK` | `//pytorch/vision:video_reader` → `//pytorch/vision/fb:video_reader` |
+| `fblearner/flow/.../video_transformers.py` | Update `get_torch_custom_op_targets()` return value |
+
+---
+
+## Part 8: File Move Commands
+
+```bash
+# Create directory structure
+mkdir -p fbcode/pytorch/vision/fb/csrc/io/decoder
+mkdir -p fbcode/pytorch/vision/fb/csrc/io/video
+mkdir -p fbcode/pytorch/vision/fb/csrc/io/video_reader
+mkdir -p fbcode/pytorch/vision/fb/io
+
+# Move C++ decoder files
+sl mv torchvision/csrc/io/decoder/*.cpp fb/csrc/io/decoder/
+sl mv torchvision/csrc/io/decoder/*.h fb/csrc/io/decoder/
+
+# Move C++ video files
+sl mv torchvision/csrc/io/video/*.cpp fb/csrc/io/video/
+sl mv torchvision/csrc/io/video/*.h fb/csrc/io/video/
+
+# Move C++ video_reader files
+sl mv torchvision/csrc/io/video_reader/*.cpp fb/csrc/io/video_reader/
+sl mv torchvision/csrc/io/video_reader/*.h fb/csrc/io/video_reader/
+
+# Move Python files
+sl mv torchvision/io/_video_opt.py fb/io/
+sl mv torchvision/io/video_reader.py fb/io/
+```
+
+---
+
+## Part 9: Testing Checklist
+
+After migration:
+
+- [ ] `buck build //pytorch/vision/fb:decoder_streaming`
+- [ ] `buck build //pytorch/vision/fb:video_reader`
+- [ ] `buck build //pytorch/vision/fb:video_reader_cpu`
+- [ ] `buck test //pytorch/vision/fb:sync_decoder_test`
+- [ ] `buck test //pytorch/vision/fb:util_test`
+- [ ] `buck build //cu_tdm/dps/worker/udf:udf`
+- [ ] `buck build //fblearner/predictor/model:pytorch_predictor_container`
+- [ ] Test `fb/datasets/video_clip_sampler.py` still works
+- [ ] Verify OSS build no longer includes video_reader code
+
+---
+
+## Summary
+
+| Category | Count |
+|----------|-------|
+| C++ files to move | 32 files |
+| Python files to move | 2-3 files |
+| BUCK targets to move | 7 targets |
+| External BUCK deps to update | 4-5 files |
+| New files to create | `fb/io/__init__.py`, update `fb/BUCK` |
+
+**Estimated effort:** 1-2 days for migration + testing
diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp
deleted file mode 100644
index b158d3438b8..00000000000
--- a/torchvision/csrc/io/decoder/audio_sampler.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-#include "audio_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-#define AVRESAMPLE_MAX_CHANNELS 32
-
-// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
-namespace ffmpeg {
-
-namespace {
-int preparePlanes(
-    const AudioFormat& fmt,
-    const uint8_t* buffer,
-    int numSamples,
-    uint8_t** planes) {
-  int result;
-  if ((result = av_samples_fill_arrays(
-           planes,
-           nullptr, // linesize is not needed
-           buffer,
-           fmt.channels,
-           numSamples,
-           (AVSampleFormat)fmt.format,
-           1)) < 0) {
-    LOG(ERROR) << "av_samples_fill_arrays failed, err: "
-               << Util::generateErrorDesc(result)
-               << ", numSamples: " << numSamples << ", fmt: " << fmt.format;
-  }
-  return result;
-}
-} // namespace
-
-AudioSampler::AudioSampler(void* logCtx) : logCtx_(logCtx) {}
-
-AudioSampler::~AudioSampler() {
-  cleanUp();
-}
-
-void AudioSampler::shutdown() {
-  cleanUp();
-}
-
-bool AudioSampler::init(const SamplerParameters& params) {
-  cleanUp();
-
-  if (params.type != MediaType::TYPE_AUDIO) {
-    LOG(ERROR) << "Invalid media type, expected MediaType::TYPE_AUDIO";
-    return false;
-  }
-
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  AVChannelLayout channel_out;
-  AVChannelLayout channel_in;
-  av_channel_layout_default(&channel_out, params.out.audio.channels);
-  av_channel_layout_default(&channel_in, params.in.audio.channels);
-  int ret = swr_alloc_set_opts2(
-      &swrContext_,
-      &channel_out,
-      (AVSampleFormat)params.out.audio.format,
-      params.out.audio.samples,
-      &channel_in,
-      (AVSampleFormat)params.in.audio.format,
-      params.in.audio.samples,
-      0,
-      logCtx_);
-  if (ret < 0 || swrContext_ == nullptr) {
-    LOG(ERROR) << "Cannot allocate SwrContext";
-    return false;
-  }
-#else
-  swrContext_ = swr_alloc_set_opts(
-      nullptr,
-      av_get_default_channel_layout(params.out.audio.channels),
-      (AVSampleFormat)params.out.audio.format,
-      params.out.audio.samples,
-      av_get_default_channel_layout(params.in.audio.channels),
-      (AVSampleFormat)params.in.audio.format,
-      params.in.audio.samples,
-      0,
-      logCtx_);
-  if (swrContext_ == nullptr) {
-    LOG(ERROR) << "Cannot allocate SwrContext";
-    return false;
-  }
-#endif
-
-  int result;
-  if ((result = swr_init(swrContext_)) < 0) {
-    LOG(ERROR) << "swr_init failed, err: " << Util::generateErrorDesc(result)
-               << ", in -> format: " << params.in.audio.format
-               << ", channels: " << params.in.audio.channels
-               << ", samples: " << params.in.audio.samples
-               << ", out -> format: " << params.out.audio.format
-               << ", channels: " << params.out.audio.channels
-               << ", samples: " << params.out.audio.samples;
-    return false;
-  }
-
-  // set formats
-  params_ = params;
-  return true;
-}
-
-int AudioSampler::numOutputSamples(int inSamples) const {
-  return swr_get_out_samples(swrContext_, inSamples);
-}
-
-int AudioSampler::sample(
-    const uint8_t* inPlanes[],
-    int inNumSamples,
-    ByteStorage* out,
-    int outNumSamples) {
-  int result;
-  int outBufferBytes = av_samples_get_buffer_size(
-      nullptr,
-      params_.out.audio.channels,
-      outNumSamples,
-      (AVSampleFormat)params_.out.audio.format,
-      1);
-
-  if (out) {
-    out->ensure(outBufferBytes);
-
-    uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-
-    if ((result = preparePlanes(
-             params_.out.audio,
-             out->writableTail(),
-             outNumSamples,
-             outPlanes)) < 0) {
-      return result;
-    }
-
-    if ((result = swr_convert(
-             swrContext_,
-             &outPlanes[0],
-             outNumSamples,
-             inPlanes,
-             inNumSamples)) < 0) {
-      LOG(ERROR) << "swr_convert failed, err: "
-                 << Util::generateErrorDesc(result);
-      return result;
-    }
-
-    TORCH_CHECK_LE(result, outNumSamples);
-
-    if (result) {
-      if ((result = av_samples_get_buffer_size(
-               nullptr,
-               params_.out.audio.channels,
-               result,
-               (AVSampleFormat)params_.out.audio.format,
-               1)) >= 0) {
-        out->append(result);
-      } else {
-        LOG(ERROR) << "av_samples_get_buffer_size failed, err: "
-                   << Util::generateErrorDesc(result);
-      }
-    }
-  } else {
-    // allocate a temporary buffer
-    auto* tmpBuffer = static_cast<uint8_t*>(av_malloc(outBufferBytes));
-    if (!tmpBuffer) {
-      LOG(ERROR) << "av_alloc failed, for size: " << outBufferBytes;
-      return -1;
-    }
-
-    uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-
-    if ((result = preparePlanes(
-             params_.out.audio, tmpBuffer, outNumSamples, outPlanes)) < 0) {
-      av_free(tmpBuffer);
-      return result;
-    }
-
-    if ((result = swr_convert(
-             swrContext_,
-             &outPlanes[0],
-             outNumSamples,
-             inPlanes,
-             inNumSamples)) < 0) {
-      LOG(ERROR) << "swr_convert failed, err: "
-                 << Util::generateErrorDesc(result);
-      av_free(tmpBuffer);
-      return result;
-    }
-
-    av_free(tmpBuffer);
-
-    TORCH_CHECK_LE(result, outNumSamples);
-
-    if (result) {
-      result = av_samples_get_buffer_size(
-          nullptr,
-          params_.out.audio.channels,
-          result,
-          (AVSampleFormat)params_.out.audio.format,
-          1);
-    }
-  }
-
-  return result;
-}
-
-int AudioSampler::sample(AVFrame* frame, ByteStorage* out) {
-  const auto outNumSamples = numOutputSamples(frame ? frame->nb_samples : 0);
-
-  if (!outNumSamples) {
-    return 0;
-  }
-
-  return sample(
-      frame ? (const uint8_t**)&frame->data[0] : nullptr,
-      frame ? frame->nb_samples : 0,
-      out,
-      outNumSamples);
-}
-
-int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  const auto inSampleSize =
-      av_get_bytes_per_sample((AVSampleFormat)params_.in.audio.format);
-
-  const auto inNumSamples =
-      !in ? 0 : in->length() / inSampleSize / params_.in.audio.channels;
-
-  const auto outNumSamples = numOutputSamples(inNumSamples);
-
-  if (!outNumSamples) {
-    return 0;
-  }
-
-  uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-  int result;
-  if (in &&
-      (result = preparePlanes(
-           params_.in.audio, in->data(), inNumSamples, inPlanes)) < 0) {
-    return result;
-  }
-
-  return sample(
-      in ? (const uint8_t**)inPlanes : nullptr,
-      inNumSamples,
-      out,
-      outNumSamples);
-}
-
-void AudioSampler::cleanUp() {
-  if (swrContext_) {
-    swr_free(&swrContext_);
-    swrContext_ = nullptr;
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_sampler.h b/torchvision/csrc/io/decoder/audio_sampler.h
deleted file mode 100644
index e105bbe4de2..00000000000
--- a/torchvision/csrc/io/decoder/audio_sampler.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode audio frames from one format into another
- */
-
-class AudioSampler : public MediaSampler {
- public:
-  explicit AudioSampler(void* logCtx);
-  ~AudioSampler() override;
-
-  // MediaSampler overrides
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  int sample(AVFrame* frame, ByteStorage* out);
-
- private:
-  // close resources
-  void cleanUp();
-  // helper functions for rescaling, cropping, etc.
-  int numOutputSamples(int inSamples) const;
-  int sample(
-      const uint8_t* inPlanes[],
-      int inNumSamples,
-      ByteStorage* out,
-      int outNumSamples);
-
- private:
-  SwrContext* swrContext_{nullptr};
-  void* logCtx_{nullptr};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp
deleted file mode 100644
index c3a003434b8..00000000000
--- a/torchvision/csrc/io/decoder/audio_stream.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include "audio_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-namespace {
-static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) {
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels;
-#else
-  return frame ? frame->channels : codec->channels;
-#endif
-}
-
-bool operator==(const AudioFormat& x, const AVFrame& y) {
-  return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(get_nb_channels(&y, nullptr)) &&
-      x.format == y.format;
-}
-
-bool operator==(const AudioFormat& x, const AVCodecContext& y) {
-  return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(get_nb_channels(nullptr, &y)) &&
-      x.format == y.sample_fmt;
-}
-
-AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
-  x.samples = y.sample_rate;
-  x.channels = get_nb_channels(&y, nullptr);
-  x.format = y.format;
-  return x;
-}
-
-AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
-  x.samples = y.sample_rate;
-  x.channels = get_nb_channels(nullptr, &y);
-  x.format = y.sample_fmt;
-  return x;
-}
-} // namespace
-
-AudioStream::AudioStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const AudioFormat& format)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          0) {}
-
-AudioStream::~AudioStream() {
-  if (sampler_) {
-    sampler_->shutdown();
-    sampler_.reset();
-  }
-}
-
-int AudioStream::initFormat() {
-  // set output format
-  if (format_.format.audio.samples == 0) {
-    format_.format.audio.samples = codecCtx_->sample_rate;
-  }
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  if (format_.format.audio.channels == 0) {
-    format_.format.audio.channels = codecCtx_->ch_layout.nb_channels;
-  }
-#else
-  if (format_.format.audio.channels == 0) {
-    format_.format.audio.channels = codecCtx_->channels;
-  }
-#endif
-  if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) {
-    format_.format.audio.format = codecCtx_->sample_fmt;
-  }
-
-  return format_.format.audio.samples != 0 &&
-          format_.format.audio.channels != 0 &&
-          format_.format.audio.format != AV_SAMPLE_FMT_NONE
-      ? 0
-      : -1;
-}
-
-// copies audio sample bytes via swr_convert call in audio_sampler.cpp
-int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  if (!sampler_) {
-    sampler_ = std::make_unique<AudioSampler>(codecCtx_);
-  }
-  // check if input format gets changed
-  if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
-            : !(sampler_->getInputFormat().audio == *frame_)) {
-    // - reinit sampler
-    SamplerParameters params;
-    params.type = format_.type;
-    params.out = format_.format;
-    params.in = FormatUnion();
-    flush ? toAudioFormat(params.in.audio, *codecCtx_)
-          : toAudioFormat(params.in.audio, *frame_);
-    if (!sampler_->init(params)) {
-      return -1;
-    }
-
-    VLOG(1) << "Set input audio sampler format"
-            << ", samples: " << params.in.audio.samples
-            << ", channels: " << params.in.audio.channels
-            << ", format: " << params.in.audio.format
-            << " : output audio sampler format"
-            << ", samples: " << format_.format.audio.samples
-            << ", channels: " << format_.format.audio.channels
-            << ", format: " << format_.format.audio.format;
-  }
-  // calls to a sampler that converts the audio samples and copies them to the
-  // out buffer via ffmpeg::swr_convert
-  return sampler_->sample(flush ? nullptr : frame_, out);
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_stream.h b/torchvision/csrc/io/decoder/audio_stream.h
deleted file mode 100644
index 2d6457b68f5..00000000000
--- a/torchvision/csrc/io/decoder/audio_stream.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-#include "audio_sampler.h"
-#include "stream.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one audio stream.
- */
-
-class AudioStream : public Stream {
- public:
-  AudioStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const AudioFormat& format);
-  ~AudioStream() override;
-
- private:
-  int initFormat() override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-
- private:
-  std::unique_ptr<AudioSampler> sampler_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/cc_stream.cpp b/torchvision/csrc/io/decoder/cc_stream.cpp
deleted file mode 100644
index 89174c396fd..00000000000
--- a/torchvision/csrc/io/decoder/cc_stream.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "cc_stream.h"
-
-namespace ffmpeg {
-
-CCStream::CCStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const SubtitleFormat& format)
-    : SubtitleStream(inputCtx, index, convertPtsToWallTime, format) {
-  format_.type = TYPE_CC;
-}
-
-AVCodec* CCStream::findCodec(AVCodecParameters* params) {
-  if (params->codec_id == AV_CODEC_ID_BIN_DATA &&
-      params->codec_type == AVMEDIA_TYPE_DATA) {
-    // obtain subtitles codec
-    params->codec_id = AV_CODEC_ID_MOV_TEXT;
-    params->codec_type = AVMEDIA_TYPE_SUBTITLE;
-  }
-  return Stream::findCodec(params);
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/cc_stream.h b/torchvision/csrc/io/decoder/cc_stream.h
deleted file mode 100644
index 3a1d169f014..00000000000
--- a/torchvision/csrc/io/decoder/cc_stream.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "subtitle_stream.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one closed captions stream.
- */
-class CCStream : public SubtitleStream {
- public:
-  CCStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const SubtitleFormat& format);
-
- private:
-  AVCodec* findCodec(AVCodecParameters* params) override;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
deleted file mode 100644
index 7221445840e..00000000000
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ /dev/null
@@ -1,764 +0,0 @@
-#include "decoder.h"
-#include <c10/util/Logging.h>
-#include <libavutil/avutil.h>
-#include <future>
-#include <iostream>
-#include <mutex>
-#include "audio_stream.h"
-#include "cc_stream.h"
-#include "subtitle_stream.h"
-#include "util.h"
-#include "video_stream.h"
-
-namespace ffmpeg {
-
-namespace {
-
-constexpr size_t kIoBufferSize = 96 * 1024;
-constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE;
-constexpr size_t kLogBufferSize = 1024;
-
-bool mapFfmpegType(AVMediaType media, MediaType* type) {
-  switch (media) {
-    case AVMEDIA_TYPE_AUDIO:
-      *type = TYPE_AUDIO;
-      return true;
-    case AVMEDIA_TYPE_VIDEO:
-      *type = TYPE_VIDEO;
-      return true;
-    case AVMEDIA_TYPE_SUBTITLE:
-      *type = TYPE_SUBTITLE;
-      return true;
-    case AVMEDIA_TYPE_DATA:
-      *type = TYPE_CC;
-      return true;
-    default:
-      return false;
-  }
-}
-
-std::unique_ptr<Stream> createStream(
-    MediaType type,
-    AVFormatContext* ctx,
-    int idx,
-    bool convertPtsToWallTime,
-    const FormatUnion& format,
-    int64_t loggingUuid) {
-  switch (type) {
-    case TYPE_AUDIO:
-      return std::make_unique<AudioStream>(
-          ctx, idx, convertPtsToWallTime, format.audio);
-    case TYPE_VIDEO:
-      return std::make_unique<VideoStream>(
-          // negative loggingUuid indicates video streams.
-          ctx,
-          idx,
-          convertPtsToWallTime,
-          format.video,
-          -loggingUuid);
-    case TYPE_SUBTITLE:
-      return std::make_unique<SubtitleStream>(
-          ctx, idx, convertPtsToWallTime, format.subtitle);
-    case TYPE_CC:
-      return std::make_unique<CCStream>(
-          ctx, idx, convertPtsToWallTime, format.subtitle);
-    default:
-      return nullptr;
-  }
-}
-
-} // Namespace
-
-/* static */
-void Decoder::logFunction(void* avcl, int level, const char* cfmt, va_list vl) {
-  if (!avcl) {
-    // Nothing can be done here
-    return;
-  }
-
-  AVClass* avclass = *reinterpret_cast<AVClass**>(avcl);
-  if (!avclass) {
-    // Nothing can be done here
-    return;
-  }
-  Decoder* decoder = nullptr;
-  if (strcmp(avclass->class_name, "AVFormatContext") == 0) {
-    AVFormatContext* context = reinterpret_cast<AVFormatContext*>(avcl);
-    if (context) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "AVCodecContext") == 0) {
-    AVCodecContext* context = reinterpret_cast<AVCodecContext*>(avcl);
-    if (context) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "AVIOContext") == 0) {
-    AVIOContext* context = reinterpret_cast<AVIOContext*>(avcl);
-    // only if opaque was assigned to Decoder pointer
-    if (context && context->read_packet == Decoder::readFunction) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "SWResampler") == 0) {
-    // expect AVCodecContext as parent
-    if (avclass->parent_log_context_offset) {
-      AVClass** parent =
-          *(AVClass***)(((uint8_t*)avcl) + avclass->parent_log_context_offset);
-      AVCodecContext* context = reinterpret_cast<AVCodecContext*>(parent);
-      if (context) {
-        decoder = reinterpret_cast<Decoder*>(context->opaque);
-      }
-    }
-  } else if (strcmp(avclass->class_name, "SWScaler") == 0) {
-    // cannot find a way to pass context pointer through SwsContext struct
-  } else {
-    VLOG(2) << "Unknown context class: " << avclass->class_name;
-  }
-
-  if (decoder != nullptr && decoder->enableLogLevel(level)) {
-    char buf[kLogBufferSize] = {0};
-    // Format the line
-    int* prefix = decoder->getPrintPrefix();
-    *prefix = 1;
-    av_log_format_line(avcl, level, cfmt, vl, buf, sizeof(buf) - 1, prefix);
-    // pass message to the decoder instance
-    std::string msg(buf);
-    decoder->logCallback(level, msg);
-  }
-}
-
-bool Decoder::enableLogLevel(int level) const {
-  return ssize_t(level) <= params_.logLevel;
-}
-
-void Decoder::logCallback(int level, const std::string& message) {
-  LOG(INFO) << "Msg, uuid=" << params_.loggingUuid << " level=" << level
-            << " msg=" << message;
-}
-
-/* static */
-int Decoder::shutdownFunction(void* ctx) {
-  Decoder* decoder = (Decoder*)ctx;
-  if (decoder == nullptr) {
-    return 1;
-  }
-  return decoder->shutdownCallback();
-}
-
-int Decoder::shutdownCallback() {
-  return interrupted_ ? 1 : 0;
-}
-
-/* static */
-int Decoder::readFunction(void* opaque, uint8_t* buf, int size) {
-  Decoder* decoder = reinterpret_cast<Decoder*>(opaque);
-  if (decoder == nullptr) {
-    return 0;
-  }
-  int bytesRead = decoder->readCallback(buf, size);
-  return bytesRead == 0 ? AVERROR_EOF : bytesRead;
-}
-
-/* static */
-int64_t Decoder::seekFunction(void* opaque, int64_t offset, int whence) {
-  Decoder* decoder = reinterpret_cast<Decoder*>(opaque);
-  if (decoder == nullptr) {
-    return -1;
-  }
-  return decoder->seekCallback(offset, whence);
-}
-
-int Decoder::readCallback(uint8_t* buf, int size) {
-  return seekableBuffer_.read(buf, size, params_.timeoutMs);
-}
-
-int64_t Decoder::seekCallback(int64_t offset, int whence) {
-  return seekableBuffer_.seek(offset, whence, params_.timeoutMs);
-}
-
-/* static */
-void Decoder::initOnce() {
-  static std::once_flag flagInit;
-  std::call_once(flagInit, []() {
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-    av_register_all();
-    avcodec_register_all();
-#endif
-    avformat_network_init();
-    av_log_set_callback(Decoder::logFunction);
-    av_log_set_level(AV_LOG_ERROR);
-    VLOG(1) << "Registered ffmpeg libs";
-  });
-}
-
-Decoder::Decoder() {
-  initOnce();
-}
-
-Decoder::~Decoder() {
-  cleanUp();
-}
-
-// Initialise the format context that holds information about the container and
-// fill it with minimal information about the format (codecs are not opened
-// here). Function reads in information about the streams from the container
-// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
-// specified within the decoder parameters, it seeks into the correct frame
-// (note, the seek defined here is "precise" seek).
-bool Decoder::init(
-    const DecoderParameters& params,
-    DecoderInCallback&& in,
-    std::vector<DecoderMetadata>* metadata) {
-  cleanUp();
-
-  if ((params.uri.empty() || in) && (!params.uri.empty() || !in)) {
-    LOG(ERROR)
-        << "uuid=" << params_.loggingUuid
-        << " either external URI gets provided or explicit input callback";
-    return false;
-  }
-
-  // set callback and params
-  params_ = params;
-
-  if (!(inputCtx_ = avformat_alloc_context())) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " cannot allocate format context";
-    return false;
-  }
-
-  AVInputFormat* fmt = nullptr;
-  int result = 0;
-  if (in) {
-    ImageType type = ImageType::UNKNOWN;
-    if ((result = seekableBuffer_.init(
-             std::forward<DecoderInCallback>(in),
-             params_.timeoutMs,
-             params_.maxSeekableBytes,
-             params_.isImage ? &type : nullptr)) < 0) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " can't initiate seekable buffer";
-      cleanUp();
-      return false;
-    }
-
-    if (params_.isImage) {
-      const char* fmtName = "image2";
-      switch (type) {
-        case ImageType::JPEG:
-          fmtName = "jpeg_pipe";
-          break;
-        case ImageType::PNG:
-          fmtName = "png_pipe";
-          break;
-        case ImageType::TIFF:
-          fmtName = "tiff_pipe";
-          break;
-        default:
-          break;
-      }
-
-      fmt = (AVInputFormat*)av_find_input_format(fmtName);
-    }
-
-    const size_t avioCtxBufferSize = kIoBufferSize;
-    uint8_t* avioCtxBuffer =
-        (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize);
-    if (!avioCtxBuffer) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " av_malloc cannot allocate " << avioCtxBufferSize
-                 << " bytes";
-      cleanUp();
-      return false;
-    }
-
-    if (!(avioCtx_ = avio_alloc_context(
-              avioCtxBuffer,
-              avioCtxBufferSize,
-              0,
-              reinterpret_cast<void*>(this),
-              &Decoder::readFunction,
-              nullptr,
-              result == 1 ? &Decoder::seekFunction : nullptr))) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " avio_alloc_context failed";
-      av_free(avioCtxBuffer);
-      cleanUp();
-      return false;
-    }
-
-    avioCtx_->max_packet_size = params.maxEncodedBufferSize;
-
-    inputCtx_->pb = avioCtx_;
-    inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
-  }
-
-  inputCtx_->opaque = reinterpret_cast<void*>(this);
-  inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction;
-  inputCtx_->interrupt_callback.opaque = reinterpret_cast<void*>(this);
-
-  // add network timeout
-  inputCtx_->flags |= AVFMT_FLAG_NONBLOCK;
-
-  AVDictionary* options = nullptr;
-  if (params_.listen) {
-    av_dict_set_int(&options, "listen", 1, 0);
-  }
-  if (params_.timeoutMs > 0) {
-    av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0);
-    av_dict_set_int(&options, "stimeout", params_.timeoutMs * 1000, 0);
-    av_dict_set_int(&options, "rw_timeout", params_.timeoutMs * 1000, 0);
-    if (!params_.tlsCertFile.empty()) {
-      av_dict_set(&options, "cert_file", params_.tlsCertFile.data(), 0);
-    }
-    if (!params_.tlsKeyFile.empty()) {
-      av_dict_set(&options, "key_file", params_.tlsKeyFile.data(), 0);
-    }
-  }
-
-  av_dict_set_int(&options, "probesize", params_.probeSize, 0);
-
-  interrupted_ = false;
-
-  // ffmpeg avformat_open_input call can hang if media source doesn't respond
-  // set a guard for handle such situations, if requested
-  std::promise<bool> p;
-  std::future<bool> f = p.get_future();
-  std::unique_ptr<std::thread> guard;
-  if (params_.preventStaleness) {
-    guard = std::make_unique<std::thread>([&f, this]() {
-      auto timeout = std::chrono::milliseconds(params_.timeoutMs);
-      if (std::future_status::timeout == f.wait_for(timeout)) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " cannot open stream within " << params_.timeoutMs
-                   << " ms";
-        interrupted_ = true;
-      }
-    });
-  }
-
-  if (fmt) {
-    result = avformat_open_input(&inputCtx_, nullptr, fmt, &options);
-  } else {
-    result =
-        avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options);
-  }
-
-  av_dict_free(&options);
-
-  if (guard) {
-    p.set_value(true);
-    guard->join();
-    guard.reset();
-  }
-
-  if (result < 0 || interrupted_) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " avformat_open_input failed, error="
-               << Util::generateErrorDesc(result);
-    cleanUp();
-    return false;
-  }
-
-  result = avformat_find_stream_info(inputCtx_, nullptr);
-
-  if (result < 0) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " avformat_find_stream_info failed, error="
-               << Util::generateErrorDesc(result);
-    cleanUp();
-    return false;
-  }
-
-  if (!openStreams(metadata)) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid << " cannot activate streams";
-    cleanUp();
-    return false;
-  }
-  // SyncDecoder inherits Decoder which would override onInit.
-  onInit();
-
-  if (params.startOffset != 0) {
-    auto offset = params.startOffset <= params.seekAccuracy
-        ? 0
-        : params.startOffset - params.seekAccuracy;
-
-    av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
-  }
-
-  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
-    if (
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-        inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO
-#else // FFMPEG 4.0+
-        inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO
-#endif
-        && inputCtx_->streams[i]->duration > 0) {
-      // There is at least two 1/r_frame_rates from the frame before the last
-      // one until the video duration, let's prefer to set duration after the
-      // frame before the last one, but as early as possible
-      double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den /
-              (double)inputCtx_->streams[i]->r_frame_rate.num -
-          1 / (double)AV_TIME_BASE;
-      videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration *
-              inputCtx_->streams[i]->time_base.num /
-              (double)inputCtx_->streams[i]->time_base.den -
-          1000 * correction;
-      break;
-    }
-  }
-
-  VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
-  VLOG(1) << "Video duration: " << videoDurationMs_;
-  return true;
-}
-
-// open appropriate CODEC for every type of stream and move it to the class
-// variable `streams_` and make sure it is in range for decoding
-bool Decoder::openStreams(std::vector<DecoderMetadata>* metadata) {
-  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
-    // - find the corespondent format at params_.formats set
-    MediaFormat format;
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-    const auto media = inputCtx_->streams[i]->codec->codec_type;
-#else // FFMPEG 4.0+
-    const auto media = inputCtx_->streams[i]->codecpar->codec_type;
-#endif
-    if (!mapFfmpegType(media, &format.type)) {
-      VLOG(1) << "Stream media: " << media << " at index " << i
-              << " gets ignored, unknown type";
-
-      continue; // unsupported type
-    }
-
-    // check format
-    auto it = params_.formats.find(format);
-    if (it == params_.formats.end()) {
-      VLOG(1) << "Stream type: " << format.type << " at index: " << i
-              << " gets ignored, caller is not interested";
-      continue; // clients don't care about this media format
-    }
-
-    // do we have stream of this type?
-    auto stream = findByType(format);
-
-    // should we process this stream?
-
-    if (it->stream == -2 || // all streams of this type are welcome
-        (!stream && (it->stream == -1 || it->stream == i))) { // new stream
-      VLOG(1) << "Stream type: " << format.type << " found, at index: " << i;
-      auto stream_2 = createStream(
-          format.type,
-          inputCtx_,
-          i,
-          params_.convertPtsToWallTime,
-          it->format,
-          params_.loggingUuid);
-      CHECK(stream_2);
-      if (stream_2->openCodec(metadata, params_.numThreads) < 0) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " open codec failed, stream_idx=" << i;
-        return false;
-      }
-      streams_.emplace(i, std::move(stream_2));
-      inRange_.set(i, true);
-    }
-  }
-
-  return true;
-}
-
-void Decoder::shutdown() {
-  cleanUp();
-}
-
-void Decoder::interrupt() {
-  interrupted_ = true;
-}
-
-void Decoder::cleanUp() {
-  if (!interrupted_) {
-    interrupted_ = true;
-  }
-
-  if (inputCtx_) {
-    for (auto& stream : streams_) {
-      // Drain stream buffers.
-      DecoderOutputMessage msg;
-      while (msg.payload = nullptr, stream.second->flush(&msg, true) > 0) {
-      }
-      stream.second.reset();
-    }
-    streams_.clear();
-    avformat_close_input(&inputCtx_);
-  }
-  if (avioCtx_) {
-    av_freep(&avioCtx_->buffer);
-    av_freep(&avioCtx_);
-  }
-
-  // reset callback
-  seekableBuffer_.shutdown();
-}
-
-// function does actual work, derived class calls it in working thread
-// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if
-// no frames got decoded in the specified timeout time, AVERROR_BUFFER_TOO_SMALL
-// when unable to allocate packet and error on unrecoverable error
-int Decoder::getFrame(size_t workingTimeInMs) {
-  if (inRange_.none()) {
-    return ENODATA;
-  }
-  // decode frames until cache is full and leave thread
-  // once decode() method gets called and grab some bytes
-  // run this method again
-  // init package
-  // update 03/22: moving memory management to ffmpeg
-  AVPacket* avPacket;
-  avPacket = av_packet_alloc();
-  if (avPacket == nullptr) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " decoder as not able to allocate the packet.";
-    return AVERROR_BUFFER_TOO_SMALL;
-  }
-  avPacket->data = nullptr;
-  avPacket->size = 0;
-
-  auto end = std::chrono::steady_clock::now() +
-      std::chrono::milliseconds(workingTimeInMs);
-  // return true if elapsed time less than timeout
-  auto watcher = [end]() -> bool {
-    return std::chrono::steady_clock::now() <= end;
-  };
-
-  int result = 0;
-  size_t decodingErrors = 0;
-  bool decodedFrame = false;
-  while (!interrupted_ && inRange_.any() && !decodedFrame) {
-    if (watcher() == false) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid << " hit ETIMEDOUT";
-      result = ETIMEDOUT;
-      break;
-    }
-    result = av_read_frame(inputCtx_, avPacket);
-    if (result == AVERROR(EAGAIN)) {
-      VLOG(4) << "Decoder is busy...";
-      std::this_thread::yield();
-      result = 0; // reset error, EAGAIN is not an error at all
-      // reset the packet to default settings
-      av_packet_unref(avPacket);
-      continue;
-    } else if (result == AVERROR_EOF) {
-      flushStreams();
-      VLOG(1) << "End of stream";
-      result = ENODATA;
-      break;
-    } else if (
-        result == AVERROR(EPERM) && params_.skipOperationNotPermittedPackets) {
-      // reset error, lets skip packets with EPERM
-      result = 0;
-      // reset the packet to default settings
-      av_packet_unref(avPacket);
-      continue;
-    } else if (result < 0) {
-      flushStreams();
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " error detected: " << Util::generateErrorDesc(result);
-      break;
-    }
-
-    // get stream; if stream cannot be found reset the packet to
-    // default settings
-    auto stream = findByIndex(avPacket->stream_index);
-    if (stream == nullptr || !inRange_.test(stream->getIndex())) {
-      av_packet_unref(avPacket);
-      continue;
-    }
-
-    size_t numConsecutiveNoBytes = 0;
-    // it can be only partial decoding of the package bytes
-    do {
-      // decode package
-      bool gotFrame = false;
-      bool hasMsg = false;
-      // packet either got consumed completely or not at all
-      if ((result = processPacket(
-               stream, avPacket, &gotFrame, &hasMsg, params_.fastSeek)) < 0) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " processPacket failed with code: " << result;
-        break;
-      }
-
-      if (!gotFrame && params_.maxProcessNoBytes != 0 &&
-          ++numConsecutiveNoBytes > params_.maxProcessNoBytes) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " exceeding max amount of consecutive no bytes";
-        break;
-      }
-      if (result > 0) {
-        numConsecutiveNoBytes = 0;
-      }
-
-      decodedFrame |= hasMsg;
-    } while (result == 0);
-
-    // post loop check
-    if (result < 0) {
-      if (params_.maxPackageErrors != 0 && // check errors
-          ++decodingErrors >= params_.maxPackageErrors) { // reached the limit
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " exceeding max amount of consecutive package errors";
-        break;
-      }
-    } else {
-      decodingErrors = 0; // reset on success
-    }
-
-    result = 0;
-
-    av_packet_unref(avPacket);
-
-    if (params_.uniformSampling > 1) {
-      if (doSeek_) {
-        double duration =
-            videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
-        double step =
-            (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
-        avformat_seek_file(
-            inputCtx_,
-            -1,
-            static_cast<int64_t>(step * kFramesDecoded_) + 1,
-            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
-            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
-            0);
-        ++kFramesDecoded_;
-        doSeek_ = false;
-      }
-    }
-  }
-
-  av_packet_free(&avPacket);
-  VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_
-          << ", inRange_.any() " << inRange_.any() << ", decodedFrame "
-          << decodedFrame << ", result " << result;
-
-  // loop can be terminated, either by:
-  // 1. explicitly interrupted
-  // 3. unrecoverable error or ENODATA (end of stream) or ETIMEDOUT (timeout)
-  // 4. decoded frames pts are out of the specified range
-  // 5. success decoded frame
-  if (interrupted_) {
-    return EINTR;
-  }
-  if (result != 0) {
-    return result;
-  }
-  if (inRange_.none()) {
-    return ENODATA;
-  }
-  return 0;
-}
-
-// find stream by stream index
-Stream* Decoder::findByIndex(int streamIndex) const {
-  auto it = streams_.find(streamIndex);
-  return it != streams_.end() ? it->second.get() : nullptr;
-}
-
-// find stream by type; note finds only the first stream of a given type
-Stream* Decoder::findByType(const MediaFormat& format) const {
-  for (auto& stream : streams_) {
-    if (stream.second->getMediaFormat().type == format.type) {
-      return stream.second.get();
-    }
-  }
-  return nullptr;
-}
-
-// given the stream and packet, decode the frame buffers into the
-// DecoderOutputMessage data structure via stream::decodePacket function.
-int Decoder::processPacket(
-    Stream* stream,
-    AVPacket* packet,
-    bool* gotFrame,
-    bool* hasMsg,
-    bool fastSeek) {
-  // decode package
-  int result;
-  DecoderOutputMessage msg;
-  msg.payload = params_.headerOnly ? nullptr : createByteStorage(0);
-  *hasMsg = false;
-  if ((result = stream->decodePacket(
-           packet, &msg, params_.headerOnly, gotFrame)) >= 0 &&
-      *gotFrame) {
-    // check end offset
-    bool endInRange =
-        params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
-    inRange_.set(stream->getIndex(), endInRange);
-    // if fastseek is enabled, we're returning the first
-    // frame that we decode after (potential) seek.
-    // By default, we perform accurate seek to the closest
-    // following frame
-    bool startCondition = true;
-    if (!fastSeek) {
-      startCondition = msg.header.pts >= params_.startOffset;
-    }
-    if (endInRange && startCondition) {
-      *hasMsg = pushMsg(std::move(msg));
-    }
-  }
-  return result;
-}
-
-bool Decoder::pushMsg(DecoderOutputMessage&& msg) {
-  pastDecodedPTS_ = currentDecodedPTS_;
-  currentDecodedPTS_ = msg.header.pts;
-
-  if (params_.uniformSampling <= 1) {
-    push(std::move(msg));
-    return true;
-  }
-
-  double duration =
-      videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
-  double step =
-      (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
-  if (pastDecodedPTS_ < step * kFramesDecoded_ &&
-      step * kFramesDecoded_ <= currentDecodedPTS_) {
-    push(std::move(msg));
-    doSeek_ = true;
-    return true;
-  }
-
-  return false;
-}
-
-void Decoder::flushStreams() {
-  VLOG(1) << "Flushing streams...";
-  for (auto& stream : streams_) {
-    DecoderOutputMessage msg;
-    while (msg.payload = (params_.headerOnly ? nullptr : createByteStorage(0)),
-           stream.second->flush(&msg, params_.headerOnly) > 0) {
-      // check end offset
-      bool endInRange =
-          params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
-      inRange_.set(stream.second->getIndex(), endInRange);
-      if (endInRange && msg.header.pts >= params_.startOffset) {
-        pushMsg(std::move(msg));
-      } else {
-        msg.payload.reset();
-      }
-    }
-  }
-}
-
-int Decoder::decode_all(const DecoderOutCallback& callback) {
-  int result;
-  do {
-    DecoderOutputMessage out;
-    if (0 == (result = decode(&out, params_.timeoutMs))) {
-      callback(std::move(out));
-    }
-  } while (result == 0);
-  return result;
-}
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h
deleted file mode 100644
index 172a011f93e..00000000000
--- a/torchvision/csrc/io/decoder/decoder.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#pragma once
-
-#include <bitset>
-#include <unordered_map>
-#include "seekable_buffer.h"
-#include "stream.h"
-
-#if defined(_MSC_VER)
-#include <BaseTsd.h>
-using ssize_t = SSIZE_T;
-#endif
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode media streams.
- * Media bytes can be explicitly provided through read-callback
- * or fetched internally by FFMPEG library
- */
-class Decoder : public MediaDecoder {
- public:
-  Decoder();
-  ~Decoder() override;
-
-  // MediaDecoder overrides
-  bool init(
-      const DecoderParameters& params,
-      DecoderInCallback&& in,
-      std::vector<DecoderMetadata>* metadata) override;
-  int decode_all(const DecoderOutCallback& callback) override;
-  void shutdown() override;
-  void interrupt() override;
-
- protected:
-  // function does actual work, derived class calls it in working thread
-  // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if
-  // no frames got decoded in the specified timeout time, and error on
-  // unrecoverable error.
-  int getFrame(size_t workingTimeInMs = 100);
-
-  // Derived class must override method and consume the provided message
-  virtual void push(DecoderOutputMessage&& buffer) = 0;
-
-  // Fires on init call
-  virtual void onInit() {}
-
- public:
-  // C-style FFMPEG API requires C/static methods for callbacks
-  static void logFunction(void* avcl, int level, const char* cfmt, va_list vl);
-  static int shutdownFunction(void* ctx);
-  static int readFunction(void* opaque, uint8_t* buf, int size);
-  static int64_t seekFunction(void* opaque, int64_t offset, int whence);
-  // can be called by any classes or API
-  static void initOnce();
-
-  int* getPrintPrefix() {
-    return &printPrefix;
-  }
-  double videoDurationMs_ = -1;
-
- private:
-  // mark below function for a proper invocation
-  bool enableLogLevel(int level) const;
-  void logCallback(int level, const std::string& message);
-  int readCallback(uint8_t* buf, int size);
-  int64_t seekCallback(int64_t offset, int whence);
-  int shutdownCallback();
-
-  bool openStreams(std::vector<DecoderMetadata>* metadata);
-  Stream* findByIndex(int streamIndex) const;
-  Stream* findByType(const MediaFormat& format) const;
-  int processPacket(
-      Stream* stream,
-      AVPacket* packet,
-      bool* gotFrame,
-      bool* hasMsg,
-      bool fastSeek = false);
-  void flushStreams();
-  void cleanUp();
-  bool pushMsg(DecoderOutputMessage&&
-                   msg); // returns whether frame is passed to downstream
-
- protected:
-  DecoderParameters params_;
-
- private:
-  SeekableBuffer seekableBuffer_;
-  int printPrefix{1};
-
-  std::atomic<bool> interrupted_{false};
-  AVFormatContext* inputCtx_{nullptr};
-  AVIOContext* avioCtx_{nullptr};
-  std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
-  std::bitset<64> inRange_;
-  int kFramesDecoded_{0};
-  int64_t pastDecodedPTS_{-1};
-  int64_t currentDecodedPTS_{-1};
-  bool doSeek_{false};
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
deleted file mode 100644
index d2dc5c7935b..00000000000
--- a/torchvision/csrc/io/decoder/defs.h
+++ /dev/null
@@ -1,415 +0,0 @@
-#pragma once
-
-#include <array>
-#include <functional>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavformat/avformat.h>
-#include <libavformat/avio.h>
-#include <libavutil/avutil.h>
-#include <libavutil/imgutils.h>
-#include <libswresample/swresample.h>
-#include "libswscale/swscale.h"
-}
-
-namespace ffmpeg {
-
-// bit mask of formats, keep them in form 2^n
-enum MediaType : size_t {
-  TYPE_AUDIO = 1,
-  TYPE_VIDEO = 2,
-  TYPE_SUBTITLE = 4,
-  TYPE_CC = 8, // closed captions from transport streams
-};
-
-// audio
-struct AudioFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-  bool operator==(const AudioFormat& x) const {
-    return x.format == format && x.samples == samples && x.channels == channels;
-  }
-
-  size_t samples{0}; // number samples per second (frequency)
-  size_t channels{0}; // number of channels
-  long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE
-  size_t padding[2];
-  // -- alignment 40 bytes
-};
-
-// video
-struct VideoFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-  bool operator==(const VideoFormat& x) const {
-    return x.format == format && x.width == width && x.height == height;
-  }
-  /*
-  When width = 0, height = 0, minDimension = 0, and maxDimension = 0,
-    keep the original frame resolution
-  When width = 0, height = 0, minDimension != 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that shorter edge size is
-  minDimension
-  When width = 0, height = 0, minDimension = 0, and maxDimension != 0,
-    keep the aspect ratio and resize the frame so that longer edge size is
-  maxDimension
-  When width = 0, height = 0, minDimension != 0, and maxDimension != 0,
-    resize the frame so that shorter edge size is minDimension, and
-    longer edge size is maxDimension. The aspect ratio may not be preserved
-  When width = 0, height != 0, minDimension = 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that frame height is $height
-  When width != 0, height = 0, minDimension = 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that frame width is $width
-  When width != 0, height != 0, minDimension = 0, and maxDimension = 0,
-    resize the frame so that frame width and  height are set to $width and
-  $height,
-    respectively
-  */
-  size_t width{0}; // width in pixels
-  size_t height{0}; // height in pixels
-  long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE
-  size_t minDimension{0}; // choose min dimension and rescale accordingly
-  size_t maxDimension{0}; // choose max dimension and rescale accordingly
-  size_t cropImage{0}; // request image crop
-  // -- alignment 40 bytes
-};
-
-// subtitle/cc
-struct SubtitleFormat {
-  long type{0}; // AVSubtitleType, auto SUBTITLE_NONE
-  size_t padding[4];
-  // -- alignment 40 bytes
-};
-
-union FormatUnion {
-  FormatUnion() : audio() {}
-  explicit FormatUnion(int) : video() {}
-  explicit FormatUnion(char) : subtitle() {}
-  explicit FormatUnion(double) : subtitle() {}
-  AudioFormat audio;
-  VideoFormat video;
-  SubtitleFormat subtitle;
-  // -- alignment 40 bytes
-};
-
-/*
-  MediaFormat data structure serves as input/output parameter.
-  Caller assigns values for input formats
-  or leave default values for auto detection
-  For output formats all fields will be set to the specific values
-*/
-struct MediaFormat {
-  // for using map/set data structures
-  bool operator<(const MediaFormat& x) const {
-    return type < x.type;
-  }
-  bool operator==(const MediaFormat& x) const {
-    if (type != x.type) {
-      return false;
-    }
-    switch (type) {
-      case TYPE_AUDIO:
-        return format.audio == x.format.audio;
-      case TYPE_VIDEO:
-        return format.video == x.format.video;
-      case TYPE_SUBTITLE:
-      case TYPE_CC:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {}
-  explicit MediaFormat(int x, long s = -1)
-      : type(TYPE_VIDEO), stream(s), format(x) {}
-  explicit MediaFormat(char x, long s = -1)
-      : type(TYPE_SUBTITLE), stream(s), format(x) {}
-  explicit MediaFormat(double x, long s = -1)
-      : type(TYPE_CC), stream(s), format(x) {}
-
-  static MediaFormat makeMediaFormat(AudioFormat format, long stream) {
-    MediaFormat result(stream);
-    result.format.audio = format;
-    return result;
-  }
-
-  static MediaFormat makeMediaFormat(VideoFormat format, long stream) {
-    MediaFormat result(0, stream);
-    result.format.video = format;
-    return result;
-  }
-
-  static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) {
-    MediaFormat result('0', stream);
-    result.format.subtitle = format;
-    return result;
-  }
-
-  // format type
-  MediaType type;
-  // stream index:
-  // set -1 for one stream auto detection, -2 for all streams auto detection,
-  // >= 0, specified stream, if caller knows the stream index (unlikely)
-  long stream;
-  // union keeps one of the possible formats, defined by MediaType
-  FormatUnion format;
-};
-
-struct DecoderParameters {
-  // local file, remote file, http url, rtmp stream uri, etc. anything that
-  // ffmpeg can recognize
-  std::string uri{std::string()};
-  // timeout on getting bytes for decoding
-  size_t timeoutMs{1000};
-  // logging level, default AV_LOG_PANIC
-  long logLevel{0};
-  // when decoder would give up, 0 means never
-  size_t maxPackageErrors{0};
-  // max allowed consecutive times no bytes are processed. 0 means for infinite.
-  size_t maxProcessNoBytes{0};
-  // start offset (us)
-  long startOffset{0};
-  // end offset (us)
-  long endOffset{-1};
-  // logging id
-  int64_t loggingUuid{0};
-  // internal max seekable buffer size
-  size_t maxSeekableBytes{0};
-  // adjust header pts to the epoch time
-  bool convertPtsToWallTime{false};
-  // indicate if input stream is an encoded image
-  bool isImage{false};
-  // listen and wait for new rtmp stream
-  bool listen{false};
-  // don't copy frame body, only header
-  bool headerOnly{false};
-  // enable fast seek (seek only to keyframes)
-  bool fastSeek{false};
-  // interrupt init method on timeout
-  bool preventStaleness{true};
-  // seek tolerated accuracy (us)
-  double seekAccuracy{1000000.0};
-  // Allow multithreaded decoding for numThreads > 1;
-  // 0 numThreads=0 sets up sensible defaults
-  int numThreads{1};
-  // what media types should be processed, default none
-  std::set<MediaFormat> formats;
-
-  // can be used for asynchronous decoders
-  size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes
-  size_t cacheTimeoutMs{1000}; // timeout on bytes writing
-  bool enforceCacheSize{false}; // drop output frames if cache is full
-  bool mergeAudioMessages{false}; // combine collocated audio messages together
-
-  std::string tlsCertFile;
-  std::string tlsKeyFile;
-
-  // Skip packets that fail with EPERM errors and continue decoding.
-  bool skipOperationNotPermittedPackets{false};
-
-  // probing size in bytes, i.e. the size of the data to analyze to get stream
-  // information. A higher value will enable detecting more information in case
-  // it is dispersed into the stream, but will increase latency. Must be an
-  // integer not lesser than 32. It is 5000000 by default.
-  int64_t probeSize{5000000};
-
-  // Expected duration of the video to be decoded, mainly used with uniform
-  // sampling
-  float expectedDuration{0.0f};
-
-  // Sample N key-frames from the video roughly uniformly across the timeline
-  int uniformSampling{0};
-
-  // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames.
-  // Override this with bigger buffer size if needed.
-  int64_t maxEncodedBufferSize{0};
-};
-
-struct DecoderHeader {
-  // message id, from 0 till ...
-  size_t seqno{0};
-  // decoded timestamp in microseconds from either beginning of the stream or
-  // from epoch time, see DecoderParameters::convertPtsToWallTime
-  long pts{0};
-  // decoded key frame
-  size_t keyFrame{0};
-  // frames per second, valid only for video streams
-  double fps{0};
-  // format specifies what kind frame is in a payload
-  MediaFormat format;
-};
-
-// Abstract interface ByteStorage class
-class ByteStorage {
- public:
-  virtual ~ByteStorage() = default;
-  // makes sure that buffer has at least n bytes available for writing, if not
-  // storage must reallocate memory.
-  virtual void ensure(size_t n) = 0;
-  // caller must not to write more than available bytes
-  virtual uint8_t* writableTail() = 0;
-  // caller confirms that n bytes were written to the writable tail
-  virtual void append(size_t n) = 0;
-  // caller confirms that n bytes were read from the read buffer
-  virtual void trim(size_t n) = 0;
-  // gives an access to the beginning of the read buffer
-  virtual const uint8_t* data() const = 0;
-  // returns the stored size in bytes
-  virtual size_t length() const = 0;
-  // returns available capacity for writable tail
-  virtual size_t tail() const = 0;
-  // clears content, keeps capacity
-  virtual void clear() = 0;
-};
-
-struct DecoderOutputMessage {
-  DecoderHeader header;
-  std::unique_ptr<ByteStorage> payload;
-};
-
-/*
- * External provider of the ecnoded bytes, specific implementation is left for
- * different use cases, like file, memory, external network end-points, etc.
- * Normally input/output parameter @out set to valid, not null buffer pointer,
- * which indicates "read" call, however there are "seek" modes as well.
-
- * @out != nullptr => read from the current offset, @whence got ignored,
- * @size bytes to read => return number bytes got read, 0 if no more bytes
- * available, < 0 on error.
-
- * @out == nullptr, @timeoutMs == 0 => does provider support "seek"
- * capability in a first place? @size & @whence got ignored, return 0 on
- * success, < 0 if "seek" mode is not supported.
-
- * @out == nullptr, @timeoutMs != 0 => normal seek call
- * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE)
- * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END],
- * length of buffer if @whence = [AVSEEK_SIZE].
- */
-using DecoderInCallback =
-    std::function<int(uint8_t* out, int size, int whence, uint64_t timeoutMs)>;
-
-using DecoderOutCallback = std::function<void(DecoderOutputMessage&&)>;
-
-struct DecoderMetadata {
-  // time base numerator
-  long num{0};
-  // time base denominator
-  long den{1};
-  // duration of the stream, in miscroseconds, if available
-  long duration{-1};
-  // frames per second, valid only for video streams
-  double fps{0};
-  // format specifies what kind frame is in a payload
-  MediaFormat format;
-};
-/**
- * Abstract class for decoding media bytes
- * It has two different modes. Internal media bytes retrieval for given uri and
- * external media bytes provider in case of memory streams
- */
-class MediaDecoder {
- public:
-  virtual ~MediaDecoder() = default;
-
-  /**
-   * Initializes media decoder with parameters,
-   * calls callback when media bytes are available.
-   * Media bytes get fetched internally from provided URI
-   * or invokes provided input callback to get media bytes.
-   * Input callback must be empty for the internal media provider
-   * Caller can provide non-null pointer for the input container
-   * if headers to obtain the streams metadata (optional)
-   */
-  virtual bool init(
-      const DecoderParameters& params,
-      DecoderInCallback&& in,
-      std::vector<DecoderMetadata>* metadata) = 0;
-
-  /**
-   * Polls available decoded one frame from decoder
-   * Returns error code, 0 - for success
-   */
-  virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0;
-
-  /**
-   * Polls available decoded bytes from decoder, till EOF or error
-   */
-  virtual int decode_all(const DecoderOutCallback& callback) = 0;
-
-  /**
-   * Stops calling callback, releases resources
-   */
-  virtual void shutdown() = 0;
-
-  /**
-   * Interrupts whatever decoder is doing at any time
-   */
-  virtual void interrupt() = 0;
-
-  /**
-   * Factory to create ByteStorage class instances, particular implementation is
-   * left to the derived class. Caller provides the initially allocated size
-   */
-  virtual std::unique_ptr<ByteStorage> createByteStorage(size_t n) = 0;
-};
-
-struct SamplerParameters {
-  MediaType type{TYPE_AUDIO};
-  FormatUnion in;
-  FormatUnion out;
-  int64_t loggingUuid{0};
-};
-
-/**
- * Abstract class for sampling media bytes
- */
-class MediaSampler {
- public:
-  virtual ~MediaSampler() = default;
-
-  /**
-   * Initializes media sampler with parameters
-   */
-  virtual bool init(const SamplerParameters& params) = 0;
-
-  /**
-   * Samples media bytes
-   * Returns error code < 0, or >=0 - for success, indicating number of bytes
-   * processed.
-   * set @in to null for flushing data
-   */
-  virtual int sample(const ByteStorage* in, ByteStorage* out) = 0;
-
-  /**
-   * Releases resources
-   */
-  virtual void shutdown() = 0;
-
-  /*
-   * Returns media type
-   */
-  MediaType getMediaType() const {
-    return params_.type;
-  }
-  /*
-   * Returns formats
-   */
-  FormatUnion getInputFormat() const {
-    return params_.in;
-  }
-  FormatUnion getOutFormat() const {
-    return params_.out;
-  }
-
- protected:
-  SamplerParameters params_;
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp
deleted file mode 100644
index 4e420c3b3cd..00000000000
--- a/torchvision/csrc/io/decoder/memory_buffer.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "memory_buffer.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size)
-    : buffer_(buffer), len_(size) {}
-
-int MemoryBuffer::read(uint8_t* buf, int size) {
-  if (pos_ < len_) {
-    auto available = std::min(int(len_ - pos_), size);
-    memcpy(buf, buffer_ + pos_, available);
-    pos_ += available;
-    return available;
-  }
-
-  return 0;
-}
-
-int64_t MemoryBuffer::seek(int64_t offset, int whence) {
-  if (whence & AVSEEK_SIZE) {
-    return len_;
-  }
-
-  // remove force flag
-  whence &= ~AVSEEK_FORCE;
-
-  switch (whence) {
-    case SEEK_SET:
-      if (offset >= 0 && offset <= len_) {
-        pos_ = offset;
-      }
-      break;
-    case SEEK_END:
-      if (len_ + offset >= 0 && len_ + offset <= len_) {
-        pos_ = len_ + offset;
-      }
-      break;
-    case SEEK_CUR:
-      if (pos_ + offset > 0 && pos_ + offset <= len_) {
-        pos_ += offset;
-      }
-      break;
-    default:
-      LOG(ERROR) << "Unknown whence flag gets provided: " << whence;
-  }
-  return pos_;
-}
-
-/* static */
-DecoderInCallback MemoryBuffer::getCallback(
-    const uint8_t* buffer,
-    size_t size) {
-  MemoryBuffer object(buffer, size);
-  return
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - supported
-          return 0;
-        }
-        return object.seek(size, whence);
-      };
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/memory_buffer.h b/torchvision/csrc/io/decoder/memory_buffer.h
deleted file mode 100644
index 909626d3cae..00000000000
--- a/torchvision/csrc/io/decoder/memory_buffer.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses external memory buffer and implements a seekable interface.
- */
-class MemoryBuffer {
- public:
-  explicit MemoryBuffer(const uint8_t* buffer, size_t size);
-  int64_t seek(int64_t offset, int whence);
-  int read(uint8_t* buf, int size);
-
-  // static constructor for decoder callback.
-  static DecoderInCallback getCallback(const uint8_t* buffer, size_t size);
-
- private:
-  const uint8_t* buffer_; // set at construction time
-  long pos_{0}; // current position
-  long len_{0}; // bytes in buffer
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/seekable_buffer.cpp b/torchvision/csrc/io/decoder/seekable_buffer.cpp
deleted file mode 100644
index 41e3e689c7b..00000000000
--- a/torchvision/csrc/io/decoder/seekable_buffer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "seekable_buffer.h"
-#include <c10/util/Logging.h>
-#include <chrono>
-#include "memory_buffer.h"
-
-namespace ffmpeg {
-
-int SeekableBuffer::init(
-    DecoderInCallback&& in,
-    uint64_t timeoutMs,
-    size_t maxSeekableBytes,
-    ImageType* type) {
-  shutdown();
-  isSeekable_ = in(nullptr, 0, 0, 0) == 0;
-  if (isSeekable_) { // seekable
-    if (type) {
-      if (!readBytes(in, 8, timeoutMs)) {
-        return -1;
-      }
-      setImageType(type);
-      end_ = 0;
-      eof_ = false;
-      std::vector<uint8_t>().swap(buffer_);
-      // reset callback
-      if (in(nullptr, 0, SEEK_SET, timeoutMs)) {
-        return -1;
-      }
-    }
-    inCallback_ = std::forward<DecoderInCallback>(in);
-    return 1;
-  }
-
-  if (!readBytes(in, maxSeekableBytes + (type ? 8 : 0), timeoutMs)) {
-    return -1;
-  }
-
-  if (type) {
-    setImageType(type);
-  }
-
-  if (eof_) {
-    end_ = 0;
-    eof_ = false;
-    // reuse MemoryBuffer functionality
-    inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size());
-    isSeekable_ = true;
-    return 1;
-  }
-  inCallback_ = std::forward<DecoderInCallback>(in);
-  return 0;
-}
-
-bool SeekableBuffer::readBytes(
-    DecoderInCallback& in,
-    size_t maxBytes,
-    uint64_t timeoutMs) {
-  // Resize to th minimum 4K page or less
-  buffer_.resize(std::min(maxBytes, size_t(4 * 1024UL)));
-  end_ = 0;
-  eof_ = false;
-
-  auto end =
-      std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs);
-  auto watcher = [end]() -> bool {
-    return std::chrono::steady_clock::now() <= end;
-  };
-
-  bool hasTime = true;
-  while (!eof_ && end_ < maxBytes && (hasTime = watcher())) {
-    // lets read all bytes into available buffer
-    auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs);
-    if (res > 0) {
-      end_ += res;
-      if (end_ == buffer_.size()) {
-        buffer_.resize(std::min(size_t(end_ * 4UL), maxBytes));
-      }
-    } else if (res == 0) {
-      eof_ = true;
-    } else {
-      // error
-      return false;
-    }
-  }
-
-  buffer_.resize(end_);
-
-  return hasTime;
-}
-
-void SeekableBuffer::setImageType(ImageType* type) {
-  if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 &&
-      buffer_[2] == 0xFF) {
-    *type = ImageType::JPEG;
-  } else if (
-      buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' &&
-      buffer_[3] == 'G') {
-    *type = ImageType::PNG;
-  } else if (
-      buffer_.size() > 1 &&
-      ((buffer_[0] == 0x49 && buffer_[1] == 0x49) ||
-       (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) {
-    *type = ImageType::TIFF;
-  } else {
-    *type = ImageType::UNKNOWN;
-  }
-}
-
-int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) {
-  if (isSeekable_) {
-    return inCallback_(buf, size, 0, timeoutMs);
-  }
-  if (pos_ < end_) {
-    // read cached bytes for non-seekable callback
-    auto available = std::min(int(end_ - pos_), size);
-    memcpy(buf, buffer_.data() + pos_, available);
-    pos_ += available;
-    return available;
-  } else if (!eof_) {
-    // normal sequential read (see defs.h file), i.e. @buf != null
-    auto res = inCallback_(buf, size, 0, timeoutMs); // read through
-    eof_ = res == 0;
-    return res;
-  } else {
-    return 0;
-  }
-}
-
-int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) {
-  return inCallback_(nullptr, offset, whence, timeoutMs);
-}
-
-void SeekableBuffer::shutdown() {
-  pos_ = end_ = 0;
-  eof_ = false;
-  std::vector<uint8_t>().swap(buffer_);
-  inCallback_ = nullptr;
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/seekable_buffer.h b/torchvision/csrc/io/decoder/seekable_buffer.h
deleted file mode 100644
index 9d5729f5306..00000000000
--- a/torchvision/csrc/io/decoder/seekable_buffer.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses internal buffer to store initial size bytes as a seekable cache
- * from Media provider and let ffmpeg to seek and read bytes from cache
- * and beyond - reading bytes directly from Media provider
- */
-enum class ImageType {
-  UNKNOWN = 0,
-  JPEG = 1,
-  PNG = 2,
-  TIFF = 3,
-};
-
-class SeekableBuffer {
- public:
-  // @type is optional, not nullptr only is image detection required
-  // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error
-  int init(
-      DecoderInCallback&& in,
-      uint64_t timeoutMs,
-      size_t maxSeekableBytes,
-      ImageType* type);
-  int read(uint8_t* buf, int size, uint64_t timeoutMs);
-  int64_t seek(int64_t offset, int whence, uint64_t timeoutMs);
-  void shutdown();
-
- private:
-  bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs);
-  void setImageType(ImageType* type);
-
- private:
-  DecoderInCallback inCallback_;
-  std::vector<uint8_t> buffer_; // resized at init time
-  long pos_{0}; // current position (SEEK_CUR iff pos_ < end_)
-  long end_{0}; // current buffer size
-  bool eof_{0}; // indicates the EOF
-  bool isSeekable_{false}; // is callback seekable
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp
deleted file mode 100644
index 7969741e72c..00000000000
--- a/torchvision/csrc/io/decoder/stream.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-#include "stream.h"
-#include <c10/util/Logging.h>
-#include <string.h>
-#include "util.h"
-
-namespace ffmpeg {
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-
-Stream::Stream(
-    AVFormatContext* inputCtx,
-    MediaFormat format,
-    bool convertPtsToWallTime,
-    int64_t loggingUuid)
-    : inputCtx_(inputCtx),
-      format_(format),
-      convertPtsToWallTime_(convertPtsToWallTime),
-      loggingUuid_(loggingUuid) {}
-
-Stream::~Stream() {
-  if (frame_) {
-    av_free(frame_);
-  }
-  if (codecCtx_) {
-    avcodec_free_context(&codecCtx_);
-  }
-}
-
-// look up the proper CODEC querying the function
-AVCodec* Stream::findCodec(AVCodecParameters* params) {
-  return (AVCodec*)avcodec_find_decoder(params->codec_id);
-}
-
-// Allocate memory for the AVCodecContext, which will hold the context for
-// decode/encode process. Then fill this codec context with CODEC parameters
-// defined in stream parameters. Open the codec, and allocate the global frame
-// defined in the header file
-int Stream::openCodec(std::vector<DecoderMetadata>* metadata, int num_threads) {
-  AVStream* steam = inputCtx_->streams[format_.stream];
-
-  AVCodec* codec = findCodec(steam->codecpar);
-  if (!codec) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_find_decoder failed for codec_id: "
-               << int(steam->codecpar->codec_id);
-    return AVERROR(EINVAL);
-  }
-
-  if (!(codecCtx_ = avcodec_alloc_context3(codec))) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_alloc_context3 failed";
-    return AVERROR(ENOMEM);
-  }
-  // multithreading heuristics
-  // if user defined,
-  if (num_threads > max_threads) {
-    num_threads = max_threads;
-  }
-
-  if (num_threads > 0) {
-    // if user defined, respect that
-    // note that default thread_type will be used
-    codecCtx_->thread_count = num_threads;
-  } else {
-    // otherwise set sensible defaults
-    codecCtx_->thread_count = 8;
-    codecCtx_->thread_type = FF_THREAD_SLICE;
-  }
-
-  int ret;
-  // Copy codec parameters from input stream to output codec context
-  if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_parameters_to_context failed";
-    return ret;
-  }
-
-  // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful
-  if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret);
-    avcodec_free_context(&codecCtx_);
-    codecCtx_ = nullptr;
-    return ret;
-  }
-
-  frame_ = av_frame_alloc();
-
-  switch (format_.type) {
-    case TYPE_VIDEO:
-      fps_ = av_q2d(av_guess_frame_rate(inputCtx_, steam, nullptr));
-      break;
-    case TYPE_AUDIO:
-      fps_ = codecCtx_->sample_rate;
-      break;
-    default:
-      fps_ = 30.0;
-  }
-
-  if ((ret = initFormat())) {
-    LOG(ERROR) << "initFormat failed, type: " << format_.type;
-  }
-
-  if (metadata) {
-    DecoderMetadata header;
-    header.format = format_;
-    header.fps = fps_;
-    header.num = steam->time_base.num;
-    header.den = steam->time_base.den;
-    header.duration =
-        av_rescale_q(steam->duration, steam->time_base, timeBaseQ);
-    metadata->push_back(header);
-  }
-
-  return ret;
-}
-
-// send the raw data packet (compressed frame) to the decoder, through the codec
-// context and receive the raw data frame (uncompressed frame) from the
-// decoder, through the same codec context
-int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
-  int consumed = 0;
-  int result = avcodec_send_packet(codecCtx_, packet);
-  if (result == AVERROR(EAGAIN)) {
-    *gotFrame = false; // no bytes get consumed, fetch frame
-  } else if (result == AVERROR_EOF) {
-    *gotFrame = false; // more than one flush packet
-    if (packet) {
-      // got packet after flush, this is an error
-      return result;
-    }
-  } else if (result < 0) {
-    LOG(ERROR) << "avcodec_send_packet failed, err: "
-               << Util::generateErrorDesc(result);
-    return result; // error
-  } else {
-    consumed = packet ? packet->size : 0; // all bytes get consumed
-  }
-
-  result = avcodec_receive_frame(codecCtx_, frame_);
-
-  if (result >= 0) {
-    *gotFrame = true; // frame is available
-  } else if (result == AVERROR(EAGAIN)) {
-    *gotFrame = false; // no frames at this time, needs more packets
-    if (!consumed) {
-      // precaution, if no packages got consumed and no frames are available
-      return result;
-    }
-  } else if (result == AVERROR_EOF) {
-    *gotFrame = false; // the last frame has been flushed
-    // precaution, if no more frames are available assume we consume all bytes
-    consumed = 0;
-  } else { // error
-    LOG(ERROR) << "avcodec_receive_frame failed, err: "
-               << Util::generateErrorDesc(result);
-    return result;
-  }
-  return consumed;
-}
-
-// General decoding function:
-// given the packet, analyse the metadata, and write the
-// metadata and the buffer to the DecoderOutputImage.
-int Stream::decodePacket(
-    const AVPacket* packet,
-    DecoderOutputMessage* out,
-    bool headerOnly,
-    bool* hasMsg) {
-  int consumed;
-  bool gotFrame = false;
-  *hasMsg = false;
-  if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 &&
-      (packet == nullptr || gotFrame)) {
-    int result;
-    if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) {
-      return result; // report error
-    }
-    *hasMsg = result > 0;
-  }
-  return consumed;
-}
-
-int Stream::flush(DecoderOutputMessage* out, bool headerOnly) {
-  bool hasMsg = false;
-  int result = decodePacket(nullptr, out, headerOnly, &hasMsg);
-  if (result < 0) {
-    avcodec_flush_buffers(codecCtx_);
-    return result;
-  }
-  if (!hasMsg) {
-    avcodec_flush_buffers(codecCtx_);
-    return 0;
-  }
-  return 1;
-}
-
-// Sets the header and payload via stream::setHeader and copyFrameBytes
-// functions that are defined in type stream subclass (VideoStream, AudioStream,
-// ...)
-int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) {
-  if (flush) {
-    // only flush of audio frames makes sense
-    if (format_.type == TYPE_AUDIO) {
-      int processed = 0;
-      size_t total = 0;
-      // grab all audio bytes by chunks
-      do {
-        if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) {
-          return processed;
-        }
-        total += processed;
-      } while (processed);
-
-      if (total) {
-        // set header if message bytes are available
-        setHeader(&out->header, flush);
-        return 1;
-      }
-    }
-    return 0;
-  } else {
-    if (format_.type == TYPE_AUDIO) {
-      int processed = 0;
-      if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) {
-        return processed;
-      }
-      if (processed) {
-        // set header if message bytes are available
-        setHeader(&out->header, flush);
-        return 1;
-      }
-      return 0;
-    } else {
-      // set header
-      setHeader(&out->header, flush);
-
-      if (headerOnly) {
-        // Only header is requisted
-        return 1;
-      }
-
-      return copyFrameBytes(out->payload.get(), flush);
-    }
-  }
-}
-
-void Stream::setHeader(DecoderHeader* header, bool flush) {
-  header->seqno = numGenerator_++;
-
-  setFramePts(header, flush);
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->format = format_;
-  header->keyFrame = 0;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-}
-
-void Stream::setFramePts(DecoderHeader* header, bool flush) {
-  if (flush) {
-    header->pts = nextPts_; // already in us
-  } else {
-    header->pts = frame_->best_effort_timestamp;
-    if (header->pts == AV_NOPTS_VALUE) {
-      header->pts = nextPts_;
-    } else {
-      header->pts = av_rescale_q(
-          header->pts,
-          inputCtx_->streams[format_.stream]->time_base,
-          timeBaseQ);
-    }
-
-    switch (format_.type) {
-      case TYPE_AUDIO:
-        nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_;
-        break;
-      case TYPE_VIDEO:
-        nextPts_ = header->pts + AV_TIME_BASE / fps_;
-        break;
-      default:
-        nextPts_ = header->pts;
-    }
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/stream.h b/torchvision/csrc/io/decoder/stream.h
deleted file mode 100644
index 6250dd9ecd2..00000000000
--- a/torchvision/csrc/io/decoder/stream.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#pragma once
-
-#include <atomic>
-#include "defs.h"
-#include "time_keeper.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one media stream (audio or video).
- */
-
-class Stream {
- public:
-  Stream(
-      AVFormatContext* inputCtx,
-      MediaFormat format,
-      bool convertPtsToWallTime,
-      int64_t loggingUuid);
-  virtual ~Stream();
-
-  // returns 0 - on success or negative error
-  // num_threads sets up the codec context for multithreading if needed
-  // default is set to single thread in order to not break BC
-  int openCodec(std::vector<DecoderMetadata>* metadata, int num_threads = 1);
-  // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error
-  int decodePacket(
-      const AVPacket* packet,
-      DecoderOutputMessage* out,
-      bool headerOnly,
-      bool* hasMsg);
-  // returns stream index
-  int getIndex() const {
-    return format_.stream;
-  }
-  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
-  int flush(DecoderOutputMessage* out, bool headerOnly);
-  // return media format
-  MediaFormat getMediaFormat() const {
-    return format_;
-  }
-
- protected:
-  virtual int initFormat() = 0;
-  // returns number processed bytes from packet, or negative error
-  virtual int analyzePacket(const AVPacket* packet, bool* gotFrame);
-  // returns number processed bytes from packet, or negative error
-  virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0;
-  // sets output format
-  virtual void setHeader(DecoderHeader* header, bool flush);
-  // set frame pts
-  virtual void setFramePts(DecoderHeader* header, bool flush);
-  // finds codec
-  virtual AVCodec* findCodec(AVCodecParameters* params);
-
- private:
-  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
-  int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly);
-
- protected:
-  AVFormatContext* const inputCtx_;
-  MediaFormat format_;
-  const bool convertPtsToWallTime_;
-  int64_t loggingUuid_;
-
-  AVCodecContext* codecCtx_{nullptr};
-  AVFrame* frame_{nullptr};
-
-  std::atomic<size_t> numGenerator_{0};
-  TimeKeeper keeper_;
-  // estimated next frame pts for flushing the last frame
-  int64_t nextPts_{0};
-  double fps_{30.};
-  // this is a dumb conservative limit; ideally we'd use
-  // int max_threads = at::get_num_threads(); but this would cause
-  // fb sync to fail as it would add dependency to ATen to the decoder API
-  const int max_threads = 12;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.cpp b/torchvision/csrc/io/decoder/subtitle_sampler.cpp
deleted file mode 100644
index d0df24d3e35..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_sampler.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "subtitle_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-SubtitleSampler::~SubtitleSampler() {
-  cleanUp();
-}
-
-void SubtitleSampler::shutdown() {
-  cleanUp();
-}
-
-bool SubtitleSampler::init(const SamplerParameters& params) {
-  cleanUp();
-  // set formats
-  params_ = params;
-  return true;
-}
-
-int SubtitleSampler::sample(AVSubtitle* sub, ByteStorage* out) {
-  if (!sub || !out) {
-    return 0; // flush
-  }
-
-  out->ensure(Util::size(*sub));
-
-  return Util::serialize(*sub, out);
-}
-
-int SubtitleSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  if (in && out) {
-    // Get a writable copy
-    if (size_t len = in->length()) {
-      out->ensure(len);
-      memcpy(out->writableTail(), in->data(), len);
-    }
-    return out->length();
-  }
-  return 0;
-}
-
-void SubtitleSampler::cleanUp() {}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.h b/torchvision/csrc/io/decoder/subtitle_sampler.h
deleted file mode 100644
index 4aee811ed56..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_sampler.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode audio frames from one format into another
- */
-
-class SubtitleSampler : public MediaSampler {
- public:
-  SubtitleSampler() = default;
-  ~SubtitleSampler() override;
-
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  // returns number processed/scaling bytes
-  int sample(AVSubtitle* sub, ByteStorage* out);
-
-  // helper serialization/deserialization methods
-  static void serialize(const AVSubtitle& sub, ByteStorage* out);
-  static bool deserialize(const ByteStorage& buf, AVSubtitle* sub);
-
- private:
-  // close resources
-  void cleanUp();
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp
deleted file mode 100644
index 3416f702d7e..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_stream.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "subtitle_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-
-SubtitleStream::SubtitleStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const SubtitleFormat& format)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          0) {
-  memset(&sub_, 0, sizeof(sub_));
-}
-
-void SubtitleStream::releaseSubtitle() {
-  if (sub_.release) {
-    avsubtitle_free(&sub_);
-    memset(&sub_, 0, sizeof(sub_));
-  }
-}
-
-SubtitleStream::~SubtitleStream() {
-  releaseSubtitle();
-  sampler_.shutdown();
-}
-
-int SubtitleStream::initFormat() {
-  if (!codecCtx_->subtitle_header) {
-    LOG(ERROR) << "No subtitle header found";
-  } else {
-    VLOG(1) << "Subtitle header found!";
-  }
-  return 0;
-}
-
-int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
-  // clean-up
-  releaseSubtitle();
-
-  // FIXME: should this even be created?
-  AVPacket* avPacket;
-  avPacket = av_packet_alloc();
-  if (avPacket == nullptr) {
-    LOG(ERROR)
-        << "decoder as not able to allocate the subtitle-specific packet.";
-    // alternative to ENOMEM
-    return AVERROR_BUFFER_TOO_SMALL;
-  }
-  avPacket->data = nullptr;
-  avPacket->size = 0;
-  // check flush packet
-  auto pkt = packet ? packet : avPacket;
-
-  int gotFramePtr = 0;
-  // is these a better way than cast from const?
-  int result =
-      avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, (AVPacket*)pkt);
-
-  if (result < 0) {
-    LOG(ERROR) << "avcodec_decode_subtitle2 failed, err: "
-               << Util::generateErrorDesc(result);
-    // free the packet we've created
-    av_packet_free(&avPacket);
-    return result;
-  } else if (result == 0) {
-    result = pkt->size; // discard the rest of the package
-  }
-
-  sub_.release = gotFramePtr;
-  *gotFrame = gotFramePtr > 0;
-
-  // set proper pts in us
-  if (gotFramePtr) {
-    sub_.pts = av_rescale_q(
-        pkt->pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ);
-  }
-
-  av_packet_free(&avPacket);
-  return result;
-}
-
-int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  return sampler_.sample(flush ? nullptr : &sub_, out);
-}
-
-void SubtitleStream::setFramePts(DecoderHeader* header, bool) {
-  header->pts = sub_.pts; // already in us
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_stream.h b/torchvision/csrc/io/decoder/subtitle_stream.h
deleted file mode 100644
index 6c366e11f50..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_stream.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-
-#include "stream.h"
-#include "subtitle_sampler.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one subtitle stream.
- */
-struct AVSubtitleKeeper : AVSubtitle {
-  int64_t release{0};
-};
-
-class SubtitleStream : public Stream {
- public:
-  SubtitleStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const SubtitleFormat& format);
-  ~SubtitleStream() override;
-
- protected:
-  void setFramePts(DecoderHeader* header, bool flush) override;
-
- private:
-  int initFormat() override;
-  int analyzePacket(const AVPacket* packet, bool* gotFrame) override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void releaseSubtitle();
-
- private:
-  SubtitleSampler sampler_;
-  AVSubtitleKeeper sub_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder.cpp b/torchvision/csrc/io/decoder/sync_decoder.cpp
deleted file mode 100644
index 1f03ef8eb95..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "sync_decoder.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-SyncDecoder::AVByteStorage::AVByteStorage(size_t n) {
-  ensure(n);
-}
-
-SyncDecoder::AVByteStorage::~AVByteStorage() {
-  av_free(buffer_);
-}
-
-void SyncDecoder::AVByteStorage::ensure(size_t n) {
-  if (tail() < n) {
-    capacity_ = offset_ + length_ + n;
-    buffer_ = static_cast<uint8_t*>(av_realloc(buffer_, capacity_));
-  }
-}
-
-uint8_t* SyncDecoder::AVByteStorage::writableTail() {
-  TORCH_CHECK_LE(offset_ + length_, capacity_);
-  return buffer_ + offset_ + length_;
-}
-
-void SyncDecoder::AVByteStorage::append(size_t n) {
-  TORCH_CHECK_LE(n, tail());
-  length_ += n;
-}
-
-void SyncDecoder::AVByteStorage::trim(size_t n) {
-  TORCH_CHECK_LE(n, length_);
-  offset_ += n;
-  length_ -= n;
-}
-
-const uint8_t* SyncDecoder::AVByteStorage::data() const {
-  return buffer_ + offset_;
-}
-
-size_t SyncDecoder::AVByteStorage::length() const {
-  return length_;
-}
-
-size_t SyncDecoder::AVByteStorage::tail() const {
-  TORCH_CHECK_LE(offset_ + length_, capacity_);
-  return capacity_ - offset_ - length_;
-}
-
-void SyncDecoder::AVByteStorage::clear() {
-  offset_ = 0;
-  length_ = 0;
-}
-
-std::unique_ptr<ByteStorage> SyncDecoder::createByteStorage(size_t n) {
-  return std::make_unique<AVByteStorage>(n);
-}
-
-void SyncDecoder::onInit() {
-  eof_ = false;
-  queue_.clear();
-}
-
-int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) {
-  if (eof_ && queue_.empty()) {
-    return ENODATA;
-  }
-
-  if (queue_.empty()) {
-    int result = getFrame(timeoutMs);
-    // assign EOF
-    eof_ = result == ENODATA;
-    // check unrecoverable error, any error but ENODATA
-    if (result && result != ENODATA) {
-      return result;
-    }
-
-    // still empty
-    if (queue_.empty()) {
-      if (eof_) {
-        return ENODATA;
-      } else {
-        LOG(INFO) << "Queue is empty";
-        return ETIMEDOUT;
-      }
-    }
-  }
-
-  *out = std::move(queue_.front());
-  queue_.pop_front();
-  return 0;
-}
-
-void SyncDecoder::push(DecoderOutputMessage&& buffer) {
-  queue_.push_back(std::move(buffer));
-}
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder.h b/torchvision/csrc/io/decoder/sync_decoder.h
deleted file mode 100644
index b7cf7b625ac..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#pragma once
-
-#include <list>
-#include "decoder.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode media streams.
- * Media bytes can be explicitly provided through read-callback
- * or fetched internally by FFMPEG library
- */
-class SyncDecoder : public Decoder {
- public:
-  // Allocation of memory must be done with a proper alignment.
-  class AVByteStorage : public ByteStorage {
-   public:
-    explicit AVByteStorage(size_t n);
-    ~AVByteStorage() override;
-    void ensure(size_t n) override;
-    uint8_t* writableTail() override;
-    void append(size_t n) override;
-    void trim(size_t n) override;
-    const uint8_t* data() const override;
-    size_t length() const override;
-    size_t tail() const override;
-    void clear() override;
-
-   private:
-    size_t offset_{0};
-    size_t length_{0};
-    size_t capacity_{0};
-    uint8_t* buffer_{nullptr};
-  };
-
- public:
-  int decode(DecoderOutputMessage* out, uint64_t timeoutMs) override;
-
- private:
-  void push(DecoderOutputMessage&& buffer) override;
-  void onInit() override;
-  std::unique_ptr<ByteStorage> createByteStorage(size_t n) override;
-
- private:
-  std::list<DecoderOutputMessage> queue_;
-  bool eof_{false};
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
deleted file mode 100644
index 085966ce687..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-#include <c10/util/Logging.h>
-#include <dirent.h>
-#include <gtest/gtest.h>
-#include "memory_buffer.h"
-#include "sync_decoder.h"
-#include "util.h"
-
-using namespace ffmpeg;
-
-namespace {
-struct VideoFileStats {
-  std::string name;
-  size_t durationPts{0};
-  int num{0};
-  int den{0};
-  int fps{0};
-};
-
-void gotAllTestFiles(
-    const std::string& folder,
-    std::vector<VideoFileStats>* stats) {
-  DIR* d = opendir(folder.c_str());
-  CHECK(d);
-  struct dirent* dir;
-  while ((dir = readdir(d))) {
-    if (dir->d_type != DT_DIR && 0 != strcmp(dir->d_name, "README")) {
-      VideoFileStats item;
-      item.name = folder + '/' + dir->d_name;
-      LOG(INFO) << "Found video file: " << item.name;
-      stats->push_back(std::move(item));
-    }
-  }
-  closedir(d);
-}
-
-void gotFilesStats(std::vector<VideoFileStats>& stats) {
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.formats = {MediaFormat(0)};
-  params.headerOnly = true;
-  params.preventStaleness = false;
-  size_t avgProvUs = 0;
-  const size_t rounds = 100;
-  for (auto& item : stats) {
-    LOG(INFO) << "Decoding video file in memory: " << item.name;
-    FILE* f = fopen(item.name.c_str(), "rb");
-    CHECK(f != nullptr);
-    fseek(f, 0, SEEK_END);
-    std::vector<uint8_t> buffer(ftell(f));
-    rewind(f);
-    size_t s = fread(buffer.data(), 1, buffer.size(), f);
-    TORCH_CHECK_EQ(buffer.size(), s);
-    fclose(f);
-
-    for (size_t i = 0; i < rounds; ++i) {
-      SyncDecoder decoder;
-      std::vector<DecoderMetadata> metadata;
-      const auto now = std::chrono::steady_clock::now();
-      CHECK(decoder.init(
-          params,
-          MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-          &metadata));
-      const auto then = std::chrono::steady_clock::now();
-      decoder.shutdown();
-      avgProvUs +=
-          std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-              .count();
-      TORCH_CHECK_EQ(metadata.size(), 1);
-      item.num = metadata[0].num;
-      item.den = metadata[0].den;
-      item.fps = metadata[0].fps;
-      item.durationPts =
-          av_rescale_q(metadata[0].duration, AV_TIME_BASE_Q, {1, item.fps});
-    }
-  }
-  LOG(INFO) << "Probing (us) " << avgProvUs / stats.size() / rounds;
-}
-
-size_t measurePerformanceUs(
-    const std::vector<VideoFileStats>& stats,
-    size_t rounds,
-    size_t num,
-    size_t stride) {
-  size_t avgClipDecodingUs = 0;
-  std::srand(time(nullptr));
-  for (const auto& item : stats) {
-    FILE* f = fopen(item.name.c_str(), "rb");
-    CHECK(f != nullptr);
-    fseek(f, 0, SEEK_END);
-    std::vector<uint8_t> buffer(ftell(f));
-    rewind(f);
-    size_t s = fread(buffer.data(), 1, buffer.size(), f);
-    TORCH_CHECK_EQ(buffer.size(), s);
-    fclose(f);
-
-    for (size_t i = 0; i < rounds; ++i) {
-      // randomy select clip
-      size_t rOffset = std::rand();
-      size_t fOffset = rOffset % item.durationPts;
-      size_t clipFrames = num + (num - 1) * stride;
-      if (fOffset + clipFrames > item.durationPts) {
-        fOffset = item.durationPts - clipFrames;
-      }
-
-      DecoderParameters params;
-      params.timeoutMs = 10000;
-      params.startOffset = 1000000;
-      params.seekAccuracy = 100000;
-      params.preventStaleness = false;
-
-      for (size_t n = 0; n < num; ++n) {
-        std::list<DecoderOutputMessage> msgs;
-
-        params.startOffset =
-            av_rescale_q(fOffset, {1, item.fps}, AV_TIME_BASE_Q);
-        params.endOffset = params.startOffset + 100;
-
-        auto now = std::chrono::steady_clock::now();
-        SyncDecoder decoder;
-        CHECK(decoder.init(
-            params,
-            MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-            nullptr));
-        DecoderOutputMessage out;
-        while (0 == decoder.decode(&out, params.timeoutMs)) {
-          msgs.push_back(std::move(out));
-        }
-
-        decoder.shutdown();
-
-        const auto then = std::chrono::steady_clock::now();
-
-        fOffset += 1 + stride;
-
-        avgClipDecodingUs +=
-            std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                .count();
-      }
-    }
-  }
-
-  return avgClipDecodingUs / rounds / num / stats.size();
-}
-
-void runDecoder(SyncDecoder& decoder) {
-  DecoderOutputMessage out;
-  size_t audioFrames = 0, videoFrames = 0, totalBytes = 0;
-  while (0 == decoder.decode(&out, 10000)) {
-    if (out.header.format.type == TYPE_AUDIO) {
-      ++audioFrames;
-    } else if (out.header.format.type == TYPE_VIDEO) {
-      ++videoFrames;
-    } else if (out.header.format.type == TYPE_SUBTITLE && out.payload) {
-      // deserialize
-      LOG(INFO) << "Deserializing subtitle";
-      AVSubtitle sub;
-      memset(&sub, 0, sizeof(sub));
-      EXPECT_TRUE(Util::deserialize(*out.payload, &sub));
-      LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects;
-      for (int i = 0; i < sub.num_rects; ++i) {
-        std::string text = "picture";
-        if (sub.rects[i]->type == SUBTITLE_TEXT) {
-          text = sub.rects[i]->text;
-        } else if (sub.rects[i]->type == SUBTITLE_ASS) {
-          text = sub.rects[i]->ass;
-        }
-
-        LOG(INFO) << "Rect num: " << i << ", type:" << sub.rects[i]->type
-                  << ", text: " << text;
-      }
-
-      avsubtitle_free(&sub);
-    }
-    if (out.payload) {
-      totalBytes += out.payload->length();
-    }
-  }
-  LOG(INFO) << "Decoded audio frames: " << audioFrames
-            << ", video frames: " << videoFrames
-            << ", total bytes: " << totalBytes;
-}
-} // namespace
-
-TEST(SyncDecoder, TestSyncDecoderPerformance) {
-  // Measure the average time of decoding per clip
-  // 1. list of the videos in testing directory
-  // 2. for each video got number of frames with timestamps
-  // 3. randomly select frame offset
-  // 4. adjust offset for number frames and strides,
-  //    if it's out out upper boundary
-  // 5. repeat multiple times, measuring and accumulating decoding time
-  //    per clip.
-  /*
-  1) 4 x 2
-  2) 8 x 8
-  3) 16 x 8
-  4) 32 x 4
-  */
-  const std::string kFolder = "pytorch/vision/test/assets/videos";
-  std::vector<VideoFileStats> stats;
-  gotAllTestFiles(kFolder, &stats);
-  gotFilesStats(stats);
-
-  const size_t kRounds = 10;
-
-  auto new4x2 = measurePerformanceUs(stats, kRounds, 4, 2);
-  auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8);
-  auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8);
-  auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4);
-  LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2
-            << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8
-            << ", new(32x4): " << new32x4;
-}
-
-TEST(SyncDecoder, Test) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestSubtitles) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "vue/synergy/data/robotsub.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestHeadersOnly) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = true;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestHeadersOnlyDownSampling) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = true;
-  MediaFormat format;
-  format.type = TYPE_AUDIO;
-  format.format.audio.samples = 8000;
-  params.formats.insert(format);
-
-  format.type = TYPE_VIDEO;
-  format.format.video.width = 224;
-  format.format.video.height = 224;
-  params.formats.insert(format);
-
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestInitOnlyNoShutdown) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = false;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  std::vector<DecoderMetadata> metadata;
-  CHECK(decoder.init(params, nullptr, &metadata));
-}
-
-TEST(SyncDecoder, TestMemoryBuffer) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen(
-      "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi",
-      "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-  CHECK(decoder.init(
-      params,
-      MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-      nullptr));
-  LOG(INFO) << "Decoding from memory bytes: " << buffer.size();
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-
-  params.maxSeekableBytes = buffer.size() + 1;
-  MemoryBuffer object(buffer.data(), buffer.size());
-  CHECK(decoder.init(
-      params,
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - no
-          return -1;
-        }
-        return object.seek(size, whence);
-      },
-      nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-
-  params.maxSeekableBytes = buffer.size() / 2;
-  MemoryBuffer object(buffer.data(), buffer.size());
-  CHECK(!decoder.init(
-      params,
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - no
-          return -1;
-        }
-        return object.seek(size, whence);
-      },
-      nullptr));
-}
diff --git a/torchvision/csrc/io/decoder/time_keeper.cpp b/torchvision/csrc/io/decoder/time_keeper.cpp
deleted file mode 100644
index 845c76cddc8..00000000000
--- a/torchvision/csrc/io/decoder/time_keeper.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "time_keeper.h"
-#include "defs.h"
-
-namespace ffmpeg {
-
-namespace {
-const long kMaxTimeBaseDiference = 10;
-}
-
-long TimeKeeper::adjust(long& decoderTimestamp) {
-  const long now = std::chrono::duration_cast<std::chrono::microseconds>(
-                       std::chrono::system_clock::now().time_since_epoch())
-                       .count();
-
-  if (startTime_ == 0) {
-    startTime_ = now;
-  }
-  if (streamTimestamp_ == 0) {
-    streamTimestamp_ = decoderTimestamp;
-  }
-
-  const auto runOut = startTime_ + decoderTimestamp - streamTimestamp_;
-
-  if (std::labs((now - runOut) / AV_TIME_BASE) > kMaxTimeBaseDiference) {
-    streamTimestamp_ = startTime_ - now + decoderTimestamp;
-  }
-
-  const auto sleepAdvised = runOut - now;
-
-  decoderTimestamp += startTime_ - streamTimestamp_;
-
-  return sleepAdvised > 0 ? sleepAdvised : 0;
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/time_keeper.h b/torchvision/csrc/io/decoder/time_keeper.h
deleted file mode 100644
index e4d4718c705..00000000000
--- a/torchvision/csrc/io/decoder/time_keeper.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <stdlib.h>
-#include <chrono>
-
-namespace ffmpeg {
-
-/**
- * Class keeps the track of the decoded timestamps (us) for media streams.
- */
-
-class TimeKeeper {
- public:
-  TimeKeeper() = default;
-
-  // adjust provided @timestamp to the corrected value
-  // return advised sleep time before next frame processing in (us)
-  long adjust(long& decoderTimestamp);
-
- private:
-  long startTime_{0};
-  long streamTimestamp_{0};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp
deleted file mode 100644
index 7198d2174ed..00000000000
--- a/torchvision/csrc/io/decoder/util.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-#include "util.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-namespace Serializer {
-
-// fixed size types
-template <typename T>
-inline size_t getSize(const T& x) {
-  return sizeof(x);
-}
-
-template <typename T>
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const T& src) {
-  VLOG(6) << "Generic serializeItem";
-  const auto required = sizeof(src);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(dest + pos, &src, required);
-  pos += required;
-  return true;
-}
-
-template <typename T>
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    T& dest) {
-  const auto required = sizeof(dest);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(&dest, src + pos, required);
-  pos += required;
-  return true;
-}
-
-// AVSubtitleRect specialization
-inline size_t getSize(const AVSubtitleRect& x) {
-  auto rectBytes = [](const AVSubtitleRect& y) -> size_t {
-    size_t s = 0;
-    switch (y.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < y.nb_colors; ++i) {
-          s += sizeof(y.linesize[i]);
-          s += y.linesize[i];
-        }
-        break;
-      case SUBTITLE_TEXT:
-        s += sizeof(size_t);
-        s += strlen(y.text);
-        break;
-      case SUBTITLE_ASS:
-        s += sizeof(size_t);
-        s += strlen(y.ass);
-        break;
-      default:
-        break;
-    }
-    return s;
-  };
-  return getSize(x.x) + getSize(x.y) + getSize(x.w) + getSize(x.h) +
-      getSize(x.nb_colors) + getSize(x.type) + getSize(x.flags) + rectBytes(x);
-}
-
-// AVSubtitle specialization
-inline size_t getSize(const AVSubtitle& x) {
-  auto rectBytes = [](const AVSubtitle& y) -> size_t {
-    size_t s = getSize(y.num_rects);
-    for (unsigned i = 0; i < y.num_rects; ++i) {
-      s += getSize(*y.rects[i]);
-    }
-    return s;
-  };
-  return getSize(x.format) + getSize(x.start_display_time) +
-      getSize(x.end_display_time) + getSize(x.pts) + rectBytes(x);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitleRect& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitleRect& x) -> size_t {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!serializeItem(d, l, p, x.linesize[i])) {
-            return false;
-          }
-          if (p + x.linesize[i] > l) {
-            return false;
-          }
-          memcpy(d + p, x.data[i], x.linesize[i]);
-          p += x.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        const size_t s = strlen(x.text);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.text, s);
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        const size_t s = strlen(x.ass);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.ass, s);
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-  return serializeItem(dest, len, pos, src.x) &&
-      serializeItem(dest, len, pos, src.y) &&
-      serializeItem(dest, len, pos, src.w) &&
-      serializeItem(dest, len, pos, src.h) &&
-      serializeItem(dest, len, pos, src.nb_colors) &&
-      serializeItem(dest, len, pos, src.type) &&
-      serializeItem(dest, len, pos, src.flags) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitle& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitle& x) -> bool {
-    bool res = serializeItem(d, l, p, x.num_rects);
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      res = serializeItem(d, l, p, *(x.rects[i]));
-    }
-    return res;
-  };
-  VLOG(6) << "AVSubtitle serializeItem";
-  return serializeItem(dest, len, pos, src.format) &&
-      serializeItem(dest, len, pos, src.start_display_time) &&
-      serializeItem(dest, len, pos, src.end_display_time) &&
-      serializeItem(dest, len, pos, src.pts) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitleRect& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitleRect& x) -> bool {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!deserializeItem(y, l, p, x.linesize[i])) {
-            return false;
-          }
-          if (p + x.linesize[i] > l) {
-            return false;
-          }
-          x.data[i] = (uint8_t*)av_malloc(x.linesize[i]);
-          memcpy(x.data[i], y + p, x.linesize[i]);
-          p += x.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.text = (char*)av_malloc(s + 1);
-        memcpy(x.text, y + p, s);
-        x.text[s] = 0;
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.ass = (char*)av_malloc(s + 1);
-        memcpy(x.ass, y + p, s);
-        x.ass[s] = 0;
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-
-  return deserializeItem(src, len, pos, dest.x) &&
-      deserializeItem(src, len, pos, dest.y) &&
-      deserializeItem(src, len, pos, dest.w) &&
-      deserializeItem(src, len, pos, dest.h) &&
-      deserializeItem(src, len, pos, dest.nb_colors) &&
-      deserializeItem(src, len, pos, dest.type) &&
-      deserializeItem(src, len, pos, dest.flags) &&
-      rectDeserialize(src, len, pos, dest);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitle& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitle& x) -> bool {
-    bool res = deserializeItem(y, l, p, x.num_rects);
-    if (res && x.num_rects) {
-      x.rects =
-          (AVSubtitleRect**)av_malloc(x.num_rects * sizeof(AVSubtitleRect*));
-    }
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      x.rects[i] = (AVSubtitleRect*)av_malloc(sizeof(AVSubtitleRect));
-      memset(x.rects[i], 0, sizeof(AVSubtitleRect));
-      res = deserializeItem(y, l, p, *x.rects[i]);
-    }
-    return res;
-  };
-  return deserializeItem(src, len, pos, dest.format) &&
-      deserializeItem(src, len, pos, dest.start_display_time) &&
-      deserializeItem(src, len, pos, dest.end_display_time) &&
-      deserializeItem(src, len, pos, dest.pts) &&
-      rectDeserialize(src, len, pos, dest);
-}
-} // namespace Serializer
-
-namespace Util {
-std::string generateErrorDesc(int errorCode) {
-  std::array<char, 1024> buffer;
-  if (av_strerror(errorCode, buffer.data(), buffer.size()) < 0) {
-    return std::string("Unknown error code: ") + std::to_string(errorCode);
-  }
-  buffer.back() = 0;
-  return std::string(buffer.data());
-}
-
-size_t serialize(const AVSubtitle& sub, ByteStorage* out) {
-  const auto len = size(sub);
-  size_t pos = 0;
-  if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) {
-    return 0;
-  }
-  out->append(len);
-  return len;
-}
-
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub) {
-  size_t pos = 0;
-  return Serializer::deserializeItem(buf.data(), buf.length(), pos, *sub);
-}
-
-size_t size(const AVSubtitle& sub) {
-  return Serializer::getSize(sub);
-}
-
-bool validateVideoFormat(const VideoFormat& f) {
-  // clang-format off
-  /*
-  Valid parameters values for decoder
-  ____________________________________________________________________________________
-  |  W  |  H  | minDimension | maxDimension | cropImage |  algorithm                 |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     0        |  0           |  N/A      |   original                 |
-  |__________________________________________________________________________________|
-  |  >0 |  0  |     N/A      |  N/A         |  N/A      |   scale keeping W          |
-  |__________________________________________________________________________________|
-  |  0  |  >0 |     N/A      |  N/A         |  N/A      |   scale keeping H          |
-  |__________________________________________________________________________________|
-  |  >0 |  >0 |     N/A      |  N/A         |  0        |   stretch/scale            |
-  |__________________________________________________________________________________|
-  |  >0 |  >0 |     N/A      |  N/A         |  >0       |   scale/crop               |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     >0       |  0           |  N/A      |scale to min dimension      |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     0        |  >0          |  N/A      |scale to max dimension      |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     >0       |  >0          |  N/A      |stretch to min/max dimension|
-  |_____|_____|______________|______________|___________|____________________________|
-
-  */
-  // clang-format on
-  return (f.width == 0 && // #1, #6, #7 and #8
-          f.height == 0 && f.cropImage == 0) ||
-      (f.width != 0 && // #4 and #5
-       f.height != 0 && f.minDimension == 0 && f.maxDimension == 0) ||
-      (((f.width != 0 && // #2
-         f.height == 0) ||
-        (f.width == 0 && // #3
-         f.height != 0)) &&
-       f.minDimension == 0 && f.maxDimension == 0 && f.cropImage == 0);
-}
-
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t maxDimension,
-    size_t cropImage) {
-  // rounding rules
-  // int -> double -> round up
-  // if fraction is >= 0.5 or round down if fraction is < 0.5
-  // int result = double(value) + 0.5
-  // here we rounding double to int according to the above rule
-
-  // #1, #6, #7 and #8
-  if (userW == 0 && userH == 0) {
-    if (minDimension > 0 && maxDimension == 0) { // #6
-      if (srcW > srcH) {
-        // landscape
-        destH = minDimension;
-        destW = round(double(srcW * minDimension) / srcH);
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = round(double(srcH * minDimension) / srcW);
-      }
-    } else if (minDimension == 0 && maxDimension > 0) { // #7
-      if (srcW > srcH) {
-        // landscape
-        destW = maxDimension;
-        destH = round(double(srcH * maxDimension) / srcW);
-      } else {
-        // portrait
-        destH = maxDimension;
-        destW = round(double(srcW * maxDimension) / srcH);
-      }
-    } else if (minDimension > 0 && maxDimension > 0) { // #8
-      if (srcW > srcH) {
-        // landscape
-        destW = maxDimension;
-        destH = minDimension;
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = maxDimension;
-      }
-    } else { // #1
-      destW = srcW;
-      destH = srcH;
-    }
-  } else if (userW != 0 && userH == 0) { // #2
-    destW = userW;
-    destH = round(double(srcH * userW) / srcW);
-  } else if (userW == 0 && userH != 0) { // #3
-    destW = round(double(srcW * userH) / srcH);
-    destH = userH;
-  } else { // userW != 0 && userH != 0
-    if (cropImage == 0) { // #4
-      destW = userW;
-      destH = userH;
-    } else { // #5
-      double userSlope = double(userH) / userW;
-      double srcSlope = double(srcH) / srcW;
-      if (srcSlope < userSlope) {
-        destW = round(double(srcW * userH) / srcH);
-        destH = userH;
-      } else {
-        destW = userW;
-        destH = round(double(srcH * userW) / srcW);
-      }
-    }
-  }
-  // prevent zeros
-  destW = std::max(destW, size_t(1UL));
-  destH = std::max(destH, size_t(1UL));
-}
-} // namespace Util
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util.h b/torchvision/csrc/io/decoder/util.h
deleted file mode 100644
index 01b550e5bbc..00000000000
--- a/torchvision/csrc/io/decoder/util.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * FFMPEG library utility functions.
- */
-
-namespace Util {
-std::string generateErrorDesc(int errorCode);
-size_t serialize(const AVSubtitle& sub, ByteStorage* out);
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub);
-size_t size(const AVSubtitle& sub);
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t maxDimension,
-    size_t cropImage);
-bool validateVideoFormat(const VideoFormat& format);
-} // namespace Util
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp
deleted file mode 100644
index 0a093d9561b..00000000000
--- a/torchvision/csrc/io/decoder/util_test.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <c10/util/Logging.h>
-#include <gtest/gtest.h>
-#include "util.h"
-
-TEST(Util, TestSetFormatDimensions) {
-  // clang-format off
-  const size_t test_cases[][9] = {
-      // (userW, userH, srcW, srcH, minDimension, maxDimension, cropImage, destW, destH)
-      {0, 0, 172, 128, 0, 0, 0, 172, 128},    // #1
-      {86, 0, 172, 128, 0, 0, 0, 86, 64},     // #2
-      {64, 0, 128, 172, 0, 0, 0, 64, 86},     // #2
-      {0, 32, 172, 128, 0, 0, 0, 43, 32},     // #3
-      {32, 0, 128, 172, 0, 0, 0, 32, 43},     // #3
-      {60, 50, 172, 128, 0, 0, 0, 60, 50},    // #4
-      {50, 60, 128, 172, 0, 0, 0, 50, 60},    // #4
-      {86, 40, 172, 128, 0, 0, 1, 86, 64},    // #5
-      {86, 92, 172, 128, 0, 0, 1, 124, 92},   // #5
-      {0, 0, 172, 128, 256, 0, 0, 344, 256},  // #6
-      {0, 0, 128, 172, 256, 0, 0, 256, 344},  // #6
-      {0, 0, 128, 172, 0, 344, 0, 256, 344},  // #7
-      {0, 0, 172, 128, 0, 344, 0, 344, 256},  // #7
-      {0, 0, 172, 128, 100, 344, 0, 344, 100},// #8
-      {0, 0, 128, 172, 100, 344, 0, 100, 344} // #8
-  };
-  // clang-format onn
-
-  for (const auto& tc : test_cases) {
-      size_t destW = 0;
-      size_t destH = 0;
-      ffmpeg::Util::setFormatDimensions(destW, destH, tc[0], tc[1], tc[2], tc[3], tc[4], tc[5], tc[6]);
-      CHECK(destW == tc[7]);
-      CHECK(destH == tc[8]);
-  }
-}
diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
deleted file mode 100644
index 8b712609e34..00000000000
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-#include "video_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-// www.ffmpeg.org/doxygen/0.5/swscale-example_8c-source.html
-
-namespace ffmpeg {
-
-namespace {
-
-// Setup the data pointers and linesizes based on the specified image
-// parameters and the provided array. This sets up "planes" to point to a
-// "buffer"
-// NOTE: this is most likely culprit behind #3534
-//
-// Args:
-// fmt: desired output video format
-// buffer: source constant image buffer (in different format) that will contain
-// the final image after SWScale planes: destination data pointer to be filled
-// lineSize: target destination linesize (always {0})
-int preparePlanes(
-    const VideoFormat& fmt,
-    const uint8_t* buffer,
-    uint8_t** planes,
-    int* lineSize) {
-  int result;
-
-  // NOTE: 1 at the end of av_fill_arrays is the value used for alignment
-  if ((result = av_image_fill_arrays(
-           planes,
-           lineSize,
-           buffer,
-           (AVPixelFormat)fmt.format,
-           fmt.width,
-           fmt.height,
-           1)) < 0) {
-    LOG(ERROR) << "av_image_fill_arrays failed, err: "
-               << Util::generateErrorDesc(result);
-  }
-  return result;
-}
-
-// Scale (and crop) the image slice in srcSlice and put the resulting scaled
-// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as
-// `sws_scale` cannot access buffers directly.
-//
-// Args:
-// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if
-// scale) srcSlice: frame data in YUV420P srcStride: the array containing the
-// strides for each plane of the source
-//            image (from AVFrame->linesize[0])
-// out: destination buffer
-// planes: indirect destination buffer (mapped to "out" via preparePlanes)
-// lines: destination linesize; constant {0}
-int transformImage(
-    SwsContext* context,
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    VideoFormat inFormat,
-    VideoFormat outFormat,
-    uint8_t* out,
-    uint8_t* planes[],
-    int lines[]) {
-  int result;
-  if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) {
-    return result;
-  }
-  if (context) {
-    // NOTE: srcY stride always 0: this is a parameter of YUV format
-    if ((result = sws_scale(
-             context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) <
-        0) {
-      LOG(ERROR) << "sws_scale failed, err: "
-                 << Util::generateErrorDesc(result);
-      return result;
-    }
-  } else if (
-      inFormat.width == outFormat.width &&
-      inFormat.height == outFormat.height &&
-      inFormat.format == outFormat.format) {
-    // Copy planes without using sws_scale if sws_getContext failed.
-    av_image_copy(
-        planes,
-        lines,
-        (const uint8_t**)srcSlice,
-        srcStride,
-        (AVPixelFormat)inFormat.format,
-        inFormat.width,
-        inFormat.height);
-  } else {
-    LOG(ERROR) << "Invalid scale context format " << inFormat.format;
-    return AVERROR(EINVAL);
-  }
-  return 0;
-}
-} // namespace
-
-VideoSampler::VideoSampler(int swsFlags, int64_t loggingUuid)
-    : swsFlags_(swsFlags), loggingUuid_(loggingUuid) {}
-
-VideoSampler::~VideoSampler() {
-  cleanUp();
-}
-
-void VideoSampler::shutdown() {
-  cleanUp();
-}
-
-bool VideoSampler::init(const SamplerParameters& params) {
-  cleanUp();
-
-  if (params.out.video.cropImage != 0) {
-    if (!Util::validateVideoFormat(params.out.video)) {
-      LOG(ERROR) << "Invalid video format"
-                 << ", width: " << params.out.video.width
-                 << ", height: " << params.out.video.height
-                 << ", format: " << params.out.video.format
-                 << ", minDimension: " << params.out.video.minDimension
-                 << ", crop: " << params.out.video.cropImage;
-
-      return false;
-    }
-
-    scaleFormat_.format = params.out.video.format;
-    Util::setFormatDimensions(
-        scaleFormat_.width,
-        scaleFormat_.height,
-        params.out.video.width,
-        params.out.video.height,
-        params.in.video.width,
-        params.in.video.height,
-        0,
-        0,
-        1);
-
-    if (!(scaleFormat_ == params_.out.video)) { // crop required
-      cropContext_ = sws_getContext(
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params.out.video.format,
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params.out.video.format,
-          swsFlags_,
-          nullptr,
-          nullptr,
-          nullptr);
-
-      if (!cropContext_) {
-        LOG(ERROR) << "sws_getContext failed for crop context";
-        return false;
-      }
-
-      const auto scaleImageSize = av_image_get_buffer_size(
-          (AVPixelFormat)scaleFormat_.format,
-          scaleFormat_.width,
-          scaleFormat_.height,
-          1);
-      scaleBuffer_.resize(scaleImageSize);
-    }
-  } else {
-    scaleFormat_ = params.out.video;
-  }
-
-  VLOG(1) << "Input format #" << loggingUuid_ << ", width "
-          << params.in.video.width << ", height " << params.in.video.height
-          << ", format " << params.in.video.format << ", minDimension "
-          << params.in.video.minDimension << ", cropImage "
-          << params.in.video.cropImage;
-  VLOG(1) << "Scale format #" << loggingUuid_ << ", width "
-          << scaleFormat_.width << ", height " << scaleFormat_.height
-          << ", format " << scaleFormat_.format << ", minDimension "
-          << scaleFormat_.minDimension << ", cropImage "
-          << scaleFormat_.cropImage;
-  VLOG(1) << "Crop format #" << loggingUuid_ << ", width "
-          << params.out.video.width << ", height " << params.out.video.height
-          << ", format " << params.out.video.format << ", minDimension "
-          << params.out.video.minDimension << ", cropImage "
-          << params.out.video.cropImage;
-
-  // set output format
-  params_ = params;
-
-  if (params.in.video.format == AV_PIX_FMT_YUV420P) {
-    /* When the video width and height are not multiples of 8,
-     * and there is no size change in the conversion,
-     * a blurry screen will appear on the right side
-     * This problem was discovered in 2012 and
-     * continues to exist in version 4.1.3 in 2019
-     * This problem can be avoided by increasing SWS_ACCURATE_RND
-     * details https://trac.ffmpeg.org/ticket/1582
-     */
-    if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) {
-      VLOG(1) << "The width " << params.in.video.width << " and height "
-              << params.in.video.height << " the image is not a multiple of 8, "
-              << "the decoding speed may be reduced";
-      swsFlags_ |= SWS_ACCURATE_RND;
-    }
-  }
-
-  scaleContext_ = sws_getContext(
-      params.in.video.width,
-      params.in.video.height,
-      (AVPixelFormat)params.in.video.format,
-      scaleFormat_.width,
-      scaleFormat_.height,
-      (AVPixelFormat)scaleFormat_.format,
-      swsFlags_,
-      nullptr,
-      nullptr,
-      nullptr);
-  // sws_getContext might fail if in/out format == AV_PIX_FMT_PAL8 (png format)
-  // Return true if input and output formats/width/height are identical
-  // Check scaleContext_ for nullptr in transformImage to copy planes directly
-
-  if (params.in.video.width == scaleFormat_.width &&
-      params.in.video.height == scaleFormat_.height &&
-      params.in.video.format == scaleFormat_.format) {
-    return true;
-  }
-  return scaleContext_ != nullptr;
-}
-
-// Main body of the sample function called from one of the overloads below
-//
-// Args:
-// srcSlice: decoded AVFrame->data perpared buffer
-// srcStride: linesize (usually obtained from AVFrame->linesize)
-// out: return buffer (ByteStorage*)
-int VideoSampler::sample(
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    ByteStorage* out) {
-  int result;
-  // scaled and cropped image
-  int outImageSize = av_image_get_buffer_size(
-      (AVPixelFormat)params_.out.video.format,
-      params_.out.video.width,
-      params_.out.video.height,
-      1);
-
-  out->ensure(outImageSize);
-
-  uint8_t* scalePlanes[4] = {nullptr};
-  int scaleLines[4] = {0};
-  // perform scale first
-  if ((result = transformImage(
-           scaleContext_,
-           srcSlice,
-           srcStride,
-           params_.in.video,
-           scaleFormat_,
-           // for crop use internal buffer
-           cropContext_ ? scaleBuffer_.data() : out->writableTail(),
-           scalePlanes,
-           scaleLines))) {
-    return result;
-  }
-
-  // is crop required?
-  if (cropContext_) {
-    uint8_t* cropPlanes[4] = {nullptr};
-    int cropLines[4] = {0};
-
-    if (params_.out.video.height < scaleFormat_.height) {
-      // Destination image is wider of source image: cut top and bottom
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.height - params_.out.video.height) / 2;
-      }
-    } else {
-      // Source image is wider of destination image: cut sides
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.width - params_.out.video.width) / 2 /
-            scaleFormat_.width;
-      }
-    }
-
-    // crop image
-    if ((result = transformImage(
-             cropContext_,
-             scalePlanes,
-             scaleLines,
-             params_.out.video,
-             params_.out.video,
-             out->writableTail(),
-             cropPlanes,
-             cropLines))) {
-      return result;
-    }
-  }
-
-  out->append(outImageSize);
-  return outImageSize;
-}
-
-// Call from `video_stream.cpp::114` - occurs during file reads
-int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
-  if (!frame) {
-    return 0; // no flush for videos
-  }
-
-  return sample(frame->data, frame->linesize, out);
-}
-
-// Call from `video_stream.cpp::114` - not sure when this occurs
-int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  if (!in) {
-    return 0; // no flush for videos
-  }
-
-  int result;
-  uint8_t* inPlanes[4] = {nullptr};
-  int inLineSize[4] = {0};
-
-  if ((result = preparePlanes(
-           params_.in.video, in->data(), inPlanes, inLineSize)) < 0) {
-    return result;
-  }
-
-  return sample(inPlanes, inLineSize, out);
-}
-
-void VideoSampler::cleanUp() {
-  if (scaleContext_) {
-    sws_freeContext(scaleContext_);
-    scaleContext_ = nullptr;
-  }
-  if (cropContext_) {
-    sws_freeContext(cropContext_);
-    cropContext_ = nullptr;
-    scaleBuffer_.clear();
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_sampler.h b/torchvision/csrc/io/decoder/video_sampler.h
deleted file mode 100644
index 47247f2c0c5..00000000000
--- a/torchvision/csrc/io/decoder/video_sampler.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode video frames from one format into another
- */
-
-class VideoSampler : public MediaSampler {
- public:
-  VideoSampler(int swsFlags = SWS_AREA, int64_t loggingUuid = 0);
-
-  ~VideoSampler() override;
-
-  // MediaSampler overrides
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  // returns number processed/scaling bytes
-  int sample(AVFrame* frame, ByteStorage* out);
-  int getImageBytes() const;
-
- private:
-  // close resources
-  void cleanUp();
-  // helper functions for rescaling, cropping, etc.
-  int sample(
-      const uint8_t* const srcSlice[],
-      int srcStride[],
-      ByteStorage* out);
-
- private:
-  VideoFormat scaleFormat_;
-  SwsContext* scaleContext_{nullptr};
-  SwsContext* cropContext_{nullptr};
-  int swsFlags_{SWS_AREA};
-  std::vector<uint8_t> scaleBuffer_;
-  int64_t loggingUuid_{0};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp
deleted file mode 100644
index fa08c65cac1..00000000000
--- a/torchvision/csrc/io/decoder/video_stream.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "video_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-namespace {
-bool operator==(const VideoFormat& x, const AVFrame& y) {
-  return x.width == static_cast<size_t>(y.width) &&
-      x.height == static_cast<size_t>(y.height) && x.format == y.format;
-}
-
-bool operator==(const VideoFormat& x, const AVCodecContext& y) {
-  return x.width == static_cast<size_t>(y.width) &&
-      x.height == static_cast<size_t>(y.height) && x.format == y.pix_fmt;
-}
-
-VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) {
-  x.width = y.width;
-  x.height = y.height;
-  x.format = y.format;
-  return x;
-}
-
-VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) {
-  x.width = y.width;
-  x.height = y.height;
-  x.format = y.pix_fmt;
-  return x;
-}
-} // namespace
-
-VideoStream::VideoStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const VideoFormat& format,
-    int64_t loggingUuid)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          loggingUuid) {}
-
-VideoStream::~VideoStream() {
-  if (sampler_) {
-    sampler_->shutdown();
-    sampler_.reset();
-  }
-}
-
-int VideoStream::initFormat() {
-  // set output format
-  if (!Util::validateVideoFormat(format_.format.video)) {
-    LOG(ERROR) << "Invalid video format"
-               << ", width: " << format_.format.video.width
-               << ", height: " << format_.format.video.height
-               << ", format: " << format_.format.video.format
-               << ", minDimension: " << format_.format.video.minDimension
-               << ", crop: " << format_.format.video.cropImage;
-    return -1;
-  }
-
-  // keep aspect ratio
-  Util::setFormatDimensions(
-      format_.format.video.width,
-      format_.format.video.height,
-      format_.format.video.width,
-      format_.format.video.height,
-      codecCtx_->width,
-      codecCtx_->height,
-      format_.format.video.minDimension,
-      format_.format.video.maxDimension,
-      0);
-
-  if (format_.format.video.format == AV_PIX_FMT_NONE) {
-    format_.format.video.format = codecCtx_->pix_fmt;
-  }
-  return format_.format.video.width != 0 && format_.format.video.height != 0 &&
-          format_.format.video.format != AV_PIX_FMT_NONE
-      ? 0
-      : -1;
-}
-
-// copies frame bytes via sws_scale call in video_sampler.cpp
-int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  if (!sampler_) {
-    sampler_ = std::make_unique<VideoSampler>(SWS_AREA, loggingUuid_);
-  }
-
-  // check if input format gets changed
-  if (flush ? !(sampler_->getInputFormat().video == *codecCtx_)
-            : !(sampler_->getInputFormat().video == *frame_)) {
-    // - reinit sampler
-    SamplerParameters params;
-    params.type = format_.type;
-    params.out = format_.format;
-    params.in = FormatUnion(0);
-    flush ? toVideoFormat(params.in.video, *codecCtx_)
-          : toVideoFormat(params.in.video, *frame_);
-    if (!sampler_->init(params)) {
-      return -1;
-    }
-
-    VLOG(1) << "Set input video sampler format"
-            << ", width: " << params.in.video.width
-            << ", height: " << params.in.video.height
-            << ", format: " << params.in.video.format
-            << " : output video sampler format"
-            << ", width: " << format_.format.video.width
-            << ", height: " << format_.format.video.height
-            << ", format: " << format_.format.video.format
-            << ", minDimension: " << format_.format.video.minDimension
-            << ", crop: " << format_.format.video.cropImage;
-  }
-  // calls to a sampler that converts the frame from YUV422 to RGB24, and
-  // optionally crops and resizes the frame. Frame bytes are copied from
-  // frame_->data to out buffer
-  return sampler_->sample(flush ? nullptr : frame_, out);
-}
-
-void VideoStream::setHeader(DecoderHeader* header, bool flush) {
-  Stream::setHeader(header, flush);
-  if (!flush) { // no frames for video flush
-    header->keyFrame = frame_->key_frame;
-    header->fps = av_q2d(av_guess_frame_rate(
-        inputCtx_, inputCtx_->streams[format_.stream], nullptr));
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_stream.h b/torchvision/csrc/io/decoder/video_stream.h
deleted file mode 100644
index e6a8bf02b65..00000000000
--- a/torchvision/csrc/io/decoder/video_stream.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include "stream.h"
-#include "video_sampler.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one video stream.
- */
-
-class VideoStream : public Stream {
- public:
-  VideoStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const VideoFormat& format,
-      int64_t loggingUuid);
-  ~VideoStream() override;
-
- private:
-  int initFormat() override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header, bool flush) override;
-
- private:
-  std::unique_ptr<VideoSampler> sampler_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
deleted file mode 100644
index 8f1fb3fb5b9..00000000000
--- a/torchvision/csrc/io/video/video.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-#include "video.h"
-
-#include <regex>
-
-using namespace ffmpeg;
-
-namespace vision {
-namespace video {
-
-namespace {
-
-const size_t decoderTimeoutMs = 600000;
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
-
-// returns number of written bytes
-template <typename T>
-size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
-  const auto& msg = msgs;
-  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-  if (frameData) {
-    auto sizeInBytes = msg.payload->length();
-    memcpy(frameData, msg.payload->data(), sizeInBytes);
-  }
-  return sizeof(T);
-}
-
-size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
-  return fillTensorList<uint8_t>(msgs, videoFrame);
-}
-
-size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
-  return fillTensorList<float>(msgs, audioFrame);
-}
-
-std::array<std::pair<std::string, ffmpeg::MediaType>, 4>::const_iterator
-_parse_type(const std::string& stream_string) {
-  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
-      {"video", TYPE_VIDEO},
-      {"audio", TYPE_AUDIO},
-      {"subtitle", TYPE_SUBTITLE},
-      {"cc", TYPE_CC},
-  }};
-  auto device = std::find_if(
-      types.begin(),
-      types.end(),
-      [stream_string](const std::pair<std::string, MediaType>& p) {
-        return p.first == stream_string;
-      });
-  if (device != types.end()) {
-    return device;
-  }
-  TORCH_CHECK(
-      false, "Expected one of [audio, video, subtitle, cc] ", stream_string);
-}
-
-std::string parse_type_to_string(const std::string& stream_string) {
-  auto device = _parse_type(stream_string);
-  return device->first;
-}
-
-MediaType parse_type_to_mt(const std::string& stream_string) {
-  auto device = _parse_type(stream_string);
-  return device->second;
-}
-
-std::tuple<std::string, long> _parseStream(const std::string& streamString) {
-  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
-  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
-  std::smatch match;
-
-  TORCH_CHECK(
-      std::regex_match(streamString, match, regex),
-      "Invalid stream string: '",
-      streamString,
-      "'");
-
-  std::string type_ = "video";
-  type_ = parse_type_to_string(match[1].str());
-  long index_ = -1;
-  if (match[2].matched) {
-    try {
-      index_ = std::stoi(match[2].str());
-    } catch (const std::exception&) {
-      TORCH_CHECK(
-          false,
-          "Could not parse device index '",
-          match[2].str(),
-          "' in device string '",
-          streamString,
-          "'");
-    }
-  }
-  return std::make_tuple(type_, index_);
-}
-
-} // namespace
-
-void Video::_getDecoderParams(
-    double videoStartS,
-    int64_t getPtsOnly,
-    std::string stream,
-    long stream_id = -1,
-    bool fastSeek = true,
-    bool all_streams = false,
-    int64_t num_threads = 1,
-    double seekFrameMarginUs = 10) {
-  int64_t videoStartUs = int64_t(videoStartS * 1e6);
-
-  params.timeoutMs = decoderTimeoutMs;
-  params.startOffset = videoStartUs;
-  params.seekAccuracy = seekFrameMarginUs;
-  params.fastSeek = fastSeek;
-  params.headerOnly = false;
-  params.numThreads = num_threads;
-
-  params.preventStaleness = false; // not sure what this is about
-
-  if (all_streams == true) {
-    MediaFormat format;
-    format.stream = -2;
-    format.type = TYPE_AUDIO;
-    params.formats.insert(format);
-
-    format.type = TYPE_VIDEO;
-    format.stream = -2;
-    format.format.video.width = 0;
-    format.format.video.height = 0;
-    format.format.video.cropImage = 0;
-    format.format.video.format = defaultVideoPixelFormat;
-    params.formats.insert(format);
-
-    format.type = TYPE_SUBTITLE;
-    format.stream = -2;
-    params.formats.insert(format);
-
-    format.type = TYPE_CC;
-    format.stream = -2;
-    params.formats.insert(format);
-  } else {
-    // parse stream type
-    MediaType stream_type = parse_type_to_mt(stream);
-
-    // TODO: reset params.formats
-    std::set<MediaFormat> formats;
-    params.formats = formats;
-    // Define new format
-    MediaFormat format;
-    format.type = stream_type;
-    format.stream = stream_id;
-    if (stream_type == TYPE_VIDEO) {
-      format.format.video.width = 0;
-      format.format.video.height = 0;
-      format.format.video.cropImage = 0;
-      format.format.video.format = defaultVideoPixelFormat;
-    }
-    params.formats.insert(format);
-  }
-
-} // _get decoder params
-
-void Video::initFromFile(
-    std::string videoPath,
-    std::string stream,
-    int64_t numThreads) {
-  TORCH_CHECK(!initialized, "Video object can only be initialized once");
-  initialized = true;
-  params.uri = videoPath;
-  _init(stream, numThreads);
-}
-
-void Video::initFromMemory(
-    torch::Tensor videoTensor,
-    std::string stream,
-    int64_t numThreads) {
-  TORCH_CHECK(!initialized, "Video object can only be initialized once");
-  initialized = true;
-  callback = MemoryBuffer::getCallback(
-      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
-  _init(stream, numThreads);
-}
-
-void Video::_init(std::string stream, int64_t numThreads) {
-  // set number of threads global
-  numThreads_ = numThreads;
-  // parse stream information
-  current_stream = _parseStream(stream);
-  // note that in the initial call we want to get all streams
-  _getDecoderParams(
-      0, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream info - remove that
-      long(-1), // stream_id parsed from info above change to -2
-      false, // fastseek: we're using the default param here
-      true, // read all streams
-      numThreads_ // global number of Threads for decoding
-  );
-
-  std::string logMessage, logType;
-
-  // locals
-  std::vector<double> audioFPS, videoFPS;
-  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
-  std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  c10::Dict<std::string, std::vector<double>> audioMetadata;
-  c10::Dict<std::string, std::vector<double>> videoMetadata;
-  c10::Dict<std::string, std::vector<double>> ccMetadata;
-  c10::Dict<std::string, std::vector<double>> subsMetadata;
-
-  // callback and metadata defined in struct
-  DecoderInCallback tmp_callback = callback;
-  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
-  if (succeeded) {
-    for (const auto& header : metadata) {
-      double fps = double(header.fps);
-      double duration = double(header.duration) * 1e-6; // * timeBase;
-
-      if (header.format.type == TYPE_VIDEO) {
-        videoFPS.push_back(fps);
-        videoDuration.push_back(duration);
-      } else if (header.format.type == TYPE_AUDIO) {
-        audioFPS.push_back(fps);
-        audioDuration.push_back(duration);
-      } else if (header.format.type == TYPE_CC) {
-        ccDuration.push_back(duration);
-      } else if (header.format.type == TYPE_SUBTITLE) {
-        subsDuration.push_back(duration);
-      };
-    }
-  }
-  // audio
-  audioMetadata.insert("duration", audioDuration);
-  audioMetadata.insert("framerate", audioFPS);
-  // video
-  videoMetadata.insert("duration", videoDuration);
-  videoMetadata.insert("fps", videoFPS);
-  // subs
-  subsMetadata.insert("duration", subsDuration);
-  // cc
-  ccMetadata.insert("duration", ccDuration);
-  // put all to a data
-  streamsMetadata.insert("video", videoMetadata);
-  streamsMetadata.insert("audio", audioMetadata);
-  streamsMetadata.insert("subtitles", subsMetadata);
-  streamsMetadata.insert("cc", ccMetadata);
-
-  succeeded = setCurrentStream(stream);
-  if (std::get<1>(current_stream) != -1) {
-    LOG(INFO)
-        << "Stream index set to " << std::get<1>(current_stream)
-        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
-  }
-}
-
-Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
-  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
-  if (!videoPath.empty()) {
-    initFromFile(videoPath, stream, numThreads);
-  }
-} // video
-
-bool Video::setCurrentStream(std::string stream = "video") {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
-    current_stream = _parseStream(stream);
-  }
-
-  double ts = 0;
-  if (seekTS > 0) {
-    ts = seekTS;
-  }
-
-  _getDecoderParams(
-      ts, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream
-      long(std::get<1>(
-          current_stream)), // stream_id parsed from info above change to -2
-      false, // fastseek param set to 0 false by default (changed in seek)
-      false, // read all streams
-      numThreads_ // global number of threads
-  );
-
-  // callback and metadata defined in Video.h
-  DecoderInCallback tmp_callback = callback;
-  return (decoder.init(params, std::move(tmp_callback), &metadata));
-}
-
-std::tuple<std::string, int64_t> Video::getCurrentStream() const {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  return current_stream;
-}
-
-c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
-    getStreamMetadata() const {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  return streamsMetadata;
-}
-
-void Video::Seek(double ts, bool fastSeek = false) {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  // initialize the class variables used for seeking and retrurn
-  _getDecoderParams(
-      ts, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream
-      long(std::get<1>(
-          current_stream)), // stream_id parsed from info above change to -2
-      fastSeek, // fastseek
-      false, // read all streams
-      numThreads_ // global number of threads
-  );
-
-  // callback and metadata defined in Video.h
-  DecoderInCallback tmp_callback = callback;
-  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
-}
-
-std::tuple<torch::Tensor, double> Video::Next() {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  // if failing to decode simply return a null tensor (note, should we
-  // raise an exception?)
-  double frame_pts_s;
-  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
-
-  // decode single frame
-  DecoderOutputMessage out;
-  int64_t res = decoder.decode(&out, decoderTimeoutMs);
-  // if successful
-  if (res == 0) {
-    frame_pts_s = double(double(out.header.pts) * 1e-6);
-
-    auto header = out.header;
-    const auto& format = header.format;
-
-    // initialize the output variables based on type
-
-    if (format.type == TYPE_VIDEO) {
-      // note: this can potentially be optimized
-      // by having the global tensor that we fill at decode time
-      // (would avoid allocations)
-      int outHeight = format.format.video.height;
-      int outWidth = format.format.video.width;
-      int numChannels = 3;
-      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-      fillVideoTensor(out, outFrame);
-      outFrame = outFrame.permute({2, 0, 1});
-
-    } else if (format.type == TYPE_AUDIO) {
-      int outAudioChannels = format.format.audio.channels;
-      int bytesPerSample = av_get_bytes_per_sample(
-          static_cast<AVSampleFormat>(format.format.audio.format));
-      int frameSizeTotal = out.payload->length();
-
-      TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-      int numAudioSamples =
-          frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-      outFrame =
-          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-
-      fillAudioTensor(out, outFrame);
-    }
-    // currently not supporting other formats (will do soon)
-
-    out.payload.reset();
-  } else if (res == ENODATA) {
-    LOG(INFO) << "Decoder ran out of frames (ENODATA)\n";
-  } else {
-    LOG(ERROR) << "Decoder failed with ERROR_CODE " << res;
-  }
-
-  return std::make_tuple(outFrame, frame_pts_s);
-}
-
-static auto registerVideo =
-    torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, int64_t>())
-        .def("init_from_file", &Video::initFromFile)
-        .def("init_from_memory", &Video::initFromMemory)
-        .def("get_current_stream", &Video::getCurrentStream)
-        .def("set_current_stream", &Video::setCurrentStream)
-        .def("get_metadata", &Video::getStreamMetadata)
-        .def("seek", &Video::Seek)
-        .def("next", &Video::Next);
-
-} // namespace video
-} // namespace vision
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
deleted file mode 100644
index e57fc3ae6b7..00000000000
--- a/torchvision/csrc/io/video/video.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-
-#include <torch/types.h>
-
-#include "../decoder/defs.h"
-#include "../decoder/memory_buffer.h"
-#include "../decoder/sync_decoder.h"
-
-namespace vision {
-namespace video {
-
-struct Video : torch::CustomClassHolder {
-  std::tuple<std::string, long> current_stream; // stream type, id
-  // global video metadata
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
-      streamsMetadata;
-  int64_t numThreads_{0};
-
- public:
-  Video(
-      std::string videoPath = std::string(),
-      std::string stream = std::string("video"),
-      int64_t numThreads = 0);
-  void initFromFile(
-      std::string videoPath,
-      std::string stream,
-      int64_t numThreads);
-  void initFromMemory(
-      torch::Tensor videoTensor,
-      std::string stream,
-      int64_t numThreads);
-
-  std::tuple<std::string, int64_t> getCurrentStream() const;
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
-  getStreamMetadata() const;
-  void Seek(double ts, bool fastSeek);
-  bool setCurrentStream(std::string stream);
-  std::tuple<torch::Tensor, double> Next();
-
- private:
-  bool succeeded = false; // decoder init flag
-  // seekTS and doSeek act as a flag - if it's not set, next function simply
-  // returns the next frame. If it's set, we look at the global seek
-  // time in combination with any_frame settings
-  double seekTS = -1;
-
-  bool initialized = false;
-
-  void _init(
-      std::string stream,
-      int64_t numThreads); // expects params.uri OR callback to be set
-
-  void _getDecoderParams(
-      double videoStartS,
-      int64_t getPtsOnly,
-      std::string stream,
-      long stream_id,
-      bool fastSeek,
-      bool all_streams,
-      int64_t num_threads,
-      double seekFrameMarginUs); // this needs to be improved
-
-  std::map<std::string, std::vector<double>> streamTimeBase; // not used
-
-  ffmpeg::DecoderInCallback callback = nullptr;
-  std::vector<ffmpeg::DecoderMetadata> metadata;
-
- protected:
-  ffmpeg::SyncDecoder decoder;
-  ffmpeg::DecoderParameters params;
-
-}; // struct Video
-
-} // namespace video
-} // namespace vision
diff --git a/torchvision/csrc/io/video_reader/video_reader.cpp b/torchvision/csrc/io/video_reader/video_reader.cpp
deleted file mode 100644
index f9a5e9085d8..00000000000
--- a/torchvision/csrc/io/video_reader/video_reader.cpp
+++ /dev/null
@@ -1,677 +0,0 @@
-#include "video_reader.h"
-
-#include "../decoder/memory_buffer.h"
-#include "../decoder/sync_decoder.h"
-
-// If we are in a Windows environment, we need to define
-// initialization functions for the _custom_ops extension
-#ifdef _WIN32
-void* PyInit_video_reader(void) {
-  return nullptr;
-}
-#endif
-
-using namespace ffmpeg;
-
-namespace vision {
-namespace video_reader {
-
-namespace {
-
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
-const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-const size_t decoderTimeoutMs = 600000;
-// A jitter can be added to the end of the range to avoid conversion/rounding
-// error, small value 100us won't be enough to select the next frame, but enough
-// to compensate rounding error due to the multiple conversions.
-const size_t timeBaseJitterUs = 100;
-
-DecoderParameters getDecoderParams(
-    int64_t videoStartUs,
-    int64_t videoEndUs,
-    double seekFrameMarginUs,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int videoWidth,
-    int videoHeight,
-    int videoMinDimension,
-    int videoMaxDimension,
-    int64_t readAudioStream,
-    int audioSamples,
-    int audioChannels) {
-  DecoderParameters params;
-  params.headerOnly = getPtsOnly != 0;
-  params.seekAccuracy = seekFrameMarginUs;
-  params.startOffset = videoStartUs;
-  params.endOffset = videoEndUs;
-  params.timeoutMs = decoderTimeoutMs;
-  params.preventStaleness = false;
-
-  if (readVideoStream == 1) {
-    MediaFormat videoFormat(0);
-    videoFormat.type = TYPE_VIDEO;
-    videoFormat.format.video.format = defaultVideoPixelFormat;
-    videoFormat.format.video.width = videoWidth;
-    videoFormat.format.video.height = videoHeight;
-    videoFormat.format.video.minDimension = videoMinDimension;
-    videoFormat.format.video.maxDimension = videoMaxDimension;
-    params.formats.insert(videoFormat);
-  }
-
-  if (readAudioStream == 1) {
-    MediaFormat audioFormat;
-    audioFormat.type = TYPE_AUDIO;
-    audioFormat.format.audio.format = defaultAudioSampleFormat;
-    audioFormat.format.audio.samples = audioSamples;
-    audioFormat.format.audio.channels = audioChannels;
-    params.formats.insert(audioFormat);
-  }
-
-  return params;
-}
-
-// returns number of written bytes
-template <typename T>
-size_t fillTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& frame,
-    torch::Tensor& framePts,
-    int64_t num,
-    int64_t den) {
-  if (msgs.empty()) {
-    return 0;
-  }
-  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-  int64_t* framePtsData = framePts.data_ptr<int64_t>();
-  TORCH_CHECK_EQ(framePts.size(0), (int64_t)msgs.size());
-  size_t avgElementsInFrame = frame.numel() / msgs.size();
-
-  size_t offset = 0;
-  for (size_t i = 0; i < msgs.size(); ++i) {
-    const auto& msg = msgs[i];
-    // convert pts into original time_base
-    AVRational avr = AVRational{(int)num, (int)den};
-    framePtsData[i] = av_rescale_q(msg.header.pts, timeBaseQ, avr);
-    VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts
-            << ", original: " << framePtsData[i];
-
-    if (frameData) {
-      auto sizeInBytes = msg.payload->length();
-      memcpy(frameData + offset, msg.payload->data(), sizeInBytes);
-      if (sizeof(T) == sizeof(uint8_t)) {
-        // Video - move by allocated frame size
-        offset += avgElementsInFrame / sizeof(T);
-      } else {
-        // Audio - move by number of samples
-        offset += sizeInBytes / sizeof(T);
-      }
-    }
-  }
-  return offset * sizeof(T);
-}
-
-size_t fillVideoTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& videoFrame,
-    torch::Tensor& videoFramePts,
-    int64_t num,
-    int64_t den) {
-  return fillTensor<uint8_t>(msgs, videoFrame, videoFramePts, num, den);
-}
-
-size_t fillAudioTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& audioFrame,
-    torch::Tensor& audioFramePts,
-    int64_t num,
-    int64_t den) {
-  return fillTensor<float>(msgs, audioFrame, audioFramePts, num, den);
-}
-
-void offsetsToUs(
-    double& seekFrameMargin,
-    int64_t readVideoStream,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen,
-    int64_t& videoStartUs,
-    int64_t& videoEndUs) {
-  seekFrameMargin *= AV_TIME_BASE;
-  videoStartUs = 0;
-  videoEndUs = -1;
-
-  if (readVideoStream) {
-    AVRational vr = AVRational{(int)videoTimeBaseNum, (int)videoTimeBaseDen};
-    if (videoStartPts > 0) {
-      videoStartUs = av_rescale_q(videoStartPts, vr, timeBaseQ);
-    }
-    if (videoEndPts > 0) {
-      // Add jitter to the end of the range to avoid conversion/rounding error.
-      // Small value 100us won't be enough to select the next frame, but enough
-      // to compensate rounding error due to the multiple conversions.
-      videoEndUs = timeBaseJitterUs + av_rescale_q(videoEndPts, vr, timeBaseQ);
-    }
-  } else if (readAudioStream) {
-    AVRational ar = AVRational{(int)audioTimeBaseNum, (int)audioTimeBaseDen};
-    if (audioStartPts > 0) {
-      videoStartUs = av_rescale_q(audioStartPts, ar, timeBaseQ);
-    }
-    if (audioEndPts > 0) {
-      // Add jitter to the end of the range to avoid conversion/rounding error.
-      // Small value 100us won't be enough to select the next frame, but enough
-      // to compensate rounding error due to the multiple conversions.
-      videoEndUs = timeBaseJitterUs + av_rescale_q(audioEndPts, ar, timeBaseQ);
-    }
-  }
-}
-
-torch::List<torch::Tensor> readVideo(
-    bool isReadFile,
-    const torch::Tensor& input_video,
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  int64_t videoStartUs, videoEndUs;
-
-  offsetsToUs(
-      seekFrameMargin,
-      readVideoStream,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen,
-      videoStartUs,
-      videoEndUs);
-
-  DecoderParameters params = getDecoderParams(
-      videoStartUs, // videoStartPts
-      videoEndUs, // videoEndPts
-      seekFrameMargin, // seekFrameMargin
-      getPtsOnly, // getPtsOnly
-      readVideoStream, // readVideoStream
-      width, // width
-      height, // height
-      minDimension, // minDimension
-      maxDimension, // maxDimension
-      readAudioStream, // readAudioStream
-      audioSamples, // audioSamples
-      audioChannels // audioChannels
-  );
-
-  SyncDecoder decoder;
-  std::vector<DecoderOutputMessage> audioMessages, videoMessages;
-  DecoderInCallback callback = nullptr;
-  std::string logMessage, logType;
-  if (isReadFile) {
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-  } else {
-    callback = MemoryBuffer::getCallback(
-        input_video.data_ptr<uint8_t>(), input_video.size(0));
-    logType = "memory";
-    logMessage = std::to_string(input_video.size(0));
-  }
-
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] has started";
-
-  const auto now = std::chrono::system_clock::now();
-
-  bool succeeded;
-  DecoderMetadata audioMetadata, videoMetadata;
-  std::vector<DecoderMetadata> metadata;
-  if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
-    for (const auto& header : metadata) {
-      if (header.format.type == TYPE_VIDEO) {
-        videoMetadata = header;
-      } else if (header.format.type == TYPE_AUDIO) {
-        audioMetadata = header;
-      }
-    }
-    int res;
-    DecoderOutputMessage msg;
-    while (0 == (res = decoder.decode(&msg, decoderTimeoutMs))) {
-      if (msg.header.format.type == TYPE_VIDEO) {
-        videoMessages.push_back(std::move(msg));
-      }
-      if (msg.header.format.type == TYPE_AUDIO) {
-        audioMessages.push_back(std::move(msg));
-      }
-      msg.payload.reset();
-    }
-  } else {
-    LOG(ERROR) << "Decoder initialization has failed";
-  }
-  const auto then = std::chrono::system_clock::now();
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] has finished, "
-          << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                 .count()
-          << " us";
-
-  decoder.shutdown();
-
-  // video section
-  torch::Tensor videoFrame = torch::zeros({0}, torch::kByte);
-  torch::Tensor videoFramePts = torch::zeros({0}, torch::kLong);
-  torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
-  torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && readVideoStream == 1) {
-    if (!videoMessages.empty()) {
-      const auto& header = videoMetadata;
-      const auto& format = header.format.format.video;
-      int numVideoFrames = videoMessages.size();
-      int outHeight = format.height;
-      int outWidth = format.width;
-      int numChannels = 3; // decoder guarantees the default AV_PIX_FMT_RGB24
-
-      size_t expectedWrittenBytes = 0;
-      if (getPtsOnly == 0) {
-        videoFrame = torch::zeros(
-            {numVideoFrames, outHeight, outWidth, numChannels}, torch::kByte);
-        expectedWrittenBytes =
-            (size_t)numVideoFrames * outHeight * outWidth * numChannels;
-      }
-
-      videoFramePts = torch::zeros({numVideoFrames}, torch::kLong);
-
-      VLOG(2) << "video duration: " << header.duration
-              << ", fps: " << header.fps << ", num: " << header.num
-              << ", den: " << header.den << ", num frames: " << numVideoFrames;
-
-      auto numberWrittenBytes = fillVideoTensor(
-          videoMessages, videoFrame, videoFramePts, header.num, header.den);
-
-      TORCH_CHECK_EQ(numberWrittenBytes, expectedWrittenBytes);
-
-      videoTimeBase = torch::zeros({2}, torch::kInt);
-      int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-      videoTimeBaseData[0] = header.num;
-      videoTimeBaseData[1] = header.den;
-
-      videoFps = torch::zeros({1}, torch::kFloat);
-      float* videoFpsData = videoFps.data_ptr<float>();
-      videoFpsData[0] = header.fps;
-
-      videoDuration = torch::zeros({1}, torch::kLong);
-      int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-      AVRational vr = AVRational{(int)header.num, (int)header.den};
-      videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, vr);
-      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-              << "] filled video tensors";
-    } else {
-      VLOG(1) << "Miss video stream";
-    }
-  }
-
-  // audio section
-  torch::Tensor audioFrame = torch::zeros({0}, torch::kFloat);
-  torch::Tensor audioFramePts = torch::zeros({0}, torch::kLong);
-  torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
-  if (succeeded && readAudioStream == 1) {
-    if (!audioMessages.empty()) {
-      const auto& header = audioMetadata;
-      const auto& format = header.format.format.audio;
-
-      int64_t outAudioChannels = format.channels;
-      int bytesPerSample =
-          av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format));
-
-      int numAudioFrames = audioMessages.size();
-      int64_t numAudioSamples = 0;
-      if (getPtsOnly == 0) {
-        int64_t frameSizeTotal = 0;
-        for (auto const& audioMessage : audioMessages) {
-          frameSizeTotal += audioMessage.payload->length();
-        }
-
-        TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-        numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-        audioFrame =
-            torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-      }
-      audioFramePts = torch::zeros({numAudioFrames}, torch::kLong);
-
-      VLOG(2) << "audio duration: " << header.duration
-              << ", channels: " << format.channels
-              << ", sample rate: " << format.samples << ", num: " << header.num
-              << ", den: " << header.den;
-
-      auto numberWrittenBytes = fillAudioTensor(
-          audioMessages, audioFrame, audioFramePts, header.num, header.den);
-      TORCH_CHECK_EQ(
-          numberWrittenBytes,
-          numAudioSamples * outAudioChannels * sizeof(float));
-
-      audioTimeBase = torch::zeros({2}, torch::kInt);
-      int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-      audioTimeBaseData[0] = header.num;
-      audioTimeBaseData[1] = header.den;
-
-      audioSampleRate = torch::zeros({1}, torch::kInt);
-      int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-      audioSampleRateData[0] = format.samples;
-
-      audioDuration = torch::zeros({1}, torch::kLong);
-      int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-      AVRational ar = AVRational{(int)header.num, (int)header.den};
-      audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, ar);
-      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-              << "] filled audio tensors";
-    } else {
-      VLOG(1) << "Miss audio stream";
-    }
-  }
-
-  torch::List<torch::Tensor> result;
-  result.push_back(std::move(videoFrame));
-  result.push_back(std::move(videoFramePts));
-  result.push_back(std::move(videoTimeBase));
-  result.push_back(std::move(videoFps));
-  result.push_back(std::move(videoDuration));
-  result.push_back(std::move(audioFrame));
-  result.push_back(std::move(audioFramePts));
-  result.push_back(std::move(audioTimeBase));
-  result.push_back(std::move(audioSampleRate));
-  result.push_back(std::move(audioDuration));
-
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] about to return";
-
-  return result;
-}
-
-torch::List<torch::Tensor> probeVideo(
-    bool isReadFile,
-    const torch::Tensor& input_video,
-    std::string videoPath) {
-  DecoderParameters params = getDecoderParams(
-      0, // videoStartUs
-      -1, // videoEndUs
-      0, // seekFrameMargin
-      1, // getPtsOnly
-      1, // readVideoStream
-      0, // width
-      0, // height
-      0, // minDimension
-      0, // maxDimension
-      1, // readAudioStream
-      0, // audioSamples
-      0 // audioChannels
-  );
-
-  SyncDecoder decoder;
-  DecoderInCallback callback = nullptr;
-  std::string logMessage, logType;
-  if (isReadFile) {
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-  } else {
-    callback = MemoryBuffer::getCallback(
-        input_video.data_ptr<uint8_t>(), input_video.size(0));
-    logType = "memory";
-    logMessage = std::to_string(input_video.size(0));
-  }
-
-  VLOG(1) << "Video probing from " << logType << " [" << logMessage
-          << "] has started";
-
-  const auto now = std::chrono::system_clock::now();
-
-  bool succeeded;
-  bool gotAudio = false, gotVideo = false;
-  DecoderMetadata audioMetadata, videoMetadata;
-  std::vector<DecoderMetadata> metadata;
-  if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
-    for (const auto& header : metadata) {
-      if (header.format.type == TYPE_VIDEO) {
-        gotVideo = true;
-        videoMetadata = header;
-      } else if (header.format.type == TYPE_AUDIO) {
-        gotAudio = true;
-        audioMetadata = header;
-      }
-    }
-    const auto then = std::chrono::system_clock::now();
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] has finished, "
-            << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                   .count()
-            << " us";
-  } else {
-    LOG(ERROR) << "Decoder initialization has failed";
-  }
-
-  decoder.shutdown();
-
-  // video section
-  torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
-  torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && gotVideo) {
-    videoTimeBase = torch::zeros({2}, torch::kInt);
-    int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-    const auto& header = videoMetadata;
-
-    videoTimeBaseData[0] = header.num;
-    videoTimeBaseData[1] = header.den;
-
-    videoFps = torch::zeros({1}, torch::kFloat);
-    float* videoFpsData = videoFps.data_ptr<float>();
-    videoFpsData[0] = header.fps;
-
-    videoDuration = torch::zeros({1}, torch::kLong);
-    int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-    AVRational avr = AVRational{(int)header.num, (int)header.den};
-    videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
-
-    VLOG(2) << "Prob fps: " << header.fps << ", duration: " << header.duration
-            << ", num: " << header.num << ", den: " << header.den;
-
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] filled video tensors";
-  } else {
-    LOG(ERROR) << "Miss video stream";
-  }
-
-  // audio section
-  torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && gotAudio) {
-    audioTimeBase = torch::zeros({2}, torch::kInt);
-    int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-    const auto& header = audioMetadata;
-    const auto& media = header.format;
-    const auto& format = media.format.audio;
-
-    audioTimeBaseData[0] = header.num;
-    audioTimeBaseData[1] = header.den;
-
-    audioSampleRate = torch::zeros({1}, torch::kInt);
-    int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-    audioSampleRateData[0] = format.samples;
-
-    audioDuration = torch::zeros({1}, torch::kLong);
-    int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-    AVRational avr = AVRational{(int)header.num, (int)header.den};
-    audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
-
-    VLOG(2) << "Prob sample rate: " << format.samples
-            << ", duration: " << header.duration << ", num: " << header.num
-            << ", den: " << header.den;
-
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] filled audio tensors";
-  } else {
-    VLOG(1) << "Miss audio stream";
-  }
-
-  torch::List<torch::Tensor> result;
-  result.push_back(std::move(videoTimeBase));
-  result.push_back(std::move(videoFps));
-  result.push_back(std::move(videoDuration));
-  result.push_back(std::move(audioTimeBase));
-  result.push_back(std::move(audioSampleRate));
-  result.push_back(std::move(audioDuration));
-
-  VLOG(1) << "Video probing from " << logType << " [" << logMessage
-          << "] is about to return";
-
-  return result;
-}
-
-} // namespace
-
-torch::List<torch::Tensor> read_video_from_memory(
-    torch::Tensor input_video,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.read_video_from_memory");
-  return readVideo(
-      false,
-      input_video,
-      "", // videoPath
-      seekFrameMargin,
-      getPtsOnly,
-      readVideoStream,
-      width,
-      height,
-      minDimension,
-      maxDimension,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioSamples,
-      audioChannels,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen);
-}
-
-torch::List<torch::Tensor> read_video_from_file(
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.read_video_from_file");
-  torch::Tensor dummy_input_video = torch::ones({0});
-  return readVideo(
-      true,
-      dummy_input_video,
-      videoPath,
-      seekFrameMargin,
-      getPtsOnly,
-      readVideoStream,
-      width,
-      height,
-      minDimension,
-      maxDimension,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioSamples,
-      audioChannels,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen);
-}
-
-torch::List<torch::Tensor> probe_video_from_memory(torch::Tensor input_video) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.probe_video_from_memory");
-  return probeVideo(false, input_video, "");
-}
-
-torch::List<torch::Tensor> probe_video_from_file(std::string videoPath) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.probe_video_from_file");
-  torch::Tensor dummy_input_video = torch::ones({0});
-  return probeVideo(true, dummy_input_video, videoPath);
-}
-
-TORCH_LIBRARY_FRAGMENT(video_reader, m) {
-  m.def("read_video_from_memory", read_video_from_memory);
-  m.def("read_video_from_file", read_video_from_file);
-  m.def("probe_video_from_memory", probe_video_from_memory);
-  m.def("probe_video_from_file", probe_video_from_file);
-}
-
-} // namespace video_reader
-} // namespace vision
diff --git a/torchvision/csrc/io/video_reader/video_reader.h b/torchvision/csrc/io/video_reader/video_reader.h
deleted file mode 100644
index 48c4c841219..00000000000
--- a/torchvision/csrc/io/video_reader/video_reader.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#include <torch/types.h>
-
-namespace vision {
-namespace video_reader {
-
-torch::List<torch::Tensor> read_video_from_memory(
-    torch::Tensor input_video,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
-
-torch::List<torch::Tensor> read_video_from_file(
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
-
-torch::List<torch::Tensor> probe_video_from_memory(torch::Tensor input_video);
-
-torch::List<torch::Tensor> probe_video_from_file(std::string videoPath);
-
-} // namespace video_reader
-} // namespace vision
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index 6627ac975f3..dc652d435e8 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -1,15 +1,78 @@
-from ._video_opt import (
-    _HAS_CPU_VIDEO_DECODER,
-    _HAS_VIDEO_OPT,
-    _probe_video_from_file,
-    _probe_video_from_memory,
-    _read_video_from_file,
-    _read_video_from_memory,
-    _read_video_timestamps_from_file,
-    _read_video_timestamps_from_memory,
-    Timebase,
-    VideoMetaData,
-)
+# In fbcode, import from the fb-only location
+# For OSS, these imports would fail (video_reader not available)
+try:
+    from pytorch.vision.fb.io import (
+        _HAS_CPU_VIDEO_DECODER,
+        _HAS_VIDEO_OPT,
+        _probe_video_from_file,
+        _probe_video_from_memory,
+        _read_video_from_file,
+        _read_video_from_memory,
+        _read_video_timestamps_from_file,
+        _read_video_timestamps_from_memory,
+        Timebase,
+        VideoMetaData,
+    )
+    from pytorch.vision.fb.io import VideoReader
+    from pytorch.vision.fb.io import _video_opt
+except ImportError:
+    # OSS fallback - video_reader backend not available
+    _HAS_CPU_VIDEO_DECODER = False
+    _HAS_VIDEO_OPT = False
+
+    def _stub_not_available(*args, **kwargs):
+        raise RuntimeError(
+            "video_reader backend is not available in open-source torchvision. "
+            "Use PyAV or TorchCodec instead."
+        )
+
+    _probe_video_from_file = _stub_not_available
+    _probe_video_from_memory = _stub_not_available
+    _read_video_from_file = _stub_not_available
+    _read_video_from_memory = _stub_not_available
+    _read_video_timestamps_from_file = _stub_not_available
+    _read_video_timestamps_from_memory = _stub_not_available
+
+    class Timebase:
+        __annotations__ = {"numerator": int, "denominator": int}
+        __slots__ = ["numerator", "denominator"]
+
+        def __init__(self, numerator: int = 0, denominator: int = 1) -> None:
+            self.numerator = numerator
+            self.denominator = denominator
+
+    class VideoMetaData:
+        pass
+
+    class VideoReader:
+        def __init__(self, *args, **kwargs):
+            raise RuntimeError(
+                "VideoReader with video_reader backend is not available. "
+                "Use backend='pyav' or migrate to TorchCodec."
+            )
+
+    # Stub module for _video_opt to prevent circular import issues
+    # This module is imported by video.py
+    import types
+    from fractions import Fraction
+
+    _video_opt = types.ModuleType("_video_opt")
+    _video_opt._HAS_VIDEO_OPT = False
+    _video_opt.default_timebase = Fraction(0, 1)
+
+    def _read_video_stub(filename, start_pts, end_pts, pts_unit):
+        raise RuntimeError(
+            "video_reader backend is not available. Use backend='pyav'."
+        )
+
+    def _read_video_timestamps_stub(filename, pts_unit):
+        raise RuntimeError(
+            "video_reader backend is not available. Use backend='pyav'."
+        )
+
+    _video_opt._read_video = _read_video_stub
+    _video_opt._read_video_timestamps = _read_video_timestamps_stub
+
 from .image import (
     decode_avif,
     decode_gif,
@@ -28,7 +91,6 @@
     write_png,
 )
 from .video import read_video, read_video_timestamps, write_video
-from .video_reader import VideoReader
 
 
 __all__ = [
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
deleted file mode 100644
index 8be3a3c94b9..00000000000
--- a/torchvision/io/_video_opt.py
+++ /dev/null
@@ -1,516 +0,0 @@
-import math
-import warnings
-from fractions import Fraction
-from typing import Optional, Union
-
-import torch
-
-from ..extension import _load_library
-from ._video_deprecation_warning import _raise_video_deprecation_warning
-
-
-_HAS_CPU_VIDEO_DECODER = _load_library("video_reader")
-_HAS_VIDEO_OPT = _HAS_CPU_VIDEO_DECODER  # For BC
-default_timebase = Fraction(0, 1)
-
-
-# simple class for torch scripting
-# the complex Fraction class from fractions module is not scriptable
-class Timebase:
-    __annotations__ = {"numerator": int, "denominator": int}
-    __slots__ = ["numerator", "denominator"]
-
-    def __init__(
-        self,
-        numerator: int,
-        denominator: int,
-    ) -> None:
-        self.numerator = numerator
-        self.denominator = denominator
-
-
-class VideoMetaData:
-    __annotations__ = {
-        "has_video": bool,
-        "video_timebase": Timebase,
-        "video_duration": float,
-        "video_fps": float,
-        "has_audio": bool,
-        "audio_timebase": Timebase,
-        "audio_duration": float,
-        "audio_sample_rate": float,
-    }
-    __slots__ = [
-        "has_video",
-        "video_timebase",
-        "video_duration",
-        "video_fps",
-        "has_audio",
-        "audio_timebase",
-        "audio_duration",
-        "audio_sample_rate",
-    ]
-
-    def __init__(self) -> None:
-        self.has_video = False
-        self.video_timebase = Timebase(0, 1)
-        self.video_duration = 0.0
-        self.video_fps = 0.0
-        self.has_audio = False
-        self.audio_timebase = Timebase(0, 1)
-        self.audio_duration = 0.0
-        self.audio_sample_rate = 0.0
-
-
-def _validate_pts(pts_range: tuple[int, int]) -> None:
-
-    if pts_range[0] > pts_range[1] > 0:
-        raise ValueError(
-            f"Start pts should not be smaller than end pts, got start pts: {pts_range[0]} and end pts: {pts_range[1]}"
-        )
-
-
-def _fill_info(
-    vtimebase: torch.Tensor,
-    vfps: torch.Tensor,
-    vduration: torch.Tensor,
-    atimebase: torch.Tensor,
-    asample_rate: torch.Tensor,
-    aduration: torch.Tensor,
-) -> VideoMetaData:
-    """
-    Build update VideoMetaData struct with info about the video
-    """
-    meta = VideoMetaData()
-    if vtimebase.numel() > 0:
-        meta.video_timebase = Timebase(int(vtimebase[0].item()), int(vtimebase[1].item()))
-        timebase = vtimebase[0].item() / float(vtimebase[1].item())
-        if vduration.numel() > 0:
-            meta.has_video = True
-            meta.video_duration = float(vduration.item()) * timebase
-    if vfps.numel() > 0:
-        meta.video_fps = float(vfps.item())
-    if atimebase.numel() > 0:
-        meta.audio_timebase = Timebase(int(atimebase[0].item()), int(atimebase[1].item()))
-        timebase = atimebase[0].item() / float(atimebase[1].item())
-        if aduration.numel() > 0:
-            meta.has_audio = True
-            meta.audio_duration = float(aduration.item()) * timebase
-    if asample_rate.numel() > 0:
-        meta.audio_sample_rate = float(asample_rate.item())
-
-    return meta
-
-
-def _align_audio_frames(
-    aframes: torch.Tensor, aframe_pts: torch.Tensor, audio_pts_range: tuple[int, int]
-) -> torch.Tensor:
-    start, end = aframe_pts[0], aframe_pts[-1]
-    num_samples = aframes.size(0)
-    step_per_aframe = float(end - start + 1) / float(num_samples)
-    s_idx = 0
-    e_idx = num_samples
-    if start < audio_pts_range[0]:
-        s_idx = int((audio_pts_range[0] - start) / step_per_aframe)
-    if audio_pts_range[1] != -1 and end > audio_pts_range[1]:
-        e_idx = int((audio_pts_range[1] - end) / step_per_aframe)
-    return aframes[s_idx:e_idx, :]
-
-
-def _read_video_from_file(
-    filename: str,
-    seek_frame_margin: float = 0.25,
-    read_video_stream: bool = True,
-    video_width: int = 0,
-    video_height: int = 0,
-    video_min_dimension: int = 0,
-    video_max_dimension: int = 0,
-    video_pts_range: tuple[int, int] = (0, -1),
-    video_timebase: Fraction = default_timebase,
-    read_audio_stream: bool = True,
-    audio_samples: int = 0,
-    audio_channels: int = 0,
-    audio_pts_range: tuple[int, int] = (0, -1),
-    audio_timebase: Fraction = default_timebase,
-) -> tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
-    """
-    Reads a video from a file, returning both the video frames and the audio frames
-
-    Args:
-    filename (str): path to the video file
-    seek_frame_margin (double, optional): seeking frame in the stream is imprecise. Thus,
-        when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
-    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
-    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
-        the size of decoded frames:
-
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the original frame resolution
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize the
-                frame so that shorter edge size is video_min_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension != 0, keep the aspect ratio and resize
-                the frame so that longer edge size is video_max_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension != 0, resize the frame so that shorter
-                edge size is video_min_dimension, and longer edge size is
-                video_max_dimension. The aspect ratio may not be preserved
-            - When video_width = 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_height is $video_height
-            - When video_width != 0, video_height == 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_width is $video_width
-            - When video_width != 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, resize the frame so that frame
-                video_width and  video_height are set to $video_width and
-                $video_height, respectively
-    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
-    video_timebase (Fraction, optional): a Fraction rational number which denotes timebase in video stream
-    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
-    audio_samples (int, optional): audio sampling rate
-    audio_channels (int optional): audio channels
-    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
-    audio_timebase (Fraction, optional): a Fraction rational number which denotes time base in audio stream
-
-    Returns
-        vframes (Tensor[T, H, W, C]): the `T` video frames
-        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
-            `K` is the number of audio_channels
-        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float)
-            and audio_fps (int)
-    """
-    _raise_video_deprecation_warning()
-    _validate_pts(video_pts_range)
-    _validate_pts(audio_pts_range)
-
-    result = torch.ops.video_reader.read_video_from_file(
-        filename,
-        seek_frame_margin,
-        0,  # getPtsOnly
-        read_video_stream,
-        video_width,
-        video_height,
-        video_min_dimension,
-        video_max_dimension,
-        video_pts_range[0],
-        video_pts_range[1],
-        video_timebase.numerator,
-        video_timebase.denominator,
-        read_audio_stream,
-        audio_samples,
-        audio_channels,
-        audio_pts_range[0],
-        audio_pts_range[1],
-        audio_timebase.numerator,
-        audio_timebase.denominator,
-    )
-    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    if aframes.numel() > 0:
-        # when audio stream is found
-        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
-    return vframes, aframes, info
-
-
-def _read_video_timestamps_from_file(filename: str) -> tuple[list[int], list[int], VideoMetaData]:
-    """
-    Decode all video- and audio frames in the video. Only pts
-    (presentation timestamp) is returned. The actual frame pixel data is not
-    copied. Thus, it is much faster than read_video(...)
-    """
-    result = torch.ops.video_reader.read_video_from_file(
-        filename,
-        0,  # seek_frame_margin
-        1,  # getPtsOnly
-        1,  # read_video_stream
-        0,  # video_width
-        0,  # video_height
-        0,  # video_min_dimension
-        0,  # video_max_dimension
-        0,  # video_start_pts
-        -1,  # video_end_pts
-        0,  # video_timebase_num
-        1,  # video_timebase_den
-        1,  # read_audio_stream
-        0,  # audio_samples
-        0,  # audio_channels
-        0,  # audio_start_pts
-        -1,  # audio_end_pts
-        0,  # audio_timebase_num
-        1,  # audio_timebase_den
-    )
-    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-
-    vframe_pts = vframe_pts.numpy().tolist()
-    aframe_pts = aframe_pts.numpy().tolist()
-    return vframe_pts, aframe_pts, info
-
-
-def _probe_video_from_file(filename: str) -> VideoMetaData:
-    """
-    Probe a video file and return VideoMetaData with info about the video
-    """
-    _raise_video_deprecation_warning()
-    result = torch.ops.video_reader.probe_video_from_file(filename)
-    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    return info
-
-
-def _read_video_from_memory(
-    video_data: torch.Tensor,
-    seek_frame_margin: float = 0.25,
-    read_video_stream: int = 1,
-    video_width: int = 0,
-    video_height: int = 0,
-    video_min_dimension: int = 0,
-    video_max_dimension: int = 0,
-    video_pts_range: tuple[int, int] = (0, -1),
-    video_timebase_numerator: int = 0,
-    video_timebase_denominator: int = 1,
-    read_audio_stream: int = 1,
-    audio_samples: int = 0,
-    audio_channels: int = 0,
-    audio_pts_range: tuple[int, int] = (0, -1),
-    audio_timebase_numerator: int = 0,
-    audio_timebase_denominator: int = 1,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Reads a video from memory, returning both the video frames as the audio frames
-    This function is torchscriptable.
-
-    Args:
-    video_data (data type could be 1) torch.Tensor, dtype=torch.int8 or 2) python bytes):
-        compressed video content stored in either 1) torch.Tensor 2) python bytes
-    seek_frame_margin (double, optional): seeking frame in the stream is imprecise.
-        Thus, when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
-    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
-    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
-        the size of decoded frames:
-
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the original frame resolution
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize the
-                frame so that shorter edge size is video_min_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension != 0, keep the aspect ratio and resize
-                the frame so that longer edge size is video_max_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension != 0, resize the frame so that shorter
-                edge size is video_min_dimension, and longer edge size is
-                video_max_dimension. The aspect ratio may not be preserved
-            - When video_width = 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_height is $video_height
-            - When video_width != 0, video_height == 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_width is $video_width
-            - When video_width != 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, resize the frame so that frame
-                video_width and  video_height are set to $video_width and
-                $video_height, respectively
-    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
-    video_timebase_numerator / video_timebase_denominator (float, optional): a rational
-        number which denotes timebase in video stream
-    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
-    audio_samples (int, optional): audio sampling rate
-    audio_channels (int optional): audio audio_channels
-    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
-    audio_timebase_numerator / audio_timebase_denominator (float, optional):
-        a rational number which denotes time base in audio stream
-
-    Returns:
-        vframes (Tensor[T, H, W, C]): the `T` video frames
-        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
-            `K` is the number of channels
-    """
-
-    _raise_video_deprecation_warning()
-    _validate_pts(video_pts_range)
-    _validate_pts(audio_pts_range)
-
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-
-    result = torch.ops.video_reader.read_video_from_memory(
-        video_data,
-        seek_frame_margin,
-        0,  # getPtsOnly
-        read_video_stream,
-        video_width,
-        video_height,
-        video_min_dimension,
-        video_max_dimension,
-        video_pts_range[0],
-        video_pts_range[1],
-        video_timebase_numerator,
-        video_timebase_denominator,
-        read_audio_stream,
-        audio_samples,
-        audio_channels,
-        audio_pts_range[0],
-        audio_pts_range[1],
-        audio_timebase_numerator,
-        audio_timebase_denominator,
-    )
-
-    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
-
-    if aframes.numel() > 0:
-        # when audio stream is found
-        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
-
-    return vframes, aframes
-
-
-def _read_video_timestamps_from_memory(
-    video_data: torch.Tensor,
-) -> tuple[list[int], list[int], VideoMetaData]:
-    """
-    Decode all frames in the video. Only pts (presentation timestamp) is returned.
-    The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
-    is much faster than read_video(...)
-    """
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-    result = torch.ops.video_reader.read_video_from_memory(
-        video_data,
-        0,  # seek_frame_margin
-        1,  # getPtsOnly
-        1,  # read_video_stream
-        0,  # video_width
-        0,  # video_height
-        0,  # video_min_dimension
-        0,  # video_max_dimension
-        0,  # video_start_pts
-        -1,  # video_end_pts
-        0,  # video_timebase_num
-        1,  # video_timebase_den
-        1,  # read_audio_stream
-        0,  # audio_samples
-        0,  # audio_channels
-        0,  # audio_start_pts
-        -1,  # audio_end_pts
-        0,  # audio_timebase_num
-        1,  # audio_timebase_den
-    )
-    _raise_video_deprecation_warning()
-    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-
-    vframe_pts = vframe_pts.numpy().tolist()
-    aframe_pts = aframe_pts.numpy().tolist()
-    return vframe_pts, aframe_pts, info
-
-
-def _probe_video_from_memory(
-    video_data: torch.Tensor,
-) -> VideoMetaData:
-    """
-    Probe a video in memory and return VideoMetaData with info about the video
-    This function is torchscriptable
-    """
-    _raise_video_deprecation_warning()
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-    result = torch.ops.video_reader.probe_video_from_memory(video_data)
-    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    return info
-
-
-def _read_video(
-    filename: str,
-    start_pts: Union[float, Fraction] = 0,
-    end_pts: Optional[Union[float, Fraction]] = None,
-    pts_unit: str = "pts",
-) -> tuple[torch.Tensor, torch.Tensor, dict[str, float]]:
-    _raise_video_deprecation_warning()
-    if end_pts is None:
-        end_pts = float("inf")
-
-    if pts_unit == "pts":
-        warnings.warn(
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
-        )
-
-    info = _probe_video_from_file(filename)
-
-    has_video = info.has_video
-    has_audio = info.has_audio
-
-    def get_pts(time_base):
-        start_offset = start_pts
-        end_offset = end_pts
-        if pts_unit == "sec":
-            start_offset = int(math.floor(start_pts * (1 / time_base)))
-            if end_offset != float("inf"):
-                end_offset = int(math.ceil(end_pts * (1 / time_base)))
-        if end_offset == float("inf"):
-            end_offset = -1
-        return start_offset, end_offset
-
-    video_pts_range = (0, -1)
-    video_timebase = default_timebase
-    if has_video:
-        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        video_pts_range = get_pts(video_timebase)
-
-    audio_pts_range = (0, -1)
-    audio_timebase = default_timebase
-    if has_audio:
-        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
-        audio_pts_range = get_pts(audio_timebase)
-
-    vframes, aframes, info = _read_video_from_file(
-        filename,
-        read_video_stream=True,
-        video_pts_range=video_pts_range,
-        video_timebase=video_timebase,
-        read_audio_stream=True,
-        audio_pts_range=audio_pts_range,
-        audio_timebase=audio_timebase,
-    )
-    _info = {}
-    if has_video:
-        _info["video_fps"] = info.video_fps
-    if has_audio:
-        _info["audio_fps"] = info.audio_sample_rate
-
-    return vframes, aframes, _info
-
-
-def _read_video_timestamps(
-    filename: str, pts_unit: str = "pts"
-) -> tuple[Union[list[int], list[Fraction]], Optional[float]]:
-    _raise_video_deprecation_warning()
-    if pts_unit == "pts":
-        warnings.warn(
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
-        )
-
-    pts: Union[list[int], list[Fraction]]
-    pts, _, info = _read_video_timestamps_from_file(filename)
-
-    if pts_unit == "sec":
-        video_time_base = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        pts = [x * video_time_base for x in pts]
-
-    video_fps = info.video_fps if info.has_video else None
-
-    return pts, video_fps
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
deleted file mode 100644
index 253c76376f7..00000000000
--- a/torchvision/io/video_reader.py
+++ /dev/null
@@ -1,279 +0,0 @@
-import io
-import warnings
-from collections.abc import Iterator
-
-from typing import Any
-
-import torch
-
-from ..utils import _log_api_usage_once
-from ._video_deprecation_warning import _raise_video_deprecation_warning
-
-from ._video_opt import _HAS_CPU_VIDEO_DECODER
-
-if _HAS_CPU_VIDEO_DECODER:
-
-    def _has_video_opt() -> bool:
-        return True
-
-else:
-
-    def _has_video_opt() -> bool:
-        return False
-
-
-try:
-    import av
-
-    av.logging.set_level(av.logging.ERROR)
-    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
-        av = ImportError(
-            """\
-Your version of PyAV is too old for the necessary video operations in torchvision.
-If you are on Python 3.5, you will have to build from source (the conda-forge
-packages are not up-to-date).  See
-https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-        )
-except ImportError:
-    av = ImportError(
-        """\
-PyAV is not installed, and is necessary for the video operations in torchvision.
-See https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-    )
-
-
-class VideoReader:
-    """[DEPRECATED] Fine-grained video-reading API.
-    Supports frame-by-frame reading of various streams from a single video
-    container. Much like previous video_reader API it supports the following
-    backends: video_reader and pyav.
-    Backends can be set via `torchvision.set_video_backend` function.
-
-    .. warning::
-
-        DEPRECATED: All the video decoding and encoding capabilities of torchvision
-        are deprecated from version 0.22 and will be removed in version 0.24.  We
-        recommend that you migrate to
-        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
-        consolidate the future decoding/encoding capabilities of PyTorch
-
-    .. betastatus:: VideoReader class
-
-    Example:
-        The following examples creates a :mod:`VideoReader` object, seeks into 2s
-        point, and returns a single frame::
-
-            import torchvision
-            video_path = "path_to_a_test_video"
-            reader = torchvision.io.VideoReader(video_path, "video")
-            reader.seek(2.0)
-            frame = next(reader)
-
-        :mod:`VideoReader` implements the iterable API, which makes it suitable to
-        using it in conjunction with :mod:`itertools` for more advanced reading.
-        As such, we can use a :mod:`VideoReader` instance inside for loops::
-
-            reader.seek(2)
-            for frame in reader:
-                frames.append(frame['data'])
-            # additionally, `seek` implements a fluent API, so we can do
-            for frame in reader.seek(2):
-                frames.append(frame['data'])
-
-        With :mod:`itertools`, we can read all frames between 2 and 5 seconds with the
-        following code::
-
-            for frame in itertools.takewhile(lambda x: x['pts'] <= 5, reader.seek(2)):
-                frames.append(frame['data'])
-
-        and similarly, reading 10 frames after the 2s timestamp can be achieved
-        as follows::
-
-            for frame in itertools.islice(reader.seek(2), 10):
-                frames.append(frame['data'])
-
-    .. note::
-
-        Each stream descriptor consists of two parts: stream type (e.g. 'video') and
-        a unique stream id (which are determined by the video encoding).
-        In this way, if the video container contains multiple
-        streams of the same type, users can access the one they want.
-        If only stream type is passed, the decoder auto-detects first stream of that type.
-
-    Args:
-        src (string, bytes object, or tensor): The media source.
-            If string-type, it must be a file path supported by FFMPEG.
-            If bytes, should be an in-memory representation of a file supported by FFMPEG.
-            If Tensor, it is interpreted internally as byte buffer.
-            It must be one-dimensional, of type ``torch.uint8``.
-
-        stream (string, optional): descriptor of the required stream, followed by the stream id,
-            in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
-            Currently available options include ``['video', 'audio']``
-
-        num_threads (int, optional): number of threads used by the codec to decode video.
-            Default value (0) enables multithreading with codec-dependent heuristic. The performance
-            will depend on the version of FFMPEG codecs supported.
-    """
-
-    def __init__(
-        self,
-        src: str,
-        stream: str = "video",
-        num_threads: int = 0,
-    ) -> None:
-        _raise_video_deprecation_warning()
-        _log_api_usage_once(self)
-        from .. import get_video_backend
-
-        self.backend = get_video_backend()
-        if isinstance(src, str):
-            if not src:
-                raise ValueError("src cannot be empty")
-        elif isinstance(src, bytes):
-            if self.backend == "pyav":
-                src = io.BytesIO(src)
-            else:
-                with warnings.catch_warnings():
-                    # Ignore the warning because we actually don't modify the buffer in this function
-                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
-                    src = torch.frombuffer(src, dtype=torch.uint8)
-        elif isinstance(src, torch.Tensor):
-            if self.backend == "pyav":
-                raise RuntimeError("VideoReader cannot be initialized from Tensor object when using pyav backend.")
-        else:
-            raise ValueError(f"src must be either string, Tensor or bytes object. Got {type(src)}")
-
-        if self.backend == "video_reader":
-            if isinstance(src, str):
-                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
-            elif isinstance(src, torch.Tensor):
-                self._c = torch.classes.torchvision.Video("", "", 0)
-                self._c.init_from_memory(src, stream, num_threads)
-
-        elif self.backend == "pyav":
-            self.container = av.open(src, metadata_errors="ignore")
-            # TODO: load metadata
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-
-            # TODO: add extradata exception
-
-        else:
-            raise RuntimeError(f"Unknown video backend: {self.backend}")
-
-    def __next__(self) -> dict[str, Any]:
-        """Decodes and returns the next frame of the current stream.
-        Frames are encoded as a dict with mandatory
-        data and pts fields, where data is a tensor, and pts is a
-        presentation timestamp of the frame expressed in seconds
-        as a float.
-
-        Returns:
-            (dict): a dictionary and containing decoded frame (``data``)
-            and corresponding timestamp (``pts``) in seconds
-
-        """
-        if self.backend == "video_reader":
-            frame, pts = self._c.next()
-        else:
-            try:
-                frame = next(self._c)
-                pts = float(frame.pts * frame.time_base)
-                if "video" in self.pyav_stream:
-                    frame = torch.as_tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
-                elif "audio" in self.pyav_stream:
-                    frame = torch.as_tensor(frame.to_ndarray()).permute(1, 0)
-                else:
-                    frame = None
-            except av.error.EOFError:
-                raise StopIteration
-
-        if frame.numel() == 0:
-            raise StopIteration
-
-        return {"data": frame, "pts": pts}
-
-    def __iter__(self) -> Iterator[dict[str, Any]]:
-        return self
-
-    def seek(self, time_s: float, keyframes_only: bool = False) -> "VideoReader":
-        """Seek within current stream.
-
-        Args:
-            time_s (float): seek time in seconds
-            keyframes_only (bool): allow to seek only to keyframes
-
-        .. note::
-            Current implementation is the so-called precise seek. This
-            means following seek, call to :mod:`next()` will return the
-            frame with the exact timestamp if it exists or
-            the first frame with timestamp larger than ``time_s``.
-        """
-        if self.backend == "video_reader":
-            self._c.seek(time_s, keyframes_only)
-        else:
-            # handle special case as pyav doesn't catch it
-            if time_s < 0:
-                time_s = 0
-            temp_str = self.container.streams.get(**self.pyav_stream)[0]
-            offset = int(round(time_s / temp_str.time_base))
-            if not keyframes_only:
-                warnings.warn("Accurate seek is not implemented for pyav backend")
-            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
-            self._c = self.container.decode(**self.pyav_stream)
-        return self
-
-    def get_metadata(self) -> dict[str, Any]:
-        """Returns video metadata
-
-        Returns:
-            (dict): dictionary containing duration and frame rate for every stream
-        """
-        if self.backend == "pyav":
-            metadata = {}  # type:  Dict[str, Any]
-            for stream in self.container.streams:
-                if stream.type not in metadata:
-                    if stream.type == "video":
-                        rate_n = "fps"
-                    else:
-                        rate_n = "framerate"
-                    metadata[stream.type] = {rate_n: [], "duration": []}
-
-                rate = getattr(stream, "average_rate", None) or stream.sample_rate
-
-                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
-                metadata[stream.type][rate_n].append(float(rate))
-            return metadata
-        return self._c.get_metadata()
-
-    def set_current_stream(self, stream: str) -> bool:
-        """Set current stream.
-        Explicitly define the stream we are operating on.
-
-        Args:
-            stream (string): descriptor of the required stream. Defaults to ``"video:0"``
-                Currently available stream types include ``['video', 'audio']``.
-                Each descriptor consists of two parts: stream type (e.g. 'video') and
-                a unique stream id (which are determined by video encoding).
-                In this way, if the video container contains multiple
-                streams of the same type, users can access the one they want.
-                If only stream type is passed, the decoder auto-detects first stream
-                of that type and returns it.
-
-        Returns:
-            (bool): True on success, False otherwise
-        """
-        if self.backend == "pyav":
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-            return True
-        return self._c.set_current_stream(stream)