From e79e0bbc8f9c567f4699380b16f139168516a75b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Fri, 30 Jan 2026 01:46:05 -0800
Subject: [PATCH] [torchvision] Move video_reader backend to fb/ for
 internal-only use (#9370)

Summary:
Pull Request resolved: https://github.com/pytorch/vision/pull/9370

Pull Request resolved: https://github.com/pytorch/vision/pull/9369

Move the video_reader backend (C++ decoder and Python API) from the open-source torchvision/ folder to the internal-only fb/ folder. This prepares for removal of the deprecated video_reader from open-source GitHub while maintaining backward compatibility for internal Meta users.

The move preserves all existing BUCK target paths via aliases, so existing internal consumers continue to work without modification. Python imports from torchvision.io are redirected to the fb/ location with OSS fallback stubs.

Differential Revision: D91702027
---
 torchvision/csrc/io/decoder/audio_sampler.cpp | 254 ------
 torchvision/csrc/io/decoder/audio_sampler.h   |  39 -
 torchvision/csrc/io/decoder/audio_stream.cpp  | 119 ---
 torchvision/csrc/io/decoder/audio_stream.h    |  29 -
 torchvision/csrc/io/decoder/cc_stream.cpp     |  24 -
 torchvision/csrc/io/decoder/cc_stream.h       |  22 -
 torchvision/csrc/io/decoder/decoder.cpp       | 764 ------------------
 torchvision/csrc/io/decoder/decoder.h         | 100 ---
 torchvision/csrc/io/decoder/defs.h            | 415 ----------
 torchvision/csrc/io/decoder/memory_buffer.cpp |  71 --
 torchvision/csrc/io/decoder/memory_buffer.h   |  25 -
 .../csrc/io/decoder/seekable_buffer.cpp       | 139 ----
 torchvision/csrc/io/decoder/seekable_buffer.h |  45 --
 torchvision/csrc/io/decoder/stream.cpp        | 288 -------
 torchvision/csrc/io/decoder/stream.h          |  80 --
 .../csrc/io/decoder/subtitle_sampler.cpp      |  46 --
 .../csrc/io/decoder/subtitle_sampler.h        |  32 -
 .../csrc/io/decoder/subtitle_stream.cpp       |  96 ---
 torchvision/csrc/io/decoder/subtitle_stream.h |  38 -
 torchvision/csrc/io/decoder/sync_decoder.cpp  |  97 ---
 torchvision/csrc/io/decoder/sync_decoder.h    |  48 --
 .../csrc/io/decoder/sync_decoder_test.cpp     | 416 ----------
 torchvision/csrc/io/decoder/time_keeper.cpp   |  35 -
 torchvision/csrc/io/decoder/time_keeper.h     |  25 -
 torchvision/csrc/io/decoder/util.cpp          | 401 ---------
 torchvision/csrc/io/decoder/util.h            |  28 -
 torchvision/csrc/io/decoder/util_test.cpp     |  34 -
 torchvision/csrc/io/decoder/video_sampler.cpp | 337 --------
 torchvision/csrc/io/decoder/video_sampler.h   |  44 -
 torchvision/csrc/io/decoder/video_stream.cpp  | 131 ---
 torchvision/csrc/io/decoder/video_stream.h    |  31 -
 torchvision/csrc/io/video/video.cpp           | 387 ---------
 torchvision/csrc/io/video/video.h             |  75 --
 .../csrc/io/video_reader/video_reader.cpp     | 677 ----------------
 .../csrc/io/video_reader/video_reader.h       |  55 --
 torchvision/io/__init__.py                    |  89 +-
 torchvision/io/_video_opt.py                  | 516 ------------
 torchvision/io/video_reader.py                | 279 -------
 38 files changed, 76 insertions(+), 6255 deletions(-)
 delete mode 100644 torchvision/csrc/io/decoder/audio_sampler.cpp
 delete mode 100644 torchvision/csrc/io/decoder/audio_sampler.h
 delete mode 100644 torchvision/csrc/io/decoder/audio_stream.cpp
 delete mode 100644 torchvision/csrc/io/decoder/audio_stream.h
 delete mode 100644 torchvision/csrc/io/decoder/cc_stream.cpp
 delete mode 100644 torchvision/csrc/io/decoder/cc_stream.h
 delete mode 100644 torchvision/csrc/io/decoder/decoder.cpp
 delete mode 100644 torchvision/csrc/io/decoder/decoder.h
 delete mode 100644 torchvision/csrc/io/decoder/defs.h
 delete mode 100644 torchvision/csrc/io/decoder/memory_buffer.cpp
 delete mode 100644 torchvision/csrc/io/decoder/memory_buffer.h
 delete mode 100644 torchvision/csrc/io/decoder/seekable_buffer.cpp
 delete mode 100644 torchvision/csrc/io/decoder/seekable_buffer.h
 delete mode 100644 torchvision/csrc/io/decoder/stream.cpp
 delete mode 100644 torchvision/csrc/io/decoder/stream.h
 delete mode 100644 torchvision/csrc/io/decoder/subtitle_sampler.cpp
 delete mode 100644 torchvision/csrc/io/decoder/subtitle_sampler.h
 delete mode 100644 torchvision/csrc/io/decoder/subtitle_stream.cpp
 delete mode 100644 torchvision/csrc/io/decoder/subtitle_stream.h
 delete mode 100644 torchvision/csrc/io/decoder/sync_decoder.cpp
 delete mode 100644 torchvision/csrc/io/decoder/sync_decoder.h
 delete mode 100644 torchvision/csrc/io/decoder/sync_decoder_test.cpp
 delete mode 100644 torchvision/csrc/io/decoder/time_keeper.cpp
 delete mode 100644 torchvision/csrc/io/decoder/time_keeper.h
 delete mode 100644 torchvision/csrc/io/decoder/util.cpp
 delete mode 100644 torchvision/csrc/io/decoder/util.h
 delete mode 100644 torchvision/csrc/io/decoder/util_test.cpp
 delete mode 100644 torchvision/csrc/io/decoder/video_sampler.cpp
 delete mode 100644 torchvision/csrc/io/decoder/video_sampler.h
 delete mode 100644 torchvision/csrc/io/decoder/video_stream.cpp
 delete mode 100644 torchvision/csrc/io/decoder/video_stream.h
 delete mode 100644 torchvision/csrc/io/video/video.cpp
 delete mode 100644 torchvision/csrc/io/video/video.h
 delete mode 100644 torchvision/csrc/io/video_reader/video_reader.cpp
 delete mode 100644 torchvision/csrc/io/video_reader/video_reader.h
 delete mode 100644 torchvision/io/_video_opt.py
 delete mode 100644 torchvision/io/video_reader.py

diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp
deleted file mode 100644
index b158d3438b8..00000000000
--- a/torchvision/csrc/io/decoder/audio_sampler.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-#include "audio_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-#define AVRESAMPLE_MAX_CHANNELS 32
-
-// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
-namespace ffmpeg {
-
-namespace {
-int preparePlanes(
-    const AudioFormat& fmt,
-    const uint8_t* buffer,
-    int numSamples,
-    uint8_t** planes) {
-  int result;
-  if ((result = av_samples_fill_arrays(
-           planes,
-           nullptr, // linesize is not needed
-           buffer,
-           fmt.channels,
-           numSamples,
-           (AVSampleFormat)fmt.format,
-           1)) < 0) {
-    LOG(ERROR) << "av_samples_fill_arrays failed, err: "
-               << Util::generateErrorDesc(result)
-               << ", numSamples: " << numSamples << ", fmt: " << fmt.format;
-  }
-  return result;
-}
-} // namespace
-
-AudioSampler::AudioSampler(void* logCtx) : logCtx_(logCtx) {}
-
-AudioSampler::~AudioSampler() {
-  cleanUp();
-}
-
-void AudioSampler::shutdown() {
-  cleanUp();
-}
-
-bool AudioSampler::init(const SamplerParameters& params) {
-  cleanUp();
-
-  if (params.type != MediaType::TYPE_AUDIO) {
-    LOG(ERROR) << "Invalid media type, expected MediaType::TYPE_AUDIO";
-    return false;
-  }
-
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  AVChannelLayout channel_out;
-  AVChannelLayout channel_in;
-  av_channel_layout_default(&channel_out, params.out.audio.channels);
-  av_channel_layout_default(&channel_in, params.in.audio.channels);
-  int ret = swr_alloc_set_opts2(
-      &swrContext_,
-      &channel_out,
-      (AVSampleFormat)params.out.audio.format,
-      params.out.audio.samples,
-      &channel_in,
-      (AVSampleFormat)params.in.audio.format,
-      params.in.audio.samples,
-      0,
-      logCtx_);
-  if (ret < 0 || swrContext_ == nullptr) {
-    LOG(ERROR) << "Cannot allocate SwrContext";
-    return false;
-  }
-#else
-  swrContext_ = swr_alloc_set_opts(
-      nullptr,
-      av_get_default_channel_layout(params.out.audio.channels),
-      (AVSampleFormat)params.out.audio.format,
-      params.out.audio.samples,
-      av_get_default_channel_layout(params.in.audio.channels),
-      (AVSampleFormat)params.in.audio.format,
-      params.in.audio.samples,
-      0,
-      logCtx_);
-  if (swrContext_ == nullptr) {
-    LOG(ERROR) << "Cannot allocate SwrContext";
-    return false;
-  }
-#endif
-
-  int result;
-  if ((result = swr_init(swrContext_)) < 0) {
-    LOG(ERROR) << "swr_init failed, err: " << Util::generateErrorDesc(result)
-               << ", in -> format: " << params.in.audio.format
-               << ", channels: " << params.in.audio.channels
-               << ", samples: " << params.in.audio.samples
-               << ", out -> format: " << params.out.audio.format
-               << ", channels: " << params.out.audio.channels
-               << ", samples: " << params.out.audio.samples;
-    return false;
-  }
-
-  // set formats
-  params_ = params;
-  return true;
-}
-
-int AudioSampler::numOutputSamples(int inSamples) const {
-  return swr_get_out_samples(swrContext_, inSamples);
-}
-
-int AudioSampler::sample(
-    const uint8_t* inPlanes[],
-    int inNumSamples,
-    ByteStorage* out,
-    int outNumSamples) {
-  int result;
-  int outBufferBytes = av_samples_get_buffer_size(
-      nullptr,
-      params_.out.audio.channels,
-      outNumSamples,
-      (AVSampleFormat)params_.out.audio.format,
-      1);
-
-  if (out) {
-    out->ensure(outBufferBytes);
-
-    uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-
-    if ((result = preparePlanes(
-             params_.out.audio,
-             out->writableTail(),
-             outNumSamples,
-             outPlanes)) < 0) {
-      return result;
-    }
-
-    if ((result = swr_convert(
-             swrContext_,
-             &outPlanes[0],
-             outNumSamples,
-             inPlanes,
-             inNumSamples)) < 0) {
-      LOG(ERROR) << "swr_convert failed, err: "
-                 << Util::generateErrorDesc(result);
-      return result;
-    }
-
-    TORCH_CHECK_LE(result, outNumSamples);
-
-    if (result) {
-      if ((result = av_samples_get_buffer_size(
-               nullptr,
-               params_.out.audio.channels,
-               result,
-               (AVSampleFormat)params_.out.audio.format,
-               1)) >= 0) {
-        out->append(result);
-      } else {
-        LOG(ERROR) << "av_samples_get_buffer_size failed, err: "
-                   << Util::generateErrorDesc(result);
-      }
-    }
-  } else {
-    // allocate a temporary buffer
-    auto* tmpBuffer = static_cast<uint8_t*>(av_malloc(outBufferBytes));
-    if (!tmpBuffer) {
-      LOG(ERROR) << "av_alloc failed, for size: " << outBufferBytes;
-      return -1;
-    }
-
-    uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-
-    if ((result = preparePlanes(
-             params_.out.audio, tmpBuffer, outNumSamples, outPlanes)) < 0) {
-      av_free(tmpBuffer);
-      return result;
-    }
-
-    if ((result = swr_convert(
-             swrContext_,
-             &outPlanes[0],
-             outNumSamples,
-             inPlanes,
-             inNumSamples)) < 0) {
-      LOG(ERROR) << "swr_convert failed, err: "
-                 << Util::generateErrorDesc(result);
-      av_free(tmpBuffer);
-      return result;
-    }
-
-    av_free(tmpBuffer);
-
-    TORCH_CHECK_LE(result, outNumSamples);
-
-    if (result) {
-      result = av_samples_get_buffer_size(
-          nullptr,
-          params_.out.audio.channels,
-          result,
-          (AVSampleFormat)params_.out.audio.format,
-          1);
-    }
-  }
-
-  return result;
-}
-
-int AudioSampler::sample(AVFrame* frame, ByteStorage* out) {
-  const auto outNumSamples = numOutputSamples(frame ? frame->nb_samples : 0);
-
-  if (!outNumSamples) {
-    return 0;
-  }
-
-  return sample(
-      frame ? (const uint8_t**)&frame->data[0] : nullptr,
-      frame ? frame->nb_samples : 0,
-      out,
-      outNumSamples);
-}
-
-int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  const auto inSampleSize =
-      av_get_bytes_per_sample((AVSampleFormat)params_.in.audio.format);
-
-  const auto inNumSamples =
-      !in ? 0 : in->length() / inSampleSize / params_.in.audio.channels;
-
-  const auto outNumSamples = numOutputSamples(inNumSamples);
-
-  if (!outNumSamples) {
-    return 0;
-  }
-
-  uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-  int result;
-  if (in &&
-      (result = preparePlanes(
-           params_.in.audio, in->data(), inNumSamples, inPlanes)) < 0) {
-    return result;
-  }
-
-  return sample(
-      in ? (const uint8_t**)inPlanes : nullptr,
-      inNumSamples,
-      out,
-      outNumSamples);
-}
-
-void AudioSampler::cleanUp() {
-  if (swrContext_) {
-    swr_free(&swrContext_);
-    swrContext_ = nullptr;
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_sampler.h b/torchvision/csrc/io/decoder/audio_sampler.h
deleted file mode 100644
index e105bbe4de2..00000000000
--- a/torchvision/csrc/io/decoder/audio_sampler.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode audio frames from one format into another
- */
-
-class AudioSampler : public MediaSampler {
- public:
-  explicit AudioSampler(void* logCtx);
-  ~AudioSampler() override;
-
-  // MediaSampler overrides
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  int sample(AVFrame* frame, ByteStorage* out);
-
- private:
-  // close resources
-  void cleanUp();
-  // helper functions for rescaling, cropping, etc.
-  int numOutputSamples(int inSamples) const;
-  int sample(
-      const uint8_t* inPlanes[],
-      int inNumSamples,
-      ByteStorage* out,
-      int outNumSamples);
-
- private:
-  SwrContext* swrContext_{nullptr};
-  void* logCtx_{nullptr};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp
deleted file mode 100644
index c3a003434b8..00000000000
--- a/torchvision/csrc/io/decoder/audio_stream.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include "audio_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-namespace {
-static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) {
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels;
-#else
-  return frame ? frame->channels : codec->channels;
-#endif
-}
-
-bool operator==(const AudioFormat& x, const AVFrame& y) {
-  return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(get_nb_channels(&y, nullptr)) &&
-      x.format == y.format;
-}
-
-bool operator==(const AudioFormat& x, const AVCodecContext& y) {
-  return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(get_nb_channels(nullptr, &y)) &&
-      x.format == y.sample_fmt;
-}
-
-AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
-  x.samples = y.sample_rate;
-  x.channels = get_nb_channels(&y, nullptr);
-  x.format = y.format;
-  return x;
-}
-
-AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
-  x.samples = y.sample_rate;
-  x.channels = get_nb_channels(nullptr, &y);
-  x.format = y.sample_fmt;
-  return x;
-}
-} // namespace
-
-AudioStream::AudioStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const AudioFormat& format)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          0) {}
-
-AudioStream::~AudioStream() {
-  if (sampler_) {
-    sampler_->shutdown();
-    sampler_.reset();
-  }
-}
-
-int AudioStream::initFormat() {
-  // set output format
-  if (format_.format.audio.samples == 0) {
-    format_.format.audio.samples = codecCtx_->sample_rate;
-  }
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  if (format_.format.audio.channels == 0) {
-    format_.format.audio.channels = codecCtx_->ch_layout.nb_channels;
-  }
-#else
-  if (format_.format.audio.channels == 0) {
-    format_.format.audio.channels = codecCtx_->channels;
-  }
-#endif
-  if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) {
-    format_.format.audio.format = codecCtx_->sample_fmt;
-  }
-
-  return format_.format.audio.samples != 0 &&
-          format_.format.audio.channels != 0 &&
-          format_.format.audio.format != AV_SAMPLE_FMT_NONE
-      ? 0
-      : -1;
-}
-
-// copies audio sample bytes via swr_convert call in audio_sampler.cpp
-int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  if (!sampler_) {
-    sampler_ = std::make_unique<AudioSampler>(codecCtx_);
-  }
-  // check if input format gets changed
-  if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
-            : !(sampler_->getInputFormat().audio == *frame_)) {
-    // - reinit sampler
-    SamplerParameters params;
-    params.type = format_.type;
-    params.out = format_.format;
-    params.in = FormatUnion();
-    flush ? toAudioFormat(params.in.audio, *codecCtx_)
-          : toAudioFormat(params.in.audio, *frame_);
-    if (!sampler_->init(params)) {
-      return -1;
-    }
-
-    VLOG(1) << "Set input audio sampler format"
-            << ", samples: " << params.in.audio.samples
-            << ", channels: " << params.in.audio.channels
-            << ", format: " << params.in.audio.format
-            << " : output audio sampler format"
-            << ", samples: " << format_.format.audio.samples
-            << ", channels: " << format_.format.audio.channels
-            << ", format: " << format_.format.audio.format;
-  }
-  // calls to a sampler that converts the audio samples and copies them to the
-  // out buffer via ffmpeg::swr_convert
-  return sampler_->sample(flush ? nullptr : frame_, out);
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_stream.h b/torchvision/csrc/io/decoder/audio_stream.h
deleted file mode 100644
index 2d6457b68f5..00000000000
--- a/torchvision/csrc/io/decoder/audio_stream.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-#include "audio_sampler.h"
-#include "stream.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one audio stream.
- */
-
-class AudioStream : public Stream {
- public:
-  AudioStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const AudioFormat& format);
-  ~AudioStream() override;
-
- private:
-  int initFormat() override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-
- private:
-  std::unique_ptr<AudioSampler> sampler_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/cc_stream.cpp b/torchvision/csrc/io/decoder/cc_stream.cpp
deleted file mode 100644
index 89174c396fd..00000000000
--- a/torchvision/csrc/io/decoder/cc_stream.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "cc_stream.h"
-
-namespace ffmpeg {
-
-CCStream::CCStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const SubtitleFormat& format)
-    : SubtitleStream(inputCtx, index, convertPtsToWallTime, format) {
-  format_.type = TYPE_CC;
-}
-
-AVCodec* CCStream::findCodec(AVCodecParameters* params) {
-  if (params->codec_id == AV_CODEC_ID_BIN_DATA &&
-      params->codec_type == AVMEDIA_TYPE_DATA) {
-    // obtain subtitles codec
-    params->codec_id = AV_CODEC_ID_MOV_TEXT;
-    params->codec_type = AVMEDIA_TYPE_SUBTITLE;
-  }
-  return Stream::findCodec(params);
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/cc_stream.h b/torchvision/csrc/io/decoder/cc_stream.h
deleted file mode 100644
index 3a1d169f014..00000000000
--- a/torchvision/csrc/io/decoder/cc_stream.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "subtitle_stream.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one closed captions stream.
- */
-class CCStream : public SubtitleStream {
- public:
-  CCStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const SubtitleFormat& format);
-
- private:
-  AVCodec* findCodec(AVCodecParameters* params) override;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
deleted file mode 100644
index 7221445840e..00000000000
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ /dev/null
@@ -1,764 +0,0 @@
-#include "decoder.h"
-#include <c10/util/Logging.h>
-#include <libavutil/avutil.h>
-#include <future>
-#include <iostream>
-#include <mutex>
-#include "audio_stream.h"
-#include "cc_stream.h"
-#include "subtitle_stream.h"
-#include "util.h"
-#include "video_stream.h"
-
-namespace ffmpeg {
-
-namespace {
-
-constexpr size_t kIoBufferSize = 96 * 1024;
-constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE;
-constexpr size_t kLogBufferSize = 1024;
-
-bool mapFfmpegType(AVMediaType media, MediaType* type) {
-  switch (media) {
-    case AVMEDIA_TYPE_AUDIO:
-      *type = TYPE_AUDIO;
-      return true;
-    case AVMEDIA_TYPE_VIDEO:
-      *type = TYPE_VIDEO;
-      return true;
-    case AVMEDIA_TYPE_SUBTITLE:
-      *type = TYPE_SUBTITLE;
-      return true;
-    case AVMEDIA_TYPE_DATA:
-      *type = TYPE_CC;
-      return true;
-    default:
-      return false;
-  }
-}
-
-std::unique_ptr<Stream> createStream(
-    MediaType type,
-    AVFormatContext* ctx,
-    int idx,
-    bool convertPtsToWallTime,
-    const FormatUnion& format,
-    int64_t loggingUuid) {
-  switch (type) {
-    case TYPE_AUDIO:
-      return std::make_unique<AudioStream>(
-          ctx, idx, convertPtsToWallTime, format.audio);
-    case TYPE_VIDEO:
-      return std::make_unique<VideoStream>(
-          // negative loggingUuid indicates video streams.
-          ctx,
-          idx,
-          convertPtsToWallTime,
-          format.video,
-          -loggingUuid);
-    case TYPE_SUBTITLE:
-      return std::make_unique<SubtitleStream>(
-          ctx, idx, convertPtsToWallTime, format.subtitle);
-    case TYPE_CC:
-      return std::make_unique<CCStream>(
-          ctx, idx, convertPtsToWallTime, format.subtitle);
-    default:
-      return nullptr;
-  }
-}
-
-} // Namespace
-
-/* static */
-void Decoder::logFunction(void* avcl, int level, const char* cfmt, va_list vl) {
-  if (!avcl) {
-    // Nothing can be done here
-    return;
-  }
-
-  AVClass* avclass = *reinterpret_cast<AVClass**>(avcl);
-  if (!avclass) {
-    // Nothing can be done here
-    return;
-  }
-  Decoder* decoder = nullptr;
-  if (strcmp(avclass->class_name, "AVFormatContext") == 0) {
-    AVFormatContext* context = reinterpret_cast<AVFormatContext*>(avcl);
-    if (context) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "AVCodecContext") == 0) {
-    AVCodecContext* context = reinterpret_cast<AVCodecContext*>(avcl);
-    if (context) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "AVIOContext") == 0) {
-    AVIOContext* context = reinterpret_cast<AVIOContext*>(avcl);
-    // only if opaque was assigned to Decoder pointer
-    if (context && context->read_packet == Decoder::readFunction) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "SWResampler") == 0) {
-    // expect AVCodecContext as parent
-    if (avclass->parent_log_context_offset) {
-      AVClass** parent =
-          *(AVClass***)(((uint8_t*)avcl) + avclass->parent_log_context_offset);
-      AVCodecContext* context = reinterpret_cast<AVCodecContext*>(parent);
-      if (context) {
-        decoder = reinterpret_cast<Decoder*>(context->opaque);
-      }
-    }
-  } else if (strcmp(avclass->class_name, "SWScaler") == 0) {
-    // cannot find a way to pass context pointer through SwsContext struct
-  } else {
-    VLOG(2) << "Unknown context class: " << avclass->class_name;
-  }
-
-  if (decoder != nullptr && decoder->enableLogLevel(level)) {
-    char buf[kLogBufferSize] = {0};
-    // Format the line
-    int* prefix = decoder->getPrintPrefix();
-    *prefix = 1;
-    av_log_format_line(avcl, level, cfmt, vl, buf, sizeof(buf) - 1, prefix);
-    // pass message to the decoder instance
-    std::string msg(buf);
-    decoder->logCallback(level, msg);
-  }
-}
-
-bool Decoder::enableLogLevel(int level) const {
-  return ssize_t(level) <= params_.logLevel;
-}
-
-void Decoder::logCallback(int level, const std::string& message) {
-  LOG(INFO) << "Msg, uuid=" << params_.loggingUuid << " level=" << level
-            << " msg=" << message;
-}
-
-/* static */
-int Decoder::shutdownFunction(void* ctx) {
-  Decoder* decoder = (Decoder*)ctx;
-  if (decoder == nullptr) {
-    return 1;
-  }
-  return decoder->shutdownCallback();
-}
-
-int Decoder::shutdownCallback() {
-  return interrupted_ ? 1 : 0;
-}
-
-/* static */
-int Decoder::readFunction(void* opaque, uint8_t* buf, int size) {
-  Decoder* decoder = reinterpret_cast<Decoder*>(opaque);
-  if (decoder == nullptr) {
-    return 0;
-  }
-  int bytesRead = decoder->readCallback(buf, size);
-  return bytesRead == 0 ? AVERROR_EOF : bytesRead;
-}
-
-/* static */
-int64_t Decoder::seekFunction(void* opaque, int64_t offset, int whence) {
-  Decoder* decoder = reinterpret_cast<Decoder*>(opaque);
-  if (decoder == nullptr) {
-    return -1;
-  }
-  return decoder->seekCallback(offset, whence);
-}
-
-int Decoder::readCallback(uint8_t* buf, int size) {
-  return seekableBuffer_.read(buf, size, params_.timeoutMs);
-}
-
-int64_t Decoder::seekCallback(int64_t offset, int whence) {
-  return seekableBuffer_.seek(offset, whence, params_.timeoutMs);
-}
-
-/* static */
-void Decoder::initOnce() {
-  static std::once_flag flagInit;
-  std::call_once(flagInit, []() {
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-    av_register_all();
-    avcodec_register_all();
-#endif
-    avformat_network_init();
-    av_log_set_callback(Decoder::logFunction);
-    av_log_set_level(AV_LOG_ERROR);
-    VLOG(1) << "Registered ffmpeg libs";
-  });
-}
-
-Decoder::Decoder() {
-  initOnce();
-}
-
-Decoder::~Decoder() {
-  cleanUp();
-}
-
-// Initialise the format context that holds information about the container and
-// fill it with minimal information about the format (codecs are not opened
-// here). Function reads in information about the streams from the container
-// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
-// specified within the decoder parameters, it seeks into the correct frame
-// (note, the seek defined here is "precise" seek).
-bool Decoder::init(
-    const DecoderParameters& params,
-    DecoderInCallback&& in,
-    std::vector<DecoderMetadata>* metadata) {
-  cleanUp();
-
-  if ((params.uri.empty() || in) && (!params.uri.empty() || !in)) {
-    LOG(ERROR)
-        << "uuid=" << params_.loggingUuid
-        << " either external URI gets provided or explicit input callback";
-    return false;
-  }
-
-  // set callback and params
-  params_ = params;
-
-  if (!(inputCtx_ = avformat_alloc_context())) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " cannot allocate format context";
-    return false;
-  }
-
-  AVInputFormat* fmt = nullptr;
-  int result = 0;
-  if (in) {
-    ImageType type = ImageType::UNKNOWN;
-    if ((result = seekableBuffer_.init(
-             std::forward<DecoderInCallback>(in),
-             params_.timeoutMs,
-             params_.maxSeekableBytes,
-             params_.isImage ? &type : nullptr)) < 0) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " can't initiate seekable buffer";
-      cleanUp();
-      return false;
-    }
-
-    if (params_.isImage) {
-      const char* fmtName = "image2";
-      switch (type) {
-        case ImageType::JPEG:
-          fmtName = "jpeg_pipe";
-          break;
-        case ImageType::PNG:
-          fmtName = "png_pipe";
-          break;
-        case ImageType::TIFF:
-          fmtName = "tiff_pipe";
-          break;
-        default:
-          break;
-      }
-
-      fmt = (AVInputFormat*)av_find_input_format(fmtName);
-    }
-
-    const size_t avioCtxBufferSize = kIoBufferSize;
-    uint8_t* avioCtxBuffer =
-        (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize);
-    if (!avioCtxBuffer) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " av_malloc cannot allocate " << avioCtxBufferSize
-                 << " bytes";
-      cleanUp();
-      return false;
-    }
-
-    if (!(avioCtx_ = avio_alloc_context(
-              avioCtxBuffer,
-              avioCtxBufferSize,
-              0,
-              reinterpret_cast<void*>(this),
-              &Decoder::readFunction,
-              nullptr,
-              result == 1 ? &Decoder::seekFunction : nullptr))) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " avio_alloc_context failed";
-      av_free(avioCtxBuffer);
-      cleanUp();
-      return false;
-    }
-
-    avioCtx_->max_packet_size = params.maxEncodedBufferSize;
-
-    inputCtx_->pb = avioCtx_;
-    inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
-  }
-
-  inputCtx_->opaque = reinterpret_cast<void*>(this);
-  inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction;
-  inputCtx_->interrupt_callback.opaque = reinterpret_cast<void*>(this);
-
-  // add network timeout
-  inputCtx_->flags |= AVFMT_FLAG_NONBLOCK;
-
-  AVDictionary* options = nullptr;
-  if (params_.listen) {
-    av_dict_set_int(&options, "listen", 1, 0);
-  }
-  if (params_.timeoutMs > 0) {
-    av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0);
-    av_dict_set_int(&options, "stimeout", params_.timeoutMs * 1000, 0);
-    av_dict_set_int(&options, "rw_timeout", params_.timeoutMs * 1000, 0);
-    if (!params_.tlsCertFile.empty()) {
-      av_dict_set(&options, "cert_file", params_.tlsCertFile.data(), 0);
-    }
-    if (!params_.tlsKeyFile.empty()) {
-      av_dict_set(&options, "key_file", params_.tlsKeyFile.data(), 0);
-    }
-  }
-
-  av_dict_set_int(&options, "probesize", params_.probeSize, 0);
-
-  interrupted_ = false;
-
-  // ffmpeg avformat_open_input call can hang if media source doesn't respond
-  // set a guard for handle such situations, if requested
-  std::promise<bool> p;
-  std::future<bool> f = p.get_future();
-  std::unique_ptr<std::thread> guard;
-  if (params_.preventStaleness) {
-    guard = std::make_unique<std::thread>([&f, this]() {
-      auto timeout = std::chrono::milliseconds(params_.timeoutMs);
-      if (std::future_status::timeout == f.wait_for(timeout)) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " cannot open stream within " << params_.timeoutMs
-                   << " ms";
-        interrupted_ = true;
-      }
-    });
-  }
-
-  if (fmt) {
-    result = avformat_open_input(&inputCtx_, nullptr, fmt, &options);
-  } else {
-    result =
-        avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options);
-  }
-
-  av_dict_free(&options);
-
-  if (guard) {
-    p.set_value(true);
-    guard->join();
-    guard.reset();
-  }
-
-  if (result < 0 || interrupted_) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " avformat_open_input failed, error="
-               << Util::generateErrorDesc(result);
-    cleanUp();
-    return false;
-  }
-
-  result = avformat_find_stream_info(inputCtx_, nullptr);
-
-  if (result < 0) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " avformat_find_stream_info failed, error="
-               << Util::generateErrorDesc(result);
-    cleanUp();
-    return false;
-  }
-
-  if (!openStreams(metadata)) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid << " cannot activate streams";
-    cleanUp();
-    return false;
-  }
-  // SyncDecoder inherits Decoder which would override onInit.
-  onInit();
-
-  if (params.startOffset != 0) {
-    auto offset = params.startOffset <= params.seekAccuracy
-        ? 0
-        : params.startOffset - params.seekAccuracy;
-
-    av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
-  }
-
-  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
-    if (
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-        inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO
-#else // FFMPEG 4.0+
-        inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO
-#endif
-        && inputCtx_->streams[i]->duration > 0) {
-      // There is at least two 1/r_frame_rates from the frame before the last
-      // one until the video duration, let's prefer to set duration after the
-      // frame before the last one, but as early as possible
-      double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den /
-              (double)inputCtx_->streams[i]->r_frame_rate.num -
-          1 / (double)AV_TIME_BASE;
-      videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration *
-              inputCtx_->streams[i]->time_base.num /
-              (double)inputCtx_->streams[i]->time_base.den -
-          1000 * correction;
-      break;
-    }
-  }
-
-  VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
-  VLOG(1) << "Video duration: " << videoDurationMs_;
-  return true;
-}
-
-// open appropriate CODEC for every type of stream and move it to the class
-// variable `streams_` and make sure it is in range for decoding
-bool Decoder::openStreams(std::vector<DecoderMetadata>* metadata) {
-  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
-    // - find the corespondent format at params_.formats set
-    MediaFormat format;
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-    const auto media = inputCtx_->streams[i]->codec->codec_type;
-#else // FFMPEG 4.0+
-    const auto media = inputCtx_->streams[i]->codecpar->codec_type;
-#endif
-    if (!mapFfmpegType(media, &format.type)) {
-      VLOG(1) << "Stream media: " << media << " at index " << i
-              << " gets ignored, unknown type";
-
-      continue; // unsupported type
-    }
-
-    // check format
-    auto it = params_.formats.find(format);
-    if (it == params_.formats.end()) {
-      VLOG(1) << "Stream type: " << format.type << " at index: " << i
-              << " gets ignored, caller is not interested";
-      continue; // clients don't care about this media format
-    }
-
-    // do we have stream of this type?
-    auto stream = findByType(format);
-
-    // should we process this stream?
-
-    if (it->stream == -2 || // all streams of this type are welcome
-        (!stream && (it->stream == -1 || it->stream == i))) { // new stream
-      VLOG(1) << "Stream type: " << format.type << " found, at index: " << i;
-      auto stream_2 = createStream(
-          format.type,
-          inputCtx_,
-          i,
-          params_.convertPtsToWallTime,
-          it->format,
-          params_.loggingUuid);
-      CHECK(stream_2);
-      if (stream_2->openCodec(metadata, params_.numThreads) < 0) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " open codec failed, stream_idx=" << i;
-        return false;
-      }
-      streams_.emplace(i, std::move(stream_2));
-      inRange_.set(i, true);
-    }
-  }
-
-  return true;
-}
-
-void Decoder::shutdown() {
-  cleanUp();
-}
-
-void Decoder::interrupt() {
-  interrupted_ = true;
-}
-
-void Decoder::cleanUp() {
-  if (!interrupted_) {
-    interrupted_ = true;
-  }
-
-  if (inputCtx_) {
-    for (auto& stream : streams_) {
-      // Drain stream buffers.
-      DecoderOutputMessage msg;
-      while (msg.payload = nullptr, stream.second->flush(&msg, true) > 0) {
-      }
-      stream.second.reset();
-    }
-    streams_.clear();
-    avformat_close_input(&inputCtx_);
-  }
-  if (avioCtx_) {
-    av_freep(&avioCtx_->buffer);
-    av_freep(&avioCtx_);
-  }
-
-  // reset callback
-  seekableBuffer_.shutdown();
-}
-
-// function does actual work, derived class calls it in working thread
-// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if
-// no frames got decoded in the specified timeout time, AVERROR_BUFFER_TOO_SMALL
-// when unable to allocate packet and error on unrecoverable error
-int Decoder::getFrame(size_t workingTimeInMs) {
-  if (inRange_.none()) {
-    return ENODATA;
-  }
-  // decode frames until cache is full and leave thread
-  // once decode() method gets called and grab some bytes
-  // run this method again
-  // init package
-  // update 03/22: moving memory management to ffmpeg
-  AVPacket* avPacket;
-  avPacket = av_packet_alloc();
-  if (avPacket == nullptr) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " decoder as not able to allocate the packet.";
-    return AVERROR_BUFFER_TOO_SMALL;
-  }
-  avPacket->data = nullptr;
-  avPacket->size = 0;
-
-  auto end = std::chrono::steady_clock::now() +
-      std::chrono::milliseconds(workingTimeInMs);
-  // return true if elapsed time less than timeout
-  auto watcher = [end]() -> bool {
-    return std::chrono::steady_clock::now() <= end;
-  };
-
-  int result = 0;
-  size_t decodingErrors = 0;
-  bool decodedFrame = false;
-  while (!interrupted_ && inRange_.any() && !decodedFrame) {
-    if (watcher() == false) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid << " hit ETIMEDOUT";
-      result = ETIMEDOUT;
-      break;
-    }
-    result = av_read_frame(inputCtx_, avPacket);
-    if (result == AVERROR(EAGAIN)) {
-      VLOG(4) << "Decoder is busy...";
-      std::this_thread::yield();
-      result = 0; // reset error, EAGAIN is not an error at all
-      // reset the packet to default settings
-      av_packet_unref(avPacket);
-      continue;
-    } else if (result == AVERROR_EOF) {
-      flushStreams();
-      VLOG(1) << "End of stream";
-      result = ENODATA;
-      break;
-    } else if (
-        result == AVERROR(EPERM) && params_.skipOperationNotPermittedPackets) {
-      // reset error, lets skip packets with EPERM
-      result = 0;
-      // reset the packet to default settings
-      av_packet_unref(avPacket);
-      continue;
-    } else if (result < 0) {
-      flushStreams();
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " error detected: " << Util::generateErrorDesc(result);
-      break;
-    }
-
-    // get stream; if stream cannot be found reset the packet to
-    // default settings
-    auto stream = findByIndex(avPacket->stream_index);
-    if (stream == nullptr || !inRange_.test(stream->getIndex())) {
-      av_packet_unref(avPacket);
-      continue;
-    }
-
-    size_t numConsecutiveNoBytes = 0;
-    // it can be only partial decoding of the package bytes
-    do {
-      // decode package
-      bool gotFrame = false;
-      bool hasMsg = false;
-      // packet either got consumed completely or not at all
-      if ((result = processPacket(
-               stream, avPacket, &gotFrame, &hasMsg, params_.fastSeek)) < 0) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " processPacket failed with code: " << result;
-        break;
-      }
-
-      if (!gotFrame && params_.maxProcessNoBytes != 0 &&
-          ++numConsecutiveNoBytes > params_.maxProcessNoBytes) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " exceeding max amount of consecutive no bytes";
-        break;
-      }
-      if (result > 0) {
-        numConsecutiveNoBytes = 0;
-      }
-
-      decodedFrame |= hasMsg;
-    } while (result == 0);
-
-    // post loop check
-    if (result < 0) {
-      if (params_.maxPackageErrors != 0 && // check errors
-          ++decodingErrors >= params_.maxPackageErrors) { // reached the limit
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " exceeding max amount of consecutive package errors";
-        break;
-      }
-    } else {
-      decodingErrors = 0; // reset on success
-    }
-
-    result = 0;
-
-    av_packet_unref(avPacket);
-
-    if (params_.uniformSampling > 1) {
-      if (doSeek_) {
-        double duration =
-            videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
-        double step =
-            (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
-        avformat_seek_file(
-            inputCtx_,
-            -1,
-            static_cast<int64_t>(step * kFramesDecoded_) + 1,
-            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
-            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
-            0);
-        ++kFramesDecoded_;
-        doSeek_ = false;
-      }
-    }
-  }
-
-  av_packet_free(&avPacket);
-  VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_
-          << ", inRange_.any() " << inRange_.any() << ", decodedFrame "
-          << decodedFrame << ", result " << result;
-
-  // loop can be terminated, either by:
-  // 1. explicitly interrupted
-  // 3. unrecoverable error or ENODATA (end of stream) or ETIMEDOUT (timeout)
-  // 4. decoded frames pts are out of the specified range
-  // 5. success decoded frame
-  if (interrupted_) {
-    return EINTR;
-  }
-  if (result != 0) {
-    return result;
-  }
-  if (inRange_.none()) {
-    return ENODATA;
-  }
-  return 0;
-}
-
-// find stream by stream index
-Stream* Decoder::findByIndex(int streamIndex) const {
-  auto it = streams_.find(streamIndex);
-  return it != streams_.end() ? it->second.get() : nullptr;
-}
-
-// find stream by type; note finds only the first stream of a given type
-Stream* Decoder::findByType(const MediaFormat& format) const {
-  for (auto& stream : streams_) {
-    if (stream.second->getMediaFormat().type == format.type) {
-      return stream.second.get();
-    }
-  }
-  return nullptr;
-}
-
-// given the stream and packet, decode the frame buffers into the
-// DecoderOutputMessage data structure via stream::decodePacket function.
-int Decoder::processPacket(
-    Stream* stream,
-    AVPacket* packet,
-    bool* gotFrame,
-    bool* hasMsg,
-    bool fastSeek) {
-  // decode package
-  int result;
-  DecoderOutputMessage msg;
-  msg.payload = params_.headerOnly ? nullptr : createByteStorage(0);
-  *hasMsg = false;
-  if ((result = stream->decodePacket(
-           packet, &msg, params_.headerOnly, gotFrame)) >= 0 &&
-      *gotFrame) {
-    // check end offset
-    bool endInRange =
-        params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
-    inRange_.set(stream->getIndex(), endInRange);
-    // if fastseek is enabled, we're returning the first
-    // frame that we decode after (potential) seek.
-    // By default, we perform accurate seek to the closest
-    // following frame
-    bool startCondition = true;
-    if (!fastSeek) {
-      startCondition = msg.header.pts >= params_.startOffset;
-    }
-    if (endInRange && startCondition) {
-      *hasMsg = pushMsg(std::move(msg));
-    }
-  }
-  return result;
-}
-
-bool Decoder::pushMsg(DecoderOutputMessage&& msg) {
-  pastDecodedPTS_ = currentDecodedPTS_;
-  currentDecodedPTS_ = msg.header.pts;
-
-  if (params_.uniformSampling <= 1) {
-    push(std::move(msg));
-    return true;
-  }
-
-  double duration =
-      videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
-  double step =
-      (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
-  if (pastDecodedPTS_ < step * kFramesDecoded_ &&
-      step * kFramesDecoded_ <= currentDecodedPTS_) {
-    push(std::move(msg));
-    doSeek_ = true;
-    return true;
-  }
-
-  return false;
-}
-
-void Decoder::flushStreams() {
-  VLOG(1) << "Flushing streams...";
-  for (auto& stream : streams_) {
-    DecoderOutputMessage msg;
-    while (msg.payload = (params_.headerOnly ? nullptr : createByteStorage(0)),
-           stream.second->flush(&msg, params_.headerOnly) > 0) {
-      // check end offset
-      bool endInRange =
-          params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
-      inRange_.set(stream.second->getIndex(), endInRange);
-      if (endInRange && msg.header.pts >= params_.startOffset) {
-        pushMsg(std::move(msg));
-      } else {
-        msg.payload.reset();
-      }
-    }
-  }
-}
-
-int Decoder::decode_all(const DecoderOutCallback& callback) {
-  int result;
-  do {
-    DecoderOutputMessage out;
-    if (0 == (result = decode(&out, params_.timeoutMs))) {
-      callback(std::move(out));
-    }
-  } while (result == 0);
-  return result;
-}
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h
deleted file mode 100644
index 172a011f93e..00000000000
--- a/torchvision/csrc/io/decoder/decoder.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#pragma once
-
-#include <bitset>
-#include <unordered_map>
-#include "seekable_buffer.h"
-#include "stream.h"
-
-#if defined(_MSC_VER)
-#include <BaseTsd.h>
-using ssize_t = SSIZE_T;
-#endif
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode media streams.
- * Media bytes can be explicitly provided through read-callback
- * or fetched internally by FFMPEG library
- */
-class Decoder : public MediaDecoder {
- public:
-  Decoder();
-  ~Decoder() override;
-
-  // MediaDecoder overrides
-  bool init(
-      const DecoderParameters& params,
-      DecoderInCallback&& in,
-      std::vector<DecoderMetadata>* metadata) override;
-  int decode_all(const DecoderOutCallback& callback) override;
-  void shutdown() override;
-  void interrupt() override;
-
- protected:
-  // function does actual work, derived class calls it in working thread
-  // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if
-  // no frames got decoded in the specified timeout time, and error on
-  // unrecoverable error.
-  int getFrame(size_t workingTimeInMs = 100);
-
-  // Derived class must override method and consume the provided message
-  virtual void push(DecoderOutputMessage&& buffer) = 0;
-
-  // Fires on init call
-  virtual void onInit() {}
-
- public:
-  // C-style FFMPEG API requires C/static methods for callbacks
-  static void logFunction(void* avcl, int level, const char* cfmt, va_list vl);
-  static int shutdownFunction(void* ctx);
-  static int readFunction(void* opaque, uint8_t* buf, int size);
-  static int64_t seekFunction(void* opaque, int64_t offset, int whence);
-  // can be called by any classes or API
-  static void initOnce();
-
-  int* getPrintPrefix() {
-    return &printPrefix;
-  }
-  double videoDurationMs_ = -1;
-
- private:
-  // mark below function for a proper invocation
-  bool enableLogLevel(int level) const;
-  void logCallback(int level, const std::string& message);
-  int readCallback(uint8_t* buf, int size);
-  int64_t seekCallback(int64_t offset, int whence);
-  int shutdownCallback();
-
-  bool openStreams(std::vector<DecoderMetadata>* metadata);
-  Stream* findByIndex(int streamIndex) const;
-  Stream* findByType(const MediaFormat& format) const;
-  int processPacket(
-      Stream* stream,
-      AVPacket* packet,
-      bool* gotFrame,
-      bool* hasMsg,
-      bool fastSeek = false);
-  void flushStreams();
-  void cleanUp();
-  bool pushMsg(DecoderOutputMessage&&
-                   msg); // returns whether frame is passed to downstream
-
- protected:
-  DecoderParameters params_;
-
- private:
-  SeekableBuffer seekableBuffer_;
-  int printPrefix{1};
-
-  std::atomic<bool> interrupted_{false};
-  AVFormatContext* inputCtx_{nullptr};
-  AVIOContext* avioCtx_{nullptr};
-  std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
-  std::bitset<64> inRange_;
-  int kFramesDecoded_{0};
-  int64_t pastDecodedPTS_{-1};
-  int64_t currentDecodedPTS_{-1};
-  bool doSeek_{false};
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
deleted file mode 100644
index d2dc5c7935b..00000000000
--- a/torchvision/csrc/io/decoder/defs.h
+++ /dev/null
@@ -1,415 +0,0 @@
-#pragma once
-
-#include <array>
-#include <functional>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavformat/avformat.h>
-#include <libavformat/avio.h>
-#include <libavutil/avutil.h>
-#include <libavutil/imgutils.h>
-#include <libswresample/swresample.h>
-#include "libswscale/swscale.h"
-}
-
-namespace ffmpeg {
-
-// bit mask of formats, keep them in form 2^n
-enum MediaType : size_t {
-  TYPE_AUDIO = 1,
-  TYPE_VIDEO = 2,
-  TYPE_SUBTITLE = 4,
-  TYPE_CC = 8, // closed captions from transport streams
-};
-
-// audio
-struct AudioFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-  bool operator==(const AudioFormat& x) const {
-    return x.format == format && x.samples == samples && x.channels == channels;
-  }
-
-  size_t samples{0}; // number samples per second (frequency)
-  size_t channels{0}; // number of channels
-  long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE
-  size_t padding[2];
-  // -- alignment 40 bytes
-};
-
-// video
-struct VideoFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-  bool operator==(const VideoFormat& x) const {
-    return x.format == format && x.width == width && x.height == height;
-  }
-  /*
-  When width = 0, height = 0, minDimension = 0, and maxDimension = 0,
-    keep the original frame resolution
-  When width = 0, height = 0, minDimension != 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that shorter edge size is
-  minDimension
-  When width = 0, height = 0, minDimension = 0, and maxDimension != 0,
-    keep the aspect ratio and resize the frame so that longer edge size is
-  maxDimension
-  When width = 0, height = 0, minDimension != 0, and maxDimension != 0,
-    resize the frame so that shorter edge size is minDimension, and
-    longer edge size is maxDimension. The aspect ratio may not be preserved
-  When width = 0, height != 0, minDimension = 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that frame height is $height
-  When width != 0, height = 0, minDimension = 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that frame width is $width
-  When width != 0, height != 0, minDimension = 0, and maxDimension = 0,
-    resize the frame so that frame width and  height are set to $width and
-  $height,
-    respectively
-  */
-  size_t width{0}; // width in pixels
-  size_t height{0}; // height in pixels
-  long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE
-  size_t minDimension{0}; // choose min dimension and rescale accordingly
-  size_t maxDimension{0}; // choose max dimension and rescale accordingly
-  size_t cropImage{0}; // request image crop
-  // -- alignment 40 bytes
-};
-
-// subtitle/cc
-struct SubtitleFormat {
-  long type{0}; // AVSubtitleType, auto SUBTITLE_NONE
-  size_t padding[4];
-  // -- alignment 40 bytes
-};
-
-union FormatUnion {
-  FormatUnion() : audio() {}
-  explicit FormatUnion(int) : video() {}
-  explicit FormatUnion(char) : subtitle() {}
-  explicit FormatUnion(double) : subtitle() {}
-  AudioFormat audio;
-  VideoFormat video;
-  SubtitleFormat subtitle;
-  // -- alignment 40 bytes
-};
-
-/*
-  MediaFormat data structure serves as input/output parameter.
-  Caller assigns values for input formats
-  or leave default values for auto detection
-  For output formats all fields will be set to the specific values
-*/
-struct MediaFormat {
-  // for using map/set data structures
-  bool operator<(const MediaFormat& x) const {
-    return type < x.type;
-  }
-  bool operator==(const MediaFormat& x) const {
-    if (type != x.type) {
-      return false;
-    }
-    switch (type) {
-      case TYPE_AUDIO:
-        return format.audio == x.format.audio;
-      case TYPE_VIDEO:
-        return format.video == x.format.video;
-      case TYPE_SUBTITLE:
-      case TYPE_CC:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {}
-  explicit MediaFormat(int x, long s = -1)
-      : type(TYPE_VIDEO), stream(s), format(x) {}
-  explicit MediaFormat(char x, long s = -1)
-      : type(TYPE_SUBTITLE), stream(s), format(x) {}
-  explicit MediaFormat(double x, long s = -1)
-      : type(TYPE_CC), stream(s), format(x) {}
-
-  static MediaFormat makeMediaFormat(AudioFormat format, long stream) {
-    MediaFormat result(stream);
-    result.format.audio = format;
-    return result;
-  }
-
-  static MediaFormat makeMediaFormat(VideoFormat format, long stream) {
-    MediaFormat result(0, stream);
-    result.format.video = format;
-    return result;
-  }
-
-  static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) {
-    MediaFormat result('0', stream);
-    result.format.subtitle = format;
-    return result;
-  }
-
-  // format type
-  MediaType type;
-  // stream index:
-  // set -1 for one stream auto detection, -2 for all streams auto detection,
-  // >= 0, specified stream, if caller knows the stream index (unlikely)
-  long stream;
-  // union keeps one of the possible formats, defined by MediaType
-  FormatUnion format;
-};
-
-struct DecoderParameters {
-  // local file, remote file, http url, rtmp stream uri, etc. anything that
-  // ffmpeg can recognize
-  std::string uri{std::string()};
-  // timeout on getting bytes for decoding
-  size_t timeoutMs{1000};
-  // logging level, default AV_LOG_PANIC
-  long logLevel{0};
-  // when decoder would give up, 0 means never
-  size_t maxPackageErrors{0};
-  // max allowed consecutive times no bytes are processed. 0 means for infinite.
-  size_t maxProcessNoBytes{0};
-  // start offset (us)
-  long startOffset{0};
-  // end offset (us)
-  long endOffset{-1};
-  // logging id
-  int64_t loggingUuid{0};
-  // internal max seekable buffer size
-  size_t maxSeekableBytes{0};
-  // adjust header pts to the epoch time
-  bool convertPtsToWallTime{false};
-  // indicate if input stream is an encoded image
-  bool isImage{false};
-  // listen and wait for new rtmp stream
-  bool listen{false};
-  // don't copy frame body, only header
-  bool headerOnly{false};
-  // enable fast seek (seek only to keyframes)
-  bool fastSeek{false};
-  // interrupt init method on timeout
-  bool preventStaleness{true};
-  // seek tolerated accuracy (us)
-  double seekAccuracy{1000000.0};
-  // Allow multithreaded decoding for numThreads > 1;
-  // 0 numThreads=0 sets up sensible defaults
-  int numThreads{1};
-  // what media types should be processed, default none
-  std::set<MediaFormat> formats;
-
-  // can be used for asynchronous decoders
-  size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes
-  size_t cacheTimeoutMs{1000}; // timeout on bytes writing
-  bool enforceCacheSize{false}; // drop output frames if cache is full
-  bool mergeAudioMessages{false}; // combine collocated audio messages together
-
-  std::string tlsCertFile;
-  std::string tlsKeyFile;
-
-  // Skip packets that fail with EPERM errors and continue decoding.
-  bool skipOperationNotPermittedPackets{false};
-
-  // probing size in bytes, i.e. the size of the data to analyze to get stream
-  // information. A higher value will enable detecting more information in case
-  // it is dispersed into the stream, but will increase latency. Must be an
-  // integer not lesser than 32. It is 5000000 by default.
-  int64_t probeSize{5000000};
-
-  // Expected duration of the video to be decoded, mainly used with uniform
-  // sampling
-  float expectedDuration{0.0f};
-
-  // Sample N key-frames from the video roughly uniformly across the timeline
-  int uniformSampling{0};
-
-  // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames.
-  // Override this with bigger buffer size if needed.
-  int64_t maxEncodedBufferSize{0};
-};
-
-struct DecoderHeader {
-  // message id, from 0 till ...
-  size_t seqno{0};
-  // decoded timestamp in microseconds from either beginning of the stream or
-  // from epoch time, see DecoderParameters::convertPtsToWallTime
-  long pts{0};
-  // decoded key frame
-  size_t keyFrame{0};
-  // frames per second, valid only for video streams
-  double fps{0};
-  // format specifies what kind frame is in a payload
-  MediaFormat format;
-};
-
-// Abstract interface ByteStorage class
-class ByteStorage {
- public:
-  virtual ~ByteStorage() = default;
-  // makes sure that buffer has at least n bytes available for writing, if not
-  // storage must reallocate memory.
-  virtual void ensure(size_t n) = 0;
-  // caller must not to write more than available bytes
-  virtual uint8_t* writableTail() = 0;
-  // caller confirms that n bytes were written to the writable tail
-  virtual void append(size_t n) = 0;
-  // caller confirms that n bytes were read from the read buffer
-  virtual void trim(size_t n) = 0;
-  // gives an access to the beginning of the read buffer
-  virtual const uint8_t* data() const = 0;
-  // returns the stored size in bytes
-  virtual size_t length() const = 0;
-  // returns available capacity for writable tail
-  virtual size_t tail() const = 0;
-  // clears content, keeps capacity
-  virtual void clear() = 0;
-};
-
-struct DecoderOutputMessage {
-  DecoderHeader header;
-  std::unique_ptr<ByteStorage> payload;
-};
-
-/*
- * External provider of the ecnoded bytes, specific implementation is left for
- * different use cases, like file, memory, external network end-points, etc.
- * Normally input/output parameter @out set to valid, not null buffer pointer,
- * which indicates "read" call, however there are "seek" modes as well.
-
- * @out != nullptr => read from the current offset, @whence got ignored,
- * @size bytes to read => return number bytes got read, 0 if no more bytes
- * available, < 0 on error.
-
- * @out == nullptr, @timeoutMs == 0 => does provider support "seek"
- * capability in a first place? @size & @whence got ignored, return 0 on
- * success, < 0 if "seek" mode is not supported.
-
- * @out == nullptr, @timeoutMs != 0 => normal seek call
- * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE)
- * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END],
- * length of buffer if @whence = [AVSEEK_SIZE].
- */
-using DecoderInCallback =
-    std::function<int(uint8_t* out, int size, int whence, uint64_t timeoutMs)>;
-
-using DecoderOutCallback = std::function<void(DecoderOutputMessage&&)>;
-
-struct DecoderMetadata {
-  // time base numerator
-  long num{0};
-  // time base denominator
-  long den{1};
-  // duration of the stream, in miscroseconds, if available
-  long duration{-1};
-  // frames per second, valid only for video streams
-  double fps{0};
-  // format specifies what kind frame is in a payload
-  MediaFormat format;
-};
-/**
- * Abstract class for decoding media bytes
- * It has two different modes. Internal media bytes retrieval for given uri and
- * external media bytes provider in case of memory streams
- */
-class MediaDecoder {
- public:
-  virtual ~MediaDecoder() = default;
-
-  /**
-   * Initializes media decoder with parameters,
-   * calls callback when media bytes are available.
-   * Media bytes get fetched internally from provided URI
-   * or invokes provided input callback to get media bytes.
-   * Input callback must be empty for the internal media provider
-   * Caller can provide non-null pointer for the input container
-   * if headers to obtain the streams metadata (optional)
-   */
-  virtual bool init(
-      const DecoderParameters& params,
-      DecoderInCallback&& in,
-      std::vector<DecoderMetadata>* metadata) = 0;
-
-  /**
-   * Polls available decoded one frame from decoder
-   * Returns error code, 0 - for success
-   */
-  virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0;
-
-  /**
-   * Polls available decoded bytes from decoder, till EOF or error
-   */
-  virtual int decode_all(const DecoderOutCallback& callback) = 0;
-
-  /**
-   * Stops calling callback, releases resources
-   */
-  virtual void shutdown() = 0;
-
-  /**
-   * Interrupts whatever decoder is doing at any time
-   */
-  virtual void interrupt() = 0;
-
-  /**
-   * Factory to create ByteStorage class instances, particular implementation is
-   * left to the derived class. Caller provides the initially allocated size
-   */
-  virtual std::unique_ptr<ByteStorage> createByteStorage(size_t n) = 0;
-};
-
-struct SamplerParameters {
-  MediaType type{TYPE_AUDIO};
-  FormatUnion in;
-  FormatUnion out;
-  int64_t loggingUuid{0};
-};
-
-/**
- * Abstract class for sampling media bytes
- */
-class MediaSampler {
- public:
-  virtual ~MediaSampler() = default;
-
-  /**
-   * Initializes media sampler with parameters
-   */
-  virtual bool init(const SamplerParameters& params) = 0;
-
-  /**
-   * Samples media bytes
-   * Returns error code < 0, or >=0 - for success, indicating number of bytes
-   * processed.
-   * set @in to null for flushing data
-   */
-  virtual int sample(const ByteStorage* in, ByteStorage* out) = 0;
-
-  /**
-   * Releases resources
-   */
-  virtual void shutdown() = 0;
-
-  /*
-   * Returns media type
-   */
-  MediaType getMediaType() const {
-    return params_.type;
-  }
-  /*
-   * Returns formats
-   */
-  FormatUnion getInputFormat() const {
-    return params_.in;
-  }
-  FormatUnion getOutFormat() const {
-    return params_.out;
-  }
-
- protected:
-  SamplerParameters params_;
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp
deleted file mode 100644
index 4e420c3b3cd..00000000000
--- a/torchvision/csrc/io/decoder/memory_buffer.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "memory_buffer.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size)
-    : buffer_(buffer), len_(size) {}
-
-int MemoryBuffer::read(uint8_t* buf, int size) {
-  if (pos_ < len_) {
-    auto available = std::min(int(len_ - pos_), size);
-    memcpy(buf, buffer_ + pos_, available);
-    pos_ += available;
-    return available;
-  }
-
-  return 0;
-}
-
-int64_t MemoryBuffer::seek(int64_t offset, int whence) {
-  if (whence & AVSEEK_SIZE) {
-    return len_;
-  }
-
-  // remove force flag
-  whence &= ~AVSEEK_FORCE;
-
-  switch (whence) {
-    case SEEK_SET:
-      if (offset >= 0 && offset <= len_) {
-        pos_ = offset;
-      }
-      break;
-    case SEEK_END:
-      if (len_ + offset >= 0 && len_ + offset <= len_) {
-        pos_ = len_ + offset;
-      }
-      break;
-    case SEEK_CUR:
-      if (pos_ + offset > 0 && pos_ + offset <= len_) {
-        pos_ += offset;
-      }
-      break;
-    default:
-      LOG(ERROR) << "Unknown whence flag gets provided: " << whence;
-  }
-  return pos_;
-}
-
-/* static */
-DecoderInCallback MemoryBuffer::getCallback(
-    const uint8_t* buffer,
-    size_t size) {
-  MemoryBuffer object(buffer, size);
-  return
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - supported
-          return 0;
-        }
-        return object.seek(size, whence);
-      };
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/memory_buffer.h b/torchvision/csrc/io/decoder/memory_buffer.h
deleted file mode 100644
index 909626d3cae..00000000000
--- a/torchvision/csrc/io/decoder/memory_buffer.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses external memory buffer and implements a seekable interface.
- */
-class MemoryBuffer {
- public:
-  explicit MemoryBuffer(const uint8_t* buffer, size_t size);
-  int64_t seek(int64_t offset, int whence);
-  int read(uint8_t* buf, int size);
-
-  // static constructor for decoder callback.
-  static DecoderInCallback getCallback(const uint8_t* buffer, size_t size);
-
- private:
-  const uint8_t* buffer_; // set at construction time
-  long pos_{0}; // current position
-  long len_{0}; // bytes in buffer
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/seekable_buffer.cpp b/torchvision/csrc/io/decoder/seekable_buffer.cpp
deleted file mode 100644
index 41e3e689c7b..00000000000
--- a/torchvision/csrc/io/decoder/seekable_buffer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "seekable_buffer.h"
-#include <c10/util/Logging.h>
-#include <chrono>
-#include "memory_buffer.h"
-
-namespace ffmpeg {
-
-int SeekableBuffer::init(
-    DecoderInCallback&& in,
-    uint64_t timeoutMs,
-    size_t maxSeekableBytes,
-    ImageType* type) {
-  shutdown();
-  isSeekable_ = in(nullptr, 0, 0, 0) == 0;
-  if (isSeekable_) { // seekable
-    if (type) {
-      if (!readBytes(in, 8, timeoutMs)) {
-        return -1;
-      }
-      setImageType(type);
-      end_ = 0;
-      eof_ = false;
-      std::vector<uint8_t>().swap(buffer_);
-      // reset callback
-      if (in(nullptr, 0, SEEK_SET, timeoutMs)) {
-        return -1;
-      }
-    }
-    inCallback_ = std::forward<DecoderInCallback>(in);
-    return 1;
-  }
-
-  if (!readBytes(in, maxSeekableBytes + (type ? 8 : 0), timeoutMs)) {
-    return -1;
-  }
-
-  if (type) {
-    setImageType(type);
-  }
-
-  if (eof_) {
-    end_ = 0;
-    eof_ = false;
-    // reuse MemoryBuffer functionality
-    inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size());
-    isSeekable_ = true;
-    return 1;
-  }
-  inCallback_ = std::forward<DecoderInCallback>(in);
-  return 0;
-}
-
-bool SeekableBuffer::readBytes(
-    DecoderInCallback& in,
-    size_t maxBytes,
-    uint64_t timeoutMs) {
-  // Resize to th minimum 4K page or less
-  buffer_.resize(std::min(maxBytes, size_t(4 * 1024UL)));
-  end_ = 0;
-  eof_ = false;
-
-  auto end =
-      std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs);
-  auto watcher = [end]() -> bool {
-    return std::chrono::steady_clock::now() <= end;
-  };
-
-  bool hasTime = true;
-  while (!eof_ && end_ < maxBytes && (hasTime = watcher())) {
-    // lets read all bytes into available buffer
-    auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs);
-    if (res > 0) {
-      end_ += res;
-      if (end_ == buffer_.size()) {
-        buffer_.resize(std::min(size_t(end_ * 4UL), maxBytes));
-      }
-    } else if (res == 0) {
-      eof_ = true;
-    } else {
-      // error
-      return false;
-    }
-  }
-
-  buffer_.resize(end_);
-
-  return hasTime;
-}
-
-void SeekableBuffer::setImageType(ImageType* type) {
-  if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 &&
-      buffer_[2] == 0xFF) {
-    *type = ImageType::JPEG;
-  } else if (
-      buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' &&
-      buffer_[3] == 'G') {
-    *type = ImageType::PNG;
-  } else if (
-      buffer_.size() > 1 &&
-      ((buffer_[0] == 0x49 && buffer_[1] == 0x49) ||
-       (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) {
-    *type = ImageType::TIFF;
-  } else {
-    *type = ImageType::UNKNOWN;
-  }
-}
-
-int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) {
-  if (isSeekable_) {
-    return inCallback_(buf, size, 0, timeoutMs);
-  }
-  if (pos_ < end_) {
-    // read cached bytes for non-seekable callback
-    auto available = std::min(int(end_ - pos_), size);
-    memcpy(buf, buffer_.data() + pos_, available);
-    pos_ += available;
-    return available;
-  } else if (!eof_) {
-    // normal sequential read (see defs.h file), i.e. @buf != null
-    auto res = inCallback_(buf, size, 0, timeoutMs); // read through
-    eof_ = res == 0;
-    return res;
-  } else {
-    return 0;
-  }
-}
-
-int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) {
-  return inCallback_(nullptr, offset, whence, timeoutMs);
-}
-
-void SeekableBuffer::shutdown() {
-  pos_ = end_ = 0;
-  eof_ = false;
-  std::vector<uint8_t>().swap(buffer_);
-  inCallback_ = nullptr;
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/seekable_buffer.h b/torchvision/csrc/io/decoder/seekable_buffer.h
deleted file mode 100644
index 9d5729f5306..00000000000
--- a/torchvision/csrc/io/decoder/seekable_buffer.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses internal buffer to store initial size bytes as a seekable cache
- * from Media provider and let ffmpeg to seek and read bytes from cache
- * and beyond - reading bytes directly from Media provider
- */
-enum class ImageType {
-  UNKNOWN = 0,
-  JPEG = 1,
-  PNG = 2,
-  TIFF = 3,
-};
-
-class SeekableBuffer {
- public:
-  // @type is optional, not nullptr only is image detection required
-  // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error
-  int init(
-      DecoderInCallback&& in,
-      uint64_t timeoutMs,
-      size_t maxSeekableBytes,
-      ImageType* type);
-  int read(uint8_t* buf, int size, uint64_t timeoutMs);
-  int64_t seek(int64_t offset, int whence, uint64_t timeoutMs);
-  void shutdown();
-
- private:
-  bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs);
-  void setImageType(ImageType* type);
-
- private:
-  DecoderInCallback inCallback_;
-  std::vector<uint8_t> buffer_; // resized at init time
-  long pos_{0}; // current position (SEEK_CUR iff pos_ < end_)
-  long end_{0}; // current buffer size
-  bool eof_{0}; // indicates the EOF
-  bool isSeekable_{false}; // is callback seekable
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp
deleted file mode 100644
index 7969741e72c..00000000000
--- a/torchvision/csrc/io/decoder/stream.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-#include "stream.h"
-#include <c10/util/Logging.h>
-#include <string.h>
-#include "util.h"
-
-namespace ffmpeg {
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-
-Stream::Stream(
-    AVFormatContext* inputCtx,
-    MediaFormat format,
-    bool convertPtsToWallTime,
-    int64_t loggingUuid)
-    : inputCtx_(inputCtx),
-      format_(format),
-      convertPtsToWallTime_(convertPtsToWallTime),
-      loggingUuid_(loggingUuid) {}
-
-Stream::~Stream() {
-  if (frame_) {
-    av_free(frame_);
-  }
-  if (codecCtx_) {
-    avcodec_free_context(&codecCtx_);
-  }
-}
-
-// look up the proper CODEC querying the function
-AVCodec* Stream::findCodec(AVCodecParameters* params) {
-  return (AVCodec*)avcodec_find_decoder(params->codec_id);
-}
-
-// Allocate memory for the AVCodecContext, which will hold the context for
-// decode/encode process. Then fill this codec context with CODEC parameters
-// defined in stream parameters. Open the codec, and allocate the global frame
-// defined in the header file
-int Stream::openCodec(std::vector<DecoderMetadata>* metadata, int num_threads) {
-  AVStream* steam = inputCtx_->streams[format_.stream];
-
-  AVCodec* codec = findCodec(steam->codecpar);
-  if (!codec) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_find_decoder failed for codec_id: "
-               << int(steam->codecpar->codec_id);
-    return AVERROR(EINVAL);
-  }
-
-  if (!(codecCtx_ = avcodec_alloc_context3(codec))) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_alloc_context3 failed";
-    return AVERROR(ENOMEM);
-  }
-  // multithreading heuristics
-  // if user defined,
-  if (num_threads > max_threads) {
-    num_threads = max_threads;
-  }
-
-  if (num_threads > 0) {
-    // if user defined, respect that
-    // note that default thread_type will be used
-    codecCtx_->thread_count = num_threads;
-  } else {
-    // otherwise set sensible defaults
-    codecCtx_->thread_count = 8;
-    codecCtx_->thread_type = FF_THREAD_SLICE;
-  }
-
-  int ret;
-  // Copy codec parameters from input stream to output codec context
-  if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_parameters_to_context failed";
-    return ret;
-  }
-
-  // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful
-  if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret);
-    avcodec_free_context(&codecCtx_);
-    codecCtx_ = nullptr;
-    return ret;
-  }
-
-  frame_ = av_frame_alloc();
-
-  switch (format_.type) {
-    case TYPE_VIDEO:
-      fps_ = av_q2d(av_guess_frame_rate(inputCtx_, steam, nullptr));
-      break;
-    case TYPE_AUDIO:
-      fps_ = codecCtx_->sample_rate;
-      break;
-    default:
-      fps_ = 30.0;
-  }
-
-  if ((ret = initFormat())) {
-    LOG(ERROR) << "initFormat failed, type: " << format_.type;
-  }
-
-  if (metadata) {
-    DecoderMetadata header;
-    header.format = format_;
-    header.fps = fps_;
-    header.num = steam->time_base.num;
-    header.den = steam->time_base.den;
-    header.duration =
-        av_rescale_q(steam->duration, steam->time_base, timeBaseQ);
-    metadata->push_back(header);
-  }
-
-  return ret;
-}
-
-// send the raw data packet (compressed frame) to the decoder, through the codec
-// context and receive the raw data frame (uncompressed frame) from the
-// decoder, through the same codec context
-int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
-  int consumed = 0;
-  int result = avcodec_send_packet(codecCtx_, packet);
-  if (result == AVERROR(EAGAIN)) {
-    *gotFrame = false; // no bytes get consumed, fetch frame
-  } else if (result == AVERROR_EOF) {
-    *gotFrame = false; // more than one flush packet
-    if (packet) {
-      // got packet after flush, this is an error
-      return result;
-    }
-  } else if (result < 0) {
-    LOG(ERROR) << "avcodec_send_packet failed, err: "
-               << Util::generateErrorDesc(result);
-    return result; // error
-  } else {
-    consumed = packet ? packet->size : 0; // all bytes get consumed
-  }
-
-  result = avcodec_receive_frame(codecCtx_, frame_);
-
-  if (result >= 0) {
-    *gotFrame = true; // frame is available
-  } else if (result == AVERROR(EAGAIN)) {
-    *gotFrame = false; // no frames at this time, needs more packets
-    if (!consumed) {
-      // precaution, if no packages got consumed and no frames are available
-      return result;
-    }
-  } else if (result == AVERROR_EOF) {
-    *gotFrame = false; // the last frame has been flushed
-    // precaution, if no more frames are available assume we consume all bytes
-    consumed = 0;
-  } else { // error
-    LOG(ERROR) << "avcodec_receive_frame failed, err: "
-               << Util::generateErrorDesc(result);
-    return result;
-  }
-  return consumed;
-}
-
-// General decoding function:
-// given the packet, analyse the metadata, and write the
-// metadata and the buffer to the DecoderOutputImage.
-int Stream::decodePacket(
-    const AVPacket* packet,
-    DecoderOutputMessage* out,
-    bool headerOnly,
-    bool* hasMsg) {
-  int consumed;
-  bool gotFrame = false;
-  *hasMsg = false;
-  if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 &&
-      (packet == nullptr || gotFrame)) {
-    int result;
-    if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) {
-      return result; // report error
-    }
-    *hasMsg = result > 0;
-  }
-  return consumed;
-}
-
-int Stream::flush(DecoderOutputMessage* out, bool headerOnly) {
-  bool hasMsg = false;
-  int result = decodePacket(nullptr, out, headerOnly, &hasMsg);
-  if (result < 0) {
-    avcodec_flush_buffers(codecCtx_);
-    return result;
-  }
-  if (!hasMsg) {
-    avcodec_flush_buffers(codecCtx_);
-    return 0;
-  }
-  return 1;
-}
-
-// Sets the header and payload via stream::setHeader and copyFrameBytes
-// functions that are defined in type stream subclass (VideoStream, AudioStream,
-// ...)
-int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) {
-  if (flush) {
-    // only flush of audio frames makes sense
-    if (format_.type == TYPE_AUDIO) {
-      int processed = 0;
-      size_t total = 0;
-      // grab all audio bytes by chunks
-      do {
-        if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) {
-          return processed;
-        }
-        total += processed;
-      } while (processed);
-
-      if (total) {
-        // set header if message bytes are available
-        setHeader(&out->header, flush);
-        return 1;
-      }
-    }
-    return 0;
-  } else {
-    if (format_.type == TYPE_AUDIO) {
-      int processed = 0;
-      if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) {
-        return processed;
-      }
-      if (processed) {
-        // set header if message bytes are available
-        setHeader(&out->header, flush);
-        return 1;
-      }
-      return 0;
-    } else {
-      // set header
-      setHeader(&out->header, flush);
-
-      if (headerOnly) {
-        // Only header is requisted
-        return 1;
-      }
-
-      return copyFrameBytes(out->payload.get(), flush);
-    }
-  }
-}
-
-void Stream::setHeader(DecoderHeader* header, bool flush) {
-  header->seqno = numGenerator_++;
-
-  setFramePts(header, flush);
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->format = format_;
-  header->keyFrame = 0;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-}
-
-void Stream::setFramePts(DecoderHeader* header, bool flush) {
-  if (flush) {
-    header->pts = nextPts_; // already in us
-  } else {
-    header->pts = frame_->best_effort_timestamp;
-    if (header->pts == AV_NOPTS_VALUE) {
-      header->pts = nextPts_;
-    } else {
-      header->pts = av_rescale_q(
-          header->pts,
-          inputCtx_->streams[format_.stream]->time_base,
-          timeBaseQ);
-    }
-
-    switch (format_.type) {
-      case TYPE_AUDIO:
-        nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_;
-        break;
-      case TYPE_VIDEO:
-        nextPts_ = header->pts + AV_TIME_BASE / fps_;
-        break;
-      default:
-        nextPts_ = header->pts;
-    }
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/stream.h b/torchvision/csrc/io/decoder/stream.h
deleted file mode 100644
index 6250dd9ecd2..00000000000
--- a/torchvision/csrc/io/decoder/stream.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#pragma once
-
-#include <atomic>
-#include "defs.h"
-#include "time_keeper.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one media stream (audio or video).
- */
-
-class Stream {
- public:
-  Stream(
-      AVFormatContext* inputCtx,
-      MediaFormat format,
-      bool convertPtsToWallTime,
-      int64_t loggingUuid);
-  virtual ~Stream();
-
-  // returns 0 - on success or negative error
-  // num_threads sets up the codec context for multithreading if needed
-  // default is set to single thread in order to not break BC
-  int openCodec(std::vector<DecoderMetadata>* metadata, int num_threads = 1);
-  // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error
-  int decodePacket(
-      const AVPacket* packet,
-      DecoderOutputMessage* out,
-      bool headerOnly,
-      bool* hasMsg);
-  // returns stream index
-  int getIndex() const {
-    return format_.stream;
-  }
-  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
-  int flush(DecoderOutputMessage* out, bool headerOnly);
-  // return media format
-  MediaFormat getMediaFormat() const {
-    return format_;
-  }
-
- protected:
-  virtual int initFormat() = 0;
-  // returns number processed bytes from packet, or negative error
-  virtual int analyzePacket(const AVPacket* packet, bool* gotFrame);
-  // returns number processed bytes from packet, or negative error
-  virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0;
-  // sets output format
-  virtual void setHeader(DecoderHeader* header, bool flush);
-  // set frame pts
-  virtual void setFramePts(DecoderHeader* header, bool flush);
-  // finds codec
-  virtual AVCodec* findCodec(AVCodecParameters* params);
-
- private:
-  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
-  int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly);
-
- protected:
-  AVFormatContext* const inputCtx_;
-  MediaFormat format_;
-  const bool convertPtsToWallTime_;
-  int64_t loggingUuid_;
-
-  AVCodecContext* codecCtx_{nullptr};
-  AVFrame* frame_{nullptr};
-
-  std::atomic<size_t> numGenerator_{0};
-  TimeKeeper keeper_;
-  // estimated next frame pts for flushing the last frame
-  int64_t nextPts_{0};
-  double fps_{30.};
-  // this is a dumb conservative limit; ideally we'd use
-  // int max_threads = at::get_num_threads(); but this would cause
-  // fb sync to fail as it would add dependency to ATen to the decoder API
-  const int max_threads = 12;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.cpp b/torchvision/csrc/io/decoder/subtitle_sampler.cpp
deleted file mode 100644
index d0df24d3e35..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_sampler.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "subtitle_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-SubtitleSampler::~SubtitleSampler() {
-  cleanUp();
-}
-
-void SubtitleSampler::shutdown() {
-  cleanUp();
-}
-
-bool SubtitleSampler::init(const SamplerParameters& params) {
-  cleanUp();
-  // set formats
-  params_ = params;
-  return true;
-}
-
-int SubtitleSampler::sample(AVSubtitle* sub, ByteStorage* out) {
-  if (!sub || !out) {
-    return 0; // flush
-  }
-
-  out->ensure(Util::size(*sub));
-
-  return Util::serialize(*sub, out);
-}
-
-int SubtitleSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  if (in && out) {
-    // Get a writable copy
-    if (size_t len = in->length()) {
-      out->ensure(len);
-      memcpy(out->writableTail(), in->data(), len);
-    }
-    return out->length();
-  }
-  return 0;
-}
-
-void SubtitleSampler::cleanUp() {}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.h b/torchvision/csrc/io/decoder/subtitle_sampler.h
deleted file mode 100644
index 4aee811ed56..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_sampler.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode audio frames from one format into another
- */
-
-class SubtitleSampler : public MediaSampler {
- public:
-  SubtitleSampler() = default;
-  ~SubtitleSampler() override;
-
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  // returns number processed/scaling bytes
-  int sample(AVSubtitle* sub, ByteStorage* out);
-
-  // helper serialization/deserialization methods
-  static void serialize(const AVSubtitle& sub, ByteStorage* out);
-  static bool deserialize(const ByteStorage& buf, AVSubtitle* sub);
-
- private:
-  // close resources
-  void cleanUp();
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp
deleted file mode 100644
index 3416f702d7e..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_stream.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "subtitle_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-
-SubtitleStream::SubtitleStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const SubtitleFormat& format)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          0) {
-  memset(&sub_, 0, sizeof(sub_));
-}
-
-void SubtitleStream::releaseSubtitle() {
-  if (sub_.release) {
-    avsubtitle_free(&sub_);
-    memset(&sub_, 0, sizeof(sub_));
-  }
-}
-
-SubtitleStream::~SubtitleStream() {
-  releaseSubtitle();
-  sampler_.shutdown();
-}
-
-int SubtitleStream::initFormat() {
-  if (!codecCtx_->subtitle_header) {
-    LOG(ERROR) << "No subtitle header found";
-  } else {
-    VLOG(1) << "Subtitle header found!";
-  }
-  return 0;
-}
-
-int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
-  // clean-up
-  releaseSubtitle();
-
-  // FIXME: should this even be created?
-  AVPacket* avPacket;
-  avPacket = av_packet_alloc();
-  if (avPacket == nullptr) {
-    LOG(ERROR)
-        << "decoder as not able to allocate the subtitle-specific packet.";
-    // alternative to ENOMEM
-    return AVERROR_BUFFER_TOO_SMALL;
-  }
-  avPacket->data = nullptr;
-  avPacket->size = 0;
-  // check flush packet
-  auto pkt = packet ? packet : avPacket;
-
-  int gotFramePtr = 0;
-  // is these a better way than cast from const?
-  int result =
-      avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, (AVPacket*)pkt);
-
-  if (result < 0) {
-    LOG(ERROR) << "avcodec_decode_subtitle2 failed, err: "
-               << Util::generateErrorDesc(result);
-    // free the packet we've created
-    av_packet_free(&avPacket);
-    return result;
-  } else if (result == 0) {
-    result = pkt->size; // discard the rest of the package
-  }
-
-  sub_.release = gotFramePtr;
-  *gotFrame = gotFramePtr > 0;
-
-  // set proper pts in us
-  if (gotFramePtr) {
-    sub_.pts = av_rescale_q(
-        pkt->pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ);
-  }
-
-  av_packet_free(&avPacket);
-  return result;
-}
-
-int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  return sampler_.sample(flush ? nullptr : &sub_, out);
-}
-
-void SubtitleStream::setFramePts(DecoderHeader* header, bool) {
-  header->pts = sub_.pts; // already in us
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_stream.h b/torchvision/csrc/io/decoder/subtitle_stream.h
deleted file mode 100644
index 6c366e11f50..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_stream.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-
-#include "stream.h"
-#include "subtitle_sampler.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one subtitle stream.
- */
-struct AVSubtitleKeeper : AVSubtitle {
-  int64_t release{0};
-};
-
-class SubtitleStream : public Stream {
- public:
-  SubtitleStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const SubtitleFormat& format);
-  ~SubtitleStream() override;
-
- protected:
-  void setFramePts(DecoderHeader* header, bool flush) override;
-
- private:
-  int initFormat() override;
-  int analyzePacket(const AVPacket* packet, bool* gotFrame) override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void releaseSubtitle();
-
- private:
-  SubtitleSampler sampler_;
-  AVSubtitleKeeper sub_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder.cpp b/torchvision/csrc/io/decoder/sync_decoder.cpp
deleted file mode 100644
index 1f03ef8eb95..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "sync_decoder.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-SyncDecoder::AVByteStorage::AVByteStorage(size_t n) {
-  ensure(n);
-}
-
-SyncDecoder::AVByteStorage::~AVByteStorage() {
-  av_free(buffer_);
-}
-
-void SyncDecoder::AVByteStorage::ensure(size_t n) {
-  if (tail() < n) {
-    capacity_ = offset_ + length_ + n;
-    buffer_ = static_cast<uint8_t*>(av_realloc(buffer_, capacity_));
-  }
-}
-
-uint8_t* SyncDecoder::AVByteStorage::writableTail() {
-  TORCH_CHECK_LE(offset_ + length_, capacity_);
-  return buffer_ + offset_ + length_;
-}
-
-void SyncDecoder::AVByteStorage::append(size_t n) {
-  TORCH_CHECK_LE(n, tail());
-  length_ += n;
-}
-
-void SyncDecoder::AVByteStorage::trim(size_t n) {
-  TORCH_CHECK_LE(n, length_);
-  offset_ += n;
-  length_ -= n;
-}
-
-const uint8_t* SyncDecoder::AVByteStorage::data() const {
-  return buffer_ + offset_;
-}
-
-size_t SyncDecoder::AVByteStorage::length() const {
-  return length_;
-}
-
-size_t SyncDecoder::AVByteStorage::tail() const {
-  TORCH_CHECK_LE(offset_ + length_, capacity_);
-  return capacity_ - offset_ - length_;
-}
-
-void SyncDecoder::AVByteStorage::clear() {
-  offset_ = 0;
-  length_ = 0;
-}
-
-std::unique_ptr<ByteStorage> SyncDecoder::createByteStorage(size_t n) {
-  return std::make_unique<AVByteStorage>(n);
-}
-
-void SyncDecoder::onInit() {
-  eof_ = false;
-  queue_.clear();
-}
-
-int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) {
-  if (eof_ && queue_.empty()) {
-    return ENODATA;
-  }
-
-  if (queue_.empty()) {
-    int result = getFrame(timeoutMs);
-    // assign EOF
-    eof_ = result == ENODATA;
-    // check unrecoverable error, any error but ENODATA
-    if (result && result != ENODATA) {
-      return result;
-    }
-
-    // still empty
-    if (queue_.empty()) {
-      if (eof_) {
-        return ENODATA;
-      } else {
-        LOG(INFO) << "Queue is empty";
-        return ETIMEDOUT;
-      }
-    }
-  }
-
-  *out = std::move(queue_.front());
-  queue_.pop_front();
-  return 0;
-}
-
-void SyncDecoder::push(DecoderOutputMessage&& buffer) {
-  queue_.push_back(std::move(buffer));
-}
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder.h b/torchvision/csrc/io/decoder/sync_decoder.h
deleted file mode 100644
index b7cf7b625ac..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#pragma once
-
-#include <list>
-#include "decoder.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode media streams.
- * Media bytes can be explicitly provided through read-callback
- * or fetched internally by FFMPEG library
- */
-class SyncDecoder : public Decoder {
- public:
-  // Allocation of memory must be done with a proper alignment.
-  class AVByteStorage : public ByteStorage {
-   public:
-    explicit AVByteStorage(size_t n);
-    ~AVByteStorage() override;
-    void ensure(size_t n) override;
-    uint8_t* writableTail() override;
-    void append(size_t n) override;
-    void trim(size_t n) override;
-    const uint8_t* data() const override;
-    size_t length() const override;
-    size_t tail() const override;
-    void clear() override;
-
-   private:
-    size_t offset_{0};
-    size_t length_{0};
-    size_t capacity_{0};
-    uint8_t* buffer_{nullptr};
-  };
-
- public:
-  int decode(DecoderOutputMessage* out, uint64_t timeoutMs) override;
-
- private:
-  void push(DecoderOutputMessage&& buffer) override;
-  void onInit() override;
-  std::unique_ptr<ByteStorage> createByteStorage(size_t n) override;
-
- private:
-  std::list<DecoderOutputMessage> queue_;
-  bool eof_{false};
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
deleted file mode 100644
index 085966ce687..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-#include <c10/util/Logging.h>
-#include <dirent.h>
-#include <gtest/gtest.h>
-#include "memory_buffer.h"
-#include "sync_decoder.h"
-#include "util.h"
-
-using namespace ffmpeg;
-
-namespace {
-struct VideoFileStats {
-  std::string name;
-  size_t durationPts{0};
-  int num{0};
-  int den{0};
-  int fps{0};
-};
-
-void gotAllTestFiles(
-    const std::string& folder,
-    std::vector<VideoFileStats>* stats) {
-  DIR* d = opendir(folder.c_str());
-  CHECK(d);
-  struct dirent* dir;
-  while ((dir = readdir(d))) {
-    if (dir->d_type != DT_DIR && 0 != strcmp(dir->d_name, "README")) {
-      VideoFileStats item;
-      item.name = folder + '/' + dir->d_name;
-      LOG(INFO) << "Found video file: " << item.name;
-      stats->push_back(std::move(item));
-    }
-  }
-  closedir(d);
-}
-
-void gotFilesStats(std::vector<VideoFileStats>& stats) {
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.formats = {MediaFormat(0)};
-  params.headerOnly = true;
-  params.preventStaleness = false;
-  size_t avgProvUs = 0;
-  const size_t rounds = 100;
-  for (auto& item : stats) {
-    LOG(INFO) << "Decoding video file in memory: " << item.name;
-    FILE* f = fopen(item.name.c_str(), "rb");
-    CHECK(f != nullptr);
-    fseek(f, 0, SEEK_END);
-    std::vector<uint8_t> buffer(ftell(f));
-    rewind(f);
-    size_t s = fread(buffer.data(), 1, buffer.size(), f);
-    TORCH_CHECK_EQ(buffer.size(), s);
-    fclose(f);
-
-    for (size_t i = 0; i < rounds; ++i) {
-      SyncDecoder decoder;
-      std::vector<DecoderMetadata> metadata;
-      const auto now = std::chrono::steady_clock::now();
-      CHECK(decoder.init(
-          params,
-          MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-          &metadata));
-      const auto then = std::chrono::steady_clock::now();
-      decoder.shutdown();
-      avgProvUs +=
-          std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-              .count();
-      TORCH_CHECK_EQ(metadata.size(), 1);
-      item.num = metadata[0].num;
-      item.den = metadata[0].den;
-      item.fps = metadata[0].fps;
-      item.durationPts =
-          av_rescale_q(metadata[0].duration, AV_TIME_BASE_Q, {1, item.fps});
-    }
-  }
-  LOG(INFO) << "Probing (us) " << avgProvUs / stats.size() / rounds;
-}
-
-size_t measurePerformanceUs(
-    const std::vector<VideoFileStats>& stats,
-    size_t rounds,
-    size_t num,
-    size_t stride) {
-  size_t avgClipDecodingUs = 0;
-  std::srand(time(nullptr));
-  for (const auto& item : stats) {
-    FILE* f = fopen(item.name.c_str(), "rb");
-    CHECK(f != nullptr);
-    fseek(f, 0, SEEK_END);
-    std::vector<uint8_t> buffer(ftell(f));
-    rewind(f);
-    size_t s = fread(buffer.data(), 1, buffer.size(), f);
-    TORCH_CHECK_EQ(buffer.size(), s);
-    fclose(f);
-
-    for (size_t i = 0; i < rounds; ++i) {
-      // randomy select clip
-      size_t rOffset = std::rand();
-      size_t fOffset = rOffset % item.durationPts;
-      size_t clipFrames = num + (num - 1) * stride;
-      if (fOffset + clipFrames > item.durationPts) {
-        fOffset = item.durationPts - clipFrames;
-      }
-
-      DecoderParameters params;
-      params.timeoutMs = 10000;
-      params.startOffset = 1000000;
-      params.seekAccuracy = 100000;
-      params.preventStaleness = false;
-
-      for (size_t n = 0; n < num; ++n) {
-        std::list<DecoderOutputMessage> msgs;
-
-        params.startOffset =
-            av_rescale_q(fOffset, {1, item.fps}, AV_TIME_BASE_Q);
-        params.endOffset = params.startOffset + 100;
-
-        auto now = std::chrono::steady_clock::now();
-        SyncDecoder decoder;
-        CHECK(decoder.init(
-            params,
-            MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-            nullptr));
-        DecoderOutputMessage out;
-        while (0 == decoder.decode(&out, params.timeoutMs)) {
-          msgs.push_back(std::move(out));
-        }
-
-        decoder.shutdown();
-
-        const auto then = std::chrono::steady_clock::now();
-
-        fOffset += 1 + stride;
-
-        avgClipDecodingUs +=
-            std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                .count();
-      }
-    }
-  }
-
-  return avgClipDecodingUs / rounds / num / stats.size();
-}
-
-void runDecoder(SyncDecoder& decoder) {
-  DecoderOutputMessage out;
-  size_t audioFrames = 0, videoFrames = 0, totalBytes = 0;
-  while (0 == decoder.decode(&out, 10000)) {
-    if (out.header.format.type == TYPE_AUDIO) {
-      ++audioFrames;
-    } else if (out.header.format.type == TYPE_VIDEO) {
-      ++videoFrames;
-    } else if (out.header.format.type == TYPE_SUBTITLE && out.payload) {
-      // deserialize
-      LOG(INFO) << "Deserializing subtitle";
-      AVSubtitle sub;
-      memset(&sub, 0, sizeof(sub));
-      EXPECT_TRUE(Util::deserialize(*out.payload, &sub));
-      LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects;
-      for (int i = 0; i < sub.num_rects; ++i) {
-        std::string text = "picture";
-        if (sub.rects[i]->type == SUBTITLE_TEXT) {
-          text = sub.rects[i]->text;
-        } else if (sub.rects[i]->type == SUBTITLE_ASS) {
-          text = sub.rects[i]->ass;
-        }
-
-        LOG(INFO) << "Rect num: " << i << ", type:" << sub.rects[i]->type
-                  << ", text: " << text;
-      }
-
-      avsubtitle_free(&sub);
-    }
-    if (out.payload) {
-      totalBytes += out.payload->length();
-    }
-  }
-  LOG(INFO) << "Decoded audio frames: " << audioFrames
-            << ", video frames: " << videoFrames
-            << ", total bytes: " << totalBytes;
-}
-} // namespace
-
-TEST(SyncDecoder, TestSyncDecoderPerformance) {
-  // Measure the average time of decoding per clip
-  // 1. list of the videos in testing directory
-  // 2. for each video got number of frames with timestamps
-  // 3. randomly select frame offset
-  // 4. adjust offset for number frames and strides,
-  //    if it's out out upper boundary
-  // 5. repeat multiple times, measuring and accumulating decoding time
-  //    per clip.
-  /*
-  1) 4 x 2
-  2) 8 x 8
-  3) 16 x 8
-  4) 32 x 4
-  */
-  const std::string kFolder = "pytorch/vision/test/assets/videos";
-  std::vector<VideoFileStats> stats;
-  gotAllTestFiles(kFolder, &stats);
-  gotFilesStats(stats);
-
-  const size_t kRounds = 10;
-
-  auto new4x2 = measurePerformanceUs(stats, kRounds, 4, 2);
-  auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8);
-  auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8);
-  auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4);
-  LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2
-            << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8
-            << ", new(32x4): " << new32x4;
-}
-
-TEST(SyncDecoder, Test) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestSubtitles) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "vue/synergy/data/robotsub.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestHeadersOnly) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = true;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestHeadersOnlyDownSampling) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = true;
-  MediaFormat format;
-  format.type = TYPE_AUDIO;
-  format.format.audio.samples = 8000;
-  params.formats.insert(format);
-
-  format.type = TYPE_VIDEO;
-  format.format.video.width = 224;
-  format.format.video.height = 224;
-  params.formats.insert(format);
-
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestInitOnlyNoShutdown) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = false;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  std::vector<DecoderMetadata> metadata;
-  CHECK(decoder.init(params, nullptr, &metadata));
-}
-
-TEST(SyncDecoder, TestMemoryBuffer) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen(
-      "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi",
-      "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-  CHECK(decoder.init(
-      params,
-      MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-      nullptr));
-  LOG(INFO) << "Decoding from memory bytes: " << buffer.size();
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-
-  params.maxSeekableBytes = buffer.size() + 1;
-  MemoryBuffer object(buffer.data(), buffer.size());
-  CHECK(decoder.init(
-      params,
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - no
-          return -1;
-        }
-        return object.seek(size, whence);
-      },
-      nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-
-  params.maxSeekableBytes = buffer.size() / 2;
-  MemoryBuffer object(buffer.data(), buffer.size());
-  CHECK(!decoder.init(
-      params,
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - no
-          return -1;
-        }
-        return object.seek(size, whence);
-      },
-      nullptr));
-}
diff --git a/torchvision/csrc/io/decoder/time_keeper.cpp b/torchvision/csrc/io/decoder/time_keeper.cpp
deleted file mode 100644
index 845c76cddc8..00000000000
--- a/torchvision/csrc/io/decoder/time_keeper.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "time_keeper.h"
-#include "defs.h"
-
-namespace ffmpeg {
-
-namespace {
-const long kMaxTimeBaseDiference = 10;
-}
-
-long TimeKeeper::adjust(long& decoderTimestamp) {
-  const long now = std::chrono::duration_cast<std::chrono::microseconds>(
-                       std::chrono::system_clock::now().time_since_epoch())
-                       .count();
-
-  if (startTime_ == 0) {
-    startTime_ = now;
-  }
-  if (streamTimestamp_ == 0) {
-    streamTimestamp_ = decoderTimestamp;
-  }
-
-  const auto runOut = startTime_ + decoderTimestamp - streamTimestamp_;
-
-  if (std::labs((now - runOut) / AV_TIME_BASE) > kMaxTimeBaseDiference) {
-    streamTimestamp_ = startTime_ - now + decoderTimestamp;
-  }
-
-  const auto sleepAdvised = runOut - now;
-
-  decoderTimestamp += startTime_ - streamTimestamp_;
-
-  return sleepAdvised > 0 ? sleepAdvised : 0;
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/time_keeper.h b/torchvision/csrc/io/decoder/time_keeper.h
deleted file mode 100644
index e4d4718c705..00000000000
--- a/torchvision/csrc/io/decoder/time_keeper.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <stdlib.h>
-#include <chrono>
-
-namespace ffmpeg {
-
-/**
- * Class keeps the track of the decoded timestamps (us) for media streams.
- */
-
-class TimeKeeper {
- public:
-  TimeKeeper() = default;
-
-  // adjust provided @timestamp to the corrected value
-  // return advised sleep time before next frame processing in (us)
-  long adjust(long& decoderTimestamp);
-
- private:
-  long startTime_{0};
-  long streamTimestamp_{0};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp
deleted file mode 100644
index 7198d2174ed..00000000000
--- a/torchvision/csrc/io/decoder/util.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-#include "util.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-namespace Serializer {
-
-// fixed size types
-template <typename T>
-inline size_t getSize(const T& x) {
-  return sizeof(x);
-}
-
-template <typename T>
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const T& src) {
-  VLOG(6) << "Generic serializeItem";
-  const auto required = sizeof(src);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(dest + pos, &src, required);
-  pos += required;
-  return true;
-}
-
-template <typename T>
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    T& dest) {
-  const auto required = sizeof(dest);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(&dest, src + pos, required);
-  pos += required;
-  return true;
-}
-
-// AVSubtitleRect specialization
-inline size_t getSize(const AVSubtitleRect& x) {
-  auto rectBytes = [](const AVSubtitleRect& y) -> size_t {
-    size_t s = 0;
-    switch (y.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < y.nb_colors; ++i) {
-          s += sizeof(y.linesize[i]);
-          s += y.linesize[i];
-        }
-        break;
-      case SUBTITLE_TEXT:
-        s += sizeof(size_t);
-        s += strlen(y.text);
-        break;
-      case SUBTITLE_ASS:
-        s += sizeof(size_t);
-        s += strlen(y.ass);
-        break;
-      default:
-        break;
-    }
-    return s;
-  };
-  return getSize(x.x) + getSize(x.y) + getSize(x.w) + getSize(x.h) +
-      getSize(x.nb_colors) + getSize(x.type) + getSize(x.flags) + rectBytes(x);
-}
-
-// AVSubtitle specialization
-inline size_t getSize(const AVSubtitle& x) {
-  auto rectBytes = [](const AVSubtitle& y) -> size_t {
-    size_t s = getSize(y.num_rects);
-    for (unsigned i = 0; i < y.num_rects; ++i) {
-      s += getSize(*y.rects[i]);
-    }
-    return s;
-  };
-  return getSize(x.format) + getSize(x.start_display_time) +
-      getSize(x.end_display_time) + getSize(x.pts) + rectBytes(x);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitleRect& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitleRect& x) -> size_t {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!serializeItem(d, l, p, x.linesize[i])) {
-            return false;
-          }
-          if (p + x.linesize[i] > l) {
-            return false;
-          }
-          memcpy(d + p, x.data[i], x.linesize[i]);
-          p += x.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        const size_t s = strlen(x.text);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.text, s);
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        const size_t s = strlen(x.ass);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.ass, s);
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-  return serializeItem(dest, len, pos, src.x) &&
-      serializeItem(dest, len, pos, src.y) &&
-      serializeItem(dest, len, pos, src.w) &&
-      serializeItem(dest, len, pos, src.h) &&
-      serializeItem(dest, len, pos, src.nb_colors) &&
-      serializeItem(dest, len, pos, src.type) &&
-      serializeItem(dest, len, pos, src.flags) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitle& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitle& x) -> bool {
-    bool res = serializeItem(d, l, p, x.num_rects);
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      res = serializeItem(d, l, p, *(x.rects[i]));
-    }
-    return res;
-  };
-  VLOG(6) << "AVSubtitle serializeItem";
-  return serializeItem(dest, len, pos, src.format) &&
-      serializeItem(dest, len, pos, src.start_display_time) &&
-      serializeItem(dest, len, pos, src.end_display_time) &&
-      serializeItem(dest, len, pos, src.pts) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitleRect& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitleRect& x) -> bool {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!deserializeItem(y, l, p, x.linesize[i])) {
-            return false;
-          }
-          if (p + x.linesize[i] > l) {
-            return false;
-          }
-          x.data[i] = (uint8_t*)av_malloc(x.linesize[i]);
-          memcpy(x.data[i], y + p, x.linesize[i]);
-          p += x.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.text = (char*)av_malloc(s + 1);
-        memcpy(x.text, y + p, s);
-        x.text[s] = 0;
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.ass = (char*)av_malloc(s + 1);
-        memcpy(x.ass, y + p, s);
-        x.ass[s] = 0;
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-
-  return deserializeItem(src, len, pos, dest.x) &&
-      deserializeItem(src, len, pos, dest.y) &&
-      deserializeItem(src, len, pos, dest.w) &&
-      deserializeItem(src, len, pos, dest.h) &&
-      deserializeItem(src, len, pos, dest.nb_colors) &&
-      deserializeItem(src, len, pos, dest.type) &&
-      deserializeItem(src, len, pos, dest.flags) &&
-      rectDeserialize(src, len, pos, dest);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitle& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitle& x) -> bool {
-    bool res = deserializeItem(y, l, p, x.num_rects);
-    if (res && x.num_rects) {
-      x.rects =
-          (AVSubtitleRect**)av_malloc(x.num_rects * sizeof(AVSubtitleRect*));
-    }
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      x.rects[i] = (AVSubtitleRect*)av_malloc(sizeof(AVSubtitleRect));
-      memset(x.rects[i], 0, sizeof(AVSubtitleRect));
-      res = deserializeItem(y, l, p, *x.rects[i]);
-    }
-    return res;
-  };
-  return deserializeItem(src, len, pos, dest.format) &&
-      deserializeItem(src, len, pos, dest.start_display_time) &&
-      deserializeItem(src, len, pos, dest.end_display_time) &&
-      deserializeItem(src, len, pos, dest.pts) &&
-      rectDeserialize(src, len, pos, dest);
-}
-} // namespace Serializer
-
-namespace Util {
-std::string generateErrorDesc(int errorCode) {
-  std::array<char, 1024> buffer;
-  if (av_strerror(errorCode, buffer.data(), buffer.size()) < 0) {
-    return std::string("Unknown error code: ") + std::to_string(errorCode);
-  }
-  buffer.back() = 0;
-  return std::string(buffer.data());
-}
-
-size_t serialize(const AVSubtitle& sub, ByteStorage* out) {
-  const auto len = size(sub);
-  size_t pos = 0;
-  if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) {
-    return 0;
-  }
-  out->append(len);
-  return len;
-}
-
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub) {
-  size_t pos = 0;
-  return Serializer::deserializeItem(buf.data(), buf.length(), pos, *sub);
-}
-
-size_t size(const AVSubtitle& sub) {
-  return Serializer::getSize(sub);
-}
-
-bool validateVideoFormat(const VideoFormat& f) {
-  // clang-format off
-  /*
-  Valid parameters values for decoder
-  ____________________________________________________________________________________
-  |  W  |  H  | minDimension | maxDimension | cropImage |  algorithm                 |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     0        |  0           |  N/A      |   original                 |
-  |__________________________________________________________________________________|
-  |  >0 |  0  |     N/A      |  N/A         |  N/A      |   scale keeping W          |
-  |__________________________________________________________________________________|
-  |  0  |  >0 |     N/A      |  N/A         |  N/A      |   scale keeping H          |
-  |__________________________________________________________________________________|
-  |  >0 |  >0 |     N/A      |  N/A         |  0        |   stretch/scale            |
-  |__________________________________________________________________________________|
-  |  >0 |  >0 |     N/A      |  N/A         |  >0       |   scale/crop               |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     >0       |  0           |  N/A      |scale to min dimension      |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     0        |  >0          |  N/A      |scale to max dimension      |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     >0       |  >0          |  N/A      |stretch to min/max dimension|
-  |_____|_____|______________|______________|___________|____________________________|
-
-  */
-  // clang-format on
-  return (f.width == 0 && // #1, #6, #7 and #8
-          f.height == 0 && f.cropImage == 0) ||
-      (f.width != 0 && // #4 and #5
-       f.height != 0 && f.minDimension == 0 && f.maxDimension == 0) ||
-      (((f.width != 0 && // #2
-         f.height == 0) ||
-        (f.width == 0 && // #3
-         f.height != 0)) &&
-       f.minDimension == 0 && f.maxDimension == 0 && f.cropImage == 0);
-}
-
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t maxDimension,
-    size_t cropImage) {
-  // rounding rules
-  // int -> double -> round up
-  // if fraction is >= 0.5 or round down if fraction is < 0.5
-  // int result = double(value) + 0.5
-  // here we rounding double to int according to the above rule
-
-  // #1, #6, #7 and #8
-  if (userW == 0 && userH == 0) {
-    if (minDimension > 0 && maxDimension == 0) { // #6
-      if (srcW > srcH) {
-        // landscape
-        destH = minDimension;
-        destW = round(double(srcW * minDimension) / srcH);
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = round(double(srcH * minDimension) / srcW);
-      }
-    } else if (minDimension == 0 && maxDimension > 0) { // #7
-      if (srcW > srcH) {
-        // landscape
-        destW = maxDimension;
-        destH = round(double(srcH * maxDimension) / srcW);
-      } else {
-        // portrait
-        destH = maxDimension;
-        destW = round(double(srcW * maxDimension) / srcH);
-      }
-    } else if (minDimension > 0 && maxDimension > 0) { // #8
-      if (srcW > srcH) {
-        // landscape
-        destW = maxDimension;
-        destH = minDimension;
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = maxDimension;
-      }
-    } else { // #1
-      destW = srcW;
-      destH = srcH;
-    }
-  } else if (userW != 0 && userH == 0) { // #2
-    destW = userW;
-    destH = round(double(srcH * userW) / srcW);
-  } else if (userW == 0 && userH != 0) { // #3
-    destW = round(double(srcW * userH) / srcH);
-    destH = userH;
-  } else { // userW != 0 && userH != 0
-    if (cropImage == 0) { // #4
-      destW = userW;
-      destH = userH;
-    } else { // #5
-      double userSlope = double(userH) / userW;
-      double srcSlope = double(srcH) / srcW;
-      if (srcSlope < userSlope) {
-        destW = round(double(srcW * userH) / srcH);
-        destH = userH;
-      } else {
-        destW = userW;
-        destH = round(double(srcH * userW) / srcW);
-      }
-    }
-  }
-  // prevent zeros
-  destW = std::max(destW, size_t(1UL));
-  destH = std::max(destH, size_t(1UL));
-}
-} // namespace Util
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util.h b/torchvision/csrc/io/decoder/util.h
deleted file mode 100644
index 01b550e5bbc..00000000000
--- a/torchvision/csrc/io/decoder/util.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * FFMPEG library utility functions.
- */
-
-namespace Util {
-std::string generateErrorDesc(int errorCode);
-size_t serialize(const AVSubtitle& sub, ByteStorage* out);
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub);
-size_t size(const AVSubtitle& sub);
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t maxDimension,
-    size_t cropImage);
-bool validateVideoFormat(const VideoFormat& format);
-} // namespace Util
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp
deleted file mode 100644
index 0a093d9561b..00000000000
--- a/torchvision/csrc/io/decoder/util_test.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <c10/util/Logging.h>
-#include <gtest/gtest.h>
-#include "util.h"
-
-TEST(Util, TestSetFormatDimensions) {
-  // clang-format off
-  const size_t test_cases[][9] = {
-      // (userW, userH, srcW, srcH, minDimension, maxDimension, cropImage, destW, destH)
-      {0, 0, 172, 128, 0, 0, 0, 172, 128},    // #1
-      {86, 0, 172, 128, 0, 0, 0, 86, 64},     // #2
-      {64, 0, 128, 172, 0, 0, 0, 64, 86},     // #2
-      {0, 32, 172, 128, 0, 0, 0, 43, 32},     // #3
-      {32, 0, 128, 172, 0, 0, 0, 32, 43},     // #3
-      {60, 50, 172, 128, 0, 0, 0, 60, 50},    // #4
-      {50, 60, 128, 172, 0, 0, 0, 50, 60},    // #4
-      {86, 40, 172, 128, 0, 0, 1, 86, 64},    // #5
-      {86, 92, 172, 128, 0, 0, 1, 124, 92},   // #5
-      {0, 0, 172, 128, 256, 0, 0, 344, 256},  // #6
-      {0, 0, 128, 172, 256, 0, 0, 256, 344},  // #6
-      {0, 0, 128, 172, 0, 344, 0, 256, 344},  // #7
-      {0, 0, 172, 128, 0, 344, 0, 344, 256},  // #7
-      {0, 0, 172, 128, 100, 344, 0, 344, 100},// #8
-      {0, 0, 128, 172, 100, 344, 0, 100, 344} // #8
-  };
-  // clang-format onn
-
-  for (const auto& tc : test_cases) {
-      size_t destW = 0;
-      size_t destH = 0;
-      ffmpeg::Util::setFormatDimensions(destW, destH, tc[0], tc[1], tc[2], tc[3], tc[4], tc[5], tc[6]);
-      CHECK(destW == tc[7]);
-      CHECK(destH == tc[8]);
-  }
-}
diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
deleted file mode 100644
index 8b712609e34..00000000000
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-#include "video_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-// www.ffmpeg.org/doxygen/0.5/swscale-example_8c-source.html
-
-namespace ffmpeg {
-
-namespace {
-
-// Setup the data pointers and linesizes based on the specified image
-// parameters and the provided array. This sets up "planes" to point to a
-// "buffer"
-// NOTE: this is most likely culprit behind #3534
-//
-// Args:
-// fmt: desired output video format
-// buffer: source constant image buffer (in different format) that will contain
-// the final image after SWScale planes: destination data pointer to be filled
-// lineSize: target destination linesize (always {0})
-int preparePlanes(
-    const VideoFormat& fmt,
-    const uint8_t* buffer,
-    uint8_t** planes,
-    int* lineSize) {
-  int result;
-
-  // NOTE: 1 at the end of av_fill_arrays is the value used for alignment
-  if ((result = av_image_fill_arrays(
-           planes,
-           lineSize,
-           buffer,
-           (AVPixelFormat)fmt.format,
-           fmt.width,
-           fmt.height,
-           1)) < 0) {
-    LOG(ERROR) << "av_image_fill_arrays failed, err: "
-               << Util::generateErrorDesc(result);
-  }
-  return result;
-}
-
-// Scale (and crop) the image slice in srcSlice and put the resulting scaled
-// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as
-// `sws_scale` cannot access buffers directly.
-//
-// Args:
-// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if
-// scale) srcSlice: frame data in YUV420P srcStride: the array containing the
-// strides for each plane of the source
-//            image (from AVFrame->linesize[0])
-// out: destination buffer
-// planes: indirect destination buffer (mapped to "out" via preparePlanes)
-// lines: destination linesize; constant {0}
-int transformImage(
-    SwsContext* context,
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    VideoFormat inFormat,
-    VideoFormat outFormat,
-    uint8_t* out,
-    uint8_t* planes[],
-    int lines[]) {
-  int result;
-  if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) {
-    return result;
-  }
-  if (context) {
-    // NOTE: srcY stride always 0: this is a parameter of YUV format
-    if ((result = sws_scale(
-             context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) <
-        0) {
-      LOG(ERROR) << "sws_scale failed, err: "
-                 << Util::generateErrorDesc(result);
-      return result;
-    }
-  } else if (
-      inFormat.width == outFormat.width &&
-      inFormat.height == outFormat.height &&
-      inFormat.format == outFormat.format) {
-    // Copy planes without using sws_scale if sws_getContext failed.
-    av_image_copy(
-        planes,
-        lines,
-        (const uint8_t**)srcSlice,
-        srcStride,
-        (AVPixelFormat)inFormat.format,
-        inFormat.width,
-        inFormat.height);
-  } else {
-    LOG(ERROR) << "Invalid scale context format " << inFormat.format;
-    return AVERROR(EINVAL);
-  }
-  return 0;
-}
-} // namespace
-
-VideoSampler::VideoSampler(int swsFlags, int64_t loggingUuid)
-    : swsFlags_(swsFlags), loggingUuid_(loggingUuid) {}
-
-VideoSampler::~VideoSampler() {
-  cleanUp();
-}
-
-void VideoSampler::shutdown() {
-  cleanUp();
-}
-
-bool VideoSampler::init(const SamplerParameters& params) {
-  cleanUp();
-
-  if (params.out.video.cropImage != 0) {
-    if (!Util::validateVideoFormat(params.out.video)) {
-      LOG(ERROR) << "Invalid video format"
-                 << ", width: " << params.out.video.width
-                 << ", height: " << params.out.video.height
-                 << ", format: " << params.out.video.format
-                 << ", minDimension: " << params.out.video.minDimension
-                 << ", crop: " << params.out.video.cropImage;
-
-      return false;
-    }
-
-    scaleFormat_.format = params.out.video.format;
-    Util::setFormatDimensions(
-        scaleFormat_.width,
-        scaleFormat_.height,
-        params.out.video.width,
-        params.out.video.height,
-        params.in.video.width,
-        params.in.video.height,
-        0,
-        0,
-        1);
-
-    if (!(scaleFormat_ == params_.out.video)) { // crop required
-      cropContext_ = sws_getContext(
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params.out.video.format,
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params.out.video.format,
-          swsFlags_,
-          nullptr,
-          nullptr,
-          nullptr);
-
-      if (!cropContext_) {
-        LOG(ERROR) << "sws_getContext failed for crop context";
-        return false;
-      }
-
-      const auto scaleImageSize = av_image_get_buffer_size(
-          (AVPixelFormat)scaleFormat_.format,
-          scaleFormat_.width,
-          scaleFormat_.height,
-          1);
-      scaleBuffer_.resize(scaleImageSize);
-    }
-  } else {
-    scaleFormat_ = params.out.video;
-  }
-
-  VLOG(1) << "Input format #" << loggingUuid_ << ", width "
-          << params.in.video.width << ", height " << params.in.video.height
-          << ", format " << params.in.video.format << ", minDimension "
-          << params.in.video.minDimension << ", cropImage "
-          << params.in.video.cropImage;
-  VLOG(1) << "Scale format #" << loggingUuid_ << ", width "
-          << scaleFormat_.width << ", height " << scaleFormat_.height
-          << ", format " << scaleFormat_.format << ", minDimension "
-          << scaleFormat_.minDimension << ", cropImage "
-          << scaleFormat_.cropImage;
-  VLOG(1) << "Crop format #" << loggingUuid_ << ", width "
-          << params.out.video.width << ", height " << params.out.video.height
-          << ", format " << params.out.video.format << ", minDimension "
-          << params.out.video.minDimension << ", cropImage "
-          << params.out.video.cropImage;
-
-  // set output format
-  params_ = params;
-
-  if (params.in.video.format == AV_PIX_FMT_YUV420P) {
-    /* When the video width and height are not multiples of 8,
-     * and there is no size change in the conversion,
-     * a blurry screen will appear on the right side
-     * This problem was discovered in 2012 and
-     * continues to exist in version 4.1.3 in 2019
-     * This problem can be avoided by increasing SWS_ACCURATE_RND
-     * details https://trac.ffmpeg.org/ticket/1582
-     */
-    if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) {
-      VLOG(1) << "The width " << params.in.video.width << " and height "
-              << params.in.video.height << " the image is not a multiple of 8, "
-              << "the decoding speed may be reduced";
-      swsFlags_ |= SWS_ACCURATE_RND;
-    }
-  }
-
-  scaleContext_ = sws_getContext(
-      params.in.video.width,
-      params.in.video.height,
-      (AVPixelFormat)params.in.video.format,
-      scaleFormat_.width,
-      scaleFormat_.height,
-      (AVPixelFormat)scaleFormat_.format,
-      swsFlags_,
-      nullptr,
-      nullptr,
-      nullptr);
-  // sws_getContext might fail if in/out format == AV_PIX_FMT_PAL8 (png format)
-  // Return true if input and output formats/width/height are identical
-  // Check scaleContext_ for nullptr in transformImage to copy planes directly
-
-  if (params.in.video.width == scaleFormat_.width &&
-      params.in.video.height == scaleFormat_.height &&
-      params.in.video.format == scaleFormat_.format) {
-    return true;
-  }
-  return scaleContext_ != nullptr;
-}
-
-// Main body of the sample function called from one of the overloads below
-//
-// Args:
-// srcSlice: decoded AVFrame->data perpared buffer
-// srcStride: linesize (usually obtained from AVFrame->linesize)
-// out: return buffer (ByteStorage*)
-int VideoSampler::sample(
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    ByteStorage* out) {
-  int result;
-  // scaled and cropped image
-  int outImageSize = av_image_get_buffer_size(
-      (AVPixelFormat)params_.out.video.format,
-      params_.out.video.width,
-      params_.out.video.height,
-      1);
-
-  out->ensure(outImageSize);
-
-  uint8_t* scalePlanes[4] = {nullptr};
-  int scaleLines[4] = {0};
-  // perform scale first
-  if ((result = transformImage(
-           scaleContext_,
-           srcSlice,
-           srcStride,
-           params_.in.video,
-           scaleFormat_,
-           // for crop use internal buffer
-           cropContext_ ? scaleBuffer_.data() : out->writableTail(),
-           scalePlanes,
-           scaleLines))) {
-    return result;
-  }
-
-  // is crop required?
-  if (cropContext_) {
-    uint8_t* cropPlanes[4] = {nullptr};
-    int cropLines[4] = {0};
-
-    if (params_.out.video.height < scaleFormat_.height) {
-      // Destination image is wider of source image: cut top and bottom
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.height - params_.out.video.height) / 2;
-      }
-    } else {
-      // Source image is wider of destination image: cut sides
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.width - params_.out.video.width) / 2 /
-            scaleFormat_.width;
-      }
-    }
-
-    // crop image
-    if ((result = transformImage(
-             cropContext_,
-             scalePlanes,
-             scaleLines,
-             params_.out.video,
-             params_.out.video,
-             out->writableTail(),
-             cropPlanes,
-             cropLines))) {
-      return result;
-    }
-  }
-
-  out->append(outImageSize);
-  return outImageSize;
-}
-
-// Call from `video_stream.cpp::114` - occurs during file reads
-int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
-  if (!frame) {
-    return 0; // no flush for videos
-  }
-
-  return sample(frame->data, frame->linesize, out);
-}
-
-// Call from `video_stream.cpp::114` - not sure when this occurs
-int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  if (!in) {
-    return 0; // no flush for videos
-  }
-
-  int result;
-  uint8_t* inPlanes[4] = {nullptr};
-  int inLineSize[4] = {0};
-
-  if ((result = preparePlanes(
-           params_.in.video, in->data(), inPlanes, inLineSize)) < 0) {
-    return result;
-  }
-
-  return sample(inPlanes, inLineSize, out);
-}
-
-void VideoSampler::cleanUp() {
-  if (scaleContext_) {
-    sws_freeContext(scaleContext_);
-    scaleContext_ = nullptr;
-  }
-  if (cropContext_) {
-    sws_freeContext(cropContext_);
-    cropContext_ = nullptr;
-    scaleBuffer_.clear();
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_sampler.h b/torchvision/csrc/io/decoder/video_sampler.h
deleted file mode 100644
index 47247f2c0c5..00000000000
--- a/torchvision/csrc/io/decoder/video_sampler.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode video frames from one format into another
- */
-
-class VideoSampler : public MediaSampler {
- public:
-  VideoSampler(int swsFlags = SWS_AREA, int64_t loggingUuid = 0);
-
-  ~VideoSampler() override;
-
-  // MediaSampler overrides
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  // returns number processed/scaling bytes
-  int sample(AVFrame* frame, ByteStorage* out);
-  int getImageBytes() const;
-
- private:
-  // close resources
-  void cleanUp();
-  // helper functions for rescaling, cropping, etc.
-  int sample(
-      const uint8_t* const srcSlice[],
-      int srcStride[],
-      ByteStorage* out);
-
- private:
-  VideoFormat scaleFormat_;
-  SwsContext* scaleContext_{nullptr};
-  SwsContext* cropContext_{nullptr};
-  int swsFlags_{SWS_AREA};
-  std::vector<uint8_t> scaleBuffer_;
-  int64_t loggingUuid_{0};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp
deleted file mode 100644
index fa08c65cac1..00000000000
--- a/torchvision/csrc/io/decoder/video_stream.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "video_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-namespace {
-bool operator==(const VideoFormat& x, const AVFrame& y) {
-  return x.width == static_cast<size_t>(y.width) &&
-      x.height == static_cast<size_t>(y.height) && x.format == y.format;
-}
-
-bool operator==(const VideoFormat& x, const AVCodecContext& y) {
-  return x.width == static_cast<size_t>(y.width) &&
-      x.height == static_cast<size_t>(y.height) && x.format == y.pix_fmt;
-}
-
-VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) {
-  x.width = y.width;
-  x.height = y.height;
-  x.format = y.format;
-  return x;
-}
-
-VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) {
-  x.width = y.width;
-  x.height = y.height;
-  x.format = y.pix_fmt;
-  return x;
-}
-} // namespace
-
-VideoStream::VideoStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const VideoFormat& format,
-    int64_t loggingUuid)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          loggingUuid) {}
-
-VideoStream::~VideoStream() {
-  if (sampler_) {
-    sampler_->shutdown();
-    sampler_.reset();
-  }
-}
-
-int VideoStream::initFormat() {
-  // set output format
-  if (!Util::validateVideoFormat(format_.format.video)) {
-    LOG(ERROR) << "Invalid video format"
-               << ", width: " << format_.format.video.width
-               << ", height: " << format_.format.video.height
-               << ", format: " << format_.format.video.format
-               << ", minDimension: " << format_.format.video.minDimension
-               << ", crop: " << format_.format.video.cropImage;
-    return -1;
-  }
-
-  // keep aspect ratio
-  Util::setFormatDimensions(
-      format_.format.video.width,
-      format_.format.video.height,
-      format_.format.video.width,
-      format_.format.video.height,
-      codecCtx_->width,
-      codecCtx_->height,
-      format_.format.video.minDimension,
-      format_.format.video.maxDimension,
-      0);
-
-  if (format_.format.video.format == AV_PIX_FMT_NONE) {
-    format_.format.video.format = codecCtx_->pix_fmt;
-  }
-  return format_.format.video.width != 0 && format_.format.video.height != 0 &&
-          format_.format.video.format != AV_PIX_FMT_NONE
-      ? 0
-      : -1;
-}
-
-// copies frame bytes via sws_scale call in video_sampler.cpp
-int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  if (!sampler_) {
-    sampler_ = std::make_unique<VideoSampler>(SWS_AREA, loggingUuid_);
-  }
-
-  // check if input format gets changed
-  if (flush ? !(sampler_->getInputFormat().video == *codecCtx_)
-            : !(sampler_->getInputFormat().video == *frame_)) {
-    // - reinit sampler
-    SamplerParameters params;
-    params.type = format_.type;
-    params.out = format_.format;
-    params.in = FormatUnion(0);
-    flush ? toVideoFormat(params.in.video, *codecCtx_)
-          : toVideoFormat(params.in.video, *frame_);
-    if (!sampler_->init(params)) {
-      return -1;
-    }
-
-    VLOG(1) << "Set input video sampler format"
-            << ", width: " << params.in.video.width
-            << ", height: " << params.in.video.height
-            << ", format: " << params.in.video.format
-            << " : output video sampler format"
-            << ", width: " << format_.format.video.width
-            << ", height: " << format_.format.video.height
-            << ", format: " << format_.format.video.format
-            << ", minDimension: " << format_.format.video.minDimension
-            << ", crop: " << format_.format.video.cropImage;
-  }
-  // calls to a sampler that converts the frame from YUV422 to RGB24, and
-  // optionally crops and resizes the frame. Frame bytes are copied from
-  // frame_->data to out buffer
-  return sampler_->sample(flush ? nullptr : frame_, out);
-}
-
-void VideoStream::setHeader(DecoderHeader* header, bool flush) {
-  Stream::setHeader(header, flush);
-  if (!flush) { // no frames for video flush
-    header->keyFrame = frame_->key_frame;
-    header->fps = av_q2d(av_guess_frame_rate(
-        inputCtx_, inputCtx_->streams[format_.stream], nullptr));
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_stream.h b/torchvision/csrc/io/decoder/video_stream.h
deleted file mode 100644
index e6a8bf02b65..00000000000
--- a/torchvision/csrc/io/decoder/video_stream.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include "stream.h"
-#include "video_sampler.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one video stream.
- */
-
-class VideoStream : public Stream {
- public:
-  VideoStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const VideoFormat& format,
-      int64_t loggingUuid);
-  ~VideoStream() override;
-
- private:
-  int initFormat() override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header, bool flush) override;
-
- private:
-  std::unique_ptr<VideoSampler> sampler_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
deleted file mode 100644
index 8f1fb3fb5b9..00000000000
--- a/torchvision/csrc/io/video/video.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-#include "video.h"
-
-#include <regex>
-
-using namespace ffmpeg;
-
-namespace vision {
-namespace video {
-
-namespace {
-
-const size_t decoderTimeoutMs = 600000;
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
-
-// returns number of written bytes
-template <typename T>
-size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
-  const auto& msg = msgs;
-  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-  if (frameData) {
-    auto sizeInBytes = msg.payload->length();
-    memcpy(frameData, msg.payload->data(), sizeInBytes);
-  }
-  return sizeof(T);
-}
-
-size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
-  return fillTensorList<uint8_t>(msgs, videoFrame);
-}
-
-size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
-  return fillTensorList<float>(msgs, audioFrame);
-}
-
-std::array<std::pair<std::string, ffmpeg::MediaType>, 4>::const_iterator
-_parse_type(const std::string& stream_string) {
-  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
-      {"video", TYPE_VIDEO},
-      {"audio", TYPE_AUDIO},
-      {"subtitle", TYPE_SUBTITLE},
-      {"cc", TYPE_CC},
-  }};
-  auto device = std::find_if(
-      types.begin(),
-      types.end(),
-      [stream_string](const std::pair<std::string, MediaType>& p) {
-        return p.first == stream_string;
-      });
-  if (device != types.end()) {
-    return device;
-  }
-  TORCH_CHECK(
-      false, "Expected one of [audio, video, subtitle, cc] ", stream_string);
-}
-
-std::string parse_type_to_string(const std::string& stream_string) {
-  auto device = _parse_type(stream_string);
-  return device->first;
-}
-
-MediaType parse_type_to_mt(const std::string& stream_string) {
-  auto device = _parse_type(stream_string);
-  return device->second;
-}
-
-std::tuple<std::string, long> _parseStream(const std::string& streamString) {
-  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
-  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
-  std::smatch match;
-
-  TORCH_CHECK(
-      std::regex_match(streamString, match, regex),
-      "Invalid stream string: '",
-      streamString,
-      "'");
-
-  std::string type_ = "video";
-  type_ = parse_type_to_string(match[1].str());
-  long index_ = -1;
-  if (match[2].matched) {
-    try {
-      index_ = std::stoi(match[2].str());
-    } catch (const std::exception&) {
-      TORCH_CHECK(
-          false,
-          "Could not parse device index '",
-          match[2].str(),
-          "' in device string '",
-          streamString,
-          "'");
-    }
-  }
-  return std::make_tuple(type_, index_);
-}
-
-} // namespace
-
-void Video::_getDecoderParams(
-    double videoStartS,
-    int64_t getPtsOnly,
-    std::string stream,
-    long stream_id = -1,
-    bool fastSeek = true,
-    bool all_streams = false,
-    int64_t num_threads = 1,
-    double seekFrameMarginUs = 10) {
-  int64_t videoStartUs = int64_t(videoStartS * 1e6);
-
-  params.timeoutMs = decoderTimeoutMs;
-  params.startOffset = videoStartUs;
-  params.seekAccuracy = seekFrameMarginUs;
-  params.fastSeek = fastSeek;
-  params.headerOnly = false;
-  params.numThreads = num_threads;
-
-  params.preventStaleness = false; // not sure what this is about
-
-  if (all_streams == true) {
-    MediaFormat format;
-    format.stream = -2;
-    format.type = TYPE_AUDIO;
-    params.formats.insert(format);
-
-    format.type = TYPE_VIDEO;
-    format.stream = -2;
-    format.format.video.width = 0;
-    format.format.video.height = 0;
-    format.format.video.cropImage = 0;
-    format.format.video.format = defaultVideoPixelFormat;
-    params.formats.insert(format);
-
-    format.type = TYPE_SUBTITLE;
-    format.stream = -2;
-    params.formats.insert(format);
-
-    format.type = TYPE_CC;
-    format.stream = -2;
-    params.formats.insert(format);
-  } else {
-    // parse stream type
-    MediaType stream_type = parse_type_to_mt(stream);
-
-    // TODO: reset params.formats
-    std::set<MediaFormat> formats;
-    params.formats = formats;
-    // Define new format
-    MediaFormat format;
-    format.type = stream_type;
-    format.stream = stream_id;
-    if (stream_type == TYPE_VIDEO) {
-      format.format.video.width = 0;
-      format.format.video.height = 0;
-      format.format.video.cropImage = 0;
-      format.format.video.format = defaultVideoPixelFormat;
-    }
-    params.formats.insert(format);
-  }
-
-} // _get decoder params
-
-void Video::initFromFile(
-    std::string videoPath,
-    std::string stream,
-    int64_t numThreads) {
-  TORCH_CHECK(!initialized, "Video object can only be initialized once");
-  initialized = true;
-  params.uri = videoPath;
-  _init(stream, numThreads);
-}
-
-void Video::initFromMemory(
-    torch::Tensor videoTensor,
-    std::string stream,
-    int64_t numThreads) {
-  TORCH_CHECK(!initialized, "Video object can only be initialized once");
-  initialized = true;
-  callback = MemoryBuffer::getCallback(
-      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
-  _init(stream, numThreads);
-}
-
-void Video::_init(std::string stream, int64_t numThreads) {
-  // set number of threads global
-  numThreads_ = numThreads;
-  // parse stream information
-  current_stream = _parseStream(stream);
-  // note that in the initial call we want to get all streams
-  _getDecoderParams(
-      0, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream info - remove that
-      long(-1), // stream_id parsed from info above change to -2
-      false, // fastseek: we're using the default param here
-      true, // read all streams
-      numThreads_ // global number of Threads for decoding
-  );
-
-  std::string logMessage, logType;
-
-  // locals
-  std::vector<double> audioFPS, videoFPS;
-  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
-  std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  c10::Dict<std::string, std::vector<double>> audioMetadata;
-  c10::Dict<std::string, std::vector<double>> videoMetadata;
-  c10::Dict<std::string, std::vector<double>> ccMetadata;
-  c10::Dict<std::string, std::vector<double>> subsMetadata;
-
-  // callback and metadata defined in struct
-  DecoderInCallback tmp_callback = callback;
-  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
-  if (succeeded) {
-    for (const auto& header : metadata) {
-      double fps = double(header.fps);
-      double duration = double(header.duration) * 1e-6; // * timeBase;
-
-      if (header.format.type == TYPE_VIDEO) {
-        videoFPS.push_back(fps);
-        videoDuration.push_back(duration);
-      } else if (header.format.type == TYPE_AUDIO) {
-        audioFPS.push_back(fps);
-        audioDuration.push_back(duration);
-      } else if (header.format.type == TYPE_CC) {
-        ccDuration.push_back(duration);
-      } else if (header.format.type == TYPE_SUBTITLE) {
-        subsDuration.push_back(duration);
-      };
-    }
-  }
-  // audio
-  audioMetadata.insert("duration", audioDuration);
-  audioMetadata.insert("framerate", audioFPS);
-  // video
-  videoMetadata.insert("duration", videoDuration);
-  videoMetadata.insert("fps", videoFPS);
-  // subs
-  subsMetadata.insert("duration", subsDuration);
-  // cc
-  ccMetadata.insert("duration", ccDuration);
-  // put all to a data
-  streamsMetadata.insert("video", videoMetadata);
-  streamsMetadata.insert("audio", audioMetadata);
-  streamsMetadata.insert("subtitles", subsMetadata);
-  streamsMetadata.insert("cc", ccMetadata);
-
-  succeeded = setCurrentStream(stream);
-  if (std::get<1>(current_stream) != -1) {
-    LOG(INFO)
-        << "Stream index set to " << std::get<1>(current_stream)
-        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
-  }
-}
-
-Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
-  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
-  if (!videoPath.empty()) {
-    initFromFile(videoPath, stream, numThreads);
-  }
-} // video
-
-bool Video::setCurrentStream(std::string stream = "video") {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
-    current_stream = _parseStream(stream);
-  }
-
-  double ts = 0;
-  if (seekTS > 0) {
-    ts = seekTS;
-  }
-
-  _getDecoderParams(
-      ts, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream
-      long(std::get<1>(
-          current_stream)), // stream_id parsed from info above change to -2
-      false, // fastseek param set to 0 false by default (changed in seek)
-      false, // read all streams
-      numThreads_ // global number of threads
-  );
-
-  // callback and metadata defined in Video.h
-  DecoderInCallback tmp_callback = callback;
-  return (decoder.init(params, std::move(tmp_callback), &metadata));
-}
-
-std::tuple<std::string, int64_t> Video::getCurrentStream() const {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  return current_stream;
-}
-
-c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
-    getStreamMetadata() const {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  return streamsMetadata;
-}
-
-void Video::Seek(double ts, bool fastSeek = false) {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  // initialize the class variables used for seeking and retrurn
-  _getDecoderParams(
-      ts, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream
-      long(std::get<1>(
-          current_stream)), // stream_id parsed from info above change to -2
-      fastSeek, // fastseek
-      false, // read all streams
-      numThreads_ // global number of threads
-  );
-
-  // callback and metadata defined in Video.h
-  DecoderInCallback tmp_callback = callback;
-  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
-}
-
-std::tuple<torch::Tensor, double> Video::Next() {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  // if failing to decode simply return a null tensor (note, should we
-  // raise an exception?)
-  double frame_pts_s;
-  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
-
-  // decode single frame
-  DecoderOutputMessage out;
-  int64_t res = decoder.decode(&out, decoderTimeoutMs);
-  // if successful
-  if (res == 0) {
-    frame_pts_s = double(double(out.header.pts) * 1e-6);
-
-    auto header = out.header;
-    const auto& format = header.format;
-
-    // initialize the output variables based on type
-
-    if (format.type == TYPE_VIDEO) {
-      // note: this can potentially be optimized
-      // by having the global tensor that we fill at decode time
-      // (would avoid allocations)
-      int outHeight = format.format.video.height;
-      int outWidth = format.format.video.width;
-      int numChannels = 3;
-      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-      fillVideoTensor(out, outFrame);
-      outFrame = outFrame.permute({2, 0, 1});
-
-    } else if (format.type == TYPE_AUDIO) {
-      int outAudioChannels = format.format.audio.channels;
-      int bytesPerSample = av_get_bytes_per_sample(
-          static_cast<AVSampleFormat>(format.format.audio.format));
-      int frameSizeTotal = out.payload->length();
-
-      TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-      int numAudioSamples =
-          frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-      outFrame =
-          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-
-      fillAudioTensor(out, outFrame);
-    }
-    // currently not supporting other formats (will do soon)
-
-    out.payload.reset();
-  } else if (res == ENODATA) {
-    LOG(INFO) << "Decoder ran out of frames (ENODATA)\n";
-  } else {
-    LOG(ERROR) << "Decoder failed with ERROR_CODE " << res;
-  }
-
-  return std::make_tuple(outFrame, frame_pts_s);
-}
-
-static auto registerVideo =
-    torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, int64_t>())
-        .def("init_from_file", &Video::initFromFile)
-        .def("init_from_memory", &Video::initFromMemory)
-        .def("get_current_stream", &Video::getCurrentStream)
-        .def("set_current_stream", &Video::setCurrentStream)
-        .def("get_metadata", &Video::getStreamMetadata)
-        .def("seek", &Video::Seek)
-        .def("next", &Video::Next);
-
-} // namespace video
-} // namespace vision
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
deleted file mode 100644
index e57fc3ae6b7..00000000000
--- a/torchvision/csrc/io/video/video.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-
-#include <torch/types.h>
-
-#include "../decoder/defs.h"
-#include "../decoder/memory_buffer.h"
-#include "../decoder/sync_decoder.h"
-
-namespace vision {
-namespace video {
-
-struct Video : torch::CustomClassHolder {
-  std::tuple<std::string, long> current_stream; // stream type, id
-  // global video metadata
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
-      streamsMetadata;
-  int64_t numThreads_{0};
-
- public:
-  Video(
-      std::string videoPath = std::string(),
-      std::string stream = std::string("video"),
-      int64_t numThreads = 0);
-  void initFromFile(
-      std::string videoPath,
-      std::string stream,
-      int64_t numThreads);
-  void initFromMemory(
-      torch::Tensor videoTensor,
-      std::string stream,
-      int64_t numThreads);
-
-  std::tuple<std::string, int64_t> getCurrentStream() const;
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
-  getStreamMetadata() const;
-  void Seek(double ts, bool fastSeek);
-  bool setCurrentStream(std::string stream);
-  std::tuple<torch::Tensor, double> Next();
-
- private:
-  bool succeeded = false; // decoder init flag
-  // seekTS and doSeek act as a flag - if it's not set, next function simply
-  // returns the next frame. If it's set, we look at the global seek
-  // time in combination with any_frame settings
-  double seekTS = -1;
-
-  bool initialized = false;
-
-  void _init(
-      std::string stream,
-      int64_t numThreads); // expects params.uri OR callback to be set
-
-  void _getDecoderParams(
-      double videoStartS,
-      int64_t getPtsOnly,
-      std::string stream,
-      long stream_id,
-      bool fastSeek,
-      bool all_streams,
-      int64_t num_threads,
-      double seekFrameMarginUs); // this needs to be improved
-
-  std::map<std::string, std::vector<double>> streamTimeBase; // not used
-
-  ffmpeg::DecoderInCallback callback = nullptr;
-  std::vector<ffmpeg::DecoderMetadata> metadata;
-
- protected:
-  ffmpeg::SyncDecoder decoder;
-  ffmpeg::DecoderParameters params;
-
-}; // struct Video
-
-} // namespace video
-} // namespace vision
diff --git a/torchvision/csrc/io/video_reader/video_reader.cpp b/torchvision/csrc/io/video_reader/video_reader.cpp
deleted file mode 100644
index f9a5e9085d8..00000000000
--- a/torchvision/csrc/io/video_reader/video_reader.cpp
+++ /dev/null
@@ -1,677 +0,0 @@
-#include "video_reader.h"
-
-#include "../decoder/memory_buffer.h"
-#include "../decoder/sync_decoder.h"
-
-// If we are in a Windows environment, we need to define
-// initialization functions for the _custom_ops extension
-#ifdef _WIN32
-void* PyInit_video_reader(void) {
-  return nullptr;
-}
-#endif
-
-using namespace ffmpeg;
-
-namespace vision {
-namespace video_reader {
-
-namespace {
-
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
-const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-const size_t decoderTimeoutMs = 600000;
-// A jitter can be added to the end of the range to avoid conversion/rounding
-// error, small value 100us won't be enough to select the next frame, but enough
-// to compensate rounding error due to the multiple conversions.
-const size_t timeBaseJitterUs = 100;
-
-DecoderParameters getDecoderParams(
-    int64_t videoStartUs,
-    int64_t videoEndUs,
-    double seekFrameMarginUs,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int videoWidth,
-    int videoHeight,
-    int videoMinDimension,
-    int videoMaxDimension,
-    int64_t readAudioStream,
-    int audioSamples,
-    int audioChannels) {
-  DecoderParameters params;
-  params.headerOnly = getPtsOnly != 0;
-  params.seekAccuracy = seekFrameMarginUs;
-  params.startOffset = videoStartUs;
-  params.endOffset = videoEndUs;
-  params.timeoutMs = decoderTimeoutMs;
-  params.preventStaleness = false;
-
-  if (readVideoStream == 1) {
-    MediaFormat videoFormat(0);
-    videoFormat.type = TYPE_VIDEO;
-    videoFormat.format.video.format = defaultVideoPixelFormat;
-    videoFormat.format.video.width = videoWidth;
-    videoFormat.format.video.height = videoHeight;
-    videoFormat.format.video.minDimension = videoMinDimension;
-    videoFormat.format.video.maxDimension = videoMaxDimension;
-    params.formats.insert(videoFormat);
-  }
-
-  if (readAudioStream == 1) {
-    MediaFormat audioFormat;
-    audioFormat.type = TYPE_AUDIO;
-    audioFormat.format.audio.format = defaultAudioSampleFormat;
-    audioFormat.format.audio.samples = audioSamples;
-    audioFormat.format.audio.channels = audioChannels;
-    params.formats.insert(audioFormat);
-  }
-
-  return params;
-}
-
-// returns number of written bytes
-template <typename T>
-size_t fillTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& frame,
-    torch::Tensor& framePts,
-    int64_t num,
-    int64_t den) {
-  if (msgs.empty()) {
-    return 0;
-  }
-  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-  int64_t* framePtsData = framePts.data_ptr<int64_t>();
-  TORCH_CHECK_EQ(framePts.size(0), (int64_t)msgs.size());
-  size_t avgElementsInFrame = frame.numel() / msgs.size();
-
-  size_t offset = 0;
-  for (size_t i = 0; i < msgs.size(); ++i) {
-    const auto& msg = msgs[i];
-    // convert pts into original time_base
-    AVRational avr = AVRational{(int)num, (int)den};
-    framePtsData[i] = av_rescale_q(msg.header.pts, timeBaseQ, avr);
-    VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts
-            << ", original: " << framePtsData[i];
-
-    if (frameData) {
-      auto sizeInBytes = msg.payload->length();
-      memcpy(frameData + offset, msg.payload->data(), sizeInBytes);
-      if (sizeof(T) == sizeof(uint8_t)) {
-        // Video - move by allocated frame size
-        offset += avgElementsInFrame / sizeof(T);
-      } else {
-        // Audio - move by number of samples
-        offset += sizeInBytes / sizeof(T);
-      }
-    }
-  }
-  return offset * sizeof(T);
-}
-
-size_t fillVideoTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& videoFrame,
-    torch::Tensor& videoFramePts,
-    int64_t num,
-    int64_t den) {
-  return fillTensor<uint8_t>(msgs, videoFrame, videoFramePts, num, den);
-}
-
-size_t fillAudioTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& audioFrame,
-    torch::Tensor& audioFramePts,
-    int64_t num,
-    int64_t den) {
-  return fillTensor<float>(msgs, audioFrame, audioFramePts, num, den);
-}
-
-void offsetsToUs(
-    double& seekFrameMargin,
-    int64_t readVideoStream,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen,
-    int64_t& videoStartUs,
-    int64_t& videoEndUs) {
-  seekFrameMargin *= AV_TIME_BASE;
-  videoStartUs = 0;
-  videoEndUs = -1;
-
-  if (readVideoStream) {
-    AVRational vr = AVRational{(int)videoTimeBaseNum, (int)videoTimeBaseDen};
-    if (videoStartPts > 0) {
-      videoStartUs = av_rescale_q(videoStartPts, vr, timeBaseQ);
-    }
-    if (videoEndPts > 0) {
-      // Add jitter to the end of the range to avoid conversion/rounding error.
-      // Small value 100us won't be enough to select the next frame, but enough
-      // to compensate rounding error due to the multiple conversions.
-      videoEndUs = timeBaseJitterUs + av_rescale_q(videoEndPts, vr, timeBaseQ);
-    }
-  } else if (readAudioStream) {
-    AVRational ar = AVRational{(int)audioTimeBaseNum, (int)audioTimeBaseDen};
-    if (audioStartPts > 0) {
-      videoStartUs = av_rescale_q(audioStartPts, ar, timeBaseQ);
-    }
-    if (audioEndPts > 0) {
-      // Add jitter to the end of the range to avoid conversion/rounding error.
-      // Small value 100us won't be enough to select the next frame, but enough
-      // to compensate rounding error due to the multiple conversions.
-      videoEndUs = timeBaseJitterUs + av_rescale_q(audioEndPts, ar, timeBaseQ);
-    }
-  }
-}
-
-torch::List<torch::Tensor> readVideo(
-    bool isReadFile,
-    const torch::Tensor& input_video,
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  int64_t videoStartUs, videoEndUs;
-
-  offsetsToUs(
-      seekFrameMargin,
-      readVideoStream,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen,
-      videoStartUs,
-      videoEndUs);
-
-  DecoderParameters params = getDecoderParams(
-      videoStartUs, // videoStartPts
-      videoEndUs, // videoEndPts
-      seekFrameMargin, // seekFrameMargin
-      getPtsOnly, // getPtsOnly
-      readVideoStream, // readVideoStream
-      width, // width
-      height, // height
-      minDimension, // minDimension
-      maxDimension, // maxDimension
-      readAudioStream, // readAudioStream
-      audioSamples, // audioSamples
-      audioChannels // audioChannels
-  );
-
-  SyncDecoder decoder;
-  std::vector<DecoderOutputMessage> audioMessages, videoMessages;
-  DecoderInCallback callback = nullptr;
-  std::string logMessage, logType;
-  if (isReadFile) {
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-  } else {
-    callback = MemoryBuffer::getCallback(
-        input_video.data_ptr<uint8_t>(), input_video.size(0));
-    logType = "memory";
-    logMessage = std::to_string(input_video.size(0));
-  }
-
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] has started";
-
-  const auto now = std::chrono::system_clock::now();
-
-  bool succeeded;
-  DecoderMetadata audioMetadata, videoMetadata;
-  std::vector<DecoderMetadata> metadata;
-  if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
-    for (const auto& header : metadata) {
-      if (header.format.type == TYPE_VIDEO) {
-        videoMetadata = header;
-      } else if (header.format.type == TYPE_AUDIO) {
-        audioMetadata = header;
-      }
-    }
-    int res;
-    DecoderOutputMessage msg;
-    while (0 == (res = decoder.decode(&msg, decoderTimeoutMs))) {
-      if (msg.header.format.type == TYPE_VIDEO) {
-        videoMessages.push_back(std::move(msg));
-      }
-      if (msg.header.format.type == TYPE_AUDIO) {
-        audioMessages.push_back(std::move(msg));
-      }
-      msg.payload.reset();
-    }
-  } else {
-    LOG(ERROR) << "Decoder initialization has failed";
-  }
-  const auto then = std::chrono::system_clock::now();
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] has finished, "
-          << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                 .count()
-          << " us";
-
-  decoder.shutdown();
-
-  // video section
-  torch::Tensor videoFrame = torch::zeros({0}, torch::kByte);
-  torch::Tensor videoFramePts = torch::zeros({0}, torch::kLong);
-  torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
-  torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && readVideoStream == 1) {
-    if (!videoMessages.empty()) {
-      const auto& header = videoMetadata;
-      const auto& format = header.format.format.video;
-      int numVideoFrames = videoMessages.size();
-      int outHeight = format.height;
-      int outWidth = format.width;
-      int numChannels = 3; // decoder guarantees the default AV_PIX_FMT_RGB24
-
-      size_t expectedWrittenBytes = 0;
-      if (getPtsOnly == 0) {
-        videoFrame = torch::zeros(
-            {numVideoFrames, outHeight, outWidth, numChannels}, torch::kByte);
-        expectedWrittenBytes =
-            (size_t)numVideoFrames * outHeight * outWidth * numChannels;
-      }
-
-      videoFramePts = torch::zeros({numVideoFrames}, torch::kLong);
-
-      VLOG(2) << "video duration: " << header.duration
-              << ", fps: " << header.fps << ", num: " << header.num
-              << ", den: " << header.den << ", num frames: " << numVideoFrames;
-
-      auto numberWrittenBytes = fillVideoTensor(
-          videoMessages, videoFrame, videoFramePts, header.num, header.den);
-
-      TORCH_CHECK_EQ(numberWrittenBytes, expectedWrittenBytes);
-
-      videoTimeBase = torch::zeros({2}, torch::kInt);
-      int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-      videoTimeBaseData[0] = header.num;
-      videoTimeBaseData[1] = header.den;
-
-      videoFps = torch::zeros({1}, torch::kFloat);
-      float* videoFpsData = videoFps.data_ptr<float>();
-      videoFpsData[0] = header.fps;
-
-      videoDuration = torch::zeros({1}, torch::kLong);
-      int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-      AVRational vr = AVRational{(int)header.num, (int)header.den};
-      videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, vr);
-      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-              << "] filled video tensors";
-    } else {
-      VLOG(1) << "Miss video stream";
-    }
-  }
-
-  // audio section
-  torch::Tensor audioFrame = torch::zeros({0}, torch::kFloat);
-  torch::Tensor audioFramePts = torch::zeros({0}, torch::kLong);
-  torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
-  if (succeeded && readAudioStream == 1) {
-    if (!audioMessages.empty()) {
-      const auto& header = audioMetadata;
-      const auto& format = header.format.format.audio;
-
-      int64_t outAudioChannels = format.channels;
-      int bytesPerSample =
-          av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format));
-
-      int numAudioFrames = audioMessages.size();
-      int64_t numAudioSamples = 0;
-      if (getPtsOnly == 0) {
-        int64_t frameSizeTotal = 0;
-        for (auto const& audioMessage : audioMessages) {
-          frameSizeTotal += audioMessage.payload->length();
-        }
-
-        TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-        numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-        audioFrame =
-            torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-      }
-      audioFramePts = torch::zeros({numAudioFrames}, torch::kLong);
-
-      VLOG(2) << "audio duration: " << header.duration
-              << ", channels: " << format.channels
-              << ", sample rate: " << format.samples << ", num: " << header.num
-              << ", den: " << header.den;
-
-      auto numberWrittenBytes = fillAudioTensor(
-          audioMessages, audioFrame, audioFramePts, header.num, header.den);
-      TORCH_CHECK_EQ(
-          numberWrittenBytes,
-          numAudioSamples * outAudioChannels * sizeof(float));
-
-      audioTimeBase = torch::zeros({2}, torch::kInt);
-      int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-      audioTimeBaseData[0] = header.num;
-      audioTimeBaseData[1] = header.den;
-
-      audioSampleRate = torch::zeros({1}, torch::kInt);
-      int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-      audioSampleRateData[0] = format.samples;
-
-      audioDuration = torch::zeros({1}, torch::kLong);
-      int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-      AVRational ar = AVRational{(int)header.num, (int)header.den};
-      audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, ar);
-      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-              << "] filled audio tensors";
-    } else {
-      VLOG(1) << "Miss audio stream";
-    }
-  }
-
-  torch::List<torch::Tensor> result;
-  result.push_back(std::move(videoFrame));
-  result.push_back(std::move(videoFramePts));
-  result.push_back(std::move(videoTimeBase));
-  result.push_back(std::move(videoFps));
-  result.push_back(std::move(videoDuration));
-  result.push_back(std::move(audioFrame));
-  result.push_back(std::move(audioFramePts));
-  result.push_back(std::move(audioTimeBase));
-  result.push_back(std::move(audioSampleRate));
-  result.push_back(std::move(audioDuration));
-
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] about to return";
-
-  return result;
-}
-
-torch::List<torch::Tensor> probeVideo(
-    bool isReadFile,
-    const torch::Tensor& input_video,
-    std::string videoPath) {
-  DecoderParameters params = getDecoderParams(
-      0, // videoStartUs
-      -1, // videoEndUs
-      0, // seekFrameMargin
-      1, // getPtsOnly
-      1, // readVideoStream
-      0, // width
-      0, // height
-      0, // minDimension
-      0, // maxDimension
-      1, // readAudioStream
-      0, // audioSamples
-      0 // audioChannels
-  );
-
-  SyncDecoder decoder;
-  DecoderInCallback callback = nullptr;
-  std::string logMessage, logType;
-  if (isReadFile) {
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-  } else {
-    callback = MemoryBuffer::getCallback(
-        input_video.data_ptr<uint8_t>(), input_video.size(0));
-    logType = "memory";
-    logMessage = std::to_string(input_video.size(0));
-  }
-
-  VLOG(1) << "Video probing from " << logType << " [" << logMessage
-          << "] has started";
-
-  const auto now = std::chrono::system_clock::now();
-
-  bool succeeded;
-  bool gotAudio = false, gotVideo = false;
-  DecoderMetadata audioMetadata, videoMetadata;
-  std::vector<DecoderMetadata> metadata;
-  if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
-    for (const auto& header : metadata) {
-      if (header.format.type == TYPE_VIDEO) {
-        gotVideo = true;
-        videoMetadata = header;
-      } else if (header.format.type == TYPE_AUDIO) {
-        gotAudio = true;
-        audioMetadata = header;
-      }
-    }
-    const auto then = std::chrono::system_clock::now();
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] has finished, "
-            << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                   .count()
-            << " us";
-  } else {
-    LOG(ERROR) << "Decoder initialization has failed";
-  }
-
-  decoder.shutdown();
-
-  // video section
-  torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
-  torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && gotVideo) {
-    videoTimeBase = torch::zeros({2}, torch::kInt);
-    int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-    const auto& header = videoMetadata;
-
-    videoTimeBaseData[0] = header.num;
-    videoTimeBaseData[1] = header.den;
-
-    videoFps = torch::zeros({1}, torch::kFloat);
-    float* videoFpsData = videoFps.data_ptr<float>();
-    videoFpsData[0] = header.fps;
-
-    videoDuration = torch::zeros({1}, torch::kLong);
-    int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-    AVRational avr = AVRational{(int)header.num, (int)header.den};
-    videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
-
-    VLOG(2) << "Prob fps: " << header.fps << ", duration: " << header.duration
-            << ", num: " << header.num << ", den: " << header.den;
-
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] filled video tensors";
-  } else {
-    LOG(ERROR) << "Miss video stream";
-  }
-
-  // audio section
-  torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && gotAudio) {
-    audioTimeBase = torch::zeros({2}, torch::kInt);
-    int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-    const auto& header = audioMetadata;
-    const auto& media = header.format;
-    const auto& format = media.format.audio;
-
-    audioTimeBaseData[0] = header.num;
-    audioTimeBaseData[1] = header.den;
-
-    audioSampleRate = torch::zeros({1}, torch::kInt);
-    int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-    audioSampleRateData[0] = format.samples;
-
-    audioDuration = torch::zeros({1}, torch::kLong);
-    int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-    AVRational avr = AVRational{(int)header.num, (int)header.den};
-    audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
-
-    VLOG(2) << "Prob sample rate: " << format.samples
-            << ", duration: " << header.duration << ", num: " << header.num
-            << ", den: " << header.den;
-
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] filled audio tensors";
-  } else {
-    VLOG(1) << "Miss audio stream";
-  }
-
-  torch::List<torch::Tensor> result;
-  result.push_back(std::move(videoTimeBase));
-  result.push_back(std::move(videoFps));
-  result.push_back(std::move(videoDuration));
-  result.push_back(std::move(audioTimeBase));
-  result.push_back(std::move(audioSampleRate));
-  result.push_back(std::move(audioDuration));
-
-  VLOG(1) << "Video probing from " << logType << " [" << logMessage
-          << "] is about to return";
-
-  return result;
-}
-
-} // namespace
-
-torch::List<torch::Tensor> read_video_from_memory(
-    torch::Tensor input_video,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.read_video_from_memory");
-  return readVideo(
-      false,
-      input_video,
-      "", // videoPath
-      seekFrameMargin,
-      getPtsOnly,
-      readVideoStream,
-      width,
-      height,
-      minDimension,
-      maxDimension,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioSamples,
-      audioChannels,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen);
-}
-
-torch::List<torch::Tensor> read_video_from_file(
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.read_video_from_file");
-  torch::Tensor dummy_input_video = torch::ones({0});
-  return readVideo(
-      true,
-      dummy_input_video,
-      videoPath,
-      seekFrameMargin,
-      getPtsOnly,
-      readVideoStream,
-      width,
-      height,
-      minDimension,
-      maxDimension,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioSamples,
-      audioChannels,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen);
-}
-
-torch::List<torch::Tensor> probe_video_from_memory(torch::Tensor input_video) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.probe_video_from_memory");
-  return probeVideo(false, input_video, "");
-}
-
-torch::List<torch::Tensor> probe_video_from_file(std::string videoPath) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.probe_video_from_file");
-  torch::Tensor dummy_input_video = torch::ones({0});
-  return probeVideo(true, dummy_input_video, videoPath);
-}
-
-TORCH_LIBRARY_FRAGMENT(video_reader, m) {
-  m.def("read_video_from_memory", read_video_from_memory);
-  m.def("read_video_from_file", read_video_from_file);
-  m.def("probe_video_from_memory", probe_video_from_memory);
-  m.def("probe_video_from_file", probe_video_from_file);
-}
-
-} // namespace video_reader
-} // namespace vision
diff --git a/torchvision/csrc/io/video_reader/video_reader.h b/torchvision/csrc/io/video_reader/video_reader.h
deleted file mode 100644
index 48c4c841219..00000000000
--- a/torchvision/csrc/io/video_reader/video_reader.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#include <torch/types.h>
-
-namespace vision {
-namespace video_reader {
-
-torch::List<torch::Tensor> read_video_from_memory(
-    torch::Tensor input_video,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
-
-torch::List<torch::Tensor> read_video_from_file(
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
-
-torch::List<torch::Tensor> probe_video_from_memory(torch::Tensor input_video);
-
-torch::List<torch::Tensor> probe_video_from_file(std::string videoPath);
-
-} // namespace video_reader
-} // namespace vision
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index 6627ac975f3..3c5c13482f5 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -1,15 +1,79 @@
-from ._video_opt import (
-    _HAS_CPU_VIDEO_DECODER,
-    _HAS_VIDEO_OPT,
-    _probe_video_from_file,
-    _probe_video_from_memory,
-    _read_video_from_file,
-    _read_video_from_memory,
-    _read_video_timestamps_from_file,
-    _read_video_timestamps_from_memory,
-    Timebase,
-    VideoMetaData,
-)
+# In fbcode, import from the fb-only location
+# For OSS, these imports would fail (video_reader not available)
+try:
+    from pytorch.vision.fb.io import (  # type: ignore[import-not-found]
+        _HAS_CPU_VIDEO_DECODER,
+        _HAS_VIDEO_OPT,
+        _probe_video_from_file,
+        _probe_video_from_memory,
+        _read_video_from_file,
+        _read_video_from_memory,
+        _read_video_timestamps_from_file,
+        _read_video_timestamps_from_memory,
+        _video_opt,
+        Timebase,
+        VideoMetaData,
+        VideoReader,
+    )
+except ImportError:
+    # OSS fallback - video_reader backend not available
+    _HAS_CPU_VIDEO_DECODER = False
+    _HAS_VIDEO_OPT = False
+
+    def _stub_not_available(*args, **kwargs):
+        raise RuntimeError(
+            "video_reader backend is not available in open-source torchvision. " "Use PyAV or TorchCodec instead."
+        )
+
+    _probe_video_from_file = _stub_not_available
+    _probe_video_from_memory = _stub_not_available
+    _read_video_from_file = _stub_not_available
+    _read_video_from_memory = _stub_not_available
+    _read_video_timestamps_from_file = _stub_not_available
+    _read_video_timestamps_from_memory = _stub_not_available
+
+    class Timebase:  # type: ignore[no-redef]
+        __annotations__ = {"numerator": int, "denominator": int}
+        __slots__ = ["numerator", "denominator"]
+
+        def __init__(self, numerator: int = 0, denominator: int = 1) -> None:
+            self.numerator = numerator
+            self.denominator = denominator
+
+    class VideoMetaData:  # type: ignore[no-redef]
+        pass
+
+    class VideoReader:  # type: ignore[no-redef]
+        def __init__(self, *args, **kwargs):
+            raise RuntimeError(
+                "VideoReader with video_reader backend is not available. "
+                "Use backend='pyav' or migrate to TorchCodec."
+            )
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            raise StopIteration
+
+    # Stub module for _video_opt to prevent circular import issues
+    # This module is imported by video.py
+    import types
+    from fractions import Fraction
+
+    _video_opt = types.ModuleType("_video_opt")
+    _video_opt._HAS_VIDEO_OPT = False
+    _video_opt.default_timebase = Fraction(0, 1)
+
+    def _read_video_stub(filename, start_pts, end_pts, pts_unit):
+        raise RuntimeError("video_reader backend is not available. Use backend='pyav'.")
+
+    def _read_video_timestamps_stub(filename, pts_unit):
+        raise RuntimeError("video_reader backend is not available. Use backend='pyav'.")
+
+    _video_opt._read_video = _read_video_stub
+    _video_opt._read_video_timestamps = _read_video_timestamps_stub
+
 from .image import (
     decode_avif,
     decode_gif,
@@ -28,7 +92,6 @@
     write_png,
 )
 from .video import read_video, read_video_timestamps, write_video
-from .video_reader import VideoReader
 
 
 __all__ = [
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
deleted file mode 100644
index 8be3a3c94b9..00000000000
--- a/torchvision/io/_video_opt.py
+++ /dev/null
@@ -1,516 +0,0 @@
-import math
-import warnings
-from fractions import Fraction
-from typing import Optional, Union
-
-import torch
-
-from ..extension import _load_library
-from ._video_deprecation_warning import _raise_video_deprecation_warning
-
-
-_HAS_CPU_VIDEO_DECODER = _load_library("video_reader")
-_HAS_VIDEO_OPT = _HAS_CPU_VIDEO_DECODER  # For BC
-default_timebase = Fraction(0, 1)
-
-
-# simple class for torch scripting
-# the complex Fraction class from fractions module is not scriptable
-class Timebase:
-    __annotations__ = {"numerator": int, "denominator": int}
-    __slots__ = ["numerator", "denominator"]
-
-    def __init__(
-        self,
-        numerator: int,
-        denominator: int,
-    ) -> None:
-        self.numerator = numerator
-        self.denominator = denominator
-
-
-class VideoMetaData:
-    __annotations__ = {
-        "has_video": bool,
-        "video_timebase": Timebase,
-        "video_duration": float,
-        "video_fps": float,
-        "has_audio": bool,
-        "audio_timebase": Timebase,
-        "audio_duration": float,
-        "audio_sample_rate": float,
-    }
-    __slots__ = [
-        "has_video",
-        "video_timebase",
-        "video_duration",
-        "video_fps",
-        "has_audio",
-        "audio_timebase",
-        "audio_duration",
-        "audio_sample_rate",
-    ]
-
-    def __init__(self) -> None:
-        self.has_video = False
-        self.video_timebase = Timebase(0, 1)
-        self.video_duration = 0.0
-        self.video_fps = 0.0
-        self.has_audio = False
-        self.audio_timebase = Timebase(0, 1)
-        self.audio_duration = 0.0
-        self.audio_sample_rate = 0.0
-
-
-def _validate_pts(pts_range: tuple[int, int]) -> None:
-
-    if pts_range[0] > pts_range[1] > 0:
-        raise ValueError(
-            f"Start pts should not be smaller than end pts, got start pts: {pts_range[0]} and end pts: {pts_range[1]}"
-        )
-
-
-def _fill_info(
-    vtimebase: torch.Tensor,
-    vfps: torch.Tensor,
-    vduration: torch.Tensor,
-    atimebase: torch.Tensor,
-    asample_rate: torch.Tensor,
-    aduration: torch.Tensor,
-) -> VideoMetaData:
-    """
-    Build update VideoMetaData struct with info about the video
-    """
-    meta = VideoMetaData()
-    if vtimebase.numel() > 0:
-        meta.video_timebase = Timebase(int(vtimebase[0].item()), int(vtimebase[1].item()))
-        timebase = vtimebase[0].item() / float(vtimebase[1].item())
-        if vduration.numel() > 0:
-            meta.has_video = True
-            meta.video_duration = float(vduration.item()) * timebase
-    if vfps.numel() > 0:
-        meta.video_fps = float(vfps.item())
-    if atimebase.numel() > 0:
-        meta.audio_timebase = Timebase(int(atimebase[0].item()), int(atimebase[1].item()))
-        timebase = atimebase[0].item() / float(atimebase[1].item())
-        if aduration.numel() > 0:
-            meta.has_audio = True
-            meta.audio_duration = float(aduration.item()) * timebase
-    if asample_rate.numel() > 0:
-        meta.audio_sample_rate = float(asample_rate.item())
-
-    return meta
-
-
-def _align_audio_frames(
-    aframes: torch.Tensor, aframe_pts: torch.Tensor, audio_pts_range: tuple[int, int]
-) -> torch.Tensor:
-    start, end = aframe_pts[0], aframe_pts[-1]
-    num_samples = aframes.size(0)
-    step_per_aframe = float(end - start + 1) / float(num_samples)
-    s_idx = 0
-    e_idx = num_samples
-    if start < audio_pts_range[0]:
-        s_idx = int((audio_pts_range[0] - start) / step_per_aframe)
-    if audio_pts_range[1] != -1 and end > audio_pts_range[1]:
-        e_idx = int((audio_pts_range[1] - end) / step_per_aframe)
-    return aframes[s_idx:e_idx, :]
-
-
-def _read_video_from_file(
-    filename: str,
-    seek_frame_margin: float = 0.25,
-    read_video_stream: bool = True,
-    video_width: int = 0,
-    video_height: int = 0,
-    video_min_dimension: int = 0,
-    video_max_dimension: int = 0,
-    video_pts_range: tuple[int, int] = (0, -1),
-    video_timebase: Fraction = default_timebase,
-    read_audio_stream: bool = True,
-    audio_samples: int = 0,
-    audio_channels: int = 0,
-    audio_pts_range: tuple[int, int] = (0, -1),
-    audio_timebase: Fraction = default_timebase,
-) -> tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
-    """
-    Reads a video from a file, returning both the video frames and the audio frames
-
-    Args:
-    filename (str): path to the video file
-    seek_frame_margin (double, optional): seeking frame in the stream is imprecise. Thus,
-        when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
-    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
-    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
-        the size of decoded frames:
-
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the original frame resolution
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize the
-                frame so that shorter edge size is video_min_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension != 0, keep the aspect ratio and resize
-                the frame so that longer edge size is video_max_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension != 0, resize the frame so that shorter
-                edge size is video_min_dimension, and longer edge size is
-                video_max_dimension. The aspect ratio may not be preserved
-            - When video_width = 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_height is $video_height
-            - When video_width != 0, video_height == 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_width is $video_width
-            - When video_width != 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, resize the frame so that frame
-                video_width and  video_height are set to $video_width and
-                $video_height, respectively
-    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
-    video_timebase (Fraction, optional): a Fraction rational number which denotes timebase in video stream
-    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
-    audio_samples (int, optional): audio sampling rate
-    audio_channels (int optional): audio channels
-    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
-    audio_timebase (Fraction, optional): a Fraction rational number which denotes time base in audio stream
-
-    Returns
-        vframes (Tensor[T, H, W, C]): the `T` video frames
-        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
-            `K` is the number of audio_channels
-        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float)
-            and audio_fps (int)
-    """
-    _raise_video_deprecation_warning()
-    _validate_pts(video_pts_range)
-    _validate_pts(audio_pts_range)
-
-    result = torch.ops.video_reader.read_video_from_file(
-        filename,
-        seek_frame_margin,
-        0,  # getPtsOnly
-        read_video_stream,
-        video_width,
-        video_height,
-        video_min_dimension,
-        video_max_dimension,
-        video_pts_range[0],
-        video_pts_range[1],
-        video_timebase.numerator,
-        video_timebase.denominator,
-        read_audio_stream,
-        audio_samples,
-        audio_channels,
-        audio_pts_range[0],
-        audio_pts_range[1],
-        audio_timebase.numerator,
-        audio_timebase.denominator,
-    )
-    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    if aframes.numel() > 0:
-        # when audio stream is found
-        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
-    return vframes, aframes, info
-
-
-def _read_video_timestamps_from_file(filename: str) -> tuple[list[int], list[int], VideoMetaData]:
-    """
-    Decode all video- and audio frames in the video. Only pts
-    (presentation timestamp) is returned. The actual frame pixel data is not
-    copied. Thus, it is much faster than read_video(...)
-    """
-    result = torch.ops.video_reader.read_video_from_file(
-        filename,
-        0,  # seek_frame_margin
-        1,  # getPtsOnly
-        1,  # read_video_stream
-        0,  # video_width
-        0,  # video_height
-        0,  # video_min_dimension
-        0,  # video_max_dimension
-        0,  # video_start_pts
-        -1,  # video_end_pts
-        0,  # video_timebase_num
-        1,  # video_timebase_den
-        1,  # read_audio_stream
-        0,  # audio_samples
-        0,  # audio_channels
-        0,  # audio_start_pts
-        -1,  # audio_end_pts
-        0,  # audio_timebase_num
-        1,  # audio_timebase_den
-    )
-    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-
-    vframe_pts = vframe_pts.numpy().tolist()
-    aframe_pts = aframe_pts.numpy().tolist()
-    return vframe_pts, aframe_pts, info
-
-
-def _probe_video_from_file(filename: str) -> VideoMetaData:
-    """
-    Probe a video file and return VideoMetaData with info about the video
-    """
-    _raise_video_deprecation_warning()
-    result = torch.ops.video_reader.probe_video_from_file(filename)
-    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    return info
-
-
-def _read_video_from_memory(
-    video_data: torch.Tensor,
-    seek_frame_margin: float = 0.25,
-    read_video_stream: int = 1,
-    video_width: int = 0,
-    video_height: int = 0,
-    video_min_dimension: int = 0,
-    video_max_dimension: int = 0,
-    video_pts_range: tuple[int, int] = (0, -1),
-    video_timebase_numerator: int = 0,
-    video_timebase_denominator: int = 1,
-    read_audio_stream: int = 1,
-    audio_samples: int = 0,
-    audio_channels: int = 0,
-    audio_pts_range: tuple[int, int] = (0, -1),
-    audio_timebase_numerator: int = 0,
-    audio_timebase_denominator: int = 1,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Reads a video from memory, returning both the video frames as the audio frames
-    This function is torchscriptable.
-
-    Args:
-    video_data (data type could be 1) torch.Tensor, dtype=torch.int8 or 2) python bytes):
-        compressed video content stored in either 1) torch.Tensor 2) python bytes
-    seek_frame_margin (double, optional): seeking frame in the stream is imprecise.
-        Thus, when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
-    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
-    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
-        the size of decoded frames:
-
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the original frame resolution
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize the
-                frame so that shorter edge size is video_min_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension != 0, keep the aspect ratio and resize
-                the frame so that longer edge size is video_max_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension != 0, resize the frame so that shorter
-                edge size is video_min_dimension, and longer edge size is
-                video_max_dimension. The aspect ratio may not be preserved
-            - When video_width = 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_height is $video_height
-            - When video_width != 0, video_height == 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_width is $video_width
-            - When video_width != 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, resize the frame so that frame
-                video_width and  video_height are set to $video_width and
-                $video_height, respectively
-    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
-    video_timebase_numerator / video_timebase_denominator (float, optional): a rational
-        number which denotes timebase in video stream
-    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
-    audio_samples (int, optional): audio sampling rate
-    audio_channels (int optional): audio audio_channels
-    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
-    audio_timebase_numerator / audio_timebase_denominator (float, optional):
-        a rational number which denotes time base in audio stream
-
-    Returns:
-        vframes (Tensor[T, H, W, C]): the `T` video frames
-        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
-            `K` is the number of channels
-    """
-
-    _raise_video_deprecation_warning()
-    _validate_pts(video_pts_range)
-    _validate_pts(audio_pts_range)
-
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-
-    result = torch.ops.video_reader.read_video_from_memory(
-        video_data,
-        seek_frame_margin,
-        0,  # getPtsOnly
-        read_video_stream,
-        video_width,
-        video_height,
-        video_min_dimension,
-        video_max_dimension,
-        video_pts_range[0],
-        video_pts_range[1],
-        video_timebase_numerator,
-        video_timebase_denominator,
-        read_audio_stream,
-        audio_samples,
-        audio_channels,
-        audio_pts_range[0],
-        audio_pts_range[1],
-        audio_timebase_numerator,
-        audio_timebase_denominator,
-    )
-
-    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
-
-    if aframes.numel() > 0:
-        # when audio stream is found
-        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
-
-    return vframes, aframes
-
-
-def _read_video_timestamps_from_memory(
-    video_data: torch.Tensor,
-) -> tuple[list[int], list[int], VideoMetaData]:
-    """
-    Decode all frames in the video. Only pts (presentation timestamp) is returned.
-    The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
-    is much faster than read_video(...)
-    """
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-    result = torch.ops.video_reader.read_video_from_memory(
-        video_data,
-        0,  # seek_frame_margin
-        1,  # getPtsOnly
-        1,  # read_video_stream
-        0,  # video_width
-        0,  # video_height
-        0,  # video_min_dimension
-        0,  # video_max_dimension
-        0,  # video_start_pts
-        -1,  # video_end_pts
-        0,  # video_timebase_num
-        1,  # video_timebase_den
-        1,  # read_audio_stream
-        0,  # audio_samples
-        0,  # audio_channels
-        0,  # audio_start_pts
-        -1,  # audio_end_pts
-        0,  # audio_timebase_num
-        1,  # audio_timebase_den
-    )
-    _raise_video_deprecation_warning()
-    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-
-    vframe_pts = vframe_pts.numpy().tolist()
-    aframe_pts = aframe_pts.numpy().tolist()
-    return vframe_pts, aframe_pts, info
-
-
-def _probe_video_from_memory(
-    video_data: torch.Tensor,
-) -> VideoMetaData:
-    """
-    Probe a video in memory and return VideoMetaData with info about the video
-    This function is torchscriptable
-    """
-    _raise_video_deprecation_warning()
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-    result = torch.ops.video_reader.probe_video_from_memory(video_data)
-    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    return info
-
-
-def _read_video(
-    filename: str,
-    start_pts: Union[float, Fraction] = 0,
-    end_pts: Optional[Union[float, Fraction]] = None,
-    pts_unit: str = "pts",
-) -> tuple[torch.Tensor, torch.Tensor, dict[str, float]]:
-    _raise_video_deprecation_warning()
-    if end_pts is None:
-        end_pts = float("inf")
-
-    if pts_unit == "pts":
-        warnings.warn(
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
-        )
-
-    info = _probe_video_from_file(filename)
-
-    has_video = info.has_video
-    has_audio = info.has_audio
-
-    def get_pts(time_base):
-        start_offset = start_pts
-        end_offset = end_pts
-        if pts_unit == "sec":
-            start_offset = int(math.floor(start_pts * (1 / time_base)))
-            if end_offset != float("inf"):
-                end_offset = int(math.ceil(end_pts * (1 / time_base)))
-        if end_offset == float("inf"):
-            end_offset = -1
-        return start_offset, end_offset
-
-    video_pts_range = (0, -1)
-    video_timebase = default_timebase
-    if has_video:
-        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        video_pts_range = get_pts(video_timebase)
-
-    audio_pts_range = (0, -1)
-    audio_timebase = default_timebase
-    if has_audio:
-        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
-        audio_pts_range = get_pts(audio_timebase)
-
-    vframes, aframes, info = _read_video_from_file(
-        filename,
-        read_video_stream=True,
-        video_pts_range=video_pts_range,
-        video_timebase=video_timebase,
-        read_audio_stream=True,
-        audio_pts_range=audio_pts_range,
-        audio_timebase=audio_timebase,
-    )
-    _info = {}
-    if has_video:
-        _info["video_fps"] = info.video_fps
-    if has_audio:
-        _info["audio_fps"] = info.audio_sample_rate
-
-    return vframes, aframes, _info
-
-
-def _read_video_timestamps(
-    filename: str, pts_unit: str = "pts"
-) -> tuple[Union[list[int], list[Fraction]], Optional[float]]:
-    _raise_video_deprecation_warning()
-    if pts_unit == "pts":
-        warnings.warn(
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
-        )
-
-    pts: Union[list[int], list[Fraction]]
-    pts, _, info = _read_video_timestamps_from_file(filename)
-
-    if pts_unit == "sec":
-        video_time_base = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        pts = [x * video_time_base for x in pts]
-
-    video_fps = info.video_fps if info.has_video else None
-
-    return pts, video_fps
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
deleted file mode 100644
index 253c76376f7..00000000000
--- a/torchvision/io/video_reader.py
+++ /dev/null
@@ -1,279 +0,0 @@
-import io
-import warnings
-from collections.abc import Iterator
-
-from typing import Any
-
-import torch
-
-from ..utils import _log_api_usage_once
-from ._video_deprecation_warning import _raise_video_deprecation_warning
-
-from ._video_opt import _HAS_CPU_VIDEO_DECODER
-
-if _HAS_CPU_VIDEO_DECODER:
-
-    def _has_video_opt() -> bool:
-        return True
-
-else:
-
-    def _has_video_opt() -> bool:
-        return False
-
-
-try:
-    import av
-
-    av.logging.set_level(av.logging.ERROR)
-    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
-        av = ImportError(
-            """\
-Your version of PyAV is too old for the necessary video operations in torchvision.
-If you are on Python 3.5, you will have to build from source (the conda-forge
-packages are not up-to-date).  See
-https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-        )
-except ImportError:
-    av = ImportError(
-        """\
-PyAV is not installed, and is necessary for the video operations in torchvision.
-See https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-    )
-
-
-class VideoReader:
-    """[DEPRECATED] Fine-grained video-reading API.
-    Supports frame-by-frame reading of various streams from a single video
-    container. Much like previous video_reader API it supports the following
-    backends: video_reader and pyav.
-    Backends can be set via `torchvision.set_video_backend` function.
-
-    .. warning::
-
-        DEPRECATED: All the video decoding and encoding capabilities of torchvision
-        are deprecated from version 0.22 and will be removed in version 0.24.  We
-        recommend that you migrate to
-        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
-        consolidate the future decoding/encoding capabilities of PyTorch
-
-    .. betastatus:: VideoReader class
-
-    Example:
-        The following examples creates a :mod:`VideoReader` object, seeks into 2s
-        point, and returns a single frame::
-
-            import torchvision
-            video_path = "path_to_a_test_video"
-            reader = torchvision.io.VideoReader(video_path, "video")
-            reader.seek(2.0)
-            frame = next(reader)
-
-        :mod:`VideoReader` implements the iterable API, which makes it suitable to
-        using it in conjunction with :mod:`itertools` for more advanced reading.
-        As such, we can use a :mod:`VideoReader` instance inside for loops::
-
-            reader.seek(2)
-            for frame in reader:
-                frames.append(frame['data'])
-            # additionally, `seek` implements a fluent API, so we can do
-            for frame in reader.seek(2):
-                frames.append(frame['data'])
-
-        With :mod:`itertools`, we can read all frames between 2 and 5 seconds with the
-        following code::
-
-            for frame in itertools.takewhile(lambda x: x['pts'] <= 5, reader.seek(2)):
-                frames.append(frame['data'])
-
-        and similarly, reading 10 frames after the 2s timestamp can be achieved
-        as follows::
-
-            for frame in itertools.islice(reader.seek(2), 10):
-                frames.append(frame['data'])
-
-    .. note::
-
-        Each stream descriptor consists of two parts: stream type (e.g. 'video') and
-        a unique stream id (which are determined by the video encoding).
-        In this way, if the video container contains multiple
-        streams of the same type, users can access the one they want.
-        If only stream type is passed, the decoder auto-detects first stream of that type.
-
-    Args:
-        src (string, bytes object, or tensor): The media source.
-            If string-type, it must be a file path supported by FFMPEG.
-            If bytes, should be an in-memory representation of a file supported by FFMPEG.
-            If Tensor, it is interpreted internally as byte buffer.
-            It must be one-dimensional, of type ``torch.uint8``.
-
-        stream (string, optional): descriptor of the required stream, followed by the stream id,
-            in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
-            Currently available options include ``['video', 'audio']``
-
-        num_threads (int, optional): number of threads used by the codec to decode video.
-            Default value (0) enables multithreading with codec-dependent heuristic. The performance
-            will depend on the version of FFMPEG codecs supported.
-    """
-
-    def __init__(
-        self,
-        src: str,
-        stream: str = "video",
-        num_threads: int = 0,
-    ) -> None:
-        _raise_video_deprecation_warning()
-        _log_api_usage_once(self)
-        from .. import get_video_backend
-
-        self.backend = get_video_backend()
-        if isinstance(src, str):
-            if not src:
-                raise ValueError("src cannot be empty")
-        elif isinstance(src, bytes):
-            if self.backend == "pyav":
-                src = io.BytesIO(src)
-            else:
-                with warnings.catch_warnings():
-                    # Ignore the warning because we actually don't modify the buffer in this function
-                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
-                    src = torch.frombuffer(src, dtype=torch.uint8)
-        elif isinstance(src, torch.Tensor):
-            if self.backend == "pyav":
-                raise RuntimeError("VideoReader cannot be initialized from Tensor object when using pyav backend.")
-        else:
-            raise ValueError(f"src must be either string, Tensor or bytes object. Got {type(src)}")
-
-        if self.backend == "video_reader":
-            if isinstance(src, str):
-                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
-            elif isinstance(src, torch.Tensor):
-                self._c = torch.classes.torchvision.Video("", "", 0)
-                self._c.init_from_memory(src, stream, num_threads)
-
-        elif self.backend == "pyav":
-            self.container = av.open(src, metadata_errors="ignore")
-            # TODO: load metadata
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-
-            # TODO: add extradata exception
-
-        else:
-            raise RuntimeError(f"Unknown video backend: {self.backend}")
-
-    def __next__(self) -> dict[str, Any]:
-        """Decodes and returns the next frame of the current stream.
-        Frames are encoded as a dict with mandatory
-        data and pts fields, where data is a tensor, and pts is a
-        presentation timestamp of the frame expressed in seconds
-        as a float.
-
-        Returns:
-            (dict): a dictionary and containing decoded frame (``data``)
-            and corresponding timestamp (``pts``) in seconds
-
-        """
-        if self.backend == "video_reader":
-            frame, pts = self._c.next()
-        else:
-            try:
-                frame = next(self._c)
-                pts = float(frame.pts * frame.time_base)
-                if "video" in self.pyav_stream:
-                    frame = torch.as_tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
-                elif "audio" in self.pyav_stream:
-                    frame = torch.as_tensor(frame.to_ndarray()).permute(1, 0)
-                else:
-                    frame = None
-            except av.error.EOFError:
-                raise StopIteration
-
-        if frame.numel() == 0:
-            raise StopIteration
-
-        return {"data": frame, "pts": pts}
-
-    def __iter__(self) -> Iterator[dict[str, Any]]:
-        return self
-
-    def seek(self, time_s: float, keyframes_only: bool = False) -> "VideoReader":
-        """Seek within current stream.
-
-        Args:
-            time_s (float): seek time in seconds
-            keyframes_only (bool): allow to seek only to keyframes
-
-        .. note::
-            Current implementation is the so-called precise seek. This
-            means following seek, call to :mod:`next()` will return the
-            frame with the exact timestamp if it exists or
-            the first frame with timestamp larger than ``time_s``.
-        """
-        if self.backend == "video_reader":
-            self._c.seek(time_s, keyframes_only)
-        else:
-            # handle special case as pyav doesn't catch it
-            if time_s < 0:
-                time_s = 0
-            temp_str = self.container.streams.get(**self.pyav_stream)[0]
-            offset = int(round(time_s / temp_str.time_base))
-            if not keyframes_only:
-                warnings.warn("Accurate seek is not implemented for pyav backend")
-            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
-            self._c = self.container.decode(**self.pyav_stream)
-        return self
-
-    def get_metadata(self) -> dict[str, Any]:
-        """Returns video metadata
-
-        Returns:
-            (dict): dictionary containing duration and frame rate for every stream
-        """
-        if self.backend == "pyav":
-            metadata = {}  # type:  Dict[str, Any]
-            for stream in self.container.streams:
-                if stream.type not in metadata:
-                    if stream.type == "video":
-                        rate_n = "fps"
-                    else:
-                        rate_n = "framerate"
-                    metadata[stream.type] = {rate_n: [], "duration": []}
-
-                rate = getattr(stream, "average_rate", None) or stream.sample_rate
-
-                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
-                metadata[stream.type][rate_n].append(float(rate))
-            return metadata
-        return self._c.get_metadata()
-
-    def set_current_stream(self, stream: str) -> bool:
-        """Set current stream.
-        Explicitly define the stream we are operating on.
-
-        Args:
-            stream (string): descriptor of the required stream. Defaults to ``"video:0"``
-                Currently available stream types include ``['video', 'audio']``.
-                Each descriptor consists of two parts: stream type (e.g. 'video') and
-                a unique stream id (which are determined by video encoding).
-                In this way, if the video container contains multiple
-                streams of the same type, users can access the one they want.
-                If only stream type is passed, the decoder auto-detects first stream
-                of that type and returns it.
-
-        Returns:
-            (bool): True on success, False otherwise
-        """
-        if self.backend == "pyav":
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-            return True
-        return self._c.set_current_stream(stream)