From e79e0bbc8f9c567f4699380b16f139168516a75b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 30 Jan 2026 01:46:05 -0800 Subject: [PATCH] [torchvision] Move video_reader backend to fb/ for internal-only use (#9370) Summary: Pull Request resolved: https://github.com/pytorch/vision/pull/9370 Pull Request resolved: https://github.com/pytorch/vision/pull/9369 Move the video_reader backend (C++ decoder and Python API) from the open-source torchvision/ folder to the internal-only fb/ folder. This prepares for removal of the deprecated video_reader from open-source GitHub while maintaining backward compatibility for internal Meta users. The move preserves all existing BUCK target paths via aliases, so existing internal consumers continue to work without modification. Python imports from torchvision.io are redirected to the fb/ location with OSS fallback stubs. Differential Revision: D91702027 --- torchvision/csrc/io/decoder/audio_sampler.cpp | 254 ------ torchvision/csrc/io/decoder/audio_sampler.h | 39 - torchvision/csrc/io/decoder/audio_stream.cpp | 119 --- torchvision/csrc/io/decoder/audio_stream.h | 29 - torchvision/csrc/io/decoder/cc_stream.cpp | 24 - torchvision/csrc/io/decoder/cc_stream.h | 22 - torchvision/csrc/io/decoder/decoder.cpp | 764 ------------------ torchvision/csrc/io/decoder/decoder.h | 100 --- torchvision/csrc/io/decoder/defs.h | 415 ---------- torchvision/csrc/io/decoder/memory_buffer.cpp | 71 -- torchvision/csrc/io/decoder/memory_buffer.h | 25 - .../csrc/io/decoder/seekable_buffer.cpp | 139 ---- torchvision/csrc/io/decoder/seekable_buffer.h | 45 -- torchvision/csrc/io/decoder/stream.cpp | 288 ------- torchvision/csrc/io/decoder/stream.h | 80 -- .../csrc/io/decoder/subtitle_sampler.cpp | 46 -- .../csrc/io/decoder/subtitle_sampler.h | 32 - .../csrc/io/decoder/subtitle_stream.cpp | 96 --- torchvision/csrc/io/decoder/subtitle_stream.h | 38 - torchvision/csrc/io/decoder/sync_decoder.cpp | 97 --- torchvision/csrc/io/decoder/sync_decoder.h | 48 -- .../csrc/io/decoder/sync_decoder_test.cpp | 416 ---------- torchvision/csrc/io/decoder/time_keeper.cpp | 35 - torchvision/csrc/io/decoder/time_keeper.h | 25 - torchvision/csrc/io/decoder/util.cpp | 401 --------- torchvision/csrc/io/decoder/util.h | 28 - torchvision/csrc/io/decoder/util_test.cpp | 34 - torchvision/csrc/io/decoder/video_sampler.cpp | 337 -------- torchvision/csrc/io/decoder/video_sampler.h | 44 - torchvision/csrc/io/decoder/video_stream.cpp | 131 --- torchvision/csrc/io/decoder/video_stream.h | 31 - torchvision/csrc/io/video/video.cpp | 387 --------- torchvision/csrc/io/video/video.h | 75 -- .../csrc/io/video_reader/video_reader.cpp | 677 ---------------- .../csrc/io/video_reader/video_reader.h | 55 -- torchvision/io/__init__.py | 89 +- torchvision/io/_video_opt.py | 516 ------------ torchvision/io/video_reader.py | 279 ------- 38 files changed, 76 insertions(+), 6255 deletions(-) delete mode 100644 torchvision/csrc/io/decoder/audio_sampler.cpp delete mode 100644 torchvision/csrc/io/decoder/audio_sampler.h delete mode 100644 torchvision/csrc/io/decoder/audio_stream.cpp delete mode 100644 torchvision/csrc/io/decoder/audio_stream.h delete mode 100644 torchvision/csrc/io/decoder/cc_stream.cpp delete mode 100644 torchvision/csrc/io/decoder/cc_stream.h delete mode 100644 torchvision/csrc/io/decoder/decoder.cpp delete mode 100644 torchvision/csrc/io/decoder/decoder.h delete mode 100644 torchvision/csrc/io/decoder/defs.h delete mode 100644 torchvision/csrc/io/decoder/memory_buffer.cpp delete mode 100644 torchvision/csrc/io/decoder/memory_buffer.h delete mode 100644 torchvision/csrc/io/decoder/seekable_buffer.cpp delete mode 100644 torchvision/csrc/io/decoder/seekable_buffer.h delete mode 100644 torchvision/csrc/io/decoder/stream.cpp delete mode 100644 torchvision/csrc/io/decoder/stream.h delete mode 100644 torchvision/csrc/io/decoder/subtitle_sampler.cpp delete mode 100644 torchvision/csrc/io/decoder/subtitle_sampler.h delete mode 100644 torchvision/csrc/io/decoder/subtitle_stream.cpp delete mode 100644 torchvision/csrc/io/decoder/subtitle_stream.h delete mode 100644 torchvision/csrc/io/decoder/sync_decoder.cpp delete mode 100644 torchvision/csrc/io/decoder/sync_decoder.h delete mode 100644 torchvision/csrc/io/decoder/sync_decoder_test.cpp delete mode 100644 torchvision/csrc/io/decoder/time_keeper.cpp delete mode 100644 torchvision/csrc/io/decoder/time_keeper.h delete mode 100644 torchvision/csrc/io/decoder/util.cpp delete mode 100644 torchvision/csrc/io/decoder/util.h delete mode 100644 torchvision/csrc/io/decoder/util_test.cpp delete mode 100644 torchvision/csrc/io/decoder/video_sampler.cpp delete mode 100644 torchvision/csrc/io/decoder/video_sampler.h delete mode 100644 torchvision/csrc/io/decoder/video_stream.cpp delete mode 100644 torchvision/csrc/io/decoder/video_stream.h delete mode 100644 torchvision/csrc/io/video/video.cpp delete mode 100644 torchvision/csrc/io/video/video.h delete mode 100644 torchvision/csrc/io/video_reader/video_reader.cpp delete mode 100644 torchvision/csrc/io/video_reader/video_reader.h delete mode 100644 torchvision/io/_video_opt.py delete mode 100644 torchvision/io/video_reader.py diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp deleted file mode 100644 index b158d3438b8..00000000000 --- a/torchvision/csrc/io/decoder/audio_sampler.cpp +++ /dev/null @@ -1,254 +0,0 @@ -#include "audio_sampler.h" -#include -#include "util.h" - -#define AVRESAMPLE_MAX_CHANNELS 32 - -// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24 -namespace ffmpeg { - -namespace { -int preparePlanes( - const AudioFormat& fmt, - const uint8_t* buffer, - int numSamples, - uint8_t** planes) { - int result; - if ((result = av_samples_fill_arrays( - planes, - nullptr, // linesize is not needed - buffer, - fmt.channels, - numSamples, - (AVSampleFormat)fmt.format, - 1)) < 0) { - LOG(ERROR) << "av_samples_fill_arrays failed, err: " - << Util::generateErrorDesc(result) - << ", numSamples: " << numSamples << ", fmt: " << fmt.format; - } - return result; -} -} // namespace - -AudioSampler::AudioSampler(void* logCtx) : logCtx_(logCtx) {} - -AudioSampler::~AudioSampler() { - cleanUp(); -} - -void AudioSampler::shutdown() { - cleanUp(); -} - -bool AudioSampler::init(const SamplerParameters& params) { - cleanUp(); - - if (params.type != MediaType::TYPE_AUDIO) { - LOG(ERROR) << "Invalid media type, expected MediaType::TYPE_AUDIO"; - return false; - } - -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - AVChannelLayout channel_out; - AVChannelLayout channel_in; - av_channel_layout_default(&channel_out, params.out.audio.channels); - av_channel_layout_default(&channel_in, params.in.audio.channels); - int ret = swr_alloc_set_opts2( - &swrContext_, - &channel_out, - (AVSampleFormat)params.out.audio.format, - params.out.audio.samples, - &channel_in, - (AVSampleFormat)params.in.audio.format, - params.in.audio.samples, - 0, - logCtx_); - if (ret < 0 || swrContext_ == nullptr) { - LOG(ERROR) << "Cannot allocate SwrContext"; - return false; - } -#else - swrContext_ = swr_alloc_set_opts( - nullptr, - av_get_default_channel_layout(params.out.audio.channels), - (AVSampleFormat)params.out.audio.format, - params.out.audio.samples, - av_get_default_channel_layout(params.in.audio.channels), - (AVSampleFormat)params.in.audio.format, - params.in.audio.samples, - 0, - logCtx_); - if (swrContext_ == nullptr) { - LOG(ERROR) << "Cannot allocate SwrContext"; - return false; - } -#endif - - int result; - if ((result = swr_init(swrContext_)) < 0) { - LOG(ERROR) << "swr_init failed, err: " << Util::generateErrorDesc(result) - << ", in -> format: " << params.in.audio.format - << ", channels: " << params.in.audio.channels - << ", samples: " << params.in.audio.samples - << ", out -> format: " << params.out.audio.format - << ", channels: " << params.out.audio.channels - << ", samples: " << params.out.audio.samples; - return false; - } - - // set formats - params_ = params; - return true; -} - -int AudioSampler::numOutputSamples(int inSamples) const { - return swr_get_out_samples(swrContext_, inSamples); -} - -int AudioSampler::sample( - const uint8_t* inPlanes[], - int inNumSamples, - ByteStorage* out, - int outNumSamples) { - int result; - int outBufferBytes = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - outNumSamples, - (AVSampleFormat)params_.out.audio.format, - 1); - - if (out) { - out->ensure(outBufferBytes); - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - - if ((result = preparePlanes( - params_.out.audio, - out->writableTail(), - outNumSamples, - outPlanes)) < 0) { - return result; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - inPlanes, - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert failed, err: " - << Util::generateErrorDesc(result); - return result; - } - - TORCH_CHECK_LE(result, outNumSamples); - - if (result) { - if ((result = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - result, - (AVSampleFormat)params_.out.audio.format, - 1)) >= 0) { - out->append(result); - } else { - LOG(ERROR) << "av_samples_get_buffer_size failed, err: " - << Util::generateErrorDesc(result); - } - } - } else { - // allocate a temporary buffer - auto* tmpBuffer = static_cast(av_malloc(outBufferBytes)); - if (!tmpBuffer) { - LOG(ERROR) << "av_alloc failed, for size: " << outBufferBytes; - return -1; - } - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - - if ((result = preparePlanes( - params_.out.audio, tmpBuffer, outNumSamples, outPlanes)) < 0) { - av_free(tmpBuffer); - return result; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - inPlanes, - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert failed, err: " - << Util::generateErrorDesc(result); - av_free(tmpBuffer); - return result; - } - - av_free(tmpBuffer); - - TORCH_CHECK_LE(result, outNumSamples); - - if (result) { - result = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - result, - (AVSampleFormat)params_.out.audio.format, - 1); - } - } - - return result; -} - -int AudioSampler::sample(AVFrame* frame, ByteStorage* out) { - const auto outNumSamples = numOutputSamples(frame ? frame->nb_samples : 0); - - if (!outNumSamples) { - return 0; - } - - return sample( - frame ? (const uint8_t**)&frame->data[0] : nullptr, - frame ? frame->nb_samples : 0, - out, - outNumSamples); -} - -int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) { - const auto inSampleSize = - av_get_bytes_per_sample((AVSampleFormat)params_.in.audio.format); - - const auto inNumSamples = - !in ? 0 : in->length() / inSampleSize / params_.in.audio.channels; - - const auto outNumSamples = numOutputSamples(inNumSamples); - - if (!outNumSamples) { - return 0; - } - - uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - int result; - if (in && - (result = preparePlanes( - params_.in.audio, in->data(), inNumSamples, inPlanes)) < 0) { - return result; - } - - return sample( - in ? (const uint8_t**)inPlanes : nullptr, - inNumSamples, - out, - outNumSamples); -} - -void AudioSampler::cleanUp() { - if (swrContext_) { - swr_free(&swrContext_); - swrContext_ = nullptr; - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_sampler.h b/torchvision/csrc/io/decoder/audio_sampler.h deleted file mode 100644 index e105bbe4de2..00000000000 --- a/torchvision/csrc/io/decoder/audio_sampler.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode audio frames from one format into another - */ - -class AudioSampler : public MediaSampler { - public: - explicit AudioSampler(void* logCtx); - ~AudioSampler() override; - - // MediaSampler overrides - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - int sample(AVFrame* frame, ByteStorage* out); - - private: - // close resources - void cleanUp(); - // helper functions for rescaling, cropping, etc. - int numOutputSamples(int inSamples) const; - int sample( - const uint8_t* inPlanes[], - int inNumSamples, - ByteStorage* out, - int outNumSamples); - - private: - SwrContext* swrContext_{nullptr}; - void* logCtx_{nullptr}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp deleted file mode 100644 index c3a003434b8..00000000000 --- a/torchvision/csrc/io/decoder/audio_stream.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include "audio_stream.h" -#include -#include "util.h" - -namespace ffmpeg { - -namespace { -static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) { -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels; -#else - return frame ? frame->channels : codec->channels; -#endif -} - -bool operator==(const AudioFormat& x, const AVFrame& y) { - return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(get_nb_channels(&y, nullptr)) && - x.format == y.format; -} - -bool operator==(const AudioFormat& x, const AVCodecContext& y) { - return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(get_nb_channels(nullptr, &y)) && - x.format == y.sample_fmt; -} - -AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) { - x.samples = y.sample_rate; - x.channels = get_nb_channels(&y, nullptr); - x.format = y.format; - return x; -} - -AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) { - x.samples = y.sample_rate; - x.channels = get_nb_channels(nullptr, &y); - x.format = y.sample_fmt; - return x; -} -} // namespace - -AudioStream::AudioStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const AudioFormat& format) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - 0) {} - -AudioStream::~AudioStream() { - if (sampler_) { - sampler_->shutdown(); - sampler_.reset(); - } -} - -int AudioStream::initFormat() { - // set output format - if (format_.format.audio.samples == 0) { - format_.format.audio.samples = codecCtx_->sample_rate; - } -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - if (format_.format.audio.channels == 0) { - format_.format.audio.channels = codecCtx_->ch_layout.nb_channels; - } -#else - if (format_.format.audio.channels == 0) { - format_.format.audio.channels = codecCtx_->channels; - } -#endif - if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) { - format_.format.audio.format = codecCtx_->sample_fmt; - } - - return format_.format.audio.samples != 0 && - format_.format.audio.channels != 0 && - format_.format.audio.format != AV_SAMPLE_FMT_NONE - ? 0 - : -1; -} - -// copies audio sample bytes via swr_convert call in audio_sampler.cpp -int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { - if (!sampler_) { - sampler_ = std::make_unique(codecCtx_); - } - // check if input format gets changed - if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_) - : !(sampler_->getInputFormat().audio == *frame_)) { - // - reinit sampler - SamplerParameters params; - params.type = format_.type; - params.out = format_.format; - params.in = FormatUnion(); - flush ? toAudioFormat(params.in.audio, *codecCtx_) - : toAudioFormat(params.in.audio, *frame_); - if (!sampler_->init(params)) { - return -1; - } - - VLOG(1) << "Set input audio sampler format" - << ", samples: " << params.in.audio.samples - << ", channels: " << params.in.audio.channels - << ", format: " << params.in.audio.format - << " : output audio sampler format" - << ", samples: " << format_.format.audio.samples - << ", channels: " << format_.format.audio.channels - << ", format: " << format_.format.audio.format; - } - // calls to a sampler that converts the audio samples and copies them to the - // out buffer via ffmpeg::swr_convert - return sampler_->sample(flush ? nullptr : frame_, out); -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_stream.h b/torchvision/csrc/io/decoder/audio_stream.h deleted file mode 100644 index 2d6457b68f5..00000000000 --- a/torchvision/csrc/io/decoder/audio_stream.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "audio_sampler.h" -#include "stream.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one audio stream. - */ - -class AudioStream : public Stream { - public: - AudioStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const AudioFormat& format); - ~AudioStream() override; - - private: - int initFormat() override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - - private: - std::unique_ptr sampler_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/cc_stream.cpp b/torchvision/csrc/io/decoder/cc_stream.cpp deleted file mode 100644 index 89174c396fd..00000000000 --- a/torchvision/csrc/io/decoder/cc_stream.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "cc_stream.h" - -namespace ffmpeg { - -CCStream::CCStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format) - : SubtitleStream(inputCtx, index, convertPtsToWallTime, format) { - format_.type = TYPE_CC; -} - -AVCodec* CCStream::findCodec(AVCodecParameters* params) { - if (params->codec_id == AV_CODEC_ID_BIN_DATA && - params->codec_type == AVMEDIA_TYPE_DATA) { - // obtain subtitles codec - params->codec_id = AV_CODEC_ID_MOV_TEXT; - params->codec_type = AVMEDIA_TYPE_SUBTITLE; - } - return Stream::findCodec(params); -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/cc_stream.h b/torchvision/csrc/io/decoder/cc_stream.h deleted file mode 100644 index 3a1d169f014..00000000000 --- a/torchvision/csrc/io/decoder/cc_stream.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "subtitle_stream.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one closed captions stream. - */ -class CCStream : public SubtitleStream { - public: - CCStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format); - - private: - AVCodec* findCodec(AVCodecParameters* params) override; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp deleted file mode 100644 index 7221445840e..00000000000 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ /dev/null @@ -1,764 +0,0 @@ -#include "decoder.h" -#include -#include -#include -#include -#include -#include "audio_stream.h" -#include "cc_stream.h" -#include "subtitle_stream.h" -#include "util.h" -#include "video_stream.h" - -namespace ffmpeg { - -namespace { - -constexpr size_t kIoBufferSize = 96 * 1024; -constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE; -constexpr size_t kLogBufferSize = 1024; - -bool mapFfmpegType(AVMediaType media, MediaType* type) { - switch (media) { - case AVMEDIA_TYPE_AUDIO: - *type = TYPE_AUDIO; - return true; - case AVMEDIA_TYPE_VIDEO: - *type = TYPE_VIDEO; - return true; - case AVMEDIA_TYPE_SUBTITLE: - *type = TYPE_SUBTITLE; - return true; - case AVMEDIA_TYPE_DATA: - *type = TYPE_CC; - return true; - default: - return false; - } -} - -std::unique_ptr createStream( - MediaType type, - AVFormatContext* ctx, - int idx, - bool convertPtsToWallTime, - const FormatUnion& format, - int64_t loggingUuid) { - switch (type) { - case TYPE_AUDIO: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.audio); - case TYPE_VIDEO: - return std::make_unique( - // negative loggingUuid indicates video streams. - ctx, - idx, - convertPtsToWallTime, - format.video, - -loggingUuid); - case TYPE_SUBTITLE: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.subtitle); - case TYPE_CC: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.subtitle); - default: - return nullptr; - } -} - -} // Namespace - -/* static */ -void Decoder::logFunction(void* avcl, int level, const char* cfmt, va_list vl) { - if (!avcl) { - // Nothing can be done here - return; - } - - AVClass* avclass = *reinterpret_cast(avcl); - if (!avclass) { - // Nothing can be done here - return; - } - Decoder* decoder = nullptr; - if (strcmp(avclass->class_name, "AVFormatContext") == 0) { - AVFormatContext* context = reinterpret_cast(avcl); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "AVCodecContext") == 0) { - AVCodecContext* context = reinterpret_cast(avcl); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "AVIOContext") == 0) { - AVIOContext* context = reinterpret_cast(avcl); - // only if opaque was assigned to Decoder pointer - if (context && context->read_packet == Decoder::readFunction) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "SWResampler") == 0) { - // expect AVCodecContext as parent - if (avclass->parent_log_context_offset) { - AVClass** parent = - *(AVClass***)(((uint8_t*)avcl) + avclass->parent_log_context_offset); - AVCodecContext* context = reinterpret_cast(parent); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } - } else if (strcmp(avclass->class_name, "SWScaler") == 0) { - // cannot find a way to pass context pointer through SwsContext struct - } else { - VLOG(2) << "Unknown context class: " << avclass->class_name; - } - - if (decoder != nullptr && decoder->enableLogLevel(level)) { - char buf[kLogBufferSize] = {0}; - // Format the line - int* prefix = decoder->getPrintPrefix(); - *prefix = 1; - av_log_format_line(avcl, level, cfmt, vl, buf, sizeof(buf) - 1, prefix); - // pass message to the decoder instance - std::string msg(buf); - decoder->logCallback(level, msg); - } -} - -bool Decoder::enableLogLevel(int level) const { - return ssize_t(level) <= params_.logLevel; -} - -void Decoder::logCallback(int level, const std::string& message) { - LOG(INFO) << "Msg, uuid=" << params_.loggingUuid << " level=" << level - << " msg=" << message; -} - -/* static */ -int Decoder::shutdownFunction(void* ctx) { - Decoder* decoder = (Decoder*)ctx; - if (decoder == nullptr) { - return 1; - } - return decoder->shutdownCallback(); -} - -int Decoder::shutdownCallback() { - return interrupted_ ? 1 : 0; -} - -/* static */ -int Decoder::readFunction(void* opaque, uint8_t* buf, int size) { - Decoder* decoder = reinterpret_cast(opaque); - if (decoder == nullptr) { - return 0; - } - int bytesRead = decoder->readCallback(buf, size); - return bytesRead == 0 ? AVERROR_EOF : bytesRead; -} - -/* static */ -int64_t Decoder::seekFunction(void* opaque, int64_t offset, int whence) { - Decoder* decoder = reinterpret_cast(opaque); - if (decoder == nullptr) { - return -1; - } - return decoder->seekCallback(offset, whence); -} - -int Decoder::readCallback(uint8_t* buf, int size) { - return seekableBuffer_.read(buf, size, params_.timeoutMs); -} - -int64_t Decoder::seekCallback(int64_t offset, int whence) { - return seekableBuffer_.seek(offset, whence, params_.timeoutMs); -} - -/* static */ -void Decoder::initOnce() { - static std::once_flag flagInit; - std::call_once(flagInit, []() { -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - av_register_all(); - avcodec_register_all(); -#endif - avformat_network_init(); - av_log_set_callback(Decoder::logFunction); - av_log_set_level(AV_LOG_ERROR); - VLOG(1) << "Registered ffmpeg libs"; - }); -} - -Decoder::Decoder() { - initOnce(); -} - -Decoder::~Decoder() { - cleanUp(); -} - -// Initialise the format context that holds information about the container and -// fill it with minimal information about the format (codecs are not opened -// here). Function reads in information about the streams from the container -// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is -// specified within the decoder parameters, it seeks into the correct frame -// (note, the seek defined here is "precise" seek). -bool Decoder::init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) { - cleanUp(); - - if ((params.uri.empty() || in) && (!params.uri.empty() || !in)) { - LOG(ERROR) - << "uuid=" << params_.loggingUuid - << " either external URI gets provided or explicit input callback"; - return false; - } - - // set callback and params - params_ = params; - - if (!(inputCtx_ = avformat_alloc_context())) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " cannot allocate format context"; - return false; - } - - AVInputFormat* fmt = nullptr; - int result = 0; - if (in) { - ImageType type = ImageType::UNKNOWN; - if ((result = seekableBuffer_.init( - std::forward(in), - params_.timeoutMs, - params_.maxSeekableBytes, - params_.isImage ? &type : nullptr)) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " can't initiate seekable buffer"; - cleanUp(); - return false; - } - - if (params_.isImage) { - const char* fmtName = "image2"; - switch (type) { - case ImageType::JPEG: - fmtName = "jpeg_pipe"; - break; - case ImageType::PNG: - fmtName = "png_pipe"; - break; - case ImageType::TIFF: - fmtName = "tiff_pipe"; - break; - default: - break; - } - - fmt = (AVInputFormat*)av_find_input_format(fmtName); - } - - const size_t avioCtxBufferSize = kIoBufferSize; - uint8_t* avioCtxBuffer = - (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize); - if (!avioCtxBuffer) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " av_malloc cannot allocate " << avioCtxBufferSize - << " bytes"; - cleanUp(); - return false; - } - - if (!(avioCtx_ = avio_alloc_context( - avioCtxBuffer, - avioCtxBufferSize, - 0, - reinterpret_cast(this), - &Decoder::readFunction, - nullptr, - result == 1 ? &Decoder::seekFunction : nullptr))) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avio_alloc_context failed"; - av_free(avioCtxBuffer); - cleanUp(); - return false; - } - - avioCtx_->max_packet_size = params.maxEncodedBufferSize; - - inputCtx_->pb = avioCtx_; - inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO; - } - - inputCtx_->opaque = reinterpret_cast(this); - inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction; - inputCtx_->interrupt_callback.opaque = reinterpret_cast(this); - - // add network timeout - inputCtx_->flags |= AVFMT_FLAG_NONBLOCK; - - AVDictionary* options = nullptr; - if (params_.listen) { - av_dict_set_int(&options, "listen", 1, 0); - } - if (params_.timeoutMs > 0) { - av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0); - av_dict_set_int(&options, "stimeout", params_.timeoutMs * 1000, 0); - av_dict_set_int(&options, "rw_timeout", params_.timeoutMs * 1000, 0); - if (!params_.tlsCertFile.empty()) { - av_dict_set(&options, "cert_file", params_.tlsCertFile.data(), 0); - } - if (!params_.tlsKeyFile.empty()) { - av_dict_set(&options, "key_file", params_.tlsKeyFile.data(), 0); - } - } - - av_dict_set_int(&options, "probesize", params_.probeSize, 0); - - interrupted_ = false; - - // ffmpeg avformat_open_input call can hang if media source doesn't respond - // set a guard for handle such situations, if requested - std::promise p; - std::future f = p.get_future(); - std::unique_ptr guard; - if (params_.preventStaleness) { - guard = std::make_unique([&f, this]() { - auto timeout = std::chrono::milliseconds(params_.timeoutMs); - if (std::future_status::timeout == f.wait_for(timeout)) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " cannot open stream within " << params_.timeoutMs - << " ms"; - interrupted_ = true; - } - }); - } - - if (fmt) { - result = avformat_open_input(&inputCtx_, nullptr, fmt, &options); - } else { - result = - avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options); - } - - av_dict_free(&options); - - if (guard) { - p.set_value(true); - guard->join(); - guard.reset(); - } - - if (result < 0 || interrupted_) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avformat_open_input failed, error=" - << Util::generateErrorDesc(result); - cleanUp(); - return false; - } - - result = avformat_find_stream_info(inputCtx_, nullptr); - - if (result < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avformat_find_stream_info failed, error=" - << Util::generateErrorDesc(result); - cleanUp(); - return false; - } - - if (!openStreams(metadata)) { - LOG(ERROR) << "uuid=" << params_.loggingUuid << " cannot activate streams"; - cleanUp(); - return false; - } - // SyncDecoder inherits Decoder which would override onInit. - onInit(); - - if (params.startOffset != 0) { - auto offset = params.startOffset <= params.seekAccuracy - ? 0 - : params.startOffset - params.seekAccuracy; - - av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD); - } - - for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) { - if ( -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO -#else // FFMPEG 4.0+ - inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO -#endif - && inputCtx_->streams[i]->duration > 0) { - // There is at least two 1/r_frame_rates from the frame before the last - // one until the video duration, let's prefer to set duration after the - // frame before the last one, but as early as possible - double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den / - (double)inputCtx_->streams[i]->r_frame_rate.num - - 1 / (double)AV_TIME_BASE; - videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration * - inputCtx_->streams[i]->time_base.num / - (double)inputCtx_->streams[i]->time_base.den - - 1000 * correction; - break; - } - } - - VLOG(1) << "Decoder initialized, log level: " << params_.logLevel; - VLOG(1) << "Video duration: " << videoDurationMs_; - return true; -} - -// open appropriate CODEC for every type of stream and move it to the class -// variable `streams_` and make sure it is in range for decoding -bool Decoder::openStreams(std::vector* metadata) { - for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) { - // - find the corespondent format at params_.formats set - MediaFormat format; -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - const auto media = inputCtx_->streams[i]->codec->codec_type; -#else // FFMPEG 4.0+ - const auto media = inputCtx_->streams[i]->codecpar->codec_type; -#endif - if (!mapFfmpegType(media, &format.type)) { - VLOG(1) << "Stream media: " << media << " at index " << i - << " gets ignored, unknown type"; - - continue; // unsupported type - } - - // check format - auto it = params_.formats.find(format); - if (it == params_.formats.end()) { - VLOG(1) << "Stream type: " << format.type << " at index: " << i - << " gets ignored, caller is not interested"; - continue; // clients don't care about this media format - } - - // do we have stream of this type? - auto stream = findByType(format); - - // should we process this stream? - - if (it->stream == -2 || // all streams of this type are welcome - (!stream && (it->stream == -1 || it->stream == i))) { // new stream - VLOG(1) << "Stream type: " << format.type << " found, at index: " << i; - auto stream_2 = createStream( - format.type, - inputCtx_, - i, - params_.convertPtsToWallTime, - it->format, - params_.loggingUuid); - CHECK(stream_2); - if (stream_2->openCodec(metadata, params_.numThreads) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " open codec failed, stream_idx=" << i; - return false; - } - streams_.emplace(i, std::move(stream_2)); - inRange_.set(i, true); - } - } - - return true; -} - -void Decoder::shutdown() { - cleanUp(); -} - -void Decoder::interrupt() { - interrupted_ = true; -} - -void Decoder::cleanUp() { - if (!interrupted_) { - interrupted_ = true; - } - - if (inputCtx_) { - for (auto& stream : streams_) { - // Drain stream buffers. - DecoderOutputMessage msg; - while (msg.payload = nullptr, stream.second->flush(&msg, true) > 0) { - } - stream.second.reset(); - } - streams_.clear(); - avformat_close_input(&inputCtx_); - } - if (avioCtx_) { - av_freep(&avioCtx_->buffer); - av_freep(&avioCtx_); - } - - // reset callback - seekableBuffer_.shutdown(); -} - -// function does actual work, derived class calls it in working thread -// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if -// no frames got decoded in the specified timeout time, AVERROR_BUFFER_TOO_SMALL -// when unable to allocate packet and error on unrecoverable error -int Decoder::getFrame(size_t workingTimeInMs) { - if (inRange_.none()) { - return ENODATA; - } - // decode frames until cache is full and leave thread - // once decode() method gets called and grab some bytes - // run this method again - // init package - // update 03/22: moving memory management to ffmpeg - AVPacket* avPacket; - avPacket = av_packet_alloc(); - if (avPacket == nullptr) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " decoder as not able to allocate the packet."; - return AVERROR_BUFFER_TOO_SMALL; - } - avPacket->data = nullptr; - avPacket->size = 0; - - auto end = std::chrono::steady_clock::now() + - std::chrono::milliseconds(workingTimeInMs); - // return true if elapsed time less than timeout - auto watcher = [end]() -> bool { - return std::chrono::steady_clock::now() <= end; - }; - - int result = 0; - size_t decodingErrors = 0; - bool decodedFrame = false; - while (!interrupted_ && inRange_.any() && !decodedFrame) { - if (watcher() == false) { - LOG(ERROR) << "uuid=" << params_.loggingUuid << " hit ETIMEDOUT"; - result = ETIMEDOUT; - break; - } - result = av_read_frame(inputCtx_, avPacket); - if (result == AVERROR(EAGAIN)) { - VLOG(4) << "Decoder is busy..."; - std::this_thread::yield(); - result = 0; // reset error, EAGAIN is not an error at all - // reset the packet to default settings - av_packet_unref(avPacket); - continue; - } else if (result == AVERROR_EOF) { - flushStreams(); - VLOG(1) << "End of stream"; - result = ENODATA; - break; - } else if ( - result == AVERROR(EPERM) && params_.skipOperationNotPermittedPackets) { - // reset error, lets skip packets with EPERM - result = 0; - // reset the packet to default settings - av_packet_unref(avPacket); - continue; - } else if (result < 0) { - flushStreams(); - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " error detected: " << Util::generateErrorDesc(result); - break; - } - - // get stream; if stream cannot be found reset the packet to - // default settings - auto stream = findByIndex(avPacket->stream_index); - if (stream == nullptr || !inRange_.test(stream->getIndex())) { - av_packet_unref(avPacket); - continue; - } - - size_t numConsecutiveNoBytes = 0; - // it can be only partial decoding of the package bytes - do { - // decode package - bool gotFrame = false; - bool hasMsg = false; - // packet either got consumed completely or not at all - if ((result = processPacket( - stream, avPacket, &gotFrame, &hasMsg, params_.fastSeek)) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " processPacket failed with code: " << result; - break; - } - - if (!gotFrame && params_.maxProcessNoBytes != 0 && - ++numConsecutiveNoBytes > params_.maxProcessNoBytes) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " exceeding max amount of consecutive no bytes"; - break; - } - if (result > 0) { - numConsecutiveNoBytes = 0; - } - - decodedFrame |= hasMsg; - } while (result == 0); - - // post loop check - if (result < 0) { - if (params_.maxPackageErrors != 0 && // check errors - ++decodingErrors >= params_.maxPackageErrors) { // reached the limit - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " exceeding max amount of consecutive package errors"; - break; - } - } else { - decodingErrors = 0; // reset on success - } - - result = 0; - - av_packet_unref(avPacket); - - if (params_.uniformSampling > 1) { - if (doSeek_) { - double duration = - videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; - double step = - (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); - avformat_seek_file( - inputCtx_, - -1, - static_cast(step * kFramesDecoded_) + 1, - static_cast(step * (kFramesDecoded_ + 1)), - static_cast(step * (kFramesDecoded_ + 1)), - 0); - ++kFramesDecoded_; - doSeek_ = false; - } - } - } - - av_packet_free(&avPacket); - VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_ - << ", inRange_.any() " << inRange_.any() << ", decodedFrame " - << decodedFrame << ", result " << result; - - // loop can be terminated, either by: - // 1. explicitly interrupted - // 3. unrecoverable error or ENODATA (end of stream) or ETIMEDOUT (timeout) - // 4. decoded frames pts are out of the specified range - // 5. success decoded frame - if (interrupted_) { - return EINTR; - } - if (result != 0) { - return result; - } - if (inRange_.none()) { - return ENODATA; - } - return 0; -} - -// find stream by stream index -Stream* Decoder::findByIndex(int streamIndex) const { - auto it = streams_.find(streamIndex); - return it != streams_.end() ? it->second.get() : nullptr; -} - -// find stream by type; note finds only the first stream of a given type -Stream* Decoder::findByType(const MediaFormat& format) const { - for (auto& stream : streams_) { - if (stream.second->getMediaFormat().type == format.type) { - return stream.second.get(); - } - } - return nullptr; -} - -// given the stream and packet, decode the frame buffers into the -// DecoderOutputMessage data structure via stream::decodePacket function. -int Decoder::processPacket( - Stream* stream, - AVPacket* packet, - bool* gotFrame, - bool* hasMsg, - bool fastSeek) { - // decode package - int result; - DecoderOutputMessage msg; - msg.payload = params_.headerOnly ? nullptr : createByteStorage(0); - *hasMsg = false; - if ((result = stream->decodePacket( - packet, &msg, params_.headerOnly, gotFrame)) >= 0 && - *gotFrame) { - // check end offset - bool endInRange = - params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; - inRange_.set(stream->getIndex(), endInRange); - // if fastseek is enabled, we're returning the first - // frame that we decode after (potential) seek. - // By default, we perform accurate seek to the closest - // following frame - bool startCondition = true; - if (!fastSeek) { - startCondition = msg.header.pts >= params_.startOffset; - } - if (endInRange && startCondition) { - *hasMsg = pushMsg(std::move(msg)); - } - } - return result; -} - -bool Decoder::pushMsg(DecoderOutputMessage&& msg) { - pastDecodedPTS_ = currentDecodedPTS_; - currentDecodedPTS_ = msg.header.pts; - - if (params_.uniformSampling <= 1) { - push(std::move(msg)); - return true; - } - - double duration = - videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; - double step = - (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); - if (pastDecodedPTS_ < step * kFramesDecoded_ && - step * kFramesDecoded_ <= currentDecodedPTS_) { - push(std::move(msg)); - doSeek_ = true; - return true; - } - - return false; -} - -void Decoder::flushStreams() { - VLOG(1) << "Flushing streams..."; - for (auto& stream : streams_) { - DecoderOutputMessage msg; - while (msg.payload = (params_.headerOnly ? nullptr : createByteStorage(0)), - stream.second->flush(&msg, params_.headerOnly) > 0) { - // check end offset - bool endInRange = - params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; - inRange_.set(stream.second->getIndex(), endInRange); - if (endInRange && msg.header.pts >= params_.startOffset) { - pushMsg(std::move(msg)); - } else { - msg.payload.reset(); - } - } - } -} - -int Decoder::decode_all(const DecoderOutCallback& callback) { - int result; - do { - DecoderOutputMessage out; - if (0 == (result = decode(&out, params_.timeoutMs))) { - callback(std::move(out)); - } - } while (result == 0); - return result; -} -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h deleted file mode 100644 index 172a011f93e..00000000000 --- a/torchvision/csrc/io/decoder/decoder.h +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include -#include -#include "seekable_buffer.h" -#include "stream.h" - -#if defined(_MSC_VER) -#include -using ssize_t = SSIZE_T; -#endif - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode media streams. - * Media bytes can be explicitly provided through read-callback - * or fetched internally by FFMPEG library - */ -class Decoder : public MediaDecoder { - public: - Decoder(); - ~Decoder() override; - - // MediaDecoder overrides - bool init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) override; - int decode_all(const DecoderOutCallback& callback) override; - void shutdown() override; - void interrupt() override; - - protected: - // function does actual work, derived class calls it in working thread - // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if - // no frames got decoded in the specified timeout time, and error on - // unrecoverable error. - int getFrame(size_t workingTimeInMs = 100); - - // Derived class must override method and consume the provided message - virtual void push(DecoderOutputMessage&& buffer) = 0; - - // Fires on init call - virtual void onInit() {} - - public: - // C-style FFMPEG API requires C/static methods for callbacks - static void logFunction(void* avcl, int level, const char* cfmt, va_list vl); - static int shutdownFunction(void* ctx); - static int readFunction(void* opaque, uint8_t* buf, int size); - static int64_t seekFunction(void* opaque, int64_t offset, int whence); - // can be called by any classes or API - static void initOnce(); - - int* getPrintPrefix() { - return &printPrefix; - } - double videoDurationMs_ = -1; - - private: - // mark below function for a proper invocation - bool enableLogLevel(int level) const; - void logCallback(int level, const std::string& message); - int readCallback(uint8_t* buf, int size); - int64_t seekCallback(int64_t offset, int whence); - int shutdownCallback(); - - bool openStreams(std::vector* metadata); - Stream* findByIndex(int streamIndex) const; - Stream* findByType(const MediaFormat& format) const; - int processPacket( - Stream* stream, - AVPacket* packet, - bool* gotFrame, - bool* hasMsg, - bool fastSeek = false); - void flushStreams(); - void cleanUp(); - bool pushMsg(DecoderOutputMessage&& - msg); // returns whether frame is passed to downstream - - protected: - DecoderParameters params_; - - private: - SeekableBuffer seekableBuffer_; - int printPrefix{1}; - - std::atomic interrupted_{false}; - AVFormatContext* inputCtx_{nullptr}; - AVIOContext* avioCtx_{nullptr}; - std::unordered_map> streams_; - std::bitset<64> inRange_; - int kFramesDecoded_{0}; - int64_t pastDecodedPTS_{-1}; - int64_t currentDecodedPTS_{-1}; - bool doSeek_{false}; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h deleted file mode 100644 index d2dc5c7935b..00000000000 --- a/torchvision/csrc/io/decoder/defs.h +++ /dev/null @@ -1,415 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -extern "C" { -#include -#include -#include -#include -#include -#include -#include "libswscale/swscale.h" -} - -namespace ffmpeg { - -// bit mask of formats, keep them in form 2^n -enum MediaType : size_t { - TYPE_AUDIO = 1, - TYPE_VIDEO = 2, - TYPE_SUBTITLE = 4, - TYPE_CC = 8, // closed captions from transport streams -}; - -// audio -struct AudioFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - bool operator==(const AudioFormat& x) const { - return x.format == format && x.samples == samples && x.channels == channels; - } - - size_t samples{0}; // number samples per second (frequency) - size_t channels{0}; // number of channels - long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE - size_t padding[2]; - // -- alignment 40 bytes -}; - -// video -struct VideoFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - bool operator==(const VideoFormat& x) const { - return x.format == format && x.width == width && x.height == height; - } - /* - When width = 0, height = 0, minDimension = 0, and maxDimension = 0, - keep the original frame resolution - When width = 0, height = 0, minDimension != 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that shorter edge size is - minDimension - When width = 0, height = 0, minDimension = 0, and maxDimension != 0, - keep the aspect ratio and resize the frame so that longer edge size is - maxDimension - When width = 0, height = 0, minDimension != 0, and maxDimension != 0, - resize the frame so that shorter edge size is minDimension, and - longer edge size is maxDimension. The aspect ratio may not be preserved - When width = 0, height != 0, minDimension = 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that frame height is $height - When width != 0, height = 0, minDimension = 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that frame width is $width - When width != 0, height != 0, minDimension = 0, and maxDimension = 0, - resize the frame so that frame width and height are set to $width and - $height, - respectively - */ - size_t width{0}; // width in pixels - size_t height{0}; // height in pixels - long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE - size_t minDimension{0}; // choose min dimension and rescale accordingly - size_t maxDimension{0}; // choose max dimension and rescale accordingly - size_t cropImage{0}; // request image crop - // -- alignment 40 bytes -}; - -// subtitle/cc -struct SubtitleFormat { - long type{0}; // AVSubtitleType, auto SUBTITLE_NONE - size_t padding[4]; - // -- alignment 40 bytes -}; - -union FormatUnion { - FormatUnion() : audio() {} - explicit FormatUnion(int) : video() {} - explicit FormatUnion(char) : subtitle() {} - explicit FormatUnion(double) : subtitle() {} - AudioFormat audio; - VideoFormat video; - SubtitleFormat subtitle; - // -- alignment 40 bytes -}; - -/* - MediaFormat data structure serves as input/output parameter. - Caller assigns values for input formats - or leave default values for auto detection - For output formats all fields will be set to the specific values -*/ -struct MediaFormat { - // for using map/set data structures - bool operator<(const MediaFormat& x) const { - return type < x.type; - } - bool operator==(const MediaFormat& x) const { - if (type != x.type) { - return false; - } - switch (type) { - case TYPE_AUDIO: - return format.audio == x.format.audio; - case TYPE_VIDEO: - return format.video == x.format.video; - case TYPE_SUBTITLE: - case TYPE_CC: - return true; - default: - return false; - } - } - - explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {} - explicit MediaFormat(int x, long s = -1) - : type(TYPE_VIDEO), stream(s), format(x) {} - explicit MediaFormat(char x, long s = -1) - : type(TYPE_SUBTITLE), stream(s), format(x) {} - explicit MediaFormat(double x, long s = -1) - : type(TYPE_CC), stream(s), format(x) {} - - static MediaFormat makeMediaFormat(AudioFormat format, long stream) { - MediaFormat result(stream); - result.format.audio = format; - return result; - } - - static MediaFormat makeMediaFormat(VideoFormat format, long stream) { - MediaFormat result(0, stream); - result.format.video = format; - return result; - } - - static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) { - MediaFormat result('0', stream); - result.format.subtitle = format; - return result; - } - - // format type - MediaType type; - // stream index: - // set -1 for one stream auto detection, -2 for all streams auto detection, - // >= 0, specified stream, if caller knows the stream index (unlikely) - long stream; - // union keeps one of the possible formats, defined by MediaType - FormatUnion format; -}; - -struct DecoderParameters { - // local file, remote file, http url, rtmp stream uri, etc. anything that - // ffmpeg can recognize - std::string uri{std::string()}; - // timeout on getting bytes for decoding - size_t timeoutMs{1000}; - // logging level, default AV_LOG_PANIC - long logLevel{0}; - // when decoder would give up, 0 means never - size_t maxPackageErrors{0}; - // max allowed consecutive times no bytes are processed. 0 means for infinite. - size_t maxProcessNoBytes{0}; - // start offset (us) - long startOffset{0}; - // end offset (us) - long endOffset{-1}; - // logging id - int64_t loggingUuid{0}; - // internal max seekable buffer size - size_t maxSeekableBytes{0}; - // adjust header pts to the epoch time - bool convertPtsToWallTime{false}; - // indicate if input stream is an encoded image - bool isImage{false}; - // listen and wait for new rtmp stream - bool listen{false}; - // don't copy frame body, only header - bool headerOnly{false}; - // enable fast seek (seek only to keyframes) - bool fastSeek{false}; - // interrupt init method on timeout - bool preventStaleness{true}; - // seek tolerated accuracy (us) - double seekAccuracy{1000000.0}; - // Allow multithreaded decoding for numThreads > 1; - // 0 numThreads=0 sets up sensible defaults - int numThreads{1}; - // what media types should be processed, default none - std::set formats; - - // can be used for asynchronous decoders - size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes - size_t cacheTimeoutMs{1000}; // timeout on bytes writing - bool enforceCacheSize{false}; // drop output frames if cache is full - bool mergeAudioMessages{false}; // combine collocated audio messages together - - std::string tlsCertFile; - std::string tlsKeyFile; - - // Skip packets that fail with EPERM errors and continue decoding. - bool skipOperationNotPermittedPackets{false}; - - // probing size in bytes, i.e. the size of the data to analyze to get stream - // information. A higher value will enable detecting more information in case - // it is dispersed into the stream, but will increase latency. Must be an - // integer not lesser than 32. It is 5000000 by default. - int64_t probeSize{5000000}; - - // Expected duration of the video to be decoded, mainly used with uniform - // sampling - float expectedDuration{0.0f}; - - // Sample N key-frames from the video roughly uniformly across the timeline - int uniformSampling{0}; - - // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames. - // Override this with bigger buffer size if needed. - int64_t maxEncodedBufferSize{0}; -}; - -struct DecoderHeader { - // message id, from 0 till ... - size_t seqno{0}; - // decoded timestamp in microseconds from either beginning of the stream or - // from epoch time, see DecoderParameters::convertPtsToWallTime - long pts{0}; - // decoded key frame - size_t keyFrame{0}; - // frames per second, valid only for video streams - double fps{0}; - // format specifies what kind frame is in a payload - MediaFormat format; -}; - -// Abstract interface ByteStorage class -class ByteStorage { - public: - virtual ~ByteStorage() = default; - // makes sure that buffer has at least n bytes available for writing, if not - // storage must reallocate memory. - virtual void ensure(size_t n) = 0; - // caller must not to write more than available bytes - virtual uint8_t* writableTail() = 0; - // caller confirms that n bytes were written to the writable tail - virtual void append(size_t n) = 0; - // caller confirms that n bytes were read from the read buffer - virtual void trim(size_t n) = 0; - // gives an access to the beginning of the read buffer - virtual const uint8_t* data() const = 0; - // returns the stored size in bytes - virtual size_t length() const = 0; - // returns available capacity for writable tail - virtual size_t tail() const = 0; - // clears content, keeps capacity - virtual void clear() = 0; -}; - -struct DecoderOutputMessage { - DecoderHeader header; - std::unique_ptr payload; -}; - -/* - * External provider of the ecnoded bytes, specific implementation is left for - * different use cases, like file, memory, external network end-points, etc. - * Normally input/output parameter @out set to valid, not null buffer pointer, - * which indicates "read" call, however there are "seek" modes as well. - - * @out != nullptr => read from the current offset, @whence got ignored, - * @size bytes to read => return number bytes got read, 0 if no more bytes - * available, < 0 on error. - - * @out == nullptr, @timeoutMs == 0 => does provider support "seek" - * capability in a first place? @size & @whence got ignored, return 0 on - * success, < 0 if "seek" mode is not supported. - - * @out == nullptr, @timeoutMs != 0 => normal seek call - * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE) - * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END], - * length of buffer if @whence = [AVSEEK_SIZE]. - */ -using DecoderInCallback = - std::function; - -using DecoderOutCallback = std::function; - -struct DecoderMetadata { - // time base numerator - long num{0}; - // time base denominator - long den{1}; - // duration of the stream, in miscroseconds, if available - long duration{-1}; - // frames per second, valid only for video streams - double fps{0}; - // format specifies what kind frame is in a payload - MediaFormat format; -}; -/** - * Abstract class for decoding media bytes - * It has two different modes. Internal media bytes retrieval for given uri and - * external media bytes provider in case of memory streams - */ -class MediaDecoder { - public: - virtual ~MediaDecoder() = default; - - /** - * Initializes media decoder with parameters, - * calls callback when media bytes are available. - * Media bytes get fetched internally from provided URI - * or invokes provided input callback to get media bytes. - * Input callback must be empty for the internal media provider - * Caller can provide non-null pointer for the input container - * if headers to obtain the streams metadata (optional) - */ - virtual bool init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) = 0; - - /** - * Polls available decoded one frame from decoder - * Returns error code, 0 - for success - */ - virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0; - - /** - * Polls available decoded bytes from decoder, till EOF or error - */ - virtual int decode_all(const DecoderOutCallback& callback) = 0; - - /** - * Stops calling callback, releases resources - */ - virtual void shutdown() = 0; - - /** - * Interrupts whatever decoder is doing at any time - */ - virtual void interrupt() = 0; - - /** - * Factory to create ByteStorage class instances, particular implementation is - * left to the derived class. Caller provides the initially allocated size - */ - virtual std::unique_ptr createByteStorage(size_t n) = 0; -}; - -struct SamplerParameters { - MediaType type{TYPE_AUDIO}; - FormatUnion in; - FormatUnion out; - int64_t loggingUuid{0}; -}; - -/** - * Abstract class for sampling media bytes - */ -class MediaSampler { - public: - virtual ~MediaSampler() = default; - - /** - * Initializes media sampler with parameters - */ - virtual bool init(const SamplerParameters& params) = 0; - - /** - * Samples media bytes - * Returns error code < 0, or >=0 - for success, indicating number of bytes - * processed. - * set @in to null for flushing data - */ - virtual int sample(const ByteStorage* in, ByteStorage* out) = 0; - - /** - * Releases resources - */ - virtual void shutdown() = 0; - - /* - * Returns media type - */ - MediaType getMediaType() const { - return params_.type; - } - /* - * Returns formats - */ - FormatUnion getInputFormat() const { - return params_.in; - } - FormatUnion getOutFormat() const { - return params_.out; - } - - protected: - SamplerParameters params_; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp deleted file mode 100644 index 4e420c3b3cd..00000000000 --- a/torchvision/csrc/io/decoder/memory_buffer.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include "memory_buffer.h" -#include - -namespace ffmpeg { - -MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size) - : buffer_(buffer), len_(size) {} - -int MemoryBuffer::read(uint8_t* buf, int size) { - if (pos_ < len_) { - auto available = std::min(int(len_ - pos_), size); - memcpy(buf, buffer_ + pos_, available); - pos_ += available; - return available; - } - - return 0; -} - -int64_t MemoryBuffer::seek(int64_t offset, int whence) { - if (whence & AVSEEK_SIZE) { - return len_; - } - - // remove force flag - whence &= ~AVSEEK_FORCE; - - switch (whence) { - case SEEK_SET: - if (offset >= 0 && offset <= len_) { - pos_ = offset; - } - break; - case SEEK_END: - if (len_ + offset >= 0 && len_ + offset <= len_) { - pos_ = len_ + offset; - } - break; - case SEEK_CUR: - if (pos_ + offset > 0 && pos_ + offset <= len_) { - pos_ += offset; - } - break; - default: - LOG(ERROR) << "Unknown whence flag gets provided: " << whence; - } - return pos_; -} - -/* static */ -DecoderInCallback MemoryBuffer::getCallback( - const uint8_t* buffer, - size_t size) { - MemoryBuffer object(buffer, size); - return - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - supported - return 0; - } - return object.seek(size, whence); - }; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/memory_buffer.h b/torchvision/csrc/io/decoder/memory_buffer.h deleted file mode 100644 index 909626d3cae..00000000000 --- a/torchvision/csrc/io/decoder/memory_buffer.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class uses external memory buffer and implements a seekable interface. - */ -class MemoryBuffer { - public: - explicit MemoryBuffer(const uint8_t* buffer, size_t size); - int64_t seek(int64_t offset, int whence); - int read(uint8_t* buf, int size); - - // static constructor for decoder callback. - static DecoderInCallback getCallback(const uint8_t* buffer, size_t size); - - private: - const uint8_t* buffer_; // set at construction time - long pos_{0}; // current position - long len_{0}; // bytes in buffer -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/seekable_buffer.cpp b/torchvision/csrc/io/decoder/seekable_buffer.cpp deleted file mode 100644 index 41e3e689c7b..00000000000 --- a/torchvision/csrc/io/decoder/seekable_buffer.cpp +++ /dev/null @@ -1,139 +0,0 @@ -#include "seekable_buffer.h" -#include -#include -#include "memory_buffer.h" - -namespace ffmpeg { - -int SeekableBuffer::init( - DecoderInCallback&& in, - uint64_t timeoutMs, - size_t maxSeekableBytes, - ImageType* type) { - shutdown(); - isSeekable_ = in(nullptr, 0, 0, 0) == 0; - if (isSeekable_) { // seekable - if (type) { - if (!readBytes(in, 8, timeoutMs)) { - return -1; - } - setImageType(type); - end_ = 0; - eof_ = false; - std::vector().swap(buffer_); - // reset callback - if (in(nullptr, 0, SEEK_SET, timeoutMs)) { - return -1; - } - } - inCallback_ = std::forward(in); - return 1; - } - - if (!readBytes(in, maxSeekableBytes + (type ? 8 : 0), timeoutMs)) { - return -1; - } - - if (type) { - setImageType(type); - } - - if (eof_) { - end_ = 0; - eof_ = false; - // reuse MemoryBuffer functionality - inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size()); - isSeekable_ = true; - return 1; - } - inCallback_ = std::forward(in); - return 0; -} - -bool SeekableBuffer::readBytes( - DecoderInCallback& in, - size_t maxBytes, - uint64_t timeoutMs) { - // Resize to th minimum 4K page or less - buffer_.resize(std::min(maxBytes, size_t(4 * 1024UL))); - end_ = 0; - eof_ = false; - - auto end = - std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs); - auto watcher = [end]() -> bool { - return std::chrono::steady_clock::now() <= end; - }; - - bool hasTime = true; - while (!eof_ && end_ < maxBytes && (hasTime = watcher())) { - // lets read all bytes into available buffer - auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs); - if (res > 0) { - end_ += res; - if (end_ == buffer_.size()) { - buffer_.resize(std::min(size_t(end_ * 4UL), maxBytes)); - } - } else if (res == 0) { - eof_ = true; - } else { - // error - return false; - } - } - - buffer_.resize(end_); - - return hasTime; -} - -void SeekableBuffer::setImageType(ImageType* type) { - if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 && - buffer_[2] == 0xFF) { - *type = ImageType::JPEG; - } else if ( - buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' && - buffer_[3] == 'G') { - *type = ImageType::PNG; - } else if ( - buffer_.size() > 1 && - ((buffer_[0] == 0x49 && buffer_[1] == 0x49) || - (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) { - *type = ImageType::TIFF; - } else { - *type = ImageType::UNKNOWN; - } -} - -int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) { - if (isSeekable_) { - return inCallback_(buf, size, 0, timeoutMs); - } - if (pos_ < end_) { - // read cached bytes for non-seekable callback - auto available = std::min(int(end_ - pos_), size); - memcpy(buf, buffer_.data() + pos_, available); - pos_ += available; - return available; - } else if (!eof_) { - // normal sequential read (see defs.h file), i.e. @buf != null - auto res = inCallback_(buf, size, 0, timeoutMs); // read through - eof_ = res == 0; - return res; - } else { - return 0; - } -} - -int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) { - return inCallback_(nullptr, offset, whence, timeoutMs); -} - -void SeekableBuffer::shutdown() { - pos_ = end_ = 0; - eof_ = false; - std::vector().swap(buffer_); - inCallback_ = nullptr; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/seekable_buffer.h b/torchvision/csrc/io/decoder/seekable_buffer.h deleted file mode 100644 index 9d5729f5306..00000000000 --- a/torchvision/csrc/io/decoder/seekable_buffer.h +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class uses internal buffer to store initial size bytes as a seekable cache - * from Media provider and let ffmpeg to seek and read bytes from cache - * and beyond - reading bytes directly from Media provider - */ -enum class ImageType { - UNKNOWN = 0, - JPEG = 1, - PNG = 2, - TIFF = 3, -}; - -class SeekableBuffer { - public: - // @type is optional, not nullptr only is image detection required - // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error - int init( - DecoderInCallback&& in, - uint64_t timeoutMs, - size_t maxSeekableBytes, - ImageType* type); - int read(uint8_t* buf, int size, uint64_t timeoutMs); - int64_t seek(int64_t offset, int whence, uint64_t timeoutMs); - void shutdown(); - - private: - bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs); - void setImageType(ImageType* type); - - private: - DecoderInCallback inCallback_; - std::vector buffer_; // resized at init time - long pos_{0}; // current position (SEEK_CUR iff pos_ < end_) - long end_{0}; // current buffer size - bool eof_{0}; // indicates the EOF - bool isSeekable_{false}; // is callback seekable -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp deleted file mode 100644 index 7969741e72c..00000000000 --- a/torchvision/csrc/io/decoder/stream.cpp +++ /dev/null @@ -1,288 +0,0 @@ -#include "stream.h" -#include -#include -#include "util.h" - -namespace ffmpeg { -const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; - -Stream::Stream( - AVFormatContext* inputCtx, - MediaFormat format, - bool convertPtsToWallTime, - int64_t loggingUuid) - : inputCtx_(inputCtx), - format_(format), - convertPtsToWallTime_(convertPtsToWallTime), - loggingUuid_(loggingUuid) {} - -Stream::~Stream() { - if (frame_) { - av_free(frame_); - } - if (codecCtx_) { - avcodec_free_context(&codecCtx_); - } -} - -// look up the proper CODEC querying the function -AVCodec* Stream::findCodec(AVCodecParameters* params) { - return (AVCodec*)avcodec_find_decoder(params->codec_id); -} - -// Allocate memory for the AVCodecContext, which will hold the context for -// decode/encode process. Then fill this codec context with CODEC parameters -// defined in stream parameters. Open the codec, and allocate the global frame -// defined in the header file -int Stream::openCodec(std::vector* metadata, int num_threads) { - AVStream* steam = inputCtx_->streams[format_.stream]; - - AVCodec* codec = findCodec(steam->codecpar); - if (!codec) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_find_decoder failed for codec_id: " - << int(steam->codecpar->codec_id); - return AVERROR(EINVAL); - } - - if (!(codecCtx_ = avcodec_alloc_context3(codec))) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_alloc_context3 failed"; - return AVERROR(ENOMEM); - } - // multithreading heuristics - // if user defined, - if (num_threads > max_threads) { - num_threads = max_threads; - } - - if (num_threads > 0) { - // if user defined, respect that - // note that default thread_type will be used - codecCtx_->thread_count = num_threads; - } else { - // otherwise set sensible defaults - codecCtx_->thread_count = 8; - codecCtx_->thread_type = FF_THREAD_SLICE; - } - - int ret; - // Copy codec parameters from input stream to output codec context - if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_parameters_to_context failed"; - return ret; - } - - // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful - if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret); - avcodec_free_context(&codecCtx_); - codecCtx_ = nullptr; - return ret; - } - - frame_ = av_frame_alloc(); - - switch (format_.type) { - case TYPE_VIDEO: - fps_ = av_q2d(av_guess_frame_rate(inputCtx_, steam, nullptr)); - break; - case TYPE_AUDIO: - fps_ = codecCtx_->sample_rate; - break; - default: - fps_ = 30.0; - } - - if ((ret = initFormat())) { - LOG(ERROR) << "initFormat failed, type: " << format_.type; - } - - if (metadata) { - DecoderMetadata header; - header.format = format_; - header.fps = fps_; - header.num = steam->time_base.num; - header.den = steam->time_base.den; - header.duration = - av_rescale_q(steam->duration, steam->time_base, timeBaseQ); - metadata->push_back(header); - } - - return ret; -} - -// send the raw data packet (compressed frame) to the decoder, through the codec -// context and receive the raw data frame (uncompressed frame) from the -// decoder, through the same codec context -int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) { - int consumed = 0; - int result = avcodec_send_packet(codecCtx_, packet); - if (result == AVERROR(EAGAIN)) { - *gotFrame = false; // no bytes get consumed, fetch frame - } else if (result == AVERROR_EOF) { - *gotFrame = false; // more than one flush packet - if (packet) { - // got packet after flush, this is an error - return result; - } - } else if (result < 0) { - LOG(ERROR) << "avcodec_send_packet failed, err: " - << Util::generateErrorDesc(result); - return result; // error - } else { - consumed = packet ? packet->size : 0; // all bytes get consumed - } - - result = avcodec_receive_frame(codecCtx_, frame_); - - if (result >= 0) { - *gotFrame = true; // frame is available - } else if (result == AVERROR(EAGAIN)) { - *gotFrame = false; // no frames at this time, needs more packets - if (!consumed) { - // precaution, if no packages got consumed and no frames are available - return result; - } - } else if (result == AVERROR_EOF) { - *gotFrame = false; // the last frame has been flushed - // precaution, if no more frames are available assume we consume all bytes - consumed = 0; - } else { // error - LOG(ERROR) << "avcodec_receive_frame failed, err: " - << Util::generateErrorDesc(result); - return result; - } - return consumed; -} - -// General decoding function: -// given the packet, analyse the metadata, and write the -// metadata and the buffer to the DecoderOutputImage. -int Stream::decodePacket( - const AVPacket* packet, - DecoderOutputMessage* out, - bool headerOnly, - bool* hasMsg) { - int consumed; - bool gotFrame = false; - *hasMsg = false; - if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 && - (packet == nullptr || gotFrame)) { - int result; - if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) { - return result; // report error - } - *hasMsg = result > 0; - } - return consumed; -} - -int Stream::flush(DecoderOutputMessage* out, bool headerOnly) { - bool hasMsg = false; - int result = decodePacket(nullptr, out, headerOnly, &hasMsg); - if (result < 0) { - avcodec_flush_buffers(codecCtx_); - return result; - } - if (!hasMsg) { - avcodec_flush_buffers(codecCtx_); - return 0; - } - return 1; -} - -// Sets the header and payload via stream::setHeader and copyFrameBytes -// functions that are defined in type stream subclass (VideoStream, AudioStream, -// ...) -int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) { - if (flush) { - // only flush of audio frames makes sense - if (format_.type == TYPE_AUDIO) { - int processed = 0; - size_t total = 0; - // grab all audio bytes by chunks - do { - if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) { - return processed; - } - total += processed; - } while (processed); - - if (total) { - // set header if message bytes are available - setHeader(&out->header, flush); - return 1; - } - } - return 0; - } else { - if (format_.type == TYPE_AUDIO) { - int processed = 0; - if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) { - return processed; - } - if (processed) { - // set header if message bytes are available - setHeader(&out->header, flush); - return 1; - } - return 0; - } else { - // set header - setHeader(&out->header, flush); - - if (headerOnly) { - // Only header is requisted - return 1; - } - - return copyFrameBytes(out->payload.get(), flush); - } - } -} - -void Stream::setHeader(DecoderHeader* header, bool flush) { - header->seqno = numGenerator_++; - - setFramePts(header, flush); - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->format = format_; - header->keyFrame = 0; - header->fps = std::numeric_limits::quiet_NaN(); -} - -void Stream::setFramePts(DecoderHeader* header, bool flush) { - if (flush) { - header->pts = nextPts_; // already in us - } else { - header->pts = frame_->best_effort_timestamp; - if (header->pts == AV_NOPTS_VALUE) { - header->pts = nextPts_; - } else { - header->pts = av_rescale_q( - header->pts, - inputCtx_->streams[format_.stream]->time_base, - timeBaseQ); - } - - switch (format_.type) { - case TYPE_AUDIO: - nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_; - break; - case TYPE_VIDEO: - nextPts_ = header->pts + AV_TIME_BASE / fps_; - break; - default: - nextPts_ = header->pts; - } - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/stream.h b/torchvision/csrc/io/decoder/stream.h deleted file mode 100644 index 6250dd9ecd2..00000000000 --- a/torchvision/csrc/io/decoder/stream.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include -#include "defs.h" -#include "time_keeper.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one media stream (audio or video). - */ - -class Stream { - public: - Stream( - AVFormatContext* inputCtx, - MediaFormat format, - bool convertPtsToWallTime, - int64_t loggingUuid); - virtual ~Stream(); - - // returns 0 - on success or negative error - // num_threads sets up the codec context for multithreading if needed - // default is set to single thread in order to not break BC - int openCodec(std::vector* metadata, int num_threads = 1); - // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error - int decodePacket( - const AVPacket* packet, - DecoderOutputMessage* out, - bool headerOnly, - bool* hasMsg); - // returns stream index - int getIndex() const { - return format_.stream; - } - // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error - int flush(DecoderOutputMessage* out, bool headerOnly); - // return media format - MediaFormat getMediaFormat() const { - return format_; - } - - protected: - virtual int initFormat() = 0; - // returns number processed bytes from packet, or negative error - virtual int analyzePacket(const AVPacket* packet, bool* gotFrame); - // returns number processed bytes from packet, or negative error - virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0; - // sets output format - virtual void setHeader(DecoderHeader* header, bool flush); - // set frame pts - virtual void setFramePts(DecoderHeader* header, bool flush); - // finds codec - virtual AVCodec* findCodec(AVCodecParameters* params); - - private: - // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error - int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly); - - protected: - AVFormatContext* const inputCtx_; - MediaFormat format_; - const bool convertPtsToWallTime_; - int64_t loggingUuid_; - - AVCodecContext* codecCtx_{nullptr}; - AVFrame* frame_{nullptr}; - - std::atomic numGenerator_{0}; - TimeKeeper keeper_; - // estimated next frame pts for flushing the last frame - int64_t nextPts_{0}; - double fps_{30.}; - // this is a dumb conservative limit; ideally we'd use - // int max_threads = at::get_num_threads(); but this would cause - // fb sync to fail as it would add dependency to ATen to the decoder API - const int max_threads = 12; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.cpp b/torchvision/csrc/io/decoder/subtitle_sampler.cpp deleted file mode 100644 index d0df24d3e35..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_sampler.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "subtitle_sampler.h" -#include -#include "util.h" - -namespace ffmpeg { - -SubtitleSampler::~SubtitleSampler() { - cleanUp(); -} - -void SubtitleSampler::shutdown() { - cleanUp(); -} - -bool SubtitleSampler::init(const SamplerParameters& params) { - cleanUp(); - // set formats - params_ = params; - return true; -} - -int SubtitleSampler::sample(AVSubtitle* sub, ByteStorage* out) { - if (!sub || !out) { - return 0; // flush - } - - out->ensure(Util::size(*sub)); - - return Util::serialize(*sub, out); -} - -int SubtitleSampler::sample(const ByteStorage* in, ByteStorage* out) { - if (in && out) { - // Get a writable copy - if (size_t len = in->length()) { - out->ensure(len); - memcpy(out->writableTail(), in->data(), len); - } - return out->length(); - } - return 0; -} - -void SubtitleSampler::cleanUp() {} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.h b/torchvision/csrc/io/decoder/subtitle_sampler.h deleted file mode 100644 index 4aee811ed56..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_sampler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode audio frames from one format into another - */ - -class SubtitleSampler : public MediaSampler { - public: - SubtitleSampler() = default; - ~SubtitleSampler() override; - - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - // returns number processed/scaling bytes - int sample(AVSubtitle* sub, ByteStorage* out); - - // helper serialization/deserialization methods - static void serialize(const AVSubtitle& sub, ByteStorage* out); - static bool deserialize(const ByteStorage& buf, AVSubtitle* sub); - - private: - // close resources - void cleanUp(); -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp deleted file mode 100644 index 3416f702d7e..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_stream.cpp +++ /dev/null @@ -1,96 +0,0 @@ -#include "subtitle_stream.h" -#include -#include "util.h" - -namespace ffmpeg { -const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; - -SubtitleStream::SubtitleStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - 0) { - memset(&sub_, 0, sizeof(sub_)); -} - -void SubtitleStream::releaseSubtitle() { - if (sub_.release) { - avsubtitle_free(&sub_); - memset(&sub_, 0, sizeof(sub_)); - } -} - -SubtitleStream::~SubtitleStream() { - releaseSubtitle(); - sampler_.shutdown(); -} - -int SubtitleStream::initFormat() { - if (!codecCtx_->subtitle_header) { - LOG(ERROR) << "No subtitle header found"; - } else { - VLOG(1) << "Subtitle header found!"; - } - return 0; -} - -int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) { - // clean-up - releaseSubtitle(); - - // FIXME: should this even be created? - AVPacket* avPacket; - avPacket = av_packet_alloc(); - if (avPacket == nullptr) { - LOG(ERROR) - << "decoder as not able to allocate the subtitle-specific packet."; - // alternative to ENOMEM - return AVERROR_BUFFER_TOO_SMALL; - } - avPacket->data = nullptr; - avPacket->size = 0; - // check flush packet - auto pkt = packet ? packet : avPacket; - - int gotFramePtr = 0; - // is these a better way than cast from const? - int result = - avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, (AVPacket*)pkt); - - if (result < 0) { - LOG(ERROR) << "avcodec_decode_subtitle2 failed, err: " - << Util::generateErrorDesc(result); - // free the packet we've created - av_packet_free(&avPacket); - return result; - } else if (result == 0) { - result = pkt->size; // discard the rest of the package - } - - sub_.release = gotFramePtr; - *gotFrame = gotFramePtr > 0; - - // set proper pts in us - if (gotFramePtr) { - sub_.pts = av_rescale_q( - pkt->pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ); - } - - av_packet_free(&avPacket); - return result; -} - -int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) { - return sampler_.sample(flush ? nullptr : &sub_, out); -} - -void SubtitleStream::setFramePts(DecoderHeader* header, bool) { - header->pts = sub_.pts; // already in us -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_stream.h b/torchvision/csrc/io/decoder/subtitle_stream.h deleted file mode 100644 index 6c366e11f50..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_stream.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include "stream.h" -#include "subtitle_sampler.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one subtitle stream. - */ -struct AVSubtitleKeeper : AVSubtitle { - int64_t release{0}; -}; - -class SubtitleStream : public Stream { - public: - SubtitleStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format); - ~SubtitleStream() override; - - protected: - void setFramePts(DecoderHeader* header, bool flush) override; - - private: - int initFormat() override; - int analyzePacket(const AVPacket* packet, bool* gotFrame) override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - void releaseSubtitle(); - - private: - SubtitleSampler sampler_; - AVSubtitleKeeper sub_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder.cpp b/torchvision/csrc/io/decoder/sync_decoder.cpp deleted file mode 100644 index 1f03ef8eb95..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include "sync_decoder.h" -#include - -namespace ffmpeg { - -SyncDecoder::AVByteStorage::AVByteStorage(size_t n) { - ensure(n); -} - -SyncDecoder::AVByteStorage::~AVByteStorage() { - av_free(buffer_); -} - -void SyncDecoder::AVByteStorage::ensure(size_t n) { - if (tail() < n) { - capacity_ = offset_ + length_ + n; - buffer_ = static_cast(av_realloc(buffer_, capacity_)); - } -} - -uint8_t* SyncDecoder::AVByteStorage::writableTail() { - TORCH_CHECK_LE(offset_ + length_, capacity_); - return buffer_ + offset_ + length_; -} - -void SyncDecoder::AVByteStorage::append(size_t n) { - TORCH_CHECK_LE(n, tail()); - length_ += n; -} - -void SyncDecoder::AVByteStorage::trim(size_t n) { - TORCH_CHECK_LE(n, length_); - offset_ += n; - length_ -= n; -} - -const uint8_t* SyncDecoder::AVByteStorage::data() const { - return buffer_ + offset_; -} - -size_t SyncDecoder::AVByteStorage::length() const { - return length_; -} - -size_t SyncDecoder::AVByteStorage::tail() const { - TORCH_CHECK_LE(offset_ + length_, capacity_); - return capacity_ - offset_ - length_; -} - -void SyncDecoder::AVByteStorage::clear() { - offset_ = 0; - length_ = 0; -} - -std::unique_ptr SyncDecoder::createByteStorage(size_t n) { - return std::make_unique(n); -} - -void SyncDecoder::onInit() { - eof_ = false; - queue_.clear(); -} - -int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) { - if (eof_ && queue_.empty()) { - return ENODATA; - } - - if (queue_.empty()) { - int result = getFrame(timeoutMs); - // assign EOF - eof_ = result == ENODATA; - // check unrecoverable error, any error but ENODATA - if (result && result != ENODATA) { - return result; - } - - // still empty - if (queue_.empty()) { - if (eof_) { - return ENODATA; - } else { - LOG(INFO) << "Queue is empty"; - return ETIMEDOUT; - } - } - } - - *out = std::move(queue_.front()); - queue_.pop_front(); - return 0; -} - -void SyncDecoder::push(DecoderOutputMessage&& buffer) { - queue_.push_back(std::move(buffer)); -} -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder.h b/torchvision/csrc/io/decoder/sync_decoder.h deleted file mode 100644 index b7cf7b625ac..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder.h +++ /dev/null @@ -1,48 +0,0 @@ -#pragma once - -#include -#include "decoder.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode media streams. - * Media bytes can be explicitly provided through read-callback - * or fetched internally by FFMPEG library - */ -class SyncDecoder : public Decoder { - public: - // Allocation of memory must be done with a proper alignment. - class AVByteStorage : public ByteStorage { - public: - explicit AVByteStorage(size_t n); - ~AVByteStorage() override; - void ensure(size_t n) override; - uint8_t* writableTail() override; - void append(size_t n) override; - void trim(size_t n) override; - const uint8_t* data() const override; - size_t length() const override; - size_t tail() const override; - void clear() override; - - private: - size_t offset_{0}; - size_t length_{0}; - size_t capacity_{0}; - uint8_t* buffer_{nullptr}; - }; - - public: - int decode(DecoderOutputMessage* out, uint64_t timeoutMs) override; - - private: - void push(DecoderOutputMessage&& buffer) override; - void onInit() override; - std::unique_ptr createByteStorage(size_t n) override; - - private: - std::list queue_; - bool eof_{false}; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp deleted file mode 100644 index 085966ce687..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp +++ /dev/null @@ -1,416 +0,0 @@ -#include -#include -#include -#include "memory_buffer.h" -#include "sync_decoder.h" -#include "util.h" - -using namespace ffmpeg; - -namespace { -struct VideoFileStats { - std::string name; - size_t durationPts{0}; - int num{0}; - int den{0}; - int fps{0}; -}; - -void gotAllTestFiles( - const std::string& folder, - std::vector* stats) { - DIR* d = opendir(folder.c_str()); - CHECK(d); - struct dirent* dir; - while ((dir = readdir(d))) { - if (dir->d_type != DT_DIR && 0 != strcmp(dir->d_name, "README")) { - VideoFileStats item; - item.name = folder + '/' + dir->d_name; - LOG(INFO) << "Found video file: " << item.name; - stats->push_back(std::move(item)); - } - } - closedir(d); -} - -void gotFilesStats(std::vector& stats) { - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.formats = {MediaFormat(0)}; - params.headerOnly = true; - params.preventStaleness = false; - size_t avgProvUs = 0; - const size_t rounds = 100; - for (auto& item : stats) { - LOG(INFO) << "Decoding video file in memory: " << item.name; - FILE* f = fopen(item.name.c_str(), "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - for (size_t i = 0; i < rounds; ++i) { - SyncDecoder decoder; - std::vector metadata; - const auto now = std::chrono::steady_clock::now(); - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - &metadata)); - const auto then = std::chrono::steady_clock::now(); - decoder.shutdown(); - avgProvUs += - std::chrono::duration_cast(then - now) - .count(); - TORCH_CHECK_EQ(metadata.size(), 1); - item.num = metadata[0].num; - item.den = metadata[0].den; - item.fps = metadata[0].fps; - item.durationPts = - av_rescale_q(metadata[0].duration, AV_TIME_BASE_Q, {1, item.fps}); - } - } - LOG(INFO) << "Probing (us) " << avgProvUs / stats.size() / rounds; -} - -size_t measurePerformanceUs( - const std::vector& stats, - size_t rounds, - size_t num, - size_t stride) { - size_t avgClipDecodingUs = 0; - std::srand(time(nullptr)); - for (const auto& item : stats) { - FILE* f = fopen(item.name.c_str(), "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - for (size_t i = 0; i < rounds; ++i) { - // randomy select clip - size_t rOffset = std::rand(); - size_t fOffset = rOffset % item.durationPts; - size_t clipFrames = num + (num - 1) * stride; - if (fOffset + clipFrames > item.durationPts) { - fOffset = item.durationPts - clipFrames; - } - - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.preventStaleness = false; - - for (size_t n = 0; n < num; ++n) { - std::list msgs; - - params.startOffset = - av_rescale_q(fOffset, {1, item.fps}, AV_TIME_BASE_Q); - params.endOffset = params.startOffset + 100; - - auto now = std::chrono::steady_clock::now(); - SyncDecoder decoder; - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - nullptr)); - DecoderOutputMessage out; - while (0 == decoder.decode(&out, params.timeoutMs)) { - msgs.push_back(std::move(out)); - } - - decoder.shutdown(); - - const auto then = std::chrono::steady_clock::now(); - - fOffset += 1 + stride; - - avgClipDecodingUs += - std::chrono::duration_cast(then - now) - .count(); - } - } - } - - return avgClipDecodingUs / rounds / num / stats.size(); -} - -void runDecoder(SyncDecoder& decoder) { - DecoderOutputMessage out; - size_t audioFrames = 0, videoFrames = 0, totalBytes = 0; - while (0 == decoder.decode(&out, 10000)) { - if (out.header.format.type == TYPE_AUDIO) { - ++audioFrames; - } else if (out.header.format.type == TYPE_VIDEO) { - ++videoFrames; - } else if (out.header.format.type == TYPE_SUBTITLE && out.payload) { - // deserialize - LOG(INFO) << "Deserializing subtitle"; - AVSubtitle sub; - memset(&sub, 0, sizeof(sub)); - EXPECT_TRUE(Util::deserialize(*out.payload, &sub)); - LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects; - for (int i = 0; i < sub.num_rects; ++i) { - std::string text = "picture"; - if (sub.rects[i]->type == SUBTITLE_TEXT) { - text = sub.rects[i]->text; - } else if (sub.rects[i]->type == SUBTITLE_ASS) { - text = sub.rects[i]->ass; - } - - LOG(INFO) << "Rect num: " << i << ", type:" << sub.rects[i]->type - << ", text: " << text; - } - - avsubtitle_free(&sub); - } - if (out.payload) { - totalBytes += out.payload->length(); - } - } - LOG(INFO) << "Decoded audio frames: " << audioFrames - << ", video frames: " << videoFrames - << ", total bytes: " << totalBytes; -} -} // namespace - -TEST(SyncDecoder, TestSyncDecoderPerformance) { - // Measure the average time of decoding per clip - // 1. list of the videos in testing directory - // 2. for each video got number of frames with timestamps - // 3. randomly select frame offset - // 4. adjust offset for number frames and strides, - // if it's out out upper boundary - // 5. repeat multiple times, measuring and accumulating decoding time - // per clip. - /* - 1) 4 x 2 - 2) 8 x 8 - 3) 16 x 8 - 4) 32 x 4 - */ - const std::string kFolder = "pytorch/vision/test/assets/videos"; - std::vector stats; - gotAllTestFiles(kFolder, &stats); - gotFilesStats(stats); - - const size_t kRounds = 10; - - auto new4x2 = measurePerformanceUs(stats, kRounds, 4, 2); - auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8); - auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8); - auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4); - LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2 - << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8 - << ", new(32x4): " << new32x4; -} - -TEST(SyncDecoder, Test) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestSubtitles) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "vue/synergy/data/robotsub.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestHeadersOnly) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = true; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestHeadersOnlyDownSampling) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = true; - MediaFormat format; - format.type = TYPE_AUDIO; - format.format.audio.samples = 8000; - params.formats.insert(format); - - format.type = TYPE_VIDEO; - format.format.video.width = 224; - format.format.video.height = 224; - params.formats.insert(format); - - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestInitOnlyNoShutdown) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = false; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - std::vector metadata; - CHECK(decoder.init(params, nullptr, &metadata)); -} - -TEST(SyncDecoder, TestMemoryBuffer) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen( - "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi", - "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - nullptr)); - LOG(INFO) << "Decoding from memory bytes: " << buffer.size(); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - params.maxSeekableBytes = buffer.size() + 1; - MemoryBuffer object(buffer.data(), buffer.size()); - CHECK(decoder.init( - params, - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - no - return -1; - } - return object.seek(size, whence); - }, - nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - params.maxSeekableBytes = buffer.size() / 2; - MemoryBuffer object(buffer.data(), buffer.size()); - CHECK(!decoder.init( - params, - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - no - return -1; - } - return object.seek(size, whence); - }, - nullptr)); -} diff --git a/torchvision/csrc/io/decoder/time_keeper.cpp b/torchvision/csrc/io/decoder/time_keeper.cpp deleted file mode 100644 index 845c76cddc8..00000000000 --- a/torchvision/csrc/io/decoder/time_keeper.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "time_keeper.h" -#include "defs.h" - -namespace ffmpeg { - -namespace { -const long kMaxTimeBaseDiference = 10; -} - -long TimeKeeper::adjust(long& decoderTimestamp) { - const long now = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - - if (startTime_ == 0) { - startTime_ = now; - } - if (streamTimestamp_ == 0) { - streamTimestamp_ = decoderTimestamp; - } - - const auto runOut = startTime_ + decoderTimestamp - streamTimestamp_; - - if (std::labs((now - runOut) / AV_TIME_BASE) > kMaxTimeBaseDiference) { - streamTimestamp_ = startTime_ - now + decoderTimestamp; - } - - const auto sleepAdvised = runOut - now; - - decoderTimestamp += startTime_ - streamTimestamp_; - - return sleepAdvised > 0 ? sleepAdvised : 0; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/time_keeper.h b/torchvision/csrc/io/decoder/time_keeper.h deleted file mode 100644 index e4d4718c705..00000000000 --- a/torchvision/csrc/io/decoder/time_keeper.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include - -namespace ffmpeg { - -/** - * Class keeps the track of the decoded timestamps (us) for media streams. - */ - -class TimeKeeper { - public: - TimeKeeper() = default; - - // adjust provided @timestamp to the corrected value - // return advised sleep time before next frame processing in (us) - long adjust(long& decoderTimestamp); - - private: - long startTime_{0}; - long streamTimestamp_{0}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp deleted file mode 100644 index 7198d2174ed..00000000000 --- a/torchvision/csrc/io/decoder/util.cpp +++ /dev/null @@ -1,401 +0,0 @@ -#include "util.h" -#include - -namespace ffmpeg { - -namespace Serializer { - -// fixed size types -template -inline size_t getSize(const T& x) { - return sizeof(x); -} - -template -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const T& src) { - VLOG(6) << "Generic serializeItem"; - const auto required = sizeof(src); - if (len < pos + required) { - return false; - } - memcpy(dest + pos, &src, required); - pos += required; - return true; -} - -template -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - T& dest) { - const auto required = sizeof(dest); - if (len < pos + required) { - return false; - } - memcpy(&dest, src + pos, required); - pos += required; - return true; -} - -// AVSubtitleRect specialization -inline size_t getSize(const AVSubtitleRect& x) { - auto rectBytes = [](const AVSubtitleRect& y) -> size_t { - size_t s = 0; - switch (y.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < y.nb_colors; ++i) { - s += sizeof(y.linesize[i]); - s += y.linesize[i]; - } - break; - case SUBTITLE_TEXT: - s += sizeof(size_t); - s += strlen(y.text); - break; - case SUBTITLE_ASS: - s += sizeof(size_t); - s += strlen(y.ass); - break; - default: - break; - } - return s; - }; - return getSize(x.x) + getSize(x.y) + getSize(x.w) + getSize(x.h) + - getSize(x.nb_colors) + getSize(x.type) + getSize(x.flags) + rectBytes(x); -} - -// AVSubtitle specialization -inline size_t getSize(const AVSubtitle& x) { - auto rectBytes = [](const AVSubtitle& y) -> size_t { - size_t s = getSize(y.num_rects); - for (unsigned i = 0; i < y.num_rects; ++i) { - s += getSize(*y.rects[i]); - } - return s; - }; - return getSize(x.format) + getSize(x.start_display_time) + - getSize(x.end_display_time) + getSize(x.pts) + rectBytes(x); -} - -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const AVSubtitleRect& src) { - auto rectSerialize = - [](uint8_t* d, size_t l, size_t& p, const AVSubtitleRect& x) -> size_t { - switch (x.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < x.nb_colors; ++i) { - if (!serializeItem(d, l, p, x.linesize[i])) { - return false; - } - if (p + x.linesize[i] > l) { - return false; - } - memcpy(d + p, x.data[i], x.linesize[i]); - p += x.linesize[i]; - } - return true; - case SUBTITLE_TEXT: { - const size_t s = strlen(x.text); - if (!serializeItem(d, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - memcpy(d + p, x.text, s); - p += s; - return true; - } - case SUBTITLE_ASS: { - const size_t s = strlen(x.ass); - if (!serializeItem(d, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - memcpy(d + p, x.ass, s); - p += s; - return true; - } - default: - return true; - } - }; - return serializeItem(dest, len, pos, src.x) && - serializeItem(dest, len, pos, src.y) && - serializeItem(dest, len, pos, src.w) && - serializeItem(dest, len, pos, src.h) && - serializeItem(dest, len, pos, src.nb_colors) && - serializeItem(dest, len, pos, src.type) && - serializeItem(dest, len, pos, src.flags) && - rectSerialize(dest, len, pos, src); -} - -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const AVSubtitle& src) { - auto rectSerialize = - [](uint8_t* d, size_t l, size_t& p, const AVSubtitle& x) -> bool { - bool res = serializeItem(d, l, p, x.num_rects); - for (unsigned i = 0; res && i < x.num_rects; ++i) { - res = serializeItem(d, l, p, *(x.rects[i])); - } - return res; - }; - VLOG(6) << "AVSubtitle serializeItem"; - return serializeItem(dest, len, pos, src.format) && - serializeItem(dest, len, pos, src.start_display_time) && - serializeItem(dest, len, pos, src.end_display_time) && - serializeItem(dest, len, pos, src.pts) && - rectSerialize(dest, len, pos, src); -} - -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - AVSubtitleRect& dest) { - auto rectDeserialize = - [](const uint8_t* y, size_t l, size_t& p, AVSubtitleRect& x) -> bool { - switch (x.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < x.nb_colors; ++i) { - if (!deserializeItem(y, l, p, x.linesize[i])) { - return false; - } - if (p + x.linesize[i] > l) { - return false; - } - x.data[i] = (uint8_t*)av_malloc(x.linesize[i]); - memcpy(x.data[i], y + p, x.linesize[i]); - p += x.linesize[i]; - } - return true; - case SUBTITLE_TEXT: { - size_t s = 0; - if (!deserializeItem(y, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - x.text = (char*)av_malloc(s + 1); - memcpy(x.text, y + p, s); - x.text[s] = 0; - p += s; - return true; - } - case SUBTITLE_ASS: { - size_t s = 0; - if (!deserializeItem(y, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - x.ass = (char*)av_malloc(s + 1); - memcpy(x.ass, y + p, s); - x.ass[s] = 0; - p += s; - return true; - } - default: - return true; - } - }; - - return deserializeItem(src, len, pos, dest.x) && - deserializeItem(src, len, pos, dest.y) && - deserializeItem(src, len, pos, dest.w) && - deserializeItem(src, len, pos, dest.h) && - deserializeItem(src, len, pos, dest.nb_colors) && - deserializeItem(src, len, pos, dest.type) && - deserializeItem(src, len, pos, dest.flags) && - rectDeserialize(src, len, pos, dest); -} - -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - AVSubtitle& dest) { - auto rectDeserialize = - [](const uint8_t* y, size_t l, size_t& p, AVSubtitle& x) -> bool { - bool res = deserializeItem(y, l, p, x.num_rects); - if (res && x.num_rects) { - x.rects = - (AVSubtitleRect**)av_malloc(x.num_rects * sizeof(AVSubtitleRect*)); - } - for (unsigned i = 0; res && i < x.num_rects; ++i) { - x.rects[i] = (AVSubtitleRect*)av_malloc(sizeof(AVSubtitleRect)); - memset(x.rects[i], 0, sizeof(AVSubtitleRect)); - res = deserializeItem(y, l, p, *x.rects[i]); - } - return res; - }; - return deserializeItem(src, len, pos, dest.format) && - deserializeItem(src, len, pos, dest.start_display_time) && - deserializeItem(src, len, pos, dest.end_display_time) && - deserializeItem(src, len, pos, dest.pts) && - rectDeserialize(src, len, pos, dest); -} -} // namespace Serializer - -namespace Util { -std::string generateErrorDesc(int errorCode) { - std::array buffer; - if (av_strerror(errorCode, buffer.data(), buffer.size()) < 0) { - return std::string("Unknown error code: ") + std::to_string(errorCode); - } - buffer.back() = 0; - return std::string(buffer.data()); -} - -size_t serialize(const AVSubtitle& sub, ByteStorage* out) { - const auto len = size(sub); - size_t pos = 0; - if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) { - return 0; - } - out->append(len); - return len; -} - -bool deserialize(const ByteStorage& buf, AVSubtitle* sub) { - size_t pos = 0; - return Serializer::deserializeItem(buf.data(), buf.length(), pos, *sub); -} - -size_t size(const AVSubtitle& sub) { - return Serializer::getSize(sub); -} - -bool validateVideoFormat(const VideoFormat& f) { - // clang-format off - /* - Valid parameters values for decoder - ____________________________________________________________________________________ - | W | H | minDimension | maxDimension | cropImage | algorithm | - |__________________________________________________________________________________| - | 0 | 0 | 0 | 0 | N/A | original | - |__________________________________________________________________________________| - | >0 | 0 | N/A | N/A | N/A | scale keeping W | - |__________________________________________________________________________________| - | 0 | >0 | N/A | N/A | N/A | scale keeping H | - |__________________________________________________________________________________| - | >0 | >0 | N/A | N/A | 0 | stretch/scale | - |__________________________________________________________________________________| - | >0 | >0 | N/A | N/A | >0 | scale/crop | - |__________________________________________________________________________________| - | 0 | 0 | >0 | 0 | N/A |scale to min dimension | - |__________________________________________________________________________________| - | 0 | 0 | 0 | >0 | N/A |scale to max dimension | - |__________________________________________________________________________________| - | 0 | 0 | >0 | >0 | N/A |stretch to min/max dimension| - |_____|_____|______________|______________|___________|____________________________| - - */ - // clang-format on - return (f.width == 0 && // #1, #6, #7 and #8 - f.height == 0 && f.cropImage == 0) || - (f.width != 0 && // #4 and #5 - f.height != 0 && f.minDimension == 0 && f.maxDimension == 0) || - (((f.width != 0 && // #2 - f.height == 0) || - (f.width == 0 && // #3 - f.height != 0)) && - f.minDimension == 0 && f.maxDimension == 0 && f.cropImage == 0); -} - -void setFormatDimensions( - size_t& destW, - size_t& destH, - size_t userW, - size_t userH, - size_t srcW, - size_t srcH, - size_t minDimension, - size_t maxDimension, - size_t cropImage) { - // rounding rules - // int -> double -> round up - // if fraction is >= 0.5 or round down if fraction is < 0.5 - // int result = double(value) + 0.5 - // here we rounding double to int according to the above rule - - // #1, #6, #7 and #8 - if (userW == 0 && userH == 0) { - if (minDimension > 0 && maxDimension == 0) { // #6 - if (srcW > srcH) { - // landscape - destH = minDimension; - destW = round(double(srcW * minDimension) / srcH); - } else { - // portrait - destW = minDimension; - destH = round(double(srcH * minDimension) / srcW); - } - } else if (minDimension == 0 && maxDimension > 0) { // #7 - if (srcW > srcH) { - // landscape - destW = maxDimension; - destH = round(double(srcH * maxDimension) / srcW); - } else { - // portrait - destH = maxDimension; - destW = round(double(srcW * maxDimension) / srcH); - } - } else if (minDimension > 0 && maxDimension > 0) { // #8 - if (srcW > srcH) { - // landscape - destW = maxDimension; - destH = minDimension; - } else { - // portrait - destW = minDimension; - destH = maxDimension; - } - } else { // #1 - destW = srcW; - destH = srcH; - } - } else if (userW != 0 && userH == 0) { // #2 - destW = userW; - destH = round(double(srcH * userW) / srcW); - } else if (userW == 0 && userH != 0) { // #3 - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { // userW != 0 && userH != 0 - if (cropImage == 0) { // #4 - destW = userW; - destH = userH; - } else { // #5 - double userSlope = double(userH) / userW; - double srcSlope = double(srcH) / srcW; - if (srcSlope < userSlope) { - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { - destW = userW; - destH = round(double(srcH * userW) / srcW); - } - } - } - // prevent zeros - destW = std::max(destW, size_t(1UL)); - destH = std::max(destH, size_t(1UL)); -} -} // namespace Util -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util.h b/torchvision/csrc/io/decoder/util.h deleted file mode 100644 index 01b550e5bbc..00000000000 --- a/torchvision/csrc/io/decoder/util.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * FFMPEG library utility functions. - */ - -namespace Util { -std::string generateErrorDesc(int errorCode); -size_t serialize(const AVSubtitle& sub, ByteStorage* out); -bool deserialize(const ByteStorage& buf, AVSubtitle* sub); -size_t size(const AVSubtitle& sub); -void setFormatDimensions( - size_t& destW, - size_t& destH, - size_t userW, - size_t userH, - size_t srcW, - size_t srcH, - size_t minDimension, - size_t maxDimension, - size_t cropImage); -bool validateVideoFormat(const VideoFormat& format); -} // namespace Util -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp deleted file mode 100644 index 0a093d9561b..00000000000 --- a/torchvision/csrc/io/decoder/util_test.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include -#include -#include "util.h" - -TEST(Util, TestSetFormatDimensions) { - // clang-format off - const size_t test_cases[][9] = { - // (userW, userH, srcW, srcH, minDimension, maxDimension, cropImage, destW, destH) - {0, 0, 172, 128, 0, 0, 0, 172, 128}, // #1 - {86, 0, 172, 128, 0, 0, 0, 86, 64}, // #2 - {64, 0, 128, 172, 0, 0, 0, 64, 86}, // #2 - {0, 32, 172, 128, 0, 0, 0, 43, 32}, // #3 - {32, 0, 128, 172, 0, 0, 0, 32, 43}, // #3 - {60, 50, 172, 128, 0, 0, 0, 60, 50}, // #4 - {50, 60, 128, 172, 0, 0, 0, 50, 60}, // #4 - {86, 40, 172, 128, 0, 0, 1, 86, 64}, // #5 - {86, 92, 172, 128, 0, 0, 1, 124, 92}, // #5 - {0, 0, 172, 128, 256, 0, 0, 344, 256}, // #6 - {0, 0, 128, 172, 256, 0, 0, 256, 344}, // #6 - {0, 0, 128, 172, 0, 344, 0, 256, 344}, // #7 - {0, 0, 172, 128, 0, 344, 0, 344, 256}, // #7 - {0, 0, 172, 128, 100, 344, 0, 344, 100},// #8 - {0, 0, 128, 172, 100, 344, 0, 100, 344} // #8 - }; - // clang-format onn - - for (const auto& tc : test_cases) { - size_t destW = 0; - size_t destH = 0; - ffmpeg::Util::setFormatDimensions(destW, destH, tc[0], tc[1], tc[2], tc[3], tc[4], tc[5], tc[6]); - CHECK(destW == tc[7]); - CHECK(destH == tc[8]); - } -} diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp deleted file mode 100644 index 8b712609e34..00000000000 --- a/torchvision/csrc/io/decoder/video_sampler.cpp +++ /dev/null @@ -1,337 +0,0 @@ -#include "video_sampler.h" -#include -#include "util.h" - -// www.ffmpeg.org/doxygen/0.5/swscale-example_8c-source.html - -namespace ffmpeg { - -namespace { - -// Setup the data pointers and linesizes based on the specified image -// parameters and the provided array. This sets up "planes" to point to a -// "buffer" -// NOTE: this is most likely culprit behind #3534 -// -// Args: -// fmt: desired output video format -// buffer: source constant image buffer (in different format) that will contain -// the final image after SWScale planes: destination data pointer to be filled -// lineSize: target destination linesize (always {0}) -int preparePlanes( - const VideoFormat& fmt, - const uint8_t* buffer, - uint8_t** planes, - int* lineSize) { - int result; - - // NOTE: 1 at the end of av_fill_arrays is the value used for alignment - if ((result = av_image_fill_arrays( - planes, - lineSize, - buffer, - (AVPixelFormat)fmt.format, - fmt.width, - fmt.height, - 1)) < 0) { - LOG(ERROR) << "av_image_fill_arrays failed, err: " - << Util::generateErrorDesc(result); - } - return result; -} - -// Scale (and crop) the image slice in srcSlice and put the resulting scaled -// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as -// `sws_scale` cannot access buffers directly. -// -// Args: -// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if -// scale) srcSlice: frame data in YUV420P srcStride: the array containing the -// strides for each plane of the source -// image (from AVFrame->linesize[0]) -// out: destination buffer -// planes: indirect destination buffer (mapped to "out" via preparePlanes) -// lines: destination linesize; constant {0} -int transformImage( - SwsContext* context, - const uint8_t* const srcSlice[], - int srcStride[], - VideoFormat inFormat, - VideoFormat outFormat, - uint8_t* out, - uint8_t* planes[], - int lines[]) { - int result; - if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) { - return result; - } - if (context) { - // NOTE: srcY stride always 0: this is a parameter of YUV format - if ((result = sws_scale( - context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) < - 0) { - LOG(ERROR) << "sws_scale failed, err: " - << Util::generateErrorDesc(result); - return result; - } - } else if ( - inFormat.width == outFormat.width && - inFormat.height == outFormat.height && - inFormat.format == outFormat.format) { - // Copy planes without using sws_scale if sws_getContext failed. - av_image_copy( - planes, - lines, - (const uint8_t**)srcSlice, - srcStride, - (AVPixelFormat)inFormat.format, - inFormat.width, - inFormat.height); - } else { - LOG(ERROR) << "Invalid scale context format " << inFormat.format; - return AVERROR(EINVAL); - } - return 0; -} -} // namespace - -VideoSampler::VideoSampler(int swsFlags, int64_t loggingUuid) - : swsFlags_(swsFlags), loggingUuid_(loggingUuid) {} - -VideoSampler::~VideoSampler() { - cleanUp(); -} - -void VideoSampler::shutdown() { - cleanUp(); -} - -bool VideoSampler::init(const SamplerParameters& params) { - cleanUp(); - - if (params.out.video.cropImage != 0) { - if (!Util::validateVideoFormat(params.out.video)) { - LOG(ERROR) << "Invalid video format" - << ", width: " << params.out.video.width - << ", height: " << params.out.video.height - << ", format: " << params.out.video.format - << ", minDimension: " << params.out.video.minDimension - << ", crop: " << params.out.video.cropImage; - - return false; - } - - scaleFormat_.format = params.out.video.format; - Util::setFormatDimensions( - scaleFormat_.width, - scaleFormat_.height, - params.out.video.width, - params.out.video.height, - params.in.video.width, - params.in.video.height, - 0, - 0, - 1); - - if (!(scaleFormat_ == params_.out.video)) { // crop required - cropContext_ = sws_getContext( - params.out.video.width, - params.out.video.height, - (AVPixelFormat)params.out.video.format, - params.out.video.width, - params.out.video.height, - (AVPixelFormat)params.out.video.format, - swsFlags_, - nullptr, - nullptr, - nullptr); - - if (!cropContext_) { - LOG(ERROR) << "sws_getContext failed for crop context"; - return false; - } - - const auto scaleImageSize = av_image_get_buffer_size( - (AVPixelFormat)scaleFormat_.format, - scaleFormat_.width, - scaleFormat_.height, - 1); - scaleBuffer_.resize(scaleImageSize); - } - } else { - scaleFormat_ = params.out.video; - } - - VLOG(1) << "Input format #" << loggingUuid_ << ", width " - << params.in.video.width << ", height " << params.in.video.height - << ", format " << params.in.video.format << ", minDimension " - << params.in.video.minDimension << ", cropImage " - << params.in.video.cropImage; - VLOG(1) << "Scale format #" << loggingUuid_ << ", width " - << scaleFormat_.width << ", height " << scaleFormat_.height - << ", format " << scaleFormat_.format << ", minDimension " - << scaleFormat_.minDimension << ", cropImage " - << scaleFormat_.cropImage; - VLOG(1) << "Crop format #" << loggingUuid_ << ", width " - << params.out.video.width << ", height " << params.out.video.height - << ", format " << params.out.video.format << ", minDimension " - << params.out.video.minDimension << ", cropImage " - << params.out.video.cropImage; - - // set output format - params_ = params; - - if (params.in.video.format == AV_PIX_FMT_YUV420P) { - /* When the video width and height are not multiples of 8, - * and there is no size change in the conversion, - * a blurry screen will appear on the right side - * This problem was discovered in 2012 and - * continues to exist in version 4.1.3 in 2019 - * This problem can be avoided by increasing SWS_ACCURATE_RND - * details https://trac.ffmpeg.org/ticket/1582 - */ - if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) { - VLOG(1) << "The width " << params.in.video.width << " and height " - << params.in.video.height << " the image is not a multiple of 8, " - << "the decoding speed may be reduced"; - swsFlags_ |= SWS_ACCURATE_RND; - } - } - - scaleContext_ = sws_getContext( - params.in.video.width, - params.in.video.height, - (AVPixelFormat)params.in.video.format, - scaleFormat_.width, - scaleFormat_.height, - (AVPixelFormat)scaleFormat_.format, - swsFlags_, - nullptr, - nullptr, - nullptr); - // sws_getContext might fail if in/out format == AV_PIX_FMT_PAL8 (png format) - // Return true if input and output formats/width/height are identical - // Check scaleContext_ for nullptr in transformImage to copy planes directly - - if (params.in.video.width == scaleFormat_.width && - params.in.video.height == scaleFormat_.height && - params.in.video.format == scaleFormat_.format) { - return true; - } - return scaleContext_ != nullptr; -} - -// Main body of the sample function called from one of the overloads below -// -// Args: -// srcSlice: decoded AVFrame->data perpared buffer -// srcStride: linesize (usually obtained from AVFrame->linesize) -// out: return buffer (ByteStorage*) -int VideoSampler::sample( - const uint8_t* const srcSlice[], - int srcStride[], - ByteStorage* out) { - int result; - // scaled and cropped image - int outImageSize = av_image_get_buffer_size( - (AVPixelFormat)params_.out.video.format, - params_.out.video.width, - params_.out.video.height, - 1); - - out->ensure(outImageSize); - - uint8_t* scalePlanes[4] = {nullptr}; - int scaleLines[4] = {0}; - // perform scale first - if ((result = transformImage( - scaleContext_, - srcSlice, - srcStride, - params_.in.video, - scaleFormat_, - // for crop use internal buffer - cropContext_ ? scaleBuffer_.data() : out->writableTail(), - scalePlanes, - scaleLines))) { - return result; - } - - // is crop required? - if (cropContext_) { - uint8_t* cropPlanes[4] = {nullptr}; - int cropLines[4] = {0}; - - if (params_.out.video.height < scaleFormat_.height) { - // Destination image is wider of source image: cut top and bottom - for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) { - scalePlanes[i] += scaleLines[i] * - (scaleFormat_.height - params_.out.video.height) / 2; - } - } else { - // Source image is wider of destination image: cut sides - for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) { - scalePlanes[i] += scaleLines[i] * - (scaleFormat_.width - params_.out.video.width) / 2 / - scaleFormat_.width; - } - } - - // crop image - if ((result = transformImage( - cropContext_, - scalePlanes, - scaleLines, - params_.out.video, - params_.out.video, - out->writableTail(), - cropPlanes, - cropLines))) { - return result; - } - } - - out->append(outImageSize); - return outImageSize; -} - -// Call from `video_stream.cpp::114` - occurs during file reads -int VideoSampler::sample(AVFrame* frame, ByteStorage* out) { - if (!frame) { - return 0; // no flush for videos - } - - return sample(frame->data, frame->linesize, out); -} - -// Call from `video_stream.cpp::114` - not sure when this occurs -int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) { - if (!in) { - return 0; // no flush for videos - } - - int result; - uint8_t* inPlanes[4] = {nullptr}; - int inLineSize[4] = {0}; - - if ((result = preparePlanes( - params_.in.video, in->data(), inPlanes, inLineSize)) < 0) { - return result; - } - - return sample(inPlanes, inLineSize, out); -} - -void VideoSampler::cleanUp() { - if (scaleContext_) { - sws_freeContext(scaleContext_); - scaleContext_ = nullptr; - } - if (cropContext_) { - sws_freeContext(cropContext_); - cropContext_ = nullptr; - scaleBuffer_.clear(); - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_sampler.h b/torchvision/csrc/io/decoder/video_sampler.h deleted file mode 100644 index 47247f2c0c5..00000000000 --- a/torchvision/csrc/io/decoder/video_sampler.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode video frames from one format into another - */ - -class VideoSampler : public MediaSampler { - public: - VideoSampler(int swsFlags = SWS_AREA, int64_t loggingUuid = 0); - - ~VideoSampler() override; - - // MediaSampler overrides - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - // returns number processed/scaling bytes - int sample(AVFrame* frame, ByteStorage* out); - int getImageBytes() const; - - private: - // close resources - void cleanUp(); - // helper functions for rescaling, cropping, etc. - int sample( - const uint8_t* const srcSlice[], - int srcStride[], - ByteStorage* out); - - private: - VideoFormat scaleFormat_; - SwsContext* scaleContext_{nullptr}; - SwsContext* cropContext_{nullptr}; - int swsFlags_{SWS_AREA}; - std::vector scaleBuffer_; - int64_t loggingUuid_{0}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp deleted file mode 100644 index fa08c65cac1..00000000000 --- a/torchvision/csrc/io/decoder/video_stream.cpp +++ /dev/null @@ -1,131 +0,0 @@ -#include "video_stream.h" -#include -#include "util.h" - -namespace ffmpeg { - -namespace { -bool operator==(const VideoFormat& x, const AVFrame& y) { - return x.width == static_cast(y.width) && - x.height == static_cast(y.height) && x.format == y.format; -} - -bool operator==(const VideoFormat& x, const AVCodecContext& y) { - return x.width == static_cast(y.width) && - x.height == static_cast(y.height) && x.format == y.pix_fmt; -} - -VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) { - x.width = y.width; - x.height = y.height; - x.format = y.format; - return x; -} - -VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) { - x.width = y.width; - x.height = y.height; - x.format = y.pix_fmt; - return x; -} -} // namespace - -VideoStream::VideoStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const VideoFormat& format, - int64_t loggingUuid) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - loggingUuid) {} - -VideoStream::~VideoStream() { - if (sampler_) { - sampler_->shutdown(); - sampler_.reset(); - } -} - -int VideoStream::initFormat() { - // set output format - if (!Util::validateVideoFormat(format_.format.video)) { - LOG(ERROR) << "Invalid video format" - << ", width: " << format_.format.video.width - << ", height: " << format_.format.video.height - << ", format: " << format_.format.video.format - << ", minDimension: " << format_.format.video.minDimension - << ", crop: " << format_.format.video.cropImage; - return -1; - } - - // keep aspect ratio - Util::setFormatDimensions( - format_.format.video.width, - format_.format.video.height, - format_.format.video.width, - format_.format.video.height, - codecCtx_->width, - codecCtx_->height, - format_.format.video.minDimension, - format_.format.video.maxDimension, - 0); - - if (format_.format.video.format == AV_PIX_FMT_NONE) { - format_.format.video.format = codecCtx_->pix_fmt; - } - return format_.format.video.width != 0 && format_.format.video.height != 0 && - format_.format.video.format != AV_PIX_FMT_NONE - ? 0 - : -1; -} - -// copies frame bytes via sws_scale call in video_sampler.cpp -int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) { - if (!sampler_) { - sampler_ = std::make_unique(SWS_AREA, loggingUuid_); - } - - // check if input format gets changed - if (flush ? !(sampler_->getInputFormat().video == *codecCtx_) - : !(sampler_->getInputFormat().video == *frame_)) { - // - reinit sampler - SamplerParameters params; - params.type = format_.type; - params.out = format_.format; - params.in = FormatUnion(0); - flush ? toVideoFormat(params.in.video, *codecCtx_) - : toVideoFormat(params.in.video, *frame_); - if (!sampler_->init(params)) { - return -1; - } - - VLOG(1) << "Set input video sampler format" - << ", width: " << params.in.video.width - << ", height: " << params.in.video.height - << ", format: " << params.in.video.format - << " : output video sampler format" - << ", width: " << format_.format.video.width - << ", height: " << format_.format.video.height - << ", format: " << format_.format.video.format - << ", minDimension: " << format_.format.video.minDimension - << ", crop: " << format_.format.video.cropImage; - } - // calls to a sampler that converts the frame from YUV422 to RGB24, and - // optionally crops and resizes the frame. Frame bytes are copied from - // frame_->data to out buffer - return sampler_->sample(flush ? nullptr : frame_, out); -} - -void VideoStream::setHeader(DecoderHeader* header, bool flush) { - Stream::setHeader(header, flush); - if (!flush) { // no frames for video flush - header->keyFrame = frame_->key_frame; - header->fps = av_q2d(av_guess_frame_rate( - inputCtx_, inputCtx_->streams[format_.stream], nullptr)); - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_stream.h b/torchvision/csrc/io/decoder/video_stream.h deleted file mode 100644 index e6a8bf02b65..00000000000 --- a/torchvision/csrc/io/decoder/video_stream.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include "stream.h" -#include "video_sampler.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one video stream. - */ - -class VideoStream : public Stream { - public: - VideoStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const VideoFormat& format, - int64_t loggingUuid); - ~VideoStream() override; - - private: - int initFormat() override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - void setHeader(DecoderHeader* header, bool flush) override; - - private: - std::unique_ptr sampler_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp deleted file mode 100644 index 8f1fb3fb5b9..00000000000 --- a/torchvision/csrc/io/video/video.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include "video.h" - -#include - -using namespace ffmpeg; - -namespace vision { -namespace video { - -namespace { - -const size_t decoderTimeoutMs = 600000; -const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; - -// returns number of written bytes -template -size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) { - const auto& msg = msgs; - T* frameData = frame.numel() > 0 ? frame.data_ptr() : nullptr; - if (frameData) { - auto sizeInBytes = msg.payload->length(); - memcpy(frameData, msg.payload->data(), sizeInBytes); - } - return sizeof(T); -} - -size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) { - return fillTensorList(msgs, videoFrame); -} - -size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) { - return fillTensorList(msgs, audioFrame); -} - -std::array, 4>::const_iterator -_parse_type(const std::string& stream_string) { - static const std::array, 4> types = {{ - {"video", TYPE_VIDEO}, - {"audio", TYPE_AUDIO}, - {"subtitle", TYPE_SUBTITLE}, - {"cc", TYPE_CC}, - }}; - auto device = std::find_if( - types.begin(), - types.end(), - [stream_string](const std::pair& p) { - return p.first == stream_string; - }); - if (device != types.end()) { - return device; - } - TORCH_CHECK( - false, "Expected one of [audio, video, subtitle, cc] ", stream_string); -} - -std::string parse_type_to_string(const std::string& stream_string) { - auto device = _parse_type(stream_string); - return device->first; -} - -MediaType parse_type_to_mt(const std::string& stream_string) { - auto device = _parse_type(stream_string); - return device->second; -} - -std::tuple _parseStream(const std::string& streamString) { - TORCH_CHECK(!streamString.empty(), "Stream string must not be empty"); - static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?"); - std::smatch match; - - TORCH_CHECK( - std::regex_match(streamString, match, regex), - "Invalid stream string: '", - streamString, - "'"); - - std::string type_ = "video"; - type_ = parse_type_to_string(match[1].str()); - long index_ = -1; - if (match[2].matched) { - try { - index_ = std::stoi(match[2].str()); - } catch (const std::exception&) { - TORCH_CHECK( - false, - "Could not parse device index '", - match[2].str(), - "' in device string '", - streamString, - "'"); - } - } - return std::make_tuple(type_, index_); -} - -} // namespace - -void Video::_getDecoderParams( - double videoStartS, - int64_t getPtsOnly, - std::string stream, - long stream_id = -1, - bool fastSeek = true, - bool all_streams = false, - int64_t num_threads = 1, - double seekFrameMarginUs = 10) { - int64_t videoStartUs = int64_t(videoStartS * 1e6); - - params.timeoutMs = decoderTimeoutMs; - params.startOffset = videoStartUs; - params.seekAccuracy = seekFrameMarginUs; - params.fastSeek = fastSeek; - params.headerOnly = false; - params.numThreads = num_threads; - - params.preventStaleness = false; // not sure what this is about - - if (all_streams == true) { - MediaFormat format; - format.stream = -2; - format.type = TYPE_AUDIO; - params.formats.insert(format); - - format.type = TYPE_VIDEO; - format.stream = -2; - format.format.video.width = 0; - format.format.video.height = 0; - format.format.video.cropImage = 0; - format.format.video.format = defaultVideoPixelFormat; - params.formats.insert(format); - - format.type = TYPE_SUBTITLE; - format.stream = -2; - params.formats.insert(format); - - format.type = TYPE_CC; - format.stream = -2; - params.formats.insert(format); - } else { - // parse stream type - MediaType stream_type = parse_type_to_mt(stream); - - // TODO: reset params.formats - std::set formats; - params.formats = formats; - // Define new format - MediaFormat format; - format.type = stream_type; - format.stream = stream_id; - if (stream_type == TYPE_VIDEO) { - format.format.video.width = 0; - format.format.video.height = 0; - format.format.video.cropImage = 0; - format.format.video.format = defaultVideoPixelFormat; - } - params.formats.insert(format); - } - -} // _get decoder params - -void Video::initFromFile( - std::string videoPath, - std::string stream, - int64_t numThreads) { - TORCH_CHECK(!initialized, "Video object can only be initialized once"); - initialized = true; - params.uri = videoPath; - _init(stream, numThreads); -} - -void Video::initFromMemory( - torch::Tensor videoTensor, - std::string stream, - int64_t numThreads) { - TORCH_CHECK(!initialized, "Video object can only be initialized once"); - initialized = true; - callback = MemoryBuffer::getCallback( - videoTensor.data_ptr(), videoTensor.size(0)); - _init(stream, numThreads); -} - -void Video::_init(std::string stream, int64_t numThreads) { - // set number of threads global - numThreads_ = numThreads; - // parse stream information - current_stream = _parseStream(stream); - // note that in the initial call we want to get all streams - _getDecoderParams( - 0, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream info - remove that - long(-1), // stream_id parsed from info above change to -2 - false, // fastseek: we're using the default param here - true, // read all streams - numThreads_ // global number of Threads for decoding - ); - - std::string logMessage, logType; - - // locals - std::vector audioFPS, videoFPS; - std::vector audioDuration, videoDuration, ccDuration, subsDuration; - std::vector audioTB, videoTB, ccTB, subsTB; - c10::Dict> audioMetadata; - c10::Dict> videoMetadata; - c10::Dict> ccMetadata; - c10::Dict> subsMetadata; - - // callback and metadata defined in struct - DecoderInCallback tmp_callback = callback; - succeeded = decoder.init(params, std::move(tmp_callback), &metadata); - if (succeeded) { - for (const auto& header : metadata) { - double fps = double(header.fps); - double duration = double(header.duration) * 1e-6; // * timeBase; - - if (header.format.type == TYPE_VIDEO) { - videoFPS.push_back(fps); - videoDuration.push_back(duration); - } else if (header.format.type == TYPE_AUDIO) { - audioFPS.push_back(fps); - audioDuration.push_back(duration); - } else if (header.format.type == TYPE_CC) { - ccDuration.push_back(duration); - } else if (header.format.type == TYPE_SUBTITLE) { - subsDuration.push_back(duration); - }; - } - } - // audio - audioMetadata.insert("duration", audioDuration); - audioMetadata.insert("framerate", audioFPS); - // video - videoMetadata.insert("duration", videoDuration); - videoMetadata.insert("fps", videoFPS); - // subs - subsMetadata.insert("duration", subsDuration); - // cc - ccMetadata.insert("duration", ccDuration); - // put all to a data - streamsMetadata.insert("video", videoMetadata); - streamsMetadata.insert("audio", audioMetadata); - streamsMetadata.insert("subtitles", subsMetadata); - streamsMetadata.insert("cc", ccMetadata); - - succeeded = setCurrentStream(stream); - if (std::get<1>(current_stream) != -1) { - LOG(INFO) - << "Stream index set to " << std::get<1>(current_stream) - << ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; - } -} - -Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { - C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); - if (!videoPath.empty()) { - initFromFile(videoPath, stream, numThreads); - } -} // video - -bool Video::setCurrentStream(std::string stream = "video") { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { - current_stream = _parseStream(stream); - } - - double ts = 0; - if (seekTS > 0) { - ts = seekTS; - } - - _getDecoderParams( - ts, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream - long(std::get<1>( - current_stream)), // stream_id parsed from info above change to -2 - false, // fastseek param set to 0 false by default (changed in seek) - false, // read all streams - numThreads_ // global number of threads - ); - - // callback and metadata defined in Video.h - DecoderInCallback tmp_callback = callback; - return (decoder.init(params, std::move(tmp_callback), &metadata)); -} - -std::tuple Video::getCurrentStream() const { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - return current_stream; -} - -c10::Dict>> Video:: - getStreamMetadata() const { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - return streamsMetadata; -} - -void Video::Seek(double ts, bool fastSeek = false) { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - // initialize the class variables used for seeking and retrurn - _getDecoderParams( - ts, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream - long(std::get<1>( - current_stream)), // stream_id parsed from info above change to -2 - fastSeek, // fastseek - false, // read all streams - numThreads_ // global number of threads - ); - - // callback and metadata defined in Video.h - DecoderInCallback tmp_callback = callback; - succeeded = decoder.init(params, std::move(tmp_callback), &metadata); -} - -std::tuple Video::Next() { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - // if failing to decode simply return a null tensor (note, should we - // raise an exception?) - double frame_pts_s; - torch::Tensor outFrame = torch::zeros({0}, torch::kByte); - - // decode single frame - DecoderOutputMessage out; - int64_t res = decoder.decode(&out, decoderTimeoutMs); - // if successful - if (res == 0) { - frame_pts_s = double(double(out.header.pts) * 1e-6); - - auto header = out.header; - const auto& format = header.format; - - // initialize the output variables based on type - - if (format.type == TYPE_VIDEO) { - // note: this can potentially be optimized - // by having the global tensor that we fill at decode time - // (would avoid allocations) - int outHeight = format.format.video.height; - int outWidth = format.format.video.width; - int numChannels = 3; - outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte); - fillVideoTensor(out, outFrame); - outFrame = outFrame.permute({2, 0, 1}); - - } else if (format.type == TYPE_AUDIO) { - int outAudioChannels = format.format.audio.channels; - int bytesPerSample = av_get_bytes_per_sample( - static_cast(format.format.audio.format)); - int frameSizeTotal = out.payload->length(); - - TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0); - int numAudioSamples = - frameSizeTotal / (outAudioChannels * bytesPerSample); - - outFrame = - torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat); - - fillAudioTensor(out, outFrame); - } - // currently not supporting other formats (will do soon) - - out.payload.reset(); - } else if (res == ENODATA) { - LOG(INFO) << "Decoder ran out of frames (ENODATA)\n"; - } else { - LOG(ERROR) << "Decoder failed with ERROR_CODE " << res; - } - - return std::make_tuple(outFrame, frame_pts_s); -} - -static auto registerVideo = - torch::class_