diff --git a/VIDEO_READER_MOVE_PLAN.md b/VIDEO_READER_MOVE_PLAN.md new file mode 100644 index 00000000000..98109a2b47f --- /dev/null +++ b/VIDEO_READER_MOVE_PLAN.md @@ -0,0 +1,517 @@ +# Video Reader Move Plan: torchvision → fb/ (Internal Only) + +## Overview + +Move the `video_reader` backend from the open-source `torchvision/` folder to the internal-only `fb/` folder. This allows: +- ✅ Removal from GitHub/open source +- ✅ Internal Meta users continue to have access +- ✅ Existing `fb/datasets/video_clip_sampler.py` keeps working + +--- + +## Part 1: C++ Files to Move + +### 1.1 Decoder Core (`decoder/`) + +**From:** `torchvision/csrc/io/decoder/` +**To:** `fb/csrc/io/decoder/` + +| File | Description | +|------|-------------| +| `audio_sampler.cpp` | Audio frame sampling | +| `audio_sampler.h` | | +| `audio_stream.cpp` | Audio stream handling | +| `audio_stream.h` | | +| `cc_stream.cpp` | Closed caption stream | +| `cc_stream.h` | | +| `decoder.cpp` | Main FFmpeg decoder class | +| `decoder.h` | | +| `defs.h` | Common definitions | +| `memory_buffer.cpp` | Memory buffer utils | +| `memory_buffer.h` | | +| `seekable_buffer.cpp` | Seekable buffer for streaming | +| `seekable_buffer.h` | | +| `stream.cpp` | Base stream class | +| `stream.h` | | +| `subtitle_sampler.cpp` | Subtitle sampling | +| `subtitle_sampler.h` | | +| `subtitle_stream.cpp` | Subtitle stream handling | +| `subtitle_stream.h` | | +| `sync_decoder.cpp` | Synchronous decoder wrapper | +| `sync_decoder.h` | | +| `time_keeper.cpp` | Timestamp management | +| `time_keeper.h` | | +| `util.cpp` | Utility functions | +| `util.h` | | +| `video_sampler.cpp` | Video frame sampling | +| `video_sampler.h` | | +| `video_stream.cpp` | Video stream handling | +| `video_stream.h` | | + +**Test files (move to `fb/csrc/io/decoder/` or `fb/tests/`):** +| File | Description | +|------|-------------| +| `sync_decoder_test.cpp` | Unit tests for sync_decoder | +| `util_test.cpp` | Unit tests for utilities | + +### 1.2 Video Utils (`video/`) + +**From:** `torchvision/csrc/io/video/` +**To:** `fb/csrc/io/video/` + +| File | Description | +|------|-------------| +| `video.cpp` | Video class implementation | +| `video.h` | Video class header | + +### 1.3 Video Reader Ops (`video_reader/`) + +**From:** `torchvision/csrc/io/video_reader/` +**To:** `fb/csrc/io/video_reader/` + +| File | Description | +|------|-------------| +| `video_reader.cpp` | torch.ops.video_reader registration | +| `video_reader.h` | | + +--- + +## Part 2: Python Files to Move + +**From:** `torchvision/io/` +**To:** `fb/io/` + +| File | Description | +|------|-------------| +| `_video_opt.py` | Core video_reader Python API (`_read_video_from_memory`, etc.) | +| `video_reader.py` | `VideoReader` class | +| `_video_deprecation_warning.py` | Deprecation warning helper (can stay in torchvision or be duplicated) | + +--- + +## Part 3: BUCK Target Changes + +### 3.1 Current Targets (in `pytorch/vision/BUCK`) + +```python +# Lines 501-539: decoder_streaming +fbcode_target( + _kind = cpp_library, + name = "decoder_streaming", + srcs = glob(["torchvision/csrc/io/decoder/*.cpp"], exclude = [...]), + ... +) + +# Lines 541-585: Tests +fbcode_target(_kind = cpp_unittest, name = "sync_decoder_test", ...) +fbcode_target(_kind = cpp_unittest, name = "sync_decoder_test_ffmpeg_7_1", ...) +fbcode_target(_kind = cpp_unittest, name = "util_test", ...) +fbcode_target(_kind = cpp_unittest, name = "util_test_ffmpeg_7_1", ...) + +# Lines 587-613: video_reader +fbcode_target( + _kind = cpp_library, + name = "video_reader", + srcs = glob([ + "torchvision/csrc/io/video/*.cpp", + "torchvision/csrc/io/video_reader/*.cpp", + ]), + ... +) + +# Lines 615-640: video_reader_cpu +fbcode_target( + _kind = cpp_library, + name = "video_reader_cpu", + ... +) +``` + +### 3.2 New Targets (create `pytorch/vision/fb/BUCK` or add to existing) + +```python +# fb/BUCK - New or updated file + +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +# C++ decoder library +cpp_library( + name = "decoder_streaming", + srcs = glob( + ["csrc/io/decoder/*.cpp"], + exclude = [ + "csrc/io/decoder/sync_decoder_test.cpp", + "csrc/io/decoder/util_test.cpp", + ], + ), + headers = glob(["csrc/io/decoder/*.h"]), + propagated_pp_flags = [ + "-Ipytorch/vision/fb/csrc/io/decoder", + ], + exported_deps = [ + "//caffe2/c10:c10", + ] + select({ + "DEFAULT": [], + "ovr_config//third-party/ffmpeg/constraints:7.1": [ + "fbsource//third-party/ffmpeg/ffmpeg_7_1:avcodec-network", + "fbsource//third-party/ffmpeg/ffmpeg_7_1:avfilter-network", + ], + }), + exported_external_deps = select({ + "DEFAULT": [ + ("ffmpeg-ref", None, "avfilter"), + ("ffmpeg-ref", None, "avcodec"), + ], + "ovr_config//third-party/ffmpeg/constraints:7.1": [], + }), +) + +# C++ video_reader library +cpp_library( + name = "video_reader", + srcs = glob([ + "csrc/io/video/*.cpp", + "csrc/io/video_reader/*.cpp", + ]), + headers = glob([ + "csrc/io/video/*.h", + "csrc/io/video_reader/*.h", + ]), + link_whole = True, + preprocessor_flags = [ + "-Ipytorch/vision/fb/csrc/io/video", + "-Ipytorch/vision/fb/csrc/io/video_reader", + "-DTORCH_EXTENSION_NAME=video_reader", + ], + propagated_pp_flags = [ + "-Ipytorch/vision/fb/csrc/io/video", + "-Ipytorch/vision/fb/csrc/io/video_reader", + ], + supports_python_dlopen = True, + exported_deps = [ + ":decoder_streaming", + "//caffe2:torch-cpp", + ], +) + +# CPU-only variant +cpp_library( + name = "video_reader_cpu", + srcs = glob([ + "csrc/io/video/*.cpp", + "csrc/io/video_reader/*.cpp", + ]), + headers = glob([ + "csrc/io/video/*.h", + "csrc/io/video_reader/*.h", + ]), + link_whole = False, + preprocessor_flags = [ + "-Ipytorch/vision/fb/csrc/io/video", + "-Ipytorch/vision/fb/csrc/io/video_reader", + "-DTORCH_EXTENSION_NAME=video_reader", + ], + propagated_pp_flags = [ + "-Ipytorch/vision/fb/csrc/io/video", + "-Ipytorch/vision/fb/csrc/io/video_reader", + ], + exported_deps = [ + ":decoder_streaming", + "//caffe2:torch-cpp-cpu", + ], +) + +# Tests +cpp_unittest( + name = "sync_decoder_test", + srcs = ["csrc/io/decoder/sync_decoder_test.cpp"], + deps = [":decoder_streaming"], +) + +cpp_unittest( + name = "sync_decoder_test_ffmpeg_7_1", + srcs = ["csrc/io/decoder/sync_decoder_test.cpp"], + modifiers = ["ovr_config//third-party/ffmpeg/constraints:7.1"], + deps = [":decoder_streaming"], +) + +cpp_unittest( + name = "util_test", + srcs = ["csrc/io/decoder/util_test.cpp"], + deps = [":decoder_streaming"], +) + +cpp_unittest( + name = "util_test_ffmpeg_7_1", + srcs = ["csrc/io/decoder/util_test.cpp"], + modifiers = ["ovr_config//third-party/ffmpeg/constraints:7.1"], + deps = [":decoder_streaming"], +) + +# Python library for video_reader API +python_library( + name = "video_reader_py", + srcs = [ + "io/_video_opt.py", + "io/video_reader.py", + ], + deps = [ + "//pytorch/vision:torchvision", # For extension loading + ], + cpp_deps = [ + ":video_reader", + ], +) +``` + +### 3.3 Remove from `pytorch/vision/BUCK` + +Delete these targets from the main BUCK file: +- `:decoder_streaming` (lines 501-539) +- `:sync_decoder_test` (lines 541-550) +- `:sync_decoder_test_ffmpeg_7_1` (lines 552-562) +- `:util_test` (lines 564-573) +- `:util_test_ffmpeg_7_1` (lines 575-585) +- `:video_reader` (lines 587-613) +- `:video_reader_cpu` (lines 615-640) + +--- + +## Part 4: Update Include Paths in C++ Files + +After moving, update `#include` statements in moved files: + +### In `fb/csrc/io/decoder/*.cpp` files: +```cpp +// Before +#include "sync_decoder.h" + +// After (if using full paths) +#include "pytorch/vision/fb/csrc/io/decoder/sync_decoder.h" +// Or keep relative if propagated_pp_flags handles it +``` + +### In `fb/csrc/io/video/*.cpp` and `fb/csrc/io/video_reader/*.cpp`: +```cpp +// Before +#include "pytorch/vision/torchvision/csrc/io/decoder/sync_decoder.h" + +// After +#include "pytorch/vision/fb/csrc/io/decoder/sync_decoder.h" +``` + +--- + +## Part 5: Update Python Imports + +### 5.1 Create `fb/io/__init__.py` + +```python +# fb/io/__init__.py +from ._video_opt import ( + _HAS_CPU_VIDEO_DECODER, + _HAS_VIDEO_OPT, + _probe_video_from_file, + _probe_video_from_memory, + _read_video_from_file, + _read_video_from_memory, + _read_video_timestamps_from_file, + _read_video_timestamps_from_memory, + Timebase, + VideoMetaData, +) +from .video_reader import VideoReader + +__all__ = [ + "_read_video_from_file", + "_read_video_timestamps_from_file", + "_probe_video_from_file", + "_read_video_from_memory", + "_read_video_timestamps_from_memory", + "_probe_video_from_memory", + "_HAS_CPU_VIDEO_DECODER", + "_HAS_VIDEO_OPT", + "VideoMetaData", + "Timebase", + "VideoReader", +] +``` + +### 5.2 Update `fb/io/_video_opt.py` + +```python +# Change this line: +from ..extension import _load_library + +# To: +from torchvision.extension import _load_library + +# OR create fb/extension.py that loads from fb/BUCK target +``` + +### 5.3 Update `fb/io/video_reader.py` + +```python +# Change: +from ..utils import _log_api_usage_once +from ._video_deprecation_warning import _raise_video_deprecation_warning +from ._video_opt import _HAS_CPU_VIDEO_DECODER + +# To: +from torchvision.utils import _log_api_usage_once +from torchvision.io._video_deprecation_warning import _raise_video_deprecation_warning +from ._video_opt import _HAS_CPU_VIDEO_DECODER +``` + +### 5.4 Update `fb/datasets/video_clip_sampler.py` + +```python +# Change line 8: +from torchvision.io import _probe_video_from_memory, _read_video_from_memory, Timebase + +# To: +from pytorch.vision.fb.io import _probe_video_from_memory, _read_video_from_memory, Timebase + +# OR if using package structure: +from ..io import _probe_video_from_memory, _read_video_from_memory, Timebase +``` + +--- + +## Part 6: Update torchvision's Public API + +### 6.1 Update `torchvision/io/__init__.py` + +Remove video_reader exports (or make them conditional): + +```python +# Remove these lines: +from ._video_opt import ( + _HAS_CPU_VIDEO_DECODER, + _HAS_VIDEO_OPT, + _probe_video_from_file, + _probe_video_from_memory, + _read_video_from_file, + _read_video_from_memory, + _read_video_timestamps_from_file, + _read_video_timestamps_from_memory, + Timebase, + VideoMetaData, +) +from .video_reader import VideoReader + +# Replace with stubs that raise deprecation errors for OSS: +_HAS_CPU_VIDEO_DECODER = False +_HAS_VIDEO_OPT = False + +def _stub_not_available(*args, **kwargs): + raise RuntimeError( + "video_reader backend is not available in open-source torchvision. " + "Use PyAV or TorchCodec instead." + ) + +_probe_video_from_file = _stub_not_available +_probe_video_from_memory = _stub_not_available +_read_video_from_file = _stub_not_available +_read_video_from_memory = _stub_not_available +_read_video_timestamps_from_file = _stub_not_available +_read_video_timestamps_from_memory = _stub_not_available + +class Timebase: + pass # Keep for compatibility + +class VideoMetaData: + pass # Keep for compatibility + +class VideoReader: + def __init__(self, *args, **kwargs): + raise RuntimeError( + "VideoReader with video_reader backend is not available. " + "Use backend='pyav' or migrate to TorchCodec." + ) +``` + +### 6.2 Update `torchvision/__init__.py` + +```python +# In set_video_backend(), remove "video_reader" as valid option for OSS: +def set_video_backend(backend: str) -> None: + # OSS version: only pyav + if backend not in ("pyav",): + raise ValueError(f"Invalid video backend: {backend}. Use 'pyav'.") + ... +``` + +--- + +## Part 7: Update External Dependencies + +Update these BUCK files to point to new target: + +| File | Change | +|------|--------| +| `cu_tdm/dps/worker/udf/BUCK` | `//pytorch/vision:video_reader` → `//pytorch/vision/fb:video_reader` | +| `cu_tdm/dps/worker/udf/BUCK` | `//pytorch/vision:decoder_streaming` → `//pytorch/vision/fb:decoder_streaming` | +| `fblearner/predictor/model/BUCK` | `//pytorch/vision:video_reader_cpu` → `//pytorch/vision/fb:video_reader_cpu` | +| `mitra/projects/xray_video_integrity/transforms/BUCK` | `//pytorch/vision:video_reader` → `//pytorch/vision/fb:video_reader` | +| `fblearner/flow/.../video_transformers.py` | Update `get_torch_custom_op_targets()` return value | + +--- + +## Part 8: File Move Commands + +```bash +# Create directory structure +mkdir -p fbcode/pytorch/vision/fb/csrc/io/decoder +mkdir -p fbcode/pytorch/vision/fb/csrc/io/video +mkdir -p fbcode/pytorch/vision/fb/csrc/io/video_reader +mkdir -p fbcode/pytorch/vision/fb/io + +# Move C++ decoder files +sl mv torchvision/csrc/io/decoder/*.cpp fb/csrc/io/decoder/ +sl mv torchvision/csrc/io/decoder/*.h fb/csrc/io/decoder/ + +# Move C++ video files +sl mv torchvision/csrc/io/video/*.cpp fb/csrc/io/video/ +sl mv torchvision/csrc/io/video/*.h fb/csrc/io/video/ + +# Move C++ video_reader files +sl mv torchvision/csrc/io/video_reader/*.cpp fb/csrc/io/video_reader/ +sl mv torchvision/csrc/io/video_reader/*.h fb/csrc/io/video_reader/ + +# Move Python files +sl mv torchvision/io/_video_opt.py fb/io/ +sl mv torchvision/io/video_reader.py fb/io/ +``` + +--- + +## Part 9: Testing Checklist + +After migration: + +- [ ] `buck build //pytorch/vision/fb:decoder_streaming` +- [ ] `buck build //pytorch/vision/fb:video_reader` +- [ ] `buck build //pytorch/vision/fb:video_reader_cpu` +- [ ] `buck test //pytorch/vision/fb:sync_decoder_test` +- [ ] `buck test //pytorch/vision/fb:util_test` +- [ ] `buck build //cu_tdm/dps/worker/udf:udf` +- [ ] `buck build //fblearner/predictor/model:pytorch_predictor_container` +- [ ] Test `fb/datasets/video_clip_sampler.py` still works +- [ ] Verify OSS build no longer includes video_reader code + +--- + +## Summary + +| Category | Count | +|----------|-------| +| C++ files to move | 32 files | +| Python files to move | 2-3 files | +| BUCK targets to move | 7 targets | +| External BUCK deps to update | 4-5 files | +| New files to create | `fb/io/__init__.py`, update `fb/BUCK` | + +**Estimated effort:** 1-2 days for migration + testing diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp deleted file mode 100644 index b158d3438b8..00000000000 --- a/torchvision/csrc/io/decoder/audio_sampler.cpp +++ /dev/null @@ -1,254 +0,0 @@ -#include "audio_sampler.h" -#include -#include "util.h" - -#define AVRESAMPLE_MAX_CHANNELS 32 - -// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24 -namespace ffmpeg { - -namespace { -int preparePlanes( - const AudioFormat& fmt, - const uint8_t* buffer, - int numSamples, - uint8_t** planes) { - int result; - if ((result = av_samples_fill_arrays( - planes, - nullptr, // linesize is not needed - buffer, - fmt.channels, - numSamples, - (AVSampleFormat)fmt.format, - 1)) < 0) { - LOG(ERROR) << "av_samples_fill_arrays failed, err: " - << Util::generateErrorDesc(result) - << ", numSamples: " << numSamples << ", fmt: " << fmt.format; - } - return result; -} -} // namespace - -AudioSampler::AudioSampler(void* logCtx) : logCtx_(logCtx) {} - -AudioSampler::~AudioSampler() { - cleanUp(); -} - -void AudioSampler::shutdown() { - cleanUp(); -} - -bool AudioSampler::init(const SamplerParameters& params) { - cleanUp(); - - if (params.type != MediaType::TYPE_AUDIO) { - LOG(ERROR) << "Invalid media type, expected MediaType::TYPE_AUDIO"; - return false; - } - -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - AVChannelLayout channel_out; - AVChannelLayout channel_in; - av_channel_layout_default(&channel_out, params.out.audio.channels); - av_channel_layout_default(&channel_in, params.in.audio.channels); - int ret = swr_alloc_set_opts2( - &swrContext_, - &channel_out, - (AVSampleFormat)params.out.audio.format, - params.out.audio.samples, - &channel_in, - (AVSampleFormat)params.in.audio.format, - params.in.audio.samples, - 0, - logCtx_); - if (ret < 0 || swrContext_ == nullptr) { - LOG(ERROR) << "Cannot allocate SwrContext"; - return false; - } -#else - swrContext_ = swr_alloc_set_opts( - nullptr, - av_get_default_channel_layout(params.out.audio.channels), - (AVSampleFormat)params.out.audio.format, - params.out.audio.samples, - av_get_default_channel_layout(params.in.audio.channels), - (AVSampleFormat)params.in.audio.format, - params.in.audio.samples, - 0, - logCtx_); - if (swrContext_ == nullptr) { - LOG(ERROR) << "Cannot allocate SwrContext"; - return false; - } -#endif - - int result; - if ((result = swr_init(swrContext_)) < 0) { - LOG(ERROR) << "swr_init failed, err: " << Util::generateErrorDesc(result) - << ", in -> format: " << params.in.audio.format - << ", channels: " << params.in.audio.channels - << ", samples: " << params.in.audio.samples - << ", out -> format: " << params.out.audio.format - << ", channels: " << params.out.audio.channels - << ", samples: " << params.out.audio.samples; - return false; - } - - // set formats - params_ = params; - return true; -} - -int AudioSampler::numOutputSamples(int inSamples) const { - return swr_get_out_samples(swrContext_, inSamples); -} - -int AudioSampler::sample( - const uint8_t* inPlanes[], - int inNumSamples, - ByteStorage* out, - int outNumSamples) { - int result; - int outBufferBytes = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - outNumSamples, - (AVSampleFormat)params_.out.audio.format, - 1); - - if (out) { - out->ensure(outBufferBytes); - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - - if ((result = preparePlanes( - params_.out.audio, - out->writableTail(), - outNumSamples, - outPlanes)) < 0) { - return result; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - inPlanes, - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert failed, err: " - << Util::generateErrorDesc(result); - return result; - } - - TORCH_CHECK_LE(result, outNumSamples); - - if (result) { - if ((result = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - result, - (AVSampleFormat)params_.out.audio.format, - 1)) >= 0) { - out->append(result); - } else { - LOG(ERROR) << "av_samples_get_buffer_size failed, err: " - << Util::generateErrorDesc(result); - } - } - } else { - // allocate a temporary buffer - auto* tmpBuffer = static_cast(av_malloc(outBufferBytes)); - if (!tmpBuffer) { - LOG(ERROR) << "av_alloc failed, for size: " << outBufferBytes; - return -1; - } - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - - if ((result = preparePlanes( - params_.out.audio, tmpBuffer, outNumSamples, outPlanes)) < 0) { - av_free(tmpBuffer); - return result; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - inPlanes, - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert failed, err: " - << Util::generateErrorDesc(result); - av_free(tmpBuffer); - return result; - } - - av_free(tmpBuffer); - - TORCH_CHECK_LE(result, outNumSamples); - - if (result) { - result = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - result, - (AVSampleFormat)params_.out.audio.format, - 1); - } - } - - return result; -} - -int AudioSampler::sample(AVFrame* frame, ByteStorage* out) { - const auto outNumSamples = numOutputSamples(frame ? frame->nb_samples : 0); - - if (!outNumSamples) { - return 0; - } - - return sample( - frame ? (const uint8_t**)&frame->data[0] : nullptr, - frame ? frame->nb_samples : 0, - out, - outNumSamples); -} - -int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) { - const auto inSampleSize = - av_get_bytes_per_sample((AVSampleFormat)params_.in.audio.format); - - const auto inNumSamples = - !in ? 0 : in->length() / inSampleSize / params_.in.audio.channels; - - const auto outNumSamples = numOutputSamples(inNumSamples); - - if (!outNumSamples) { - return 0; - } - - uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - int result; - if (in && - (result = preparePlanes( - params_.in.audio, in->data(), inNumSamples, inPlanes)) < 0) { - return result; - } - - return sample( - in ? (const uint8_t**)inPlanes : nullptr, - inNumSamples, - out, - outNumSamples); -} - -void AudioSampler::cleanUp() { - if (swrContext_) { - swr_free(&swrContext_); - swrContext_ = nullptr; - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_sampler.h b/torchvision/csrc/io/decoder/audio_sampler.h deleted file mode 100644 index e105bbe4de2..00000000000 --- a/torchvision/csrc/io/decoder/audio_sampler.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode audio frames from one format into another - */ - -class AudioSampler : public MediaSampler { - public: - explicit AudioSampler(void* logCtx); - ~AudioSampler() override; - - // MediaSampler overrides - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - int sample(AVFrame* frame, ByteStorage* out); - - private: - // close resources - void cleanUp(); - // helper functions for rescaling, cropping, etc. - int numOutputSamples(int inSamples) const; - int sample( - const uint8_t* inPlanes[], - int inNumSamples, - ByteStorage* out, - int outNumSamples); - - private: - SwrContext* swrContext_{nullptr}; - void* logCtx_{nullptr}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp deleted file mode 100644 index c3a003434b8..00000000000 --- a/torchvision/csrc/io/decoder/audio_stream.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include "audio_stream.h" -#include -#include "util.h" - -namespace ffmpeg { - -namespace { -static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) { -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels; -#else - return frame ? frame->channels : codec->channels; -#endif -} - -bool operator==(const AudioFormat& x, const AVFrame& y) { - return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(get_nb_channels(&y, nullptr)) && - x.format == y.format; -} - -bool operator==(const AudioFormat& x, const AVCodecContext& y) { - return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(get_nb_channels(nullptr, &y)) && - x.format == y.sample_fmt; -} - -AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) { - x.samples = y.sample_rate; - x.channels = get_nb_channels(&y, nullptr); - x.format = y.format; - return x; -} - -AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) { - x.samples = y.sample_rate; - x.channels = get_nb_channels(nullptr, &y); - x.format = y.sample_fmt; - return x; -} -} // namespace - -AudioStream::AudioStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const AudioFormat& format) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - 0) {} - -AudioStream::~AudioStream() { - if (sampler_) { - sampler_->shutdown(); - sampler_.reset(); - } -} - -int AudioStream::initFormat() { - // set output format - if (format_.format.audio.samples == 0) { - format_.format.audio.samples = codecCtx_->sample_rate; - } -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - if (format_.format.audio.channels == 0) { - format_.format.audio.channels = codecCtx_->ch_layout.nb_channels; - } -#else - if (format_.format.audio.channels == 0) { - format_.format.audio.channels = codecCtx_->channels; - } -#endif - if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) { - format_.format.audio.format = codecCtx_->sample_fmt; - } - - return format_.format.audio.samples != 0 && - format_.format.audio.channels != 0 && - format_.format.audio.format != AV_SAMPLE_FMT_NONE - ? 0 - : -1; -} - -// copies audio sample bytes via swr_convert call in audio_sampler.cpp -int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { - if (!sampler_) { - sampler_ = std::make_unique(codecCtx_); - } - // check if input format gets changed - if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_) - : !(sampler_->getInputFormat().audio == *frame_)) { - // - reinit sampler - SamplerParameters params; - params.type = format_.type; - params.out = format_.format; - params.in = FormatUnion(); - flush ? toAudioFormat(params.in.audio, *codecCtx_) - : toAudioFormat(params.in.audio, *frame_); - if (!sampler_->init(params)) { - return -1; - } - - VLOG(1) << "Set input audio sampler format" - << ", samples: " << params.in.audio.samples - << ", channels: " << params.in.audio.channels - << ", format: " << params.in.audio.format - << " : output audio sampler format" - << ", samples: " << format_.format.audio.samples - << ", channels: " << format_.format.audio.channels - << ", format: " << format_.format.audio.format; - } - // calls to a sampler that converts the audio samples and copies them to the - // out buffer via ffmpeg::swr_convert - return sampler_->sample(flush ? nullptr : frame_, out); -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_stream.h b/torchvision/csrc/io/decoder/audio_stream.h deleted file mode 100644 index 2d6457b68f5..00000000000 --- a/torchvision/csrc/io/decoder/audio_stream.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "audio_sampler.h" -#include "stream.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one audio stream. - */ - -class AudioStream : public Stream { - public: - AudioStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const AudioFormat& format); - ~AudioStream() override; - - private: - int initFormat() override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - - private: - std::unique_ptr sampler_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/cc_stream.cpp b/torchvision/csrc/io/decoder/cc_stream.cpp deleted file mode 100644 index 89174c396fd..00000000000 --- a/torchvision/csrc/io/decoder/cc_stream.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "cc_stream.h" - -namespace ffmpeg { - -CCStream::CCStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format) - : SubtitleStream(inputCtx, index, convertPtsToWallTime, format) { - format_.type = TYPE_CC; -} - -AVCodec* CCStream::findCodec(AVCodecParameters* params) { - if (params->codec_id == AV_CODEC_ID_BIN_DATA && - params->codec_type == AVMEDIA_TYPE_DATA) { - // obtain subtitles codec - params->codec_id = AV_CODEC_ID_MOV_TEXT; - params->codec_type = AVMEDIA_TYPE_SUBTITLE; - } - return Stream::findCodec(params); -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/cc_stream.h b/torchvision/csrc/io/decoder/cc_stream.h deleted file mode 100644 index 3a1d169f014..00000000000 --- a/torchvision/csrc/io/decoder/cc_stream.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "subtitle_stream.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one closed captions stream. - */ -class CCStream : public SubtitleStream { - public: - CCStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format); - - private: - AVCodec* findCodec(AVCodecParameters* params) override; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp deleted file mode 100644 index 7221445840e..00000000000 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ /dev/null @@ -1,764 +0,0 @@ -#include "decoder.h" -#include -#include -#include -#include -#include -#include "audio_stream.h" -#include "cc_stream.h" -#include "subtitle_stream.h" -#include "util.h" -#include "video_stream.h" - -namespace ffmpeg { - -namespace { - -constexpr size_t kIoBufferSize = 96 * 1024; -constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE; -constexpr size_t kLogBufferSize = 1024; - -bool mapFfmpegType(AVMediaType media, MediaType* type) { - switch (media) { - case AVMEDIA_TYPE_AUDIO: - *type = TYPE_AUDIO; - return true; - case AVMEDIA_TYPE_VIDEO: - *type = TYPE_VIDEO; - return true; - case AVMEDIA_TYPE_SUBTITLE: - *type = TYPE_SUBTITLE; - return true; - case AVMEDIA_TYPE_DATA: - *type = TYPE_CC; - return true; - default: - return false; - } -} - -std::unique_ptr createStream( - MediaType type, - AVFormatContext* ctx, - int idx, - bool convertPtsToWallTime, - const FormatUnion& format, - int64_t loggingUuid) { - switch (type) { - case TYPE_AUDIO: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.audio); - case TYPE_VIDEO: - return std::make_unique( - // negative loggingUuid indicates video streams. - ctx, - idx, - convertPtsToWallTime, - format.video, - -loggingUuid); - case TYPE_SUBTITLE: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.subtitle); - case TYPE_CC: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.subtitle); - default: - return nullptr; - } -} - -} // Namespace - -/* static */ -void Decoder::logFunction(void* avcl, int level, const char* cfmt, va_list vl) { - if (!avcl) { - // Nothing can be done here - return; - } - - AVClass* avclass = *reinterpret_cast(avcl); - if (!avclass) { - // Nothing can be done here - return; - } - Decoder* decoder = nullptr; - if (strcmp(avclass->class_name, "AVFormatContext") == 0) { - AVFormatContext* context = reinterpret_cast(avcl); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "AVCodecContext") == 0) { - AVCodecContext* context = reinterpret_cast(avcl); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "AVIOContext") == 0) { - AVIOContext* context = reinterpret_cast(avcl); - // only if opaque was assigned to Decoder pointer - if (context && context->read_packet == Decoder::readFunction) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "SWResampler") == 0) { - // expect AVCodecContext as parent - if (avclass->parent_log_context_offset) { - AVClass** parent = - *(AVClass***)(((uint8_t*)avcl) + avclass->parent_log_context_offset); - AVCodecContext* context = reinterpret_cast(parent); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } - } else if (strcmp(avclass->class_name, "SWScaler") == 0) { - // cannot find a way to pass context pointer through SwsContext struct - } else { - VLOG(2) << "Unknown context class: " << avclass->class_name; - } - - if (decoder != nullptr && decoder->enableLogLevel(level)) { - char buf[kLogBufferSize] = {0}; - // Format the line - int* prefix = decoder->getPrintPrefix(); - *prefix = 1; - av_log_format_line(avcl, level, cfmt, vl, buf, sizeof(buf) - 1, prefix); - // pass message to the decoder instance - std::string msg(buf); - decoder->logCallback(level, msg); - } -} - -bool Decoder::enableLogLevel(int level) const { - return ssize_t(level) <= params_.logLevel; -} - -void Decoder::logCallback(int level, const std::string& message) { - LOG(INFO) << "Msg, uuid=" << params_.loggingUuid << " level=" << level - << " msg=" << message; -} - -/* static */ -int Decoder::shutdownFunction(void* ctx) { - Decoder* decoder = (Decoder*)ctx; - if (decoder == nullptr) { - return 1; - } - return decoder->shutdownCallback(); -} - -int Decoder::shutdownCallback() { - return interrupted_ ? 1 : 0; -} - -/* static */ -int Decoder::readFunction(void* opaque, uint8_t* buf, int size) { - Decoder* decoder = reinterpret_cast(opaque); - if (decoder == nullptr) { - return 0; - } - int bytesRead = decoder->readCallback(buf, size); - return bytesRead == 0 ? AVERROR_EOF : bytesRead; -} - -/* static */ -int64_t Decoder::seekFunction(void* opaque, int64_t offset, int whence) { - Decoder* decoder = reinterpret_cast(opaque); - if (decoder == nullptr) { - return -1; - } - return decoder->seekCallback(offset, whence); -} - -int Decoder::readCallback(uint8_t* buf, int size) { - return seekableBuffer_.read(buf, size, params_.timeoutMs); -} - -int64_t Decoder::seekCallback(int64_t offset, int whence) { - return seekableBuffer_.seek(offset, whence, params_.timeoutMs); -} - -/* static */ -void Decoder::initOnce() { - static std::once_flag flagInit; - std::call_once(flagInit, []() { -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - av_register_all(); - avcodec_register_all(); -#endif - avformat_network_init(); - av_log_set_callback(Decoder::logFunction); - av_log_set_level(AV_LOG_ERROR); - VLOG(1) << "Registered ffmpeg libs"; - }); -} - -Decoder::Decoder() { - initOnce(); -} - -Decoder::~Decoder() { - cleanUp(); -} - -// Initialise the format context that holds information about the container and -// fill it with minimal information about the format (codecs are not opened -// here). Function reads in information about the streams from the container -// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is -// specified within the decoder parameters, it seeks into the correct frame -// (note, the seek defined here is "precise" seek). -bool Decoder::init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) { - cleanUp(); - - if ((params.uri.empty() || in) && (!params.uri.empty() || !in)) { - LOG(ERROR) - << "uuid=" << params_.loggingUuid - << " either external URI gets provided or explicit input callback"; - return false; - } - - // set callback and params - params_ = params; - - if (!(inputCtx_ = avformat_alloc_context())) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " cannot allocate format context"; - return false; - } - - AVInputFormat* fmt = nullptr; - int result = 0; - if (in) { - ImageType type = ImageType::UNKNOWN; - if ((result = seekableBuffer_.init( - std::forward(in), - params_.timeoutMs, - params_.maxSeekableBytes, - params_.isImage ? &type : nullptr)) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " can't initiate seekable buffer"; - cleanUp(); - return false; - } - - if (params_.isImage) { - const char* fmtName = "image2"; - switch (type) { - case ImageType::JPEG: - fmtName = "jpeg_pipe"; - break; - case ImageType::PNG: - fmtName = "png_pipe"; - break; - case ImageType::TIFF: - fmtName = "tiff_pipe"; - break; - default: - break; - } - - fmt = (AVInputFormat*)av_find_input_format(fmtName); - } - - const size_t avioCtxBufferSize = kIoBufferSize; - uint8_t* avioCtxBuffer = - (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize); - if (!avioCtxBuffer) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " av_malloc cannot allocate " << avioCtxBufferSize - << " bytes"; - cleanUp(); - return false; - } - - if (!(avioCtx_ = avio_alloc_context( - avioCtxBuffer, - avioCtxBufferSize, - 0, - reinterpret_cast(this), - &Decoder::readFunction, - nullptr, - result == 1 ? &Decoder::seekFunction : nullptr))) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avio_alloc_context failed"; - av_free(avioCtxBuffer); - cleanUp(); - return false; - } - - avioCtx_->max_packet_size = params.maxEncodedBufferSize; - - inputCtx_->pb = avioCtx_; - inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO; - } - - inputCtx_->opaque = reinterpret_cast(this); - inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction; - inputCtx_->interrupt_callback.opaque = reinterpret_cast(this); - - // add network timeout - inputCtx_->flags |= AVFMT_FLAG_NONBLOCK; - - AVDictionary* options = nullptr; - if (params_.listen) { - av_dict_set_int(&options, "listen", 1, 0); - } - if (params_.timeoutMs > 0) { - av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0); - av_dict_set_int(&options, "stimeout", params_.timeoutMs * 1000, 0); - av_dict_set_int(&options, "rw_timeout", params_.timeoutMs * 1000, 0); - if (!params_.tlsCertFile.empty()) { - av_dict_set(&options, "cert_file", params_.tlsCertFile.data(), 0); - } - if (!params_.tlsKeyFile.empty()) { - av_dict_set(&options, "key_file", params_.tlsKeyFile.data(), 0); - } - } - - av_dict_set_int(&options, "probesize", params_.probeSize, 0); - - interrupted_ = false; - - // ffmpeg avformat_open_input call can hang if media source doesn't respond - // set a guard for handle such situations, if requested - std::promise p; - std::future f = p.get_future(); - std::unique_ptr guard; - if (params_.preventStaleness) { - guard = std::make_unique([&f, this]() { - auto timeout = std::chrono::milliseconds(params_.timeoutMs); - if (std::future_status::timeout == f.wait_for(timeout)) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " cannot open stream within " << params_.timeoutMs - << " ms"; - interrupted_ = true; - } - }); - } - - if (fmt) { - result = avformat_open_input(&inputCtx_, nullptr, fmt, &options); - } else { - result = - avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options); - } - - av_dict_free(&options); - - if (guard) { - p.set_value(true); - guard->join(); - guard.reset(); - } - - if (result < 0 || interrupted_) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avformat_open_input failed, error=" - << Util::generateErrorDesc(result); - cleanUp(); - return false; - } - - result = avformat_find_stream_info(inputCtx_, nullptr); - - if (result < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avformat_find_stream_info failed, error=" - << Util::generateErrorDesc(result); - cleanUp(); - return false; - } - - if (!openStreams(metadata)) { - LOG(ERROR) << "uuid=" << params_.loggingUuid << " cannot activate streams"; - cleanUp(); - return false; - } - // SyncDecoder inherits Decoder which would override onInit. - onInit(); - - if (params.startOffset != 0) { - auto offset = params.startOffset <= params.seekAccuracy - ? 0 - : params.startOffset - params.seekAccuracy; - - av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD); - } - - for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) { - if ( -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO -#else // FFMPEG 4.0+ - inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO -#endif - && inputCtx_->streams[i]->duration > 0) { - // There is at least two 1/r_frame_rates from the frame before the last - // one until the video duration, let's prefer to set duration after the - // frame before the last one, but as early as possible - double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den / - (double)inputCtx_->streams[i]->r_frame_rate.num - - 1 / (double)AV_TIME_BASE; - videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration * - inputCtx_->streams[i]->time_base.num / - (double)inputCtx_->streams[i]->time_base.den - - 1000 * correction; - break; - } - } - - VLOG(1) << "Decoder initialized, log level: " << params_.logLevel; - VLOG(1) << "Video duration: " << videoDurationMs_; - return true; -} - -// open appropriate CODEC for every type of stream and move it to the class -// variable `streams_` and make sure it is in range for decoding -bool Decoder::openStreams(std::vector* metadata) { - for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) { - // - find the corespondent format at params_.formats set - MediaFormat format; -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - const auto media = inputCtx_->streams[i]->codec->codec_type; -#else // FFMPEG 4.0+ - const auto media = inputCtx_->streams[i]->codecpar->codec_type; -#endif - if (!mapFfmpegType(media, &format.type)) { - VLOG(1) << "Stream media: " << media << " at index " << i - << " gets ignored, unknown type"; - - continue; // unsupported type - } - - // check format - auto it = params_.formats.find(format); - if (it == params_.formats.end()) { - VLOG(1) << "Stream type: " << format.type << " at index: " << i - << " gets ignored, caller is not interested"; - continue; // clients don't care about this media format - } - - // do we have stream of this type? - auto stream = findByType(format); - - // should we process this stream? - - if (it->stream == -2 || // all streams of this type are welcome - (!stream && (it->stream == -1 || it->stream == i))) { // new stream - VLOG(1) << "Stream type: " << format.type << " found, at index: " << i; - auto stream_2 = createStream( - format.type, - inputCtx_, - i, - params_.convertPtsToWallTime, - it->format, - params_.loggingUuid); - CHECK(stream_2); - if (stream_2->openCodec(metadata, params_.numThreads) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " open codec failed, stream_idx=" << i; - return false; - } - streams_.emplace(i, std::move(stream_2)); - inRange_.set(i, true); - } - } - - return true; -} - -void Decoder::shutdown() { - cleanUp(); -} - -void Decoder::interrupt() { - interrupted_ = true; -} - -void Decoder::cleanUp() { - if (!interrupted_) { - interrupted_ = true; - } - - if (inputCtx_) { - for (auto& stream : streams_) { - // Drain stream buffers. - DecoderOutputMessage msg; - while (msg.payload = nullptr, stream.second->flush(&msg, true) > 0) { - } - stream.second.reset(); - } - streams_.clear(); - avformat_close_input(&inputCtx_); - } - if (avioCtx_) { - av_freep(&avioCtx_->buffer); - av_freep(&avioCtx_); - } - - // reset callback - seekableBuffer_.shutdown(); -} - -// function does actual work, derived class calls it in working thread -// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if -// no frames got decoded in the specified timeout time, AVERROR_BUFFER_TOO_SMALL -// when unable to allocate packet and error on unrecoverable error -int Decoder::getFrame(size_t workingTimeInMs) { - if (inRange_.none()) { - return ENODATA; - } - // decode frames until cache is full and leave thread - // once decode() method gets called and grab some bytes - // run this method again - // init package - // update 03/22: moving memory management to ffmpeg - AVPacket* avPacket; - avPacket = av_packet_alloc(); - if (avPacket == nullptr) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " decoder as not able to allocate the packet."; - return AVERROR_BUFFER_TOO_SMALL; - } - avPacket->data = nullptr; - avPacket->size = 0; - - auto end = std::chrono::steady_clock::now() + - std::chrono::milliseconds(workingTimeInMs); - // return true if elapsed time less than timeout - auto watcher = [end]() -> bool { - return std::chrono::steady_clock::now() <= end; - }; - - int result = 0; - size_t decodingErrors = 0; - bool decodedFrame = false; - while (!interrupted_ && inRange_.any() && !decodedFrame) { - if (watcher() == false) { - LOG(ERROR) << "uuid=" << params_.loggingUuid << " hit ETIMEDOUT"; - result = ETIMEDOUT; - break; - } - result = av_read_frame(inputCtx_, avPacket); - if (result == AVERROR(EAGAIN)) { - VLOG(4) << "Decoder is busy..."; - std::this_thread::yield(); - result = 0; // reset error, EAGAIN is not an error at all - // reset the packet to default settings - av_packet_unref(avPacket); - continue; - } else if (result == AVERROR_EOF) { - flushStreams(); - VLOG(1) << "End of stream"; - result = ENODATA; - break; - } else if ( - result == AVERROR(EPERM) && params_.skipOperationNotPermittedPackets) { - // reset error, lets skip packets with EPERM - result = 0; - // reset the packet to default settings - av_packet_unref(avPacket); - continue; - } else if (result < 0) { - flushStreams(); - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " error detected: " << Util::generateErrorDesc(result); - break; - } - - // get stream; if stream cannot be found reset the packet to - // default settings - auto stream = findByIndex(avPacket->stream_index); - if (stream == nullptr || !inRange_.test(stream->getIndex())) { - av_packet_unref(avPacket); - continue; - } - - size_t numConsecutiveNoBytes = 0; - // it can be only partial decoding of the package bytes - do { - // decode package - bool gotFrame = false; - bool hasMsg = false; - // packet either got consumed completely or not at all - if ((result = processPacket( - stream, avPacket, &gotFrame, &hasMsg, params_.fastSeek)) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " processPacket failed with code: " << result; - break; - } - - if (!gotFrame && params_.maxProcessNoBytes != 0 && - ++numConsecutiveNoBytes > params_.maxProcessNoBytes) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " exceeding max amount of consecutive no bytes"; - break; - } - if (result > 0) { - numConsecutiveNoBytes = 0; - } - - decodedFrame |= hasMsg; - } while (result == 0); - - // post loop check - if (result < 0) { - if (params_.maxPackageErrors != 0 && // check errors - ++decodingErrors >= params_.maxPackageErrors) { // reached the limit - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " exceeding max amount of consecutive package errors"; - break; - } - } else { - decodingErrors = 0; // reset on success - } - - result = 0; - - av_packet_unref(avPacket); - - if (params_.uniformSampling > 1) { - if (doSeek_) { - double duration = - videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; - double step = - (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); - avformat_seek_file( - inputCtx_, - -1, - static_cast(step * kFramesDecoded_) + 1, - static_cast(step * (kFramesDecoded_ + 1)), - static_cast(step * (kFramesDecoded_ + 1)), - 0); - ++kFramesDecoded_; - doSeek_ = false; - } - } - } - - av_packet_free(&avPacket); - VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_ - << ", inRange_.any() " << inRange_.any() << ", decodedFrame " - << decodedFrame << ", result " << result; - - // loop can be terminated, either by: - // 1. explicitly interrupted - // 3. unrecoverable error or ENODATA (end of stream) or ETIMEDOUT (timeout) - // 4. decoded frames pts are out of the specified range - // 5. success decoded frame - if (interrupted_) { - return EINTR; - } - if (result != 0) { - return result; - } - if (inRange_.none()) { - return ENODATA; - } - return 0; -} - -// find stream by stream index -Stream* Decoder::findByIndex(int streamIndex) const { - auto it = streams_.find(streamIndex); - return it != streams_.end() ? it->second.get() : nullptr; -} - -// find stream by type; note finds only the first stream of a given type -Stream* Decoder::findByType(const MediaFormat& format) const { - for (auto& stream : streams_) { - if (stream.second->getMediaFormat().type == format.type) { - return stream.second.get(); - } - } - return nullptr; -} - -// given the stream and packet, decode the frame buffers into the -// DecoderOutputMessage data structure via stream::decodePacket function. -int Decoder::processPacket( - Stream* stream, - AVPacket* packet, - bool* gotFrame, - bool* hasMsg, - bool fastSeek) { - // decode package - int result; - DecoderOutputMessage msg; - msg.payload = params_.headerOnly ? nullptr : createByteStorage(0); - *hasMsg = false; - if ((result = stream->decodePacket( - packet, &msg, params_.headerOnly, gotFrame)) >= 0 && - *gotFrame) { - // check end offset - bool endInRange = - params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; - inRange_.set(stream->getIndex(), endInRange); - // if fastseek is enabled, we're returning the first - // frame that we decode after (potential) seek. - // By default, we perform accurate seek to the closest - // following frame - bool startCondition = true; - if (!fastSeek) { - startCondition = msg.header.pts >= params_.startOffset; - } - if (endInRange && startCondition) { - *hasMsg = pushMsg(std::move(msg)); - } - } - return result; -} - -bool Decoder::pushMsg(DecoderOutputMessage&& msg) { - pastDecodedPTS_ = currentDecodedPTS_; - currentDecodedPTS_ = msg.header.pts; - - if (params_.uniformSampling <= 1) { - push(std::move(msg)); - return true; - } - - double duration = - videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; - double step = - (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); - if (pastDecodedPTS_ < step * kFramesDecoded_ && - step * kFramesDecoded_ <= currentDecodedPTS_) { - push(std::move(msg)); - doSeek_ = true; - return true; - } - - return false; -} - -void Decoder::flushStreams() { - VLOG(1) << "Flushing streams..."; - for (auto& stream : streams_) { - DecoderOutputMessage msg; - while (msg.payload = (params_.headerOnly ? nullptr : createByteStorage(0)), - stream.second->flush(&msg, params_.headerOnly) > 0) { - // check end offset - bool endInRange = - params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; - inRange_.set(stream.second->getIndex(), endInRange); - if (endInRange && msg.header.pts >= params_.startOffset) { - pushMsg(std::move(msg)); - } else { - msg.payload.reset(); - } - } - } -} - -int Decoder::decode_all(const DecoderOutCallback& callback) { - int result; - do { - DecoderOutputMessage out; - if (0 == (result = decode(&out, params_.timeoutMs))) { - callback(std::move(out)); - } - } while (result == 0); - return result; -} -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h deleted file mode 100644 index 172a011f93e..00000000000 --- a/torchvision/csrc/io/decoder/decoder.h +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include -#include -#include "seekable_buffer.h" -#include "stream.h" - -#if defined(_MSC_VER) -#include -using ssize_t = SSIZE_T; -#endif - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode media streams. - * Media bytes can be explicitly provided through read-callback - * or fetched internally by FFMPEG library - */ -class Decoder : public MediaDecoder { - public: - Decoder(); - ~Decoder() override; - - // MediaDecoder overrides - bool init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) override; - int decode_all(const DecoderOutCallback& callback) override; - void shutdown() override; - void interrupt() override; - - protected: - // function does actual work, derived class calls it in working thread - // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if - // no frames got decoded in the specified timeout time, and error on - // unrecoverable error. - int getFrame(size_t workingTimeInMs = 100); - - // Derived class must override method and consume the provided message - virtual void push(DecoderOutputMessage&& buffer) = 0; - - // Fires on init call - virtual void onInit() {} - - public: - // C-style FFMPEG API requires C/static methods for callbacks - static void logFunction(void* avcl, int level, const char* cfmt, va_list vl); - static int shutdownFunction(void* ctx); - static int readFunction(void* opaque, uint8_t* buf, int size); - static int64_t seekFunction(void* opaque, int64_t offset, int whence); - // can be called by any classes or API - static void initOnce(); - - int* getPrintPrefix() { - return &printPrefix; - } - double videoDurationMs_ = -1; - - private: - // mark below function for a proper invocation - bool enableLogLevel(int level) const; - void logCallback(int level, const std::string& message); - int readCallback(uint8_t* buf, int size); - int64_t seekCallback(int64_t offset, int whence); - int shutdownCallback(); - - bool openStreams(std::vector* metadata); - Stream* findByIndex(int streamIndex) const; - Stream* findByType(const MediaFormat& format) const; - int processPacket( - Stream* stream, - AVPacket* packet, - bool* gotFrame, - bool* hasMsg, - bool fastSeek = false); - void flushStreams(); - void cleanUp(); - bool pushMsg(DecoderOutputMessage&& - msg); // returns whether frame is passed to downstream - - protected: - DecoderParameters params_; - - private: - SeekableBuffer seekableBuffer_; - int printPrefix{1}; - - std::atomic interrupted_{false}; - AVFormatContext* inputCtx_{nullptr}; - AVIOContext* avioCtx_{nullptr}; - std::unordered_map> streams_; - std::bitset<64> inRange_; - int kFramesDecoded_{0}; - int64_t pastDecodedPTS_{-1}; - int64_t currentDecodedPTS_{-1}; - bool doSeek_{false}; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h deleted file mode 100644 index d2dc5c7935b..00000000000 --- a/torchvision/csrc/io/decoder/defs.h +++ /dev/null @@ -1,415 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -extern "C" { -#include -#include -#include -#include -#include -#include -#include "libswscale/swscale.h" -} - -namespace ffmpeg { - -// bit mask of formats, keep them in form 2^n -enum MediaType : size_t { - TYPE_AUDIO = 1, - TYPE_VIDEO = 2, - TYPE_SUBTITLE = 4, - TYPE_CC = 8, // closed captions from transport streams -}; - -// audio -struct AudioFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - bool operator==(const AudioFormat& x) const { - return x.format == format && x.samples == samples && x.channels == channels; - } - - size_t samples{0}; // number samples per second (frequency) - size_t channels{0}; // number of channels - long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE - size_t padding[2]; - // -- alignment 40 bytes -}; - -// video -struct VideoFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - bool operator==(const VideoFormat& x) const { - return x.format == format && x.width == width && x.height == height; - } - /* - When width = 0, height = 0, minDimension = 0, and maxDimension = 0, - keep the original frame resolution - When width = 0, height = 0, minDimension != 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that shorter edge size is - minDimension - When width = 0, height = 0, minDimension = 0, and maxDimension != 0, - keep the aspect ratio and resize the frame so that longer edge size is - maxDimension - When width = 0, height = 0, minDimension != 0, and maxDimension != 0, - resize the frame so that shorter edge size is minDimension, and - longer edge size is maxDimension. The aspect ratio may not be preserved - When width = 0, height != 0, minDimension = 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that frame height is $height - When width != 0, height = 0, minDimension = 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that frame width is $width - When width != 0, height != 0, minDimension = 0, and maxDimension = 0, - resize the frame so that frame width and height are set to $width and - $height, - respectively - */ - size_t width{0}; // width in pixels - size_t height{0}; // height in pixels - long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE - size_t minDimension{0}; // choose min dimension and rescale accordingly - size_t maxDimension{0}; // choose max dimension and rescale accordingly - size_t cropImage{0}; // request image crop - // -- alignment 40 bytes -}; - -// subtitle/cc -struct SubtitleFormat { - long type{0}; // AVSubtitleType, auto SUBTITLE_NONE - size_t padding[4]; - // -- alignment 40 bytes -}; - -union FormatUnion { - FormatUnion() : audio() {} - explicit FormatUnion(int) : video() {} - explicit FormatUnion(char) : subtitle() {} - explicit FormatUnion(double) : subtitle() {} - AudioFormat audio; - VideoFormat video; - SubtitleFormat subtitle; - // -- alignment 40 bytes -}; - -/* - MediaFormat data structure serves as input/output parameter. - Caller assigns values for input formats - or leave default values for auto detection - For output formats all fields will be set to the specific values -*/ -struct MediaFormat { - // for using map/set data structures - bool operator<(const MediaFormat& x) const { - return type < x.type; - } - bool operator==(const MediaFormat& x) const { - if (type != x.type) { - return false; - } - switch (type) { - case TYPE_AUDIO: - return format.audio == x.format.audio; - case TYPE_VIDEO: - return format.video == x.format.video; - case TYPE_SUBTITLE: - case TYPE_CC: - return true; - default: - return false; - } - } - - explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {} - explicit MediaFormat(int x, long s = -1) - : type(TYPE_VIDEO), stream(s), format(x) {} - explicit MediaFormat(char x, long s = -1) - : type(TYPE_SUBTITLE), stream(s), format(x) {} - explicit MediaFormat(double x, long s = -1) - : type(TYPE_CC), stream(s), format(x) {} - - static MediaFormat makeMediaFormat(AudioFormat format, long stream) { - MediaFormat result(stream); - result.format.audio = format; - return result; - } - - static MediaFormat makeMediaFormat(VideoFormat format, long stream) { - MediaFormat result(0, stream); - result.format.video = format; - return result; - } - - static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) { - MediaFormat result('0', stream); - result.format.subtitle = format; - return result; - } - - // format type - MediaType type; - // stream index: - // set -1 for one stream auto detection, -2 for all streams auto detection, - // >= 0, specified stream, if caller knows the stream index (unlikely) - long stream; - // union keeps one of the possible formats, defined by MediaType - FormatUnion format; -}; - -struct DecoderParameters { - // local file, remote file, http url, rtmp stream uri, etc. anything that - // ffmpeg can recognize - std::string uri{std::string()}; - // timeout on getting bytes for decoding - size_t timeoutMs{1000}; - // logging level, default AV_LOG_PANIC - long logLevel{0}; - // when decoder would give up, 0 means never - size_t maxPackageErrors{0}; - // max allowed consecutive times no bytes are processed. 0 means for infinite. - size_t maxProcessNoBytes{0}; - // start offset (us) - long startOffset{0}; - // end offset (us) - long endOffset{-1}; - // logging id - int64_t loggingUuid{0}; - // internal max seekable buffer size - size_t maxSeekableBytes{0}; - // adjust header pts to the epoch time - bool convertPtsToWallTime{false}; - // indicate if input stream is an encoded image - bool isImage{false}; - // listen and wait for new rtmp stream - bool listen{false}; - // don't copy frame body, only header - bool headerOnly{false}; - // enable fast seek (seek only to keyframes) - bool fastSeek{false}; - // interrupt init method on timeout - bool preventStaleness{true}; - // seek tolerated accuracy (us) - double seekAccuracy{1000000.0}; - // Allow multithreaded decoding for numThreads > 1; - // 0 numThreads=0 sets up sensible defaults - int numThreads{1}; - // what media types should be processed, default none - std::set formats; - - // can be used for asynchronous decoders - size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes - size_t cacheTimeoutMs{1000}; // timeout on bytes writing - bool enforceCacheSize{false}; // drop output frames if cache is full - bool mergeAudioMessages{false}; // combine collocated audio messages together - - std::string tlsCertFile; - std::string tlsKeyFile; - - // Skip packets that fail with EPERM errors and continue decoding. - bool skipOperationNotPermittedPackets{false}; - - // probing size in bytes, i.e. the size of the data to analyze to get stream - // information. A higher value will enable detecting more information in case - // it is dispersed into the stream, but will increase latency. Must be an - // integer not lesser than 32. It is 5000000 by default. - int64_t probeSize{5000000}; - - // Expected duration of the video to be decoded, mainly used with uniform - // sampling - float expectedDuration{0.0f}; - - // Sample N key-frames from the video roughly uniformly across the timeline - int uniformSampling{0}; - - // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames. - // Override this with bigger buffer size if needed. - int64_t maxEncodedBufferSize{0}; -}; - -struct DecoderHeader { - // message id, from 0 till ... - size_t seqno{0}; - // decoded timestamp in microseconds from either beginning of the stream or - // from epoch time, see DecoderParameters::convertPtsToWallTime - long pts{0}; - // decoded key frame - size_t keyFrame{0}; - // frames per second, valid only for video streams - double fps{0}; - // format specifies what kind frame is in a payload - MediaFormat format; -}; - -// Abstract interface ByteStorage class -class ByteStorage { - public: - virtual ~ByteStorage() = default; - // makes sure that buffer has at least n bytes available for writing, if not - // storage must reallocate memory. - virtual void ensure(size_t n) = 0; - // caller must not to write more than available bytes - virtual uint8_t* writableTail() = 0; - // caller confirms that n bytes were written to the writable tail - virtual void append(size_t n) = 0; - // caller confirms that n bytes were read from the read buffer - virtual void trim(size_t n) = 0; - // gives an access to the beginning of the read buffer - virtual const uint8_t* data() const = 0; - // returns the stored size in bytes - virtual size_t length() const = 0; - // returns available capacity for writable tail - virtual size_t tail() const = 0; - // clears content, keeps capacity - virtual void clear() = 0; -}; - -struct DecoderOutputMessage { - DecoderHeader header; - std::unique_ptr payload; -}; - -/* - * External provider of the ecnoded bytes, specific implementation is left for - * different use cases, like file, memory, external network end-points, etc. - * Normally input/output parameter @out set to valid, not null buffer pointer, - * which indicates "read" call, however there are "seek" modes as well. - - * @out != nullptr => read from the current offset, @whence got ignored, - * @size bytes to read => return number bytes got read, 0 if no more bytes - * available, < 0 on error. - - * @out == nullptr, @timeoutMs == 0 => does provider support "seek" - * capability in a first place? @size & @whence got ignored, return 0 on - * success, < 0 if "seek" mode is not supported. - - * @out == nullptr, @timeoutMs != 0 => normal seek call - * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE) - * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END], - * length of buffer if @whence = [AVSEEK_SIZE]. - */ -using DecoderInCallback = - std::function; - -using DecoderOutCallback = std::function; - -struct DecoderMetadata { - // time base numerator - long num{0}; - // time base denominator - long den{1}; - // duration of the stream, in miscroseconds, if available - long duration{-1}; - // frames per second, valid only for video streams - double fps{0}; - // format specifies what kind frame is in a payload - MediaFormat format; -}; -/** - * Abstract class for decoding media bytes - * It has two different modes. Internal media bytes retrieval for given uri and - * external media bytes provider in case of memory streams - */ -class MediaDecoder { - public: - virtual ~MediaDecoder() = default; - - /** - * Initializes media decoder with parameters, - * calls callback when media bytes are available. - * Media bytes get fetched internally from provided URI - * or invokes provided input callback to get media bytes. - * Input callback must be empty for the internal media provider - * Caller can provide non-null pointer for the input container - * if headers to obtain the streams metadata (optional) - */ - virtual bool init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) = 0; - - /** - * Polls available decoded one frame from decoder - * Returns error code, 0 - for success - */ - virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0; - - /** - * Polls available decoded bytes from decoder, till EOF or error - */ - virtual int decode_all(const DecoderOutCallback& callback) = 0; - - /** - * Stops calling callback, releases resources - */ - virtual void shutdown() = 0; - - /** - * Interrupts whatever decoder is doing at any time - */ - virtual void interrupt() = 0; - - /** - * Factory to create ByteStorage class instances, particular implementation is - * left to the derived class. Caller provides the initially allocated size - */ - virtual std::unique_ptr createByteStorage(size_t n) = 0; -}; - -struct SamplerParameters { - MediaType type{TYPE_AUDIO}; - FormatUnion in; - FormatUnion out; - int64_t loggingUuid{0}; -}; - -/** - * Abstract class for sampling media bytes - */ -class MediaSampler { - public: - virtual ~MediaSampler() = default; - - /** - * Initializes media sampler with parameters - */ - virtual bool init(const SamplerParameters& params) = 0; - - /** - * Samples media bytes - * Returns error code < 0, or >=0 - for success, indicating number of bytes - * processed. - * set @in to null for flushing data - */ - virtual int sample(const ByteStorage* in, ByteStorage* out) = 0; - - /** - * Releases resources - */ - virtual void shutdown() = 0; - - /* - * Returns media type - */ - MediaType getMediaType() const { - return params_.type; - } - /* - * Returns formats - */ - FormatUnion getInputFormat() const { - return params_.in; - } - FormatUnion getOutFormat() const { - return params_.out; - } - - protected: - SamplerParameters params_; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp deleted file mode 100644 index 4e420c3b3cd..00000000000 --- a/torchvision/csrc/io/decoder/memory_buffer.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include "memory_buffer.h" -#include - -namespace ffmpeg { - -MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size) - : buffer_(buffer), len_(size) {} - -int MemoryBuffer::read(uint8_t* buf, int size) { - if (pos_ < len_) { - auto available = std::min(int(len_ - pos_), size); - memcpy(buf, buffer_ + pos_, available); - pos_ += available; - return available; - } - - return 0; -} - -int64_t MemoryBuffer::seek(int64_t offset, int whence) { - if (whence & AVSEEK_SIZE) { - return len_; - } - - // remove force flag - whence &= ~AVSEEK_FORCE; - - switch (whence) { - case SEEK_SET: - if (offset >= 0 && offset <= len_) { - pos_ = offset; - } - break; - case SEEK_END: - if (len_ + offset >= 0 && len_ + offset <= len_) { - pos_ = len_ + offset; - } - break; - case SEEK_CUR: - if (pos_ + offset > 0 && pos_ + offset <= len_) { - pos_ += offset; - } - break; - default: - LOG(ERROR) << "Unknown whence flag gets provided: " << whence; - } - return pos_; -} - -/* static */ -DecoderInCallback MemoryBuffer::getCallback( - const uint8_t* buffer, - size_t size) { - MemoryBuffer object(buffer, size); - return - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - supported - return 0; - } - return object.seek(size, whence); - }; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/memory_buffer.h b/torchvision/csrc/io/decoder/memory_buffer.h deleted file mode 100644 index 909626d3cae..00000000000 --- a/torchvision/csrc/io/decoder/memory_buffer.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class uses external memory buffer and implements a seekable interface. - */ -class MemoryBuffer { - public: - explicit MemoryBuffer(const uint8_t* buffer, size_t size); - int64_t seek(int64_t offset, int whence); - int read(uint8_t* buf, int size); - - // static constructor for decoder callback. - static DecoderInCallback getCallback(const uint8_t* buffer, size_t size); - - private: - const uint8_t* buffer_; // set at construction time - long pos_{0}; // current position - long len_{0}; // bytes in buffer -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/seekable_buffer.cpp b/torchvision/csrc/io/decoder/seekable_buffer.cpp deleted file mode 100644 index 41e3e689c7b..00000000000 --- a/torchvision/csrc/io/decoder/seekable_buffer.cpp +++ /dev/null @@ -1,139 +0,0 @@ -#include "seekable_buffer.h" -#include -#include -#include "memory_buffer.h" - -namespace ffmpeg { - -int SeekableBuffer::init( - DecoderInCallback&& in, - uint64_t timeoutMs, - size_t maxSeekableBytes, - ImageType* type) { - shutdown(); - isSeekable_ = in(nullptr, 0, 0, 0) == 0; - if (isSeekable_) { // seekable - if (type) { - if (!readBytes(in, 8, timeoutMs)) { - return -1; - } - setImageType(type); - end_ = 0; - eof_ = false; - std::vector().swap(buffer_); - // reset callback - if (in(nullptr, 0, SEEK_SET, timeoutMs)) { - return -1; - } - } - inCallback_ = std::forward(in); - return 1; - } - - if (!readBytes(in, maxSeekableBytes + (type ? 8 : 0), timeoutMs)) { - return -1; - } - - if (type) { - setImageType(type); - } - - if (eof_) { - end_ = 0; - eof_ = false; - // reuse MemoryBuffer functionality - inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size()); - isSeekable_ = true; - return 1; - } - inCallback_ = std::forward(in); - return 0; -} - -bool SeekableBuffer::readBytes( - DecoderInCallback& in, - size_t maxBytes, - uint64_t timeoutMs) { - // Resize to th minimum 4K page or less - buffer_.resize(std::min(maxBytes, size_t(4 * 1024UL))); - end_ = 0; - eof_ = false; - - auto end = - std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs); - auto watcher = [end]() -> bool { - return std::chrono::steady_clock::now() <= end; - }; - - bool hasTime = true; - while (!eof_ && end_ < maxBytes && (hasTime = watcher())) { - // lets read all bytes into available buffer - auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs); - if (res > 0) { - end_ += res; - if (end_ == buffer_.size()) { - buffer_.resize(std::min(size_t(end_ * 4UL), maxBytes)); - } - } else if (res == 0) { - eof_ = true; - } else { - // error - return false; - } - } - - buffer_.resize(end_); - - return hasTime; -} - -void SeekableBuffer::setImageType(ImageType* type) { - if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 && - buffer_[2] == 0xFF) { - *type = ImageType::JPEG; - } else if ( - buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' && - buffer_[3] == 'G') { - *type = ImageType::PNG; - } else if ( - buffer_.size() > 1 && - ((buffer_[0] == 0x49 && buffer_[1] == 0x49) || - (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) { - *type = ImageType::TIFF; - } else { - *type = ImageType::UNKNOWN; - } -} - -int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) { - if (isSeekable_) { - return inCallback_(buf, size, 0, timeoutMs); - } - if (pos_ < end_) { - // read cached bytes for non-seekable callback - auto available = std::min(int(end_ - pos_), size); - memcpy(buf, buffer_.data() + pos_, available); - pos_ += available; - return available; - } else if (!eof_) { - // normal sequential read (see defs.h file), i.e. @buf != null - auto res = inCallback_(buf, size, 0, timeoutMs); // read through - eof_ = res == 0; - return res; - } else { - return 0; - } -} - -int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) { - return inCallback_(nullptr, offset, whence, timeoutMs); -} - -void SeekableBuffer::shutdown() { - pos_ = end_ = 0; - eof_ = false; - std::vector().swap(buffer_); - inCallback_ = nullptr; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/seekable_buffer.h b/torchvision/csrc/io/decoder/seekable_buffer.h deleted file mode 100644 index 9d5729f5306..00000000000 --- a/torchvision/csrc/io/decoder/seekable_buffer.h +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class uses internal buffer to store initial size bytes as a seekable cache - * from Media provider and let ffmpeg to seek and read bytes from cache - * and beyond - reading bytes directly from Media provider - */ -enum class ImageType { - UNKNOWN = 0, - JPEG = 1, - PNG = 2, - TIFF = 3, -}; - -class SeekableBuffer { - public: - // @type is optional, not nullptr only is image detection required - // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error - int init( - DecoderInCallback&& in, - uint64_t timeoutMs, - size_t maxSeekableBytes, - ImageType* type); - int read(uint8_t* buf, int size, uint64_t timeoutMs); - int64_t seek(int64_t offset, int whence, uint64_t timeoutMs); - void shutdown(); - - private: - bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs); - void setImageType(ImageType* type); - - private: - DecoderInCallback inCallback_; - std::vector buffer_; // resized at init time - long pos_{0}; // current position (SEEK_CUR iff pos_ < end_) - long end_{0}; // current buffer size - bool eof_{0}; // indicates the EOF - bool isSeekable_{false}; // is callback seekable -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp deleted file mode 100644 index 7969741e72c..00000000000 --- a/torchvision/csrc/io/decoder/stream.cpp +++ /dev/null @@ -1,288 +0,0 @@ -#include "stream.h" -#include -#include -#include "util.h" - -namespace ffmpeg { -const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; - -Stream::Stream( - AVFormatContext* inputCtx, - MediaFormat format, - bool convertPtsToWallTime, - int64_t loggingUuid) - : inputCtx_(inputCtx), - format_(format), - convertPtsToWallTime_(convertPtsToWallTime), - loggingUuid_(loggingUuid) {} - -Stream::~Stream() { - if (frame_) { - av_free(frame_); - } - if (codecCtx_) { - avcodec_free_context(&codecCtx_); - } -} - -// look up the proper CODEC querying the function -AVCodec* Stream::findCodec(AVCodecParameters* params) { - return (AVCodec*)avcodec_find_decoder(params->codec_id); -} - -// Allocate memory for the AVCodecContext, which will hold the context for -// decode/encode process. Then fill this codec context with CODEC parameters -// defined in stream parameters. Open the codec, and allocate the global frame -// defined in the header file -int Stream::openCodec(std::vector* metadata, int num_threads) { - AVStream* steam = inputCtx_->streams[format_.stream]; - - AVCodec* codec = findCodec(steam->codecpar); - if (!codec) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_find_decoder failed for codec_id: " - << int(steam->codecpar->codec_id); - return AVERROR(EINVAL); - } - - if (!(codecCtx_ = avcodec_alloc_context3(codec))) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_alloc_context3 failed"; - return AVERROR(ENOMEM); - } - // multithreading heuristics - // if user defined, - if (num_threads > max_threads) { - num_threads = max_threads; - } - - if (num_threads > 0) { - // if user defined, respect that - // note that default thread_type will be used - codecCtx_->thread_count = num_threads; - } else { - // otherwise set sensible defaults - codecCtx_->thread_count = 8; - codecCtx_->thread_type = FF_THREAD_SLICE; - } - - int ret; - // Copy codec parameters from input stream to output codec context - if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_parameters_to_context failed"; - return ret; - } - - // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful - if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret); - avcodec_free_context(&codecCtx_); - codecCtx_ = nullptr; - return ret; - } - - frame_ = av_frame_alloc(); - - switch (format_.type) { - case TYPE_VIDEO: - fps_ = av_q2d(av_guess_frame_rate(inputCtx_, steam, nullptr)); - break; - case TYPE_AUDIO: - fps_ = codecCtx_->sample_rate; - break; - default: - fps_ = 30.0; - } - - if ((ret = initFormat())) { - LOG(ERROR) << "initFormat failed, type: " << format_.type; - } - - if (metadata) { - DecoderMetadata header; - header.format = format_; - header.fps = fps_; - header.num = steam->time_base.num; - header.den = steam->time_base.den; - header.duration = - av_rescale_q(steam->duration, steam->time_base, timeBaseQ); - metadata->push_back(header); - } - - return ret; -} - -// send the raw data packet (compressed frame) to the decoder, through the codec -// context and receive the raw data frame (uncompressed frame) from the -// decoder, through the same codec context -int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) { - int consumed = 0; - int result = avcodec_send_packet(codecCtx_, packet); - if (result == AVERROR(EAGAIN)) { - *gotFrame = false; // no bytes get consumed, fetch frame - } else if (result == AVERROR_EOF) { - *gotFrame = false; // more than one flush packet - if (packet) { - // got packet after flush, this is an error - return result; - } - } else if (result < 0) { - LOG(ERROR) << "avcodec_send_packet failed, err: " - << Util::generateErrorDesc(result); - return result; // error - } else { - consumed = packet ? packet->size : 0; // all bytes get consumed - } - - result = avcodec_receive_frame(codecCtx_, frame_); - - if (result >= 0) { - *gotFrame = true; // frame is available - } else if (result == AVERROR(EAGAIN)) { - *gotFrame = false; // no frames at this time, needs more packets - if (!consumed) { - // precaution, if no packages got consumed and no frames are available - return result; - } - } else if (result == AVERROR_EOF) { - *gotFrame = false; // the last frame has been flushed - // precaution, if no more frames are available assume we consume all bytes - consumed = 0; - } else { // error - LOG(ERROR) << "avcodec_receive_frame failed, err: " - << Util::generateErrorDesc(result); - return result; - } - return consumed; -} - -// General decoding function: -// given the packet, analyse the metadata, and write the -// metadata and the buffer to the DecoderOutputImage. -int Stream::decodePacket( - const AVPacket* packet, - DecoderOutputMessage* out, - bool headerOnly, - bool* hasMsg) { - int consumed; - bool gotFrame = false; - *hasMsg = false; - if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 && - (packet == nullptr || gotFrame)) { - int result; - if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) { - return result; // report error - } - *hasMsg = result > 0; - } - return consumed; -} - -int Stream::flush(DecoderOutputMessage* out, bool headerOnly) { - bool hasMsg = false; - int result = decodePacket(nullptr, out, headerOnly, &hasMsg); - if (result < 0) { - avcodec_flush_buffers(codecCtx_); - return result; - } - if (!hasMsg) { - avcodec_flush_buffers(codecCtx_); - return 0; - } - return 1; -} - -// Sets the header and payload via stream::setHeader and copyFrameBytes -// functions that are defined in type stream subclass (VideoStream, AudioStream, -// ...) -int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) { - if (flush) { - // only flush of audio frames makes sense - if (format_.type == TYPE_AUDIO) { - int processed = 0; - size_t total = 0; - // grab all audio bytes by chunks - do { - if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) { - return processed; - } - total += processed; - } while (processed); - - if (total) { - // set header if message bytes are available - setHeader(&out->header, flush); - return 1; - } - } - return 0; - } else { - if (format_.type == TYPE_AUDIO) { - int processed = 0; - if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) { - return processed; - } - if (processed) { - // set header if message bytes are available - setHeader(&out->header, flush); - return 1; - } - return 0; - } else { - // set header - setHeader(&out->header, flush); - - if (headerOnly) { - // Only header is requisted - return 1; - } - - return copyFrameBytes(out->payload.get(), flush); - } - } -} - -void Stream::setHeader(DecoderHeader* header, bool flush) { - header->seqno = numGenerator_++; - - setFramePts(header, flush); - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->format = format_; - header->keyFrame = 0; - header->fps = std::numeric_limits::quiet_NaN(); -} - -void Stream::setFramePts(DecoderHeader* header, bool flush) { - if (flush) { - header->pts = nextPts_; // already in us - } else { - header->pts = frame_->best_effort_timestamp; - if (header->pts == AV_NOPTS_VALUE) { - header->pts = nextPts_; - } else { - header->pts = av_rescale_q( - header->pts, - inputCtx_->streams[format_.stream]->time_base, - timeBaseQ); - } - - switch (format_.type) { - case TYPE_AUDIO: - nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_; - break; - case TYPE_VIDEO: - nextPts_ = header->pts + AV_TIME_BASE / fps_; - break; - default: - nextPts_ = header->pts; - } - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/stream.h b/torchvision/csrc/io/decoder/stream.h deleted file mode 100644 index 6250dd9ecd2..00000000000 --- a/torchvision/csrc/io/decoder/stream.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include -#include "defs.h" -#include "time_keeper.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one media stream (audio or video). - */ - -class Stream { - public: - Stream( - AVFormatContext* inputCtx, - MediaFormat format, - bool convertPtsToWallTime, - int64_t loggingUuid); - virtual ~Stream(); - - // returns 0 - on success or negative error - // num_threads sets up the codec context for multithreading if needed - // default is set to single thread in order to not break BC - int openCodec(std::vector* metadata, int num_threads = 1); - // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error - int decodePacket( - const AVPacket* packet, - DecoderOutputMessage* out, - bool headerOnly, - bool* hasMsg); - // returns stream index - int getIndex() const { - return format_.stream; - } - // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error - int flush(DecoderOutputMessage* out, bool headerOnly); - // return media format - MediaFormat getMediaFormat() const { - return format_; - } - - protected: - virtual int initFormat() = 0; - // returns number processed bytes from packet, or negative error - virtual int analyzePacket(const AVPacket* packet, bool* gotFrame); - // returns number processed bytes from packet, or negative error - virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0; - // sets output format - virtual void setHeader(DecoderHeader* header, bool flush); - // set frame pts - virtual void setFramePts(DecoderHeader* header, bool flush); - // finds codec - virtual AVCodec* findCodec(AVCodecParameters* params); - - private: - // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error - int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly); - - protected: - AVFormatContext* const inputCtx_; - MediaFormat format_; - const bool convertPtsToWallTime_; - int64_t loggingUuid_; - - AVCodecContext* codecCtx_{nullptr}; - AVFrame* frame_{nullptr}; - - std::atomic numGenerator_{0}; - TimeKeeper keeper_; - // estimated next frame pts for flushing the last frame - int64_t nextPts_{0}; - double fps_{30.}; - // this is a dumb conservative limit; ideally we'd use - // int max_threads = at::get_num_threads(); but this would cause - // fb sync to fail as it would add dependency to ATen to the decoder API - const int max_threads = 12; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.cpp b/torchvision/csrc/io/decoder/subtitle_sampler.cpp deleted file mode 100644 index d0df24d3e35..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_sampler.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "subtitle_sampler.h" -#include -#include "util.h" - -namespace ffmpeg { - -SubtitleSampler::~SubtitleSampler() { - cleanUp(); -} - -void SubtitleSampler::shutdown() { - cleanUp(); -} - -bool SubtitleSampler::init(const SamplerParameters& params) { - cleanUp(); - // set formats - params_ = params; - return true; -} - -int SubtitleSampler::sample(AVSubtitle* sub, ByteStorage* out) { - if (!sub || !out) { - return 0; // flush - } - - out->ensure(Util::size(*sub)); - - return Util::serialize(*sub, out); -} - -int SubtitleSampler::sample(const ByteStorage* in, ByteStorage* out) { - if (in && out) { - // Get a writable copy - if (size_t len = in->length()) { - out->ensure(len); - memcpy(out->writableTail(), in->data(), len); - } - return out->length(); - } - return 0; -} - -void SubtitleSampler::cleanUp() {} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.h b/torchvision/csrc/io/decoder/subtitle_sampler.h deleted file mode 100644 index 4aee811ed56..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_sampler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode audio frames from one format into another - */ - -class SubtitleSampler : public MediaSampler { - public: - SubtitleSampler() = default; - ~SubtitleSampler() override; - - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - // returns number processed/scaling bytes - int sample(AVSubtitle* sub, ByteStorage* out); - - // helper serialization/deserialization methods - static void serialize(const AVSubtitle& sub, ByteStorage* out); - static bool deserialize(const ByteStorage& buf, AVSubtitle* sub); - - private: - // close resources - void cleanUp(); -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp deleted file mode 100644 index 3416f702d7e..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_stream.cpp +++ /dev/null @@ -1,96 +0,0 @@ -#include "subtitle_stream.h" -#include -#include "util.h" - -namespace ffmpeg { -const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; - -SubtitleStream::SubtitleStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - 0) { - memset(&sub_, 0, sizeof(sub_)); -} - -void SubtitleStream::releaseSubtitle() { - if (sub_.release) { - avsubtitle_free(&sub_); - memset(&sub_, 0, sizeof(sub_)); - } -} - -SubtitleStream::~SubtitleStream() { - releaseSubtitle(); - sampler_.shutdown(); -} - -int SubtitleStream::initFormat() { - if (!codecCtx_->subtitle_header) { - LOG(ERROR) << "No subtitle header found"; - } else { - VLOG(1) << "Subtitle header found!"; - } - return 0; -} - -int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) { - // clean-up - releaseSubtitle(); - - // FIXME: should this even be created? - AVPacket* avPacket; - avPacket = av_packet_alloc(); - if (avPacket == nullptr) { - LOG(ERROR) - << "decoder as not able to allocate the subtitle-specific packet."; - // alternative to ENOMEM - return AVERROR_BUFFER_TOO_SMALL; - } - avPacket->data = nullptr; - avPacket->size = 0; - // check flush packet - auto pkt = packet ? packet : avPacket; - - int gotFramePtr = 0; - // is these a better way than cast from const? - int result = - avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, (AVPacket*)pkt); - - if (result < 0) { - LOG(ERROR) << "avcodec_decode_subtitle2 failed, err: " - << Util::generateErrorDesc(result); - // free the packet we've created - av_packet_free(&avPacket); - return result; - } else if (result == 0) { - result = pkt->size; // discard the rest of the package - } - - sub_.release = gotFramePtr; - *gotFrame = gotFramePtr > 0; - - // set proper pts in us - if (gotFramePtr) { - sub_.pts = av_rescale_q( - pkt->pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ); - } - - av_packet_free(&avPacket); - return result; -} - -int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) { - return sampler_.sample(flush ? nullptr : &sub_, out); -} - -void SubtitleStream::setFramePts(DecoderHeader* header, bool) { - header->pts = sub_.pts; // already in us -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_stream.h b/torchvision/csrc/io/decoder/subtitle_stream.h deleted file mode 100644 index 6c366e11f50..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_stream.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include "stream.h" -#include "subtitle_sampler.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one subtitle stream. - */ -struct AVSubtitleKeeper : AVSubtitle { - int64_t release{0}; -}; - -class SubtitleStream : public Stream { - public: - SubtitleStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format); - ~SubtitleStream() override; - - protected: - void setFramePts(DecoderHeader* header, bool flush) override; - - private: - int initFormat() override; - int analyzePacket(const AVPacket* packet, bool* gotFrame) override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - void releaseSubtitle(); - - private: - SubtitleSampler sampler_; - AVSubtitleKeeper sub_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder.cpp b/torchvision/csrc/io/decoder/sync_decoder.cpp deleted file mode 100644 index 1f03ef8eb95..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include "sync_decoder.h" -#include - -namespace ffmpeg { - -SyncDecoder::AVByteStorage::AVByteStorage(size_t n) { - ensure(n); -} - -SyncDecoder::AVByteStorage::~AVByteStorage() { - av_free(buffer_); -} - -void SyncDecoder::AVByteStorage::ensure(size_t n) { - if (tail() < n) { - capacity_ = offset_ + length_ + n; - buffer_ = static_cast(av_realloc(buffer_, capacity_)); - } -} - -uint8_t* SyncDecoder::AVByteStorage::writableTail() { - TORCH_CHECK_LE(offset_ + length_, capacity_); - return buffer_ + offset_ + length_; -} - -void SyncDecoder::AVByteStorage::append(size_t n) { - TORCH_CHECK_LE(n, tail()); - length_ += n; -} - -void SyncDecoder::AVByteStorage::trim(size_t n) { - TORCH_CHECK_LE(n, length_); - offset_ += n; - length_ -= n; -} - -const uint8_t* SyncDecoder::AVByteStorage::data() const { - return buffer_ + offset_; -} - -size_t SyncDecoder::AVByteStorage::length() const { - return length_; -} - -size_t SyncDecoder::AVByteStorage::tail() const { - TORCH_CHECK_LE(offset_ + length_, capacity_); - return capacity_ - offset_ - length_; -} - -void SyncDecoder::AVByteStorage::clear() { - offset_ = 0; - length_ = 0; -} - -std::unique_ptr SyncDecoder::createByteStorage(size_t n) { - return std::make_unique(n); -} - -void SyncDecoder::onInit() { - eof_ = false; - queue_.clear(); -} - -int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) { - if (eof_ && queue_.empty()) { - return ENODATA; - } - - if (queue_.empty()) { - int result = getFrame(timeoutMs); - // assign EOF - eof_ = result == ENODATA; - // check unrecoverable error, any error but ENODATA - if (result && result != ENODATA) { - return result; - } - - // still empty - if (queue_.empty()) { - if (eof_) { - return ENODATA; - } else { - LOG(INFO) << "Queue is empty"; - return ETIMEDOUT; - } - } - } - - *out = std::move(queue_.front()); - queue_.pop_front(); - return 0; -} - -void SyncDecoder::push(DecoderOutputMessage&& buffer) { - queue_.push_back(std::move(buffer)); -} -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder.h b/torchvision/csrc/io/decoder/sync_decoder.h deleted file mode 100644 index b7cf7b625ac..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder.h +++ /dev/null @@ -1,48 +0,0 @@ -#pragma once - -#include -#include "decoder.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode media streams. - * Media bytes can be explicitly provided through read-callback - * or fetched internally by FFMPEG library - */ -class SyncDecoder : public Decoder { - public: - // Allocation of memory must be done with a proper alignment. - class AVByteStorage : public ByteStorage { - public: - explicit AVByteStorage(size_t n); - ~AVByteStorage() override; - void ensure(size_t n) override; - uint8_t* writableTail() override; - void append(size_t n) override; - void trim(size_t n) override; - const uint8_t* data() const override; - size_t length() const override; - size_t tail() const override; - void clear() override; - - private: - size_t offset_{0}; - size_t length_{0}; - size_t capacity_{0}; - uint8_t* buffer_{nullptr}; - }; - - public: - int decode(DecoderOutputMessage* out, uint64_t timeoutMs) override; - - private: - void push(DecoderOutputMessage&& buffer) override; - void onInit() override; - std::unique_ptr createByteStorage(size_t n) override; - - private: - std::list queue_; - bool eof_{false}; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp deleted file mode 100644 index 085966ce687..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp +++ /dev/null @@ -1,416 +0,0 @@ -#include -#include -#include -#include "memory_buffer.h" -#include "sync_decoder.h" -#include "util.h" - -using namespace ffmpeg; - -namespace { -struct VideoFileStats { - std::string name; - size_t durationPts{0}; - int num{0}; - int den{0}; - int fps{0}; -}; - -void gotAllTestFiles( - const std::string& folder, - std::vector* stats) { - DIR* d = opendir(folder.c_str()); - CHECK(d); - struct dirent* dir; - while ((dir = readdir(d))) { - if (dir->d_type != DT_DIR && 0 != strcmp(dir->d_name, "README")) { - VideoFileStats item; - item.name = folder + '/' + dir->d_name; - LOG(INFO) << "Found video file: " << item.name; - stats->push_back(std::move(item)); - } - } - closedir(d); -} - -void gotFilesStats(std::vector& stats) { - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.formats = {MediaFormat(0)}; - params.headerOnly = true; - params.preventStaleness = false; - size_t avgProvUs = 0; - const size_t rounds = 100; - for (auto& item : stats) { - LOG(INFO) << "Decoding video file in memory: " << item.name; - FILE* f = fopen(item.name.c_str(), "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - for (size_t i = 0; i < rounds; ++i) { - SyncDecoder decoder; - std::vector metadata; - const auto now = std::chrono::steady_clock::now(); - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - &metadata)); - const auto then = std::chrono::steady_clock::now(); - decoder.shutdown(); - avgProvUs += - std::chrono::duration_cast(then - now) - .count(); - TORCH_CHECK_EQ(metadata.size(), 1); - item.num = metadata[0].num; - item.den = metadata[0].den; - item.fps = metadata[0].fps; - item.durationPts = - av_rescale_q(metadata[0].duration, AV_TIME_BASE_Q, {1, item.fps}); - } - } - LOG(INFO) << "Probing (us) " << avgProvUs / stats.size() / rounds; -} - -size_t measurePerformanceUs( - const std::vector& stats, - size_t rounds, - size_t num, - size_t stride) { - size_t avgClipDecodingUs = 0; - std::srand(time(nullptr)); - for (const auto& item : stats) { - FILE* f = fopen(item.name.c_str(), "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - for (size_t i = 0; i < rounds; ++i) { - // randomy select clip - size_t rOffset = std::rand(); - size_t fOffset = rOffset % item.durationPts; - size_t clipFrames = num + (num - 1) * stride; - if (fOffset + clipFrames > item.durationPts) { - fOffset = item.durationPts - clipFrames; - } - - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.preventStaleness = false; - - for (size_t n = 0; n < num; ++n) { - std::list msgs; - - params.startOffset = - av_rescale_q(fOffset, {1, item.fps}, AV_TIME_BASE_Q); - params.endOffset = params.startOffset + 100; - - auto now = std::chrono::steady_clock::now(); - SyncDecoder decoder; - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - nullptr)); - DecoderOutputMessage out; - while (0 == decoder.decode(&out, params.timeoutMs)) { - msgs.push_back(std::move(out)); - } - - decoder.shutdown(); - - const auto then = std::chrono::steady_clock::now(); - - fOffset += 1 + stride; - - avgClipDecodingUs += - std::chrono::duration_cast(then - now) - .count(); - } - } - } - - return avgClipDecodingUs / rounds / num / stats.size(); -} - -void runDecoder(SyncDecoder& decoder) { - DecoderOutputMessage out; - size_t audioFrames = 0, videoFrames = 0, totalBytes = 0; - while (0 == decoder.decode(&out, 10000)) { - if (out.header.format.type == TYPE_AUDIO) { - ++audioFrames; - } else if (out.header.format.type == TYPE_VIDEO) { - ++videoFrames; - } else if (out.header.format.type == TYPE_SUBTITLE && out.payload) { - // deserialize - LOG(INFO) << "Deserializing subtitle"; - AVSubtitle sub; - memset(&sub, 0, sizeof(sub)); - EXPECT_TRUE(Util::deserialize(*out.payload, &sub)); - LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects; - for (int i = 0; i < sub.num_rects; ++i) { - std::string text = "picture"; - if (sub.rects[i]->type == SUBTITLE_TEXT) { - text = sub.rects[i]->text; - } else if (sub.rects[i]->type == SUBTITLE_ASS) { - text = sub.rects[i]->ass; - } - - LOG(INFO) << "Rect num: " << i << ", type:" << sub.rects[i]->type - << ", text: " << text; - } - - avsubtitle_free(&sub); - } - if (out.payload) { - totalBytes += out.payload->length(); - } - } - LOG(INFO) << "Decoded audio frames: " << audioFrames - << ", video frames: " << videoFrames - << ", total bytes: " << totalBytes; -} -} // namespace - -TEST(SyncDecoder, TestSyncDecoderPerformance) { - // Measure the average time of decoding per clip - // 1. list of the videos in testing directory - // 2. for each video got number of frames with timestamps - // 3. randomly select frame offset - // 4. adjust offset for number frames and strides, - // if it's out out upper boundary - // 5. repeat multiple times, measuring and accumulating decoding time - // per clip. - /* - 1) 4 x 2 - 2) 8 x 8 - 3) 16 x 8 - 4) 32 x 4 - */ - const std::string kFolder = "pytorch/vision/test/assets/videos"; - std::vector stats; - gotAllTestFiles(kFolder, &stats); - gotFilesStats(stats); - - const size_t kRounds = 10; - - auto new4x2 = measurePerformanceUs(stats, kRounds, 4, 2); - auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8); - auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8); - auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4); - LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2 - << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8 - << ", new(32x4): " << new32x4; -} - -TEST(SyncDecoder, Test) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestSubtitles) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "vue/synergy/data/robotsub.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestHeadersOnly) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = true; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestHeadersOnlyDownSampling) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = true; - MediaFormat format; - format.type = TYPE_AUDIO; - format.format.audio.samples = 8000; - params.formats.insert(format); - - format.type = TYPE_VIDEO; - format.format.video.width = 224; - format.format.video.height = 224; - params.formats.insert(format); - - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestInitOnlyNoShutdown) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = false; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - std::vector metadata; - CHECK(decoder.init(params, nullptr, &metadata)); -} - -TEST(SyncDecoder, TestMemoryBuffer) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen( - "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi", - "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - nullptr)); - LOG(INFO) << "Decoding from memory bytes: " << buffer.size(); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - params.maxSeekableBytes = buffer.size() + 1; - MemoryBuffer object(buffer.data(), buffer.size()); - CHECK(decoder.init( - params, - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - no - return -1; - } - return object.seek(size, whence); - }, - nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - params.maxSeekableBytes = buffer.size() / 2; - MemoryBuffer object(buffer.data(), buffer.size()); - CHECK(!decoder.init( - params, - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - no - return -1; - } - return object.seek(size, whence); - }, - nullptr)); -} diff --git a/torchvision/csrc/io/decoder/time_keeper.cpp b/torchvision/csrc/io/decoder/time_keeper.cpp deleted file mode 100644 index 845c76cddc8..00000000000 --- a/torchvision/csrc/io/decoder/time_keeper.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "time_keeper.h" -#include "defs.h" - -namespace ffmpeg { - -namespace { -const long kMaxTimeBaseDiference = 10; -} - -long TimeKeeper::adjust(long& decoderTimestamp) { - const long now = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - - if (startTime_ == 0) { - startTime_ = now; - } - if (streamTimestamp_ == 0) { - streamTimestamp_ = decoderTimestamp; - } - - const auto runOut = startTime_ + decoderTimestamp - streamTimestamp_; - - if (std::labs((now - runOut) / AV_TIME_BASE) > kMaxTimeBaseDiference) { - streamTimestamp_ = startTime_ - now + decoderTimestamp; - } - - const auto sleepAdvised = runOut - now; - - decoderTimestamp += startTime_ - streamTimestamp_; - - return sleepAdvised > 0 ? sleepAdvised : 0; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/time_keeper.h b/torchvision/csrc/io/decoder/time_keeper.h deleted file mode 100644 index e4d4718c705..00000000000 --- a/torchvision/csrc/io/decoder/time_keeper.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include - -namespace ffmpeg { - -/** - * Class keeps the track of the decoded timestamps (us) for media streams. - */ - -class TimeKeeper { - public: - TimeKeeper() = default; - - // adjust provided @timestamp to the corrected value - // return advised sleep time before next frame processing in (us) - long adjust(long& decoderTimestamp); - - private: - long startTime_{0}; - long streamTimestamp_{0}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp deleted file mode 100644 index 7198d2174ed..00000000000 --- a/torchvision/csrc/io/decoder/util.cpp +++ /dev/null @@ -1,401 +0,0 @@ -#include "util.h" -#include - -namespace ffmpeg { - -namespace Serializer { - -// fixed size types -template -inline size_t getSize(const T& x) { - return sizeof(x); -} - -template -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const T& src) { - VLOG(6) << "Generic serializeItem"; - const auto required = sizeof(src); - if (len < pos + required) { - return false; - } - memcpy(dest + pos, &src, required); - pos += required; - return true; -} - -template -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - T& dest) { - const auto required = sizeof(dest); - if (len < pos + required) { - return false; - } - memcpy(&dest, src + pos, required); - pos += required; - return true; -} - -// AVSubtitleRect specialization -inline size_t getSize(const AVSubtitleRect& x) { - auto rectBytes = [](const AVSubtitleRect& y) -> size_t { - size_t s = 0; - switch (y.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < y.nb_colors; ++i) { - s += sizeof(y.linesize[i]); - s += y.linesize[i]; - } - break; - case SUBTITLE_TEXT: - s += sizeof(size_t); - s += strlen(y.text); - break; - case SUBTITLE_ASS: - s += sizeof(size_t); - s += strlen(y.ass); - break; - default: - break; - } - return s; - }; - return getSize(x.x) + getSize(x.y) + getSize(x.w) + getSize(x.h) + - getSize(x.nb_colors) + getSize(x.type) + getSize(x.flags) + rectBytes(x); -} - -// AVSubtitle specialization -inline size_t getSize(const AVSubtitle& x) { - auto rectBytes = [](const AVSubtitle& y) -> size_t { - size_t s = getSize(y.num_rects); - for (unsigned i = 0; i < y.num_rects; ++i) { - s += getSize(*y.rects[i]); - } - return s; - }; - return getSize(x.format) + getSize(x.start_display_time) + - getSize(x.end_display_time) + getSize(x.pts) + rectBytes(x); -} - -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const AVSubtitleRect& src) { - auto rectSerialize = - [](uint8_t* d, size_t l, size_t& p, const AVSubtitleRect& x) -> size_t { - switch (x.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < x.nb_colors; ++i) { - if (!serializeItem(d, l, p, x.linesize[i])) { - return false; - } - if (p + x.linesize[i] > l) { - return false; - } - memcpy(d + p, x.data[i], x.linesize[i]); - p += x.linesize[i]; - } - return true; - case SUBTITLE_TEXT: { - const size_t s = strlen(x.text); - if (!serializeItem(d, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - memcpy(d + p, x.text, s); - p += s; - return true; - } - case SUBTITLE_ASS: { - const size_t s = strlen(x.ass); - if (!serializeItem(d, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - memcpy(d + p, x.ass, s); - p += s; - return true; - } - default: - return true; - } - }; - return serializeItem(dest, len, pos, src.x) && - serializeItem(dest, len, pos, src.y) && - serializeItem(dest, len, pos, src.w) && - serializeItem(dest, len, pos, src.h) && - serializeItem(dest, len, pos, src.nb_colors) && - serializeItem(dest, len, pos, src.type) && - serializeItem(dest, len, pos, src.flags) && - rectSerialize(dest, len, pos, src); -} - -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const AVSubtitle& src) { - auto rectSerialize = - [](uint8_t* d, size_t l, size_t& p, const AVSubtitle& x) -> bool { - bool res = serializeItem(d, l, p, x.num_rects); - for (unsigned i = 0; res && i < x.num_rects; ++i) { - res = serializeItem(d, l, p, *(x.rects[i])); - } - return res; - }; - VLOG(6) << "AVSubtitle serializeItem"; - return serializeItem(dest, len, pos, src.format) && - serializeItem(dest, len, pos, src.start_display_time) && - serializeItem(dest, len, pos, src.end_display_time) && - serializeItem(dest, len, pos, src.pts) && - rectSerialize(dest, len, pos, src); -} - -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - AVSubtitleRect& dest) { - auto rectDeserialize = - [](const uint8_t* y, size_t l, size_t& p, AVSubtitleRect& x) -> bool { - switch (x.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < x.nb_colors; ++i) { - if (!deserializeItem(y, l, p, x.linesize[i])) { - return false; - } - if (p + x.linesize[i] > l) { - return false; - } - x.data[i] = (uint8_t*)av_malloc(x.linesize[i]); - memcpy(x.data[i], y + p, x.linesize[i]); - p += x.linesize[i]; - } - return true; - case SUBTITLE_TEXT: { - size_t s = 0; - if (!deserializeItem(y, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - x.text = (char*)av_malloc(s + 1); - memcpy(x.text, y + p, s); - x.text[s] = 0; - p += s; - return true; - } - case SUBTITLE_ASS: { - size_t s = 0; - if (!deserializeItem(y, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - x.ass = (char*)av_malloc(s + 1); - memcpy(x.ass, y + p, s); - x.ass[s] = 0; - p += s; - return true; - } - default: - return true; - } - }; - - return deserializeItem(src, len, pos, dest.x) && - deserializeItem(src, len, pos, dest.y) && - deserializeItem(src, len, pos, dest.w) && - deserializeItem(src, len, pos, dest.h) && - deserializeItem(src, len, pos, dest.nb_colors) && - deserializeItem(src, len, pos, dest.type) && - deserializeItem(src, len, pos, dest.flags) && - rectDeserialize(src, len, pos, dest); -} - -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - AVSubtitle& dest) { - auto rectDeserialize = - [](const uint8_t* y, size_t l, size_t& p, AVSubtitle& x) -> bool { - bool res = deserializeItem(y, l, p, x.num_rects); - if (res && x.num_rects) { - x.rects = - (AVSubtitleRect**)av_malloc(x.num_rects * sizeof(AVSubtitleRect*)); - } - for (unsigned i = 0; res && i < x.num_rects; ++i) { - x.rects[i] = (AVSubtitleRect*)av_malloc(sizeof(AVSubtitleRect)); - memset(x.rects[i], 0, sizeof(AVSubtitleRect)); - res = deserializeItem(y, l, p, *x.rects[i]); - } - return res; - }; - return deserializeItem(src, len, pos, dest.format) && - deserializeItem(src, len, pos, dest.start_display_time) && - deserializeItem(src, len, pos, dest.end_display_time) && - deserializeItem(src, len, pos, dest.pts) && - rectDeserialize(src, len, pos, dest); -} -} // namespace Serializer - -namespace Util { -std::string generateErrorDesc(int errorCode) { - std::array buffer; - if (av_strerror(errorCode, buffer.data(), buffer.size()) < 0) { - return std::string("Unknown error code: ") + std::to_string(errorCode); - } - buffer.back() = 0; - return std::string(buffer.data()); -} - -size_t serialize(const AVSubtitle& sub, ByteStorage* out) { - const auto len = size(sub); - size_t pos = 0; - if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) { - return 0; - } - out->append(len); - return len; -} - -bool deserialize(const ByteStorage& buf, AVSubtitle* sub) { - size_t pos = 0; - return Serializer::deserializeItem(buf.data(), buf.length(), pos, *sub); -} - -size_t size(const AVSubtitle& sub) { - return Serializer::getSize(sub); -} - -bool validateVideoFormat(const VideoFormat& f) { - // clang-format off - /* - Valid parameters values for decoder - ____________________________________________________________________________________ - | W | H | minDimension | maxDimension | cropImage | algorithm | - |__________________________________________________________________________________| - | 0 | 0 | 0 | 0 | N/A | original | - |__________________________________________________________________________________| - | >0 | 0 | N/A | N/A | N/A | scale keeping W | - |__________________________________________________________________________________| - | 0 | >0 | N/A | N/A | N/A | scale keeping H | - |__________________________________________________________________________________| - | >0 | >0 | N/A | N/A | 0 | stretch/scale | - |__________________________________________________________________________________| - | >0 | >0 | N/A | N/A | >0 | scale/crop | - |__________________________________________________________________________________| - | 0 | 0 | >0 | 0 | N/A |scale to min dimension | - |__________________________________________________________________________________| - | 0 | 0 | 0 | >0 | N/A |scale to max dimension | - |__________________________________________________________________________________| - | 0 | 0 | >0 | >0 | N/A |stretch to min/max dimension| - |_____|_____|______________|______________|___________|____________________________| - - */ - // clang-format on - return (f.width == 0 && // #1, #6, #7 and #8 - f.height == 0 && f.cropImage == 0) || - (f.width != 0 && // #4 and #5 - f.height != 0 && f.minDimension == 0 && f.maxDimension == 0) || - (((f.width != 0 && // #2 - f.height == 0) || - (f.width == 0 && // #3 - f.height != 0)) && - f.minDimension == 0 && f.maxDimension == 0 && f.cropImage == 0); -} - -void setFormatDimensions( - size_t& destW, - size_t& destH, - size_t userW, - size_t userH, - size_t srcW, - size_t srcH, - size_t minDimension, - size_t maxDimension, - size_t cropImage) { - // rounding rules - // int -> double -> round up - // if fraction is >= 0.5 or round down if fraction is < 0.5 - // int result = double(value) + 0.5 - // here we rounding double to int according to the above rule - - // #1, #6, #7 and #8 - if (userW == 0 && userH == 0) { - if (minDimension > 0 && maxDimension == 0) { // #6 - if (srcW > srcH) { - // landscape - destH = minDimension; - destW = round(double(srcW * minDimension) / srcH); - } else { - // portrait - destW = minDimension; - destH = round(double(srcH * minDimension) / srcW); - } - } else if (minDimension == 0 && maxDimension > 0) { // #7 - if (srcW > srcH) { - // landscape - destW = maxDimension; - destH = round(double(srcH * maxDimension) / srcW); - } else { - // portrait - destH = maxDimension; - destW = round(double(srcW * maxDimension) / srcH); - } - } else if (minDimension > 0 && maxDimension > 0) { // #8 - if (srcW > srcH) { - // landscape - destW = maxDimension; - destH = minDimension; - } else { - // portrait - destW = minDimension; - destH = maxDimension; - } - } else { // #1 - destW = srcW; - destH = srcH; - } - } else if (userW != 0 && userH == 0) { // #2 - destW = userW; - destH = round(double(srcH * userW) / srcW); - } else if (userW == 0 && userH != 0) { // #3 - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { // userW != 0 && userH != 0 - if (cropImage == 0) { // #4 - destW = userW; - destH = userH; - } else { // #5 - double userSlope = double(userH) / userW; - double srcSlope = double(srcH) / srcW; - if (srcSlope < userSlope) { - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { - destW = userW; - destH = round(double(srcH * userW) / srcW); - } - } - } - // prevent zeros - destW = std::max(destW, size_t(1UL)); - destH = std::max(destH, size_t(1UL)); -} -} // namespace Util -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util.h b/torchvision/csrc/io/decoder/util.h deleted file mode 100644 index 01b550e5bbc..00000000000 --- a/torchvision/csrc/io/decoder/util.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * FFMPEG library utility functions. - */ - -namespace Util { -std::string generateErrorDesc(int errorCode); -size_t serialize(const AVSubtitle& sub, ByteStorage* out); -bool deserialize(const ByteStorage& buf, AVSubtitle* sub); -size_t size(const AVSubtitle& sub); -void setFormatDimensions( - size_t& destW, - size_t& destH, - size_t userW, - size_t userH, - size_t srcW, - size_t srcH, - size_t minDimension, - size_t maxDimension, - size_t cropImage); -bool validateVideoFormat(const VideoFormat& format); -} // namespace Util -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp deleted file mode 100644 index 0a093d9561b..00000000000 --- a/torchvision/csrc/io/decoder/util_test.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include -#include -#include "util.h" - -TEST(Util, TestSetFormatDimensions) { - // clang-format off - const size_t test_cases[][9] = { - // (userW, userH, srcW, srcH, minDimension, maxDimension, cropImage, destW, destH) - {0, 0, 172, 128, 0, 0, 0, 172, 128}, // #1 - {86, 0, 172, 128, 0, 0, 0, 86, 64}, // #2 - {64, 0, 128, 172, 0, 0, 0, 64, 86}, // #2 - {0, 32, 172, 128, 0, 0, 0, 43, 32}, // #3 - {32, 0, 128, 172, 0, 0, 0, 32, 43}, // #3 - {60, 50, 172, 128, 0, 0, 0, 60, 50}, // #4 - {50, 60, 128, 172, 0, 0, 0, 50, 60}, // #4 - {86, 40, 172, 128, 0, 0, 1, 86, 64}, // #5 - {86, 92, 172, 128, 0, 0, 1, 124, 92}, // #5 - {0, 0, 172, 128, 256, 0, 0, 344, 256}, // #6 - {0, 0, 128, 172, 256, 0, 0, 256, 344}, // #6 - {0, 0, 128, 172, 0, 344, 0, 256, 344}, // #7 - {0, 0, 172, 128, 0, 344, 0, 344, 256}, // #7 - {0, 0, 172, 128, 100, 344, 0, 344, 100},// #8 - {0, 0, 128, 172, 100, 344, 0, 100, 344} // #8 - }; - // clang-format onn - - for (const auto& tc : test_cases) { - size_t destW = 0; - size_t destH = 0; - ffmpeg::Util::setFormatDimensions(destW, destH, tc[0], tc[1], tc[2], tc[3], tc[4], tc[5], tc[6]); - CHECK(destW == tc[7]); - CHECK(destH == tc[8]); - } -} diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp deleted file mode 100644 index 8b712609e34..00000000000 --- a/torchvision/csrc/io/decoder/video_sampler.cpp +++ /dev/null @@ -1,337 +0,0 @@ -#include "video_sampler.h" -#include -#include "util.h" - -// www.ffmpeg.org/doxygen/0.5/swscale-example_8c-source.html - -namespace ffmpeg { - -namespace { - -// Setup the data pointers and linesizes based on the specified image -// parameters and the provided array. This sets up "planes" to point to a -// "buffer" -// NOTE: this is most likely culprit behind #3534 -// -// Args: -// fmt: desired output video format -// buffer: source constant image buffer (in different format) that will contain -// the final image after SWScale planes: destination data pointer to be filled -// lineSize: target destination linesize (always {0}) -int preparePlanes( - const VideoFormat& fmt, - const uint8_t* buffer, - uint8_t** planes, - int* lineSize) { - int result; - - // NOTE: 1 at the end of av_fill_arrays is the value used for alignment - if ((result = av_image_fill_arrays( - planes, - lineSize, - buffer, - (AVPixelFormat)fmt.format, - fmt.width, - fmt.height, - 1)) < 0) { - LOG(ERROR) << "av_image_fill_arrays failed, err: " - << Util::generateErrorDesc(result); - } - return result; -} - -// Scale (and crop) the image slice in srcSlice and put the resulting scaled -// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as -// `sws_scale` cannot access buffers directly. -// -// Args: -// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if -// scale) srcSlice: frame data in YUV420P srcStride: the array containing the -// strides for each plane of the source -// image (from AVFrame->linesize[0]) -// out: destination buffer -// planes: indirect destination buffer (mapped to "out" via preparePlanes) -// lines: destination linesize; constant {0} -int transformImage( - SwsContext* context, - const uint8_t* const srcSlice[], - int srcStride[], - VideoFormat inFormat, - VideoFormat outFormat, - uint8_t* out, - uint8_t* planes[], - int lines[]) { - int result; - if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) { - return result; - } - if (context) { - // NOTE: srcY stride always 0: this is a parameter of YUV format - if ((result = sws_scale( - context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) < - 0) { - LOG(ERROR) << "sws_scale failed, err: " - << Util::generateErrorDesc(result); - return result; - } - } else if ( - inFormat.width == outFormat.width && - inFormat.height == outFormat.height && - inFormat.format == outFormat.format) { - // Copy planes without using sws_scale if sws_getContext failed. - av_image_copy( - planes, - lines, - (const uint8_t**)srcSlice, - srcStride, - (AVPixelFormat)inFormat.format, - inFormat.width, - inFormat.height); - } else { - LOG(ERROR) << "Invalid scale context format " << inFormat.format; - return AVERROR(EINVAL); - } - return 0; -} -} // namespace - -VideoSampler::VideoSampler(int swsFlags, int64_t loggingUuid) - : swsFlags_(swsFlags), loggingUuid_(loggingUuid) {} - -VideoSampler::~VideoSampler() { - cleanUp(); -} - -void VideoSampler::shutdown() { - cleanUp(); -} - -bool VideoSampler::init(const SamplerParameters& params) { - cleanUp(); - - if (params.out.video.cropImage != 0) { - if (!Util::validateVideoFormat(params.out.video)) { - LOG(ERROR) << "Invalid video format" - << ", width: " << params.out.video.width - << ", height: " << params.out.video.height - << ", format: " << params.out.video.format - << ", minDimension: " << params.out.video.minDimension - << ", crop: " << params.out.video.cropImage; - - return false; - } - - scaleFormat_.format = params.out.video.format; - Util::setFormatDimensions( - scaleFormat_.width, - scaleFormat_.height, - params.out.video.width, - params.out.video.height, - params.in.video.width, - params.in.video.height, - 0, - 0, - 1); - - if (!(scaleFormat_ == params_.out.video)) { // crop required - cropContext_ = sws_getContext( - params.out.video.width, - params.out.video.height, - (AVPixelFormat)params.out.video.format, - params.out.video.width, - params.out.video.height, - (AVPixelFormat)params.out.video.format, - swsFlags_, - nullptr, - nullptr, - nullptr); - - if (!cropContext_) { - LOG(ERROR) << "sws_getContext failed for crop context"; - return false; - } - - const auto scaleImageSize = av_image_get_buffer_size( - (AVPixelFormat)scaleFormat_.format, - scaleFormat_.width, - scaleFormat_.height, - 1); - scaleBuffer_.resize(scaleImageSize); - } - } else { - scaleFormat_ = params.out.video; - } - - VLOG(1) << "Input format #" << loggingUuid_ << ", width " - << params.in.video.width << ", height " << params.in.video.height - << ", format " << params.in.video.format << ", minDimension " - << params.in.video.minDimension << ", cropImage " - << params.in.video.cropImage; - VLOG(1) << "Scale format #" << loggingUuid_ << ", width " - << scaleFormat_.width << ", height " << scaleFormat_.height - << ", format " << scaleFormat_.format << ", minDimension " - << scaleFormat_.minDimension << ", cropImage " - << scaleFormat_.cropImage; - VLOG(1) << "Crop format #" << loggingUuid_ << ", width " - << params.out.video.width << ", height " << params.out.video.height - << ", format " << params.out.video.format << ", minDimension " - << params.out.video.minDimension << ", cropImage " - << params.out.video.cropImage; - - // set output format - params_ = params; - - if (params.in.video.format == AV_PIX_FMT_YUV420P) { - /* When the video width and height are not multiples of 8, - * and there is no size change in the conversion, - * a blurry screen will appear on the right side - * This problem was discovered in 2012 and - * continues to exist in version 4.1.3 in 2019 - * This problem can be avoided by increasing SWS_ACCURATE_RND - * details https://trac.ffmpeg.org/ticket/1582 - */ - if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) { - VLOG(1) << "The width " << params.in.video.width << " and height " - << params.in.video.height << " the image is not a multiple of 8, " - << "the decoding speed may be reduced"; - swsFlags_ |= SWS_ACCURATE_RND; - } - } - - scaleContext_ = sws_getContext( - params.in.video.width, - params.in.video.height, - (AVPixelFormat)params.in.video.format, - scaleFormat_.width, - scaleFormat_.height, - (AVPixelFormat)scaleFormat_.format, - swsFlags_, - nullptr, - nullptr, - nullptr); - // sws_getContext might fail if in/out format == AV_PIX_FMT_PAL8 (png format) - // Return true if input and output formats/width/height are identical - // Check scaleContext_ for nullptr in transformImage to copy planes directly - - if (params.in.video.width == scaleFormat_.width && - params.in.video.height == scaleFormat_.height && - params.in.video.format == scaleFormat_.format) { - return true; - } - return scaleContext_ != nullptr; -} - -// Main body of the sample function called from one of the overloads below -// -// Args: -// srcSlice: decoded AVFrame->data perpared buffer -// srcStride: linesize (usually obtained from AVFrame->linesize) -// out: return buffer (ByteStorage*) -int VideoSampler::sample( - const uint8_t* const srcSlice[], - int srcStride[], - ByteStorage* out) { - int result; - // scaled and cropped image - int outImageSize = av_image_get_buffer_size( - (AVPixelFormat)params_.out.video.format, - params_.out.video.width, - params_.out.video.height, - 1); - - out->ensure(outImageSize); - - uint8_t* scalePlanes[4] = {nullptr}; - int scaleLines[4] = {0}; - // perform scale first - if ((result = transformImage( - scaleContext_, - srcSlice, - srcStride, - params_.in.video, - scaleFormat_, - // for crop use internal buffer - cropContext_ ? scaleBuffer_.data() : out->writableTail(), - scalePlanes, - scaleLines))) { - return result; - } - - // is crop required? - if (cropContext_) { - uint8_t* cropPlanes[4] = {nullptr}; - int cropLines[4] = {0}; - - if (params_.out.video.height < scaleFormat_.height) { - // Destination image is wider of source image: cut top and bottom - for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) { - scalePlanes[i] += scaleLines[i] * - (scaleFormat_.height - params_.out.video.height) / 2; - } - } else { - // Source image is wider of destination image: cut sides - for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) { - scalePlanes[i] += scaleLines[i] * - (scaleFormat_.width - params_.out.video.width) / 2 / - scaleFormat_.width; - } - } - - // crop image - if ((result = transformImage( - cropContext_, - scalePlanes, - scaleLines, - params_.out.video, - params_.out.video, - out->writableTail(), - cropPlanes, - cropLines))) { - return result; - } - } - - out->append(outImageSize); - return outImageSize; -} - -// Call from `video_stream.cpp::114` - occurs during file reads -int VideoSampler::sample(AVFrame* frame, ByteStorage* out) { - if (!frame) { - return 0; // no flush for videos - } - - return sample(frame->data, frame->linesize, out); -} - -// Call from `video_stream.cpp::114` - not sure when this occurs -int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) { - if (!in) { - return 0; // no flush for videos - } - - int result; - uint8_t* inPlanes[4] = {nullptr}; - int inLineSize[4] = {0}; - - if ((result = preparePlanes( - params_.in.video, in->data(), inPlanes, inLineSize)) < 0) { - return result; - } - - return sample(inPlanes, inLineSize, out); -} - -void VideoSampler::cleanUp() { - if (scaleContext_) { - sws_freeContext(scaleContext_); - scaleContext_ = nullptr; - } - if (cropContext_) { - sws_freeContext(cropContext_); - cropContext_ = nullptr; - scaleBuffer_.clear(); - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_sampler.h b/torchvision/csrc/io/decoder/video_sampler.h deleted file mode 100644 index 47247f2c0c5..00000000000 --- a/torchvision/csrc/io/decoder/video_sampler.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode video frames from one format into another - */ - -class VideoSampler : public MediaSampler { - public: - VideoSampler(int swsFlags = SWS_AREA, int64_t loggingUuid = 0); - - ~VideoSampler() override; - - // MediaSampler overrides - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - // returns number processed/scaling bytes - int sample(AVFrame* frame, ByteStorage* out); - int getImageBytes() const; - - private: - // close resources - void cleanUp(); - // helper functions for rescaling, cropping, etc. - int sample( - const uint8_t* const srcSlice[], - int srcStride[], - ByteStorage* out); - - private: - VideoFormat scaleFormat_; - SwsContext* scaleContext_{nullptr}; - SwsContext* cropContext_{nullptr}; - int swsFlags_{SWS_AREA}; - std::vector scaleBuffer_; - int64_t loggingUuid_{0}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp deleted file mode 100644 index fa08c65cac1..00000000000 --- a/torchvision/csrc/io/decoder/video_stream.cpp +++ /dev/null @@ -1,131 +0,0 @@ -#include "video_stream.h" -#include -#include "util.h" - -namespace ffmpeg { - -namespace { -bool operator==(const VideoFormat& x, const AVFrame& y) { - return x.width == static_cast(y.width) && - x.height == static_cast(y.height) && x.format == y.format; -} - -bool operator==(const VideoFormat& x, const AVCodecContext& y) { - return x.width == static_cast(y.width) && - x.height == static_cast(y.height) && x.format == y.pix_fmt; -} - -VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) { - x.width = y.width; - x.height = y.height; - x.format = y.format; - return x; -} - -VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) { - x.width = y.width; - x.height = y.height; - x.format = y.pix_fmt; - return x; -} -} // namespace - -VideoStream::VideoStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const VideoFormat& format, - int64_t loggingUuid) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - loggingUuid) {} - -VideoStream::~VideoStream() { - if (sampler_) { - sampler_->shutdown(); - sampler_.reset(); - } -} - -int VideoStream::initFormat() { - // set output format - if (!Util::validateVideoFormat(format_.format.video)) { - LOG(ERROR) << "Invalid video format" - << ", width: " << format_.format.video.width - << ", height: " << format_.format.video.height - << ", format: " << format_.format.video.format - << ", minDimension: " << format_.format.video.minDimension - << ", crop: " << format_.format.video.cropImage; - return -1; - } - - // keep aspect ratio - Util::setFormatDimensions( - format_.format.video.width, - format_.format.video.height, - format_.format.video.width, - format_.format.video.height, - codecCtx_->width, - codecCtx_->height, - format_.format.video.minDimension, - format_.format.video.maxDimension, - 0); - - if (format_.format.video.format == AV_PIX_FMT_NONE) { - format_.format.video.format = codecCtx_->pix_fmt; - } - return format_.format.video.width != 0 && format_.format.video.height != 0 && - format_.format.video.format != AV_PIX_FMT_NONE - ? 0 - : -1; -} - -// copies frame bytes via sws_scale call in video_sampler.cpp -int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) { - if (!sampler_) { - sampler_ = std::make_unique(SWS_AREA, loggingUuid_); - } - - // check if input format gets changed - if (flush ? !(sampler_->getInputFormat().video == *codecCtx_) - : !(sampler_->getInputFormat().video == *frame_)) { - // - reinit sampler - SamplerParameters params; - params.type = format_.type; - params.out = format_.format; - params.in = FormatUnion(0); - flush ? toVideoFormat(params.in.video, *codecCtx_) - : toVideoFormat(params.in.video, *frame_); - if (!sampler_->init(params)) { - return -1; - } - - VLOG(1) << "Set input video sampler format" - << ", width: " << params.in.video.width - << ", height: " << params.in.video.height - << ", format: " << params.in.video.format - << " : output video sampler format" - << ", width: " << format_.format.video.width - << ", height: " << format_.format.video.height - << ", format: " << format_.format.video.format - << ", minDimension: " << format_.format.video.minDimension - << ", crop: " << format_.format.video.cropImage; - } - // calls to a sampler that converts the frame from YUV422 to RGB24, and - // optionally crops and resizes the frame. Frame bytes are copied from - // frame_->data to out buffer - return sampler_->sample(flush ? nullptr : frame_, out); -} - -void VideoStream::setHeader(DecoderHeader* header, bool flush) { - Stream::setHeader(header, flush); - if (!flush) { // no frames for video flush - header->keyFrame = frame_->key_frame; - header->fps = av_q2d(av_guess_frame_rate( - inputCtx_, inputCtx_->streams[format_.stream], nullptr)); - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_stream.h b/torchvision/csrc/io/decoder/video_stream.h deleted file mode 100644 index e6a8bf02b65..00000000000 --- a/torchvision/csrc/io/decoder/video_stream.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include "stream.h" -#include "video_sampler.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one video stream. - */ - -class VideoStream : public Stream { - public: - VideoStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const VideoFormat& format, - int64_t loggingUuid); - ~VideoStream() override; - - private: - int initFormat() override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - void setHeader(DecoderHeader* header, bool flush) override; - - private: - std::unique_ptr sampler_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp deleted file mode 100644 index 8f1fb3fb5b9..00000000000 --- a/torchvision/csrc/io/video/video.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include "video.h" - -#include - -using namespace ffmpeg; - -namespace vision { -namespace video { - -namespace { - -const size_t decoderTimeoutMs = 600000; -const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; - -// returns number of written bytes -template -size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) { - const auto& msg = msgs; - T* frameData = frame.numel() > 0 ? frame.data_ptr() : nullptr; - if (frameData) { - auto sizeInBytes = msg.payload->length(); - memcpy(frameData, msg.payload->data(), sizeInBytes); - } - return sizeof(T); -} - -size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) { - return fillTensorList(msgs, videoFrame); -} - -size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) { - return fillTensorList(msgs, audioFrame); -} - -std::array, 4>::const_iterator -_parse_type(const std::string& stream_string) { - static const std::array, 4> types = {{ - {"video", TYPE_VIDEO}, - {"audio", TYPE_AUDIO}, - {"subtitle", TYPE_SUBTITLE}, - {"cc", TYPE_CC}, - }}; - auto device = std::find_if( - types.begin(), - types.end(), - [stream_string](const std::pair& p) { - return p.first == stream_string; - }); - if (device != types.end()) { - return device; - } - TORCH_CHECK( - false, "Expected one of [audio, video, subtitle, cc] ", stream_string); -} - -std::string parse_type_to_string(const std::string& stream_string) { - auto device = _parse_type(stream_string); - return device->first; -} - -MediaType parse_type_to_mt(const std::string& stream_string) { - auto device = _parse_type(stream_string); - return device->second; -} - -std::tuple _parseStream(const std::string& streamString) { - TORCH_CHECK(!streamString.empty(), "Stream string must not be empty"); - static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?"); - std::smatch match; - - TORCH_CHECK( - std::regex_match(streamString, match, regex), - "Invalid stream string: '", - streamString, - "'"); - - std::string type_ = "video"; - type_ = parse_type_to_string(match[1].str()); - long index_ = -1; - if (match[2].matched) { - try { - index_ = std::stoi(match[2].str()); - } catch (const std::exception&) { - TORCH_CHECK( - false, - "Could not parse device index '", - match[2].str(), - "' in device string '", - streamString, - "'"); - } - } - return std::make_tuple(type_, index_); -} - -} // namespace - -void Video::_getDecoderParams( - double videoStartS, - int64_t getPtsOnly, - std::string stream, - long stream_id = -1, - bool fastSeek = true, - bool all_streams = false, - int64_t num_threads = 1, - double seekFrameMarginUs = 10) { - int64_t videoStartUs = int64_t(videoStartS * 1e6); - - params.timeoutMs = decoderTimeoutMs; - params.startOffset = videoStartUs; - params.seekAccuracy = seekFrameMarginUs; - params.fastSeek = fastSeek; - params.headerOnly = false; - params.numThreads = num_threads; - - params.preventStaleness = false; // not sure what this is about - - if (all_streams == true) { - MediaFormat format; - format.stream = -2; - format.type = TYPE_AUDIO; - params.formats.insert(format); - - format.type = TYPE_VIDEO; - format.stream = -2; - format.format.video.width = 0; - format.format.video.height = 0; - format.format.video.cropImage = 0; - format.format.video.format = defaultVideoPixelFormat; - params.formats.insert(format); - - format.type = TYPE_SUBTITLE; - format.stream = -2; - params.formats.insert(format); - - format.type = TYPE_CC; - format.stream = -2; - params.formats.insert(format); - } else { - // parse stream type - MediaType stream_type = parse_type_to_mt(stream); - - // TODO: reset params.formats - std::set formats; - params.formats = formats; - // Define new format - MediaFormat format; - format.type = stream_type; - format.stream = stream_id; - if (stream_type == TYPE_VIDEO) { - format.format.video.width = 0; - format.format.video.height = 0; - format.format.video.cropImage = 0; - format.format.video.format = defaultVideoPixelFormat; - } - params.formats.insert(format); - } - -} // _get decoder params - -void Video::initFromFile( - std::string videoPath, - std::string stream, - int64_t numThreads) { - TORCH_CHECK(!initialized, "Video object can only be initialized once"); - initialized = true; - params.uri = videoPath; - _init(stream, numThreads); -} - -void Video::initFromMemory( - torch::Tensor videoTensor, - std::string stream, - int64_t numThreads) { - TORCH_CHECK(!initialized, "Video object can only be initialized once"); - initialized = true; - callback = MemoryBuffer::getCallback( - videoTensor.data_ptr(), videoTensor.size(0)); - _init(stream, numThreads); -} - -void Video::_init(std::string stream, int64_t numThreads) { - // set number of threads global - numThreads_ = numThreads; - // parse stream information - current_stream = _parseStream(stream); - // note that in the initial call we want to get all streams - _getDecoderParams( - 0, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream info - remove that - long(-1), // stream_id parsed from info above change to -2 - false, // fastseek: we're using the default param here - true, // read all streams - numThreads_ // global number of Threads for decoding - ); - - std::string logMessage, logType; - - // locals - std::vector audioFPS, videoFPS; - std::vector audioDuration, videoDuration, ccDuration, subsDuration; - std::vector audioTB, videoTB, ccTB, subsTB; - c10::Dict> audioMetadata; - c10::Dict> videoMetadata; - c10::Dict> ccMetadata; - c10::Dict> subsMetadata; - - // callback and metadata defined in struct - DecoderInCallback tmp_callback = callback; - succeeded = decoder.init(params, std::move(tmp_callback), &metadata); - if (succeeded) { - for (const auto& header : metadata) { - double fps = double(header.fps); - double duration = double(header.duration) * 1e-6; // * timeBase; - - if (header.format.type == TYPE_VIDEO) { - videoFPS.push_back(fps); - videoDuration.push_back(duration); - } else if (header.format.type == TYPE_AUDIO) { - audioFPS.push_back(fps); - audioDuration.push_back(duration); - } else if (header.format.type == TYPE_CC) { - ccDuration.push_back(duration); - } else if (header.format.type == TYPE_SUBTITLE) { - subsDuration.push_back(duration); - }; - } - } - // audio - audioMetadata.insert("duration", audioDuration); - audioMetadata.insert("framerate", audioFPS); - // video - videoMetadata.insert("duration", videoDuration); - videoMetadata.insert("fps", videoFPS); - // subs - subsMetadata.insert("duration", subsDuration); - // cc - ccMetadata.insert("duration", ccDuration); - // put all to a data - streamsMetadata.insert("video", videoMetadata); - streamsMetadata.insert("audio", audioMetadata); - streamsMetadata.insert("subtitles", subsMetadata); - streamsMetadata.insert("cc", ccMetadata); - - succeeded = setCurrentStream(stream); - if (std::get<1>(current_stream) != -1) { - LOG(INFO) - << "Stream index set to " << std::get<1>(current_stream) - << ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; - } -} - -Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { - C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); - if (!videoPath.empty()) { - initFromFile(videoPath, stream, numThreads); - } -} // video - -bool Video::setCurrentStream(std::string stream = "video") { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { - current_stream = _parseStream(stream); - } - - double ts = 0; - if (seekTS > 0) { - ts = seekTS; - } - - _getDecoderParams( - ts, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream - long(std::get<1>( - current_stream)), // stream_id parsed from info above change to -2 - false, // fastseek param set to 0 false by default (changed in seek) - false, // read all streams - numThreads_ // global number of threads - ); - - // callback and metadata defined in Video.h - DecoderInCallback tmp_callback = callback; - return (decoder.init(params, std::move(tmp_callback), &metadata)); -} - -std::tuple Video::getCurrentStream() const { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - return current_stream; -} - -c10::Dict>> Video:: - getStreamMetadata() const { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - return streamsMetadata; -} - -void Video::Seek(double ts, bool fastSeek = false) { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - // initialize the class variables used for seeking and retrurn - _getDecoderParams( - ts, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream - long(std::get<1>( - current_stream)), // stream_id parsed from info above change to -2 - fastSeek, // fastseek - false, // read all streams - numThreads_ // global number of threads - ); - - // callback and metadata defined in Video.h - DecoderInCallback tmp_callback = callback; - succeeded = decoder.init(params, std::move(tmp_callback), &metadata); -} - -std::tuple Video::Next() { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - // if failing to decode simply return a null tensor (note, should we - // raise an exception?) - double frame_pts_s; - torch::Tensor outFrame = torch::zeros({0}, torch::kByte); - - // decode single frame - DecoderOutputMessage out; - int64_t res = decoder.decode(&out, decoderTimeoutMs); - // if successful - if (res == 0) { - frame_pts_s = double(double(out.header.pts) * 1e-6); - - auto header = out.header; - const auto& format = header.format; - - // initialize the output variables based on type - - if (format.type == TYPE_VIDEO) { - // note: this can potentially be optimized - // by having the global tensor that we fill at decode time - // (would avoid allocations) - int outHeight = format.format.video.height; - int outWidth = format.format.video.width; - int numChannels = 3; - outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte); - fillVideoTensor(out, outFrame); - outFrame = outFrame.permute({2, 0, 1}); - - } else if (format.type == TYPE_AUDIO) { - int outAudioChannels = format.format.audio.channels; - int bytesPerSample = av_get_bytes_per_sample( - static_cast(format.format.audio.format)); - int frameSizeTotal = out.payload->length(); - - TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0); - int numAudioSamples = - frameSizeTotal / (outAudioChannels * bytesPerSample); - - outFrame = - torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat); - - fillAudioTensor(out, outFrame); - } - // currently not supporting other formats (will do soon) - - out.payload.reset(); - } else if (res == ENODATA) { - LOG(INFO) << "Decoder ran out of frames (ENODATA)\n"; - } else { - LOG(ERROR) << "Decoder failed with ERROR_CODE " << res; - } - - return std::make_tuple(outFrame, frame_pts_s); -} - -static auto registerVideo = - torch::class_