From 69f90df9805f4c4ad5a1d73732693c859b744ca3 Mon Sep 17 00:00:00 2001 From: Conn O'Griofa Date: Fri, 11 Nov 2022 01:37:41 +0000 Subject: [PATCH 1/2] video: force callbacks on problematic encoders * Add encoder flag "FORCE_CALLBACK" for encoders that don't perform well with callback capture * Set new flag for "amdenc" and "nvenc" on Windows only. * When flag is enabled, reduce timeout delay to 2x frame interval, and force a callback if a timeout transpires. This should mean that a callback is guaranteed approximately every third frame. Before change: * amdenc has a noticeable issue where the last received image can stutter or freeze for a long period if the capture framerate is low (such as on desktop content). After change: * forced callback with a 2x frame delay interval (e.g. 32ms for 60fps) ensures that the last received frame will not be delayed for more than two frames, and desktop usage feels much smoother. Note: using too low of a frame delay interval will result in a forced capture rate of the client framerate (60fps), which can make low framerate content jittery. Using 2x frame delay alleviate this issue, allowing the encoder rate to throttle to ~20fps minimum, this avoiding jitter and reducing encoder strain. --- src/main.cpp | 1 + src/main.h | 1 + src/platform/windows/display_vram.cpp | 10 ++++++---- src/video.cpp | 6 ++++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 58271f83d02..cf2d71dad41 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -45,6 +45,7 @@ bl::sources::severity_logger error(4); // Recoverable errors bl::sources::severity_logger fatal(5); // Unrecoverable errors bool display_cursor = true; +bool force_callback = false; using text_sink = bl::sinks::asynchronous_sink; boost::shared_ptr sink; diff --git a/src/main.h b/src/main.h index 89c4dbdc6f4..4267a2586a8 100644 --- a/src/main.h +++ b/src/main.h @@ -12,6 +12,7 @@ extern util::ThreadPool task_pool; extern bool display_cursor; +extern bool force_callback; extern boost::log::sources::severity_logger verbose; extern boost::log::sources::severity_logger debug; diff --git a/src/platform/windows/display_vram.cpp b/src/platform/windows/display_vram.cpp index 3cf7c97789b..29b2117007a 100644 --- a/src/platform/windows/display_vram.cpp +++ b/src/platform/windows/display_vram.cpp @@ -573,14 +573,16 @@ capture_e display_vram_t::capture(snapshot_cb_t &&snapshot_cb, std::shared_ptr<: } next_frame = now + delay; - auto status = snapshot(img.get(), 1000ms, *cursor); + auto status = snapshot(img.get(), force_callback ? std::chrono::duration_cast(delay * 2) : 1000ms, *cursor); switch(status) { case platf::capture_e::reinit: case platf::capture_e::error: return status; case platf::capture_e::timeout: - std::this_thread::sleep_for(1ms); - continue; + if(!force_callback) { + std::this_thread::sleep_for(1ms); + continue; + } case platf::capture_e::ok: img = snapshot_cb(img); break; @@ -885,4 +887,4 @@ int init() { return 0; } -} // namespace platf::dxgi \ No newline at end of file +} // namespace platf::dxgi diff --git a/src/video.cpp b/src/video.cpp index bec77c81b9d..1fc7be06fc2 100644 --- a/src/video.cpp +++ b/src/video.cpp @@ -240,6 +240,7 @@ enum flag_e { H264_ONLY = 0x02, // When HEVC is too heavy LIMITED_GOP_SIZE = 0x04, // Some encoders don't like it when you have an infinite GOP_SIZE. *cough* VAAPI *cough* SINGLE_SLICE_ONLY = 0x08, // Never use multiple slices <-- Older intel iGPU's ruin it for everyone else :P + FORCE_CALLBACK = 0x10, // Force callbacks with short timeouts for encoders that don't perform well with callback-based capture }; struct encoder_t { @@ -438,7 +439,7 @@ static encoder_t nvenc { "h264_nvenc"s, }, #ifdef _WIN32 - DEFAULT, + FORCE_CALLBACK, dxgi_make_hwdevice_ctx #else PARALLEL_ENCODING, @@ -474,7 +475,7 @@ static encoder_t amdvce { std::make_optional({ "qp_p"s, &config::video.qp }), "h264_amf"s, }, - DEFAULT, + FORCE_CALLBACK, dxgi_make_hwdevice_ctx }; #endif @@ -1404,6 +1405,7 @@ void capture( void *channel_data) { auto idr_events = mail->event(mail::idr); + force_callback = encoders.front().flags & FORCE_CALLBACK; idr_events->raise(true); if(encoders.front().flags & PARALLEL_ENCODING) { From 9710d47d1a28c2c8689be4c73f0443c09705baf8 Mon Sep 17 00:00:00 2001 From: Conn O'Griofa Date: Fri, 11 Nov 2022 02:05:58 +0000 Subject: [PATCH 2/2] video: amdvce: add workaround to reduce hw buffer size The AMF encoder does not seem to output frames in realtime even if the "ultralowlatency" usage profile is used. Reducing the HW buffers from 16 -> 2 helps to reduce latency and minimize delayed frames. Note that it is necessary to set "initial_pool_size" after av_hwframe_ctx_init() in order to resize the buffer without causing issues elsewhere in the ffmpeg code. Resizing "initial_pool_size" affects the following: * https://github.com/FFmpeg/FFmpeg/blob/a78f136f3fa039fd7ad664fd6e6e976f1448c68b/libavcodec/amfenc.c#L224 * https://github.com/FFmpeg/FFmpeg/blob/a78f136f3fa039fd7ad664fd6e6e976f1448c68b/libavcodec/amfenc.c#L278-L279 --- src/video.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/video.cpp b/src/video.cpp index 1fc7be06fc2..4417bb04108 100644 --- a/src/video.cpp +++ b/src/video.cpp @@ -71,7 +71,7 @@ util::Either dxgi_make_hwdevice_ctx(platf::hwdevice_t *hwdevice_c util::Either vaapi_make_hwdevice_ctx(platf::hwdevice_t *hwdevice_ctx); util::Either cuda_make_hwdevice_ctx(platf::hwdevice_t *hwdevice_ctx); -int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format); +int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format, bool amf_lowlatency); class swdevice_t : public platf::hwdevice_t { public: @@ -932,7 +932,7 @@ std::optional make_session(const encoder_t &encoder, const config_t & } hwdevice_ctx = std::move(buf_or_error.left()); - if(hwframe_ctx(ctx, hwdevice_ctx, sw_fmt)) { + if(hwframe_ctx(ctx, hwdevice_ctx, sw_fmt, (encoder.name == "amdvce"sv))) { return std::nullopt; } @@ -1682,7 +1682,7 @@ int init() { return 0; } -int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format) { +int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format, bool amf_lowlatency) { buffer_t frame_ref { av_hwframe_ctx_alloc(hwdevice.get()) }; auto frame_ctx = (AVHWFramesContext *)frame_ref->data; @@ -1696,6 +1696,12 @@ int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format) { return err; } + if(amf_lowlatency) { + // reduce amf encoder's hw buffers from 16 -> 2 to minimize buffered frames + // note: pool size is deliberately set after initialization + frame_ctx->initial_pool_size = 3; + } + ctx->hw_frames_ctx = av_buffer_ref(frame_ref.get()); return 0;