From fb4696e561ca06ca483e157c354506ddd22e5b8f Mon Sep 17 00:00:00 2001
From: Cameron Gutman <aicommander@gmail.com>
Date: Mon, 24 Jul 2023 20:09:34 -0500
Subject: [PATCH 1/3] Correct multi-FEC block size calculations

We were too conservative in determining our max data size before needing to split, which resulted in many frames being split into multiple FEC blocks unnecessarily.

We also just used a hardcoded split into 3 blocks instead of actually calculating how many blocks are actually required.
---
 src/stream.cpp | 84 ++++++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 37 deletions(-)
diff --git a/src/stream.cpp b/src/stream.cpp
index 83e4a128b1e..02f230b7b98 100644
--- a/src/stream.cpp
+++ b/src/stream.cpp
@@ -656,7 +656,7 @@ namespace stream {
       auto parity_shards = (data_shards * fecpercentage + 99) / 100;
 
       // increase the FEC percentage for this frame if the parity shard minimum is not met
-      if (parity_shards < minparityshards) {
+      if (parity_shards < minparityshards && fecpercentage != 0) {
         parity_shards = minparityshards;
         fecpercentage = (100 * parity_shards) / data_shards;
 
@@ -664,15 +664,6 @@ namespace stream {
       }
 
       auto nr_shards = data_shards + parity_shards;
-      if (nr_shards > DATA_SHARDS_MAX) {
-        BOOST_LOG(warning)
-          << "Number of fragments for reed solomon exceeds DATA_SHARDS_MAX"sv << std::endl
-          << nr_shards << " > "sv << DATA_SHARDS_MAX
-          << ", skipping error correction"sv;
-
-        nr_shards = data_shards;
-        fecpercentage = 0;
-      }
 
       util::buffer_t<char> shards { nr_shards * (blocksize + prefixsize) };
       util::buffer_t<uint8_t *> shards_p { nr_shards };
@@ -691,7 +682,7 @@ namespace stream {
         next += copy_len;
       }
 
-      if (data_shards + parity_shards <= DATA_SHARDS_MAX) {
+      if (fecpercentage != 0) {
         // packets = parity_shards + data_shards
         rs_t rs { reed_solomon_new(data_shards, parity_shards) };
 
@@ -1336,38 +1327,57 @@ namespace stream {
 
       payload = std::string_view { (char *) payload_new.data(), payload_new.size() };
 
-      // With a fecpercentage of 255, if payload_new is broken up into more than a 100 data_shards
-      // it will generate greater than DATA_SHARDS_MAX shards.
-      // Therefore, we start breaking the data up into three separate fec blocks.
-      auto multi_fec_threshold = 90 * blocksize;
-
-      // We can go up to 4 fec blocks, but 3 is plenty
-      constexpr auto MAX_FEC_BLOCKS = 3;
+      // There are 2 bits for FEC block count for a maximum of 4 FEC blocks
+      constexpr auto MAX_FEC_BLOCKS = 4;
+
+      // The max number of data shards per block is found by solving this system of equations for D:
+      // D = 255 - P
+      // P = D * F
+      // which results in the solution:
+      // D = 255 / (1 + F)
+      // multiplied by 100 since F is the percentage as an integer:
+      // D = (255 * 100) / (100 + F)
+      auto max_data_shards_per_fec_block = (DATA_SHARDS_MAX * 100) / (100 + fecPercentage);
+
+      // Compute the number of FEC blocks needed for this frame using the block size and max shards
+      auto max_data_per_fec_block = max_data_shards_per_fec_block * blocksize;
+      auto fec_blocks_needed = (payload.size() + (max_data_per_fec_block - 1)) / max_data_per_fec_block;
+
+      // If the number of FEC blocks needed exceeds the protocol limit, turn off FEC for this frame.
+      // For normal FEC percentages, this should only happen for enormous frames (over 800 packets at 20%).
+      if (fec_blocks_needed > MAX_FEC_BLOCKS) {
+        BOOST_LOG(warning) << "Skipping FEC for abnormally large encoded frame (needed "sv << fec_blocks_needed << " FEC blocks)"sv;
+        fecPercentage = 0;
+        fec_blocks_needed = MAX_FEC_BLOCKS;
+      }
 
       std::array<std::string_view, MAX_FEC_BLOCKS> fec_blocks;
       decltype(fec_blocks)::iterator
         fec_blocks_begin = std::begin(fec_blocks),
-        fec_blocks_end = std::begin(fec_blocks) + 1;
+        fec_blocks_end = std::begin(fec_blocks) + fec_blocks_needed;
 
-      auto lastBlockIndex = 0;
-      if (payload.size() > multi_fec_threshold) {
-        BOOST_LOG(verbose) << "Generating multiple FEC blocks"sv;
+      BOOST_LOG(verbose) << "Generating "sv << fec_blocks_needed << " FEC blocks"sv;
 
-        // Align individual fec blocks to blocksize
-        auto unaligned_size = payload.size() / MAX_FEC_BLOCKS;
-        auto aligned_size = ((unaligned_size + (blocksize - 1)) / blocksize) * blocksize;
+      // Align individual FEC blocks to blocksize
+      auto unaligned_size = payload.size() / fec_blocks_needed;
+      auto aligned_size = ((unaligned_size + (blocksize - 1)) / blocksize) * blocksize;
 
-        // Break the data up into 3 blocks, each containing multiple complete video packets.
-        fec_blocks[0] = payload.substr(0, aligned_size);
-        fec_blocks[1] = payload.substr(aligned_size, aligned_size);
-        fec_blocks[2] = payload.substr(aligned_size * 2);
-
-        lastBlockIndex = 2 << 6;
-        fec_blocks_end = std::end(fec_blocks);
+      // If we exceed the 10-bit FEC packet index (which means our frame exceeded 4096 packets),
+      // the frame will be unrecoverable. Log an error for this case.
+      if (aligned_size / blocksize >= 1024) {
+        BOOST_LOG(error) << "Encoder produced a frame too large to send! Is the encoder broken? (needed "sv << (aligned_size / blocksize) << " packets)"sv;
       }
-      else {
-        BOOST_LOG(verbose) << "Generating single FEC block"sv;
-        fec_blocks[0] = payload;
+
+      // Split the data into aligned FEC blocks
+      for (int x = 0; x < fec_blocks_needed; ++x) {
+        if (x == fec_blocks_needed - 1) {
+          // The last block must extend to the end of the payload
+          fec_blocks[x] = payload.substr(x * aligned_size);
+        }
+        else {
+          // Earlier blocks just extend to the next block offset
+          fec_blocks[x] = payload.substr(x * aligned_size, aligned_size);
+        }
       }
 
       try {
@@ -1383,7 +1393,7 @@ namespace stream {
 
             // Match multiFecFlags with Moonlight
             inspect->packet.multiFecFlags = 0x10;
-            inspect->packet.multiFecBlocks = (blockIndex << 4) | lastBlockIndex;
+            inspect->packet.multiFecBlocks = (blockIndex << 4) | ((fec_blocks_needed - 1) << 6);
 
             if (x == 0) {
               inspect->packet.flags |= FLAG_SOF;
@@ -1415,7 +1425,7 @@ namespace stream {
             inspect->rtp.sequenceNumber = util::endian::big<uint16_t>(lowseq + x);
             inspect->rtp.timestamp = util::endian::big<uint32_t>(timestamp);
 
-            inspect->packet.multiFecBlocks = (blockIndex << 4) | lastBlockIndex;
+            inspect->packet.multiFecBlocks = (blockIndex << 4) | ((fec_blocks_needed - 1) << 6);
             inspect->packet.frameIndex = packet->frame_index();
 
             // Encrypt this shard if video encryption is enabled

From 945d1f838336a1b10f2c8918ddb8455a9b56544b Mon Sep 17 00:00:00 2001
From: ns6089 <61738816+ns6089@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:41:27 +0300
Subject: [PATCH 2/3] Maximize timer precision on Windows

---
 cmake/compile_definitions/windows.cmake |  1 +
 src/platform/windows/misc.cpp           | 55 +++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/cmake/compile_definitions/windows.cmake b/cmake/compile_definitions/windows.cmake
index fa5c0614d5a..7643d1d9efc 100644
--- a/cmake/compile_definitions/windows.cmake
+++ b/cmake/compile_definitions/windows.cmake
@@ -67,6 +67,7 @@ list(PREPEND PLATFORM_LIBRARIES
         libstdc++.a
         libwinpthread.a
         libssp.a
+        ntdll
         ksuser
         wsock32
         ws2_32
diff --git a/src/platform/windows/misc.cpp b/src/platform/windows/misc.cpp
index c2f2134802c..32c5da98abc 100644
--- a/src/platform/windows/misc.cpp
+++ b/src/platform/windows/misc.cpp
@@ -61,6 +61,38 @@
   #define WLAN_API_MAKE_VERSION(_major, _minor) (((DWORD) (_minor)) << 16 | (_major))
 #endif
 
+#include <winternl.h>
+extern "C" {
+NTSTATUS NTAPI
+NtSetTimerResolution(ULONG DesiredResolution, BOOLEAN SetResolution, PULONG CurrentResolution);
+}
+
+namespace {
+
+  std::atomic<bool> used_nt_set_timer_resolution = false;
+
+  bool
+  nt_set_timer_resolution_max() {
+    ULONG minimum, maximum, current;
+    if (!NT_SUCCESS(NtQueryTimerResolution(&minimum, &maximum, &current)) ||
+        !NT_SUCCESS(NtSetTimerResolution(maximum, TRUE, &current))) {
+      return false;
+    }
+    return true;
+  }
+
+  bool
+  nt_set_timer_resolution_min() {
+    ULONG minimum, maximum, current;
+    if (!NT_SUCCESS(NtQueryTimerResolution(&minimum, &maximum, &current)) ||
+        !NT_SUCCESS(NtSetTimerResolution(minimum, TRUE, &current))) {
+      return false;
+    }
+    return true;
+  }
+
+}  // namespace
+
 namespace bp = boost::process;
 
 using namespace std::literals;
@@ -1115,8 +1147,15 @@ namespace platf {
     // Enable MMCSS scheduling for DWM
     DwmEnableMMCSS(true);
 
-    // Reduce timer period to 1ms
-    timeBeginPeriod(1);
+    // Reduce timer period to 0.5ms
+    if (nt_set_timer_resolution_max()) {
+      used_nt_set_timer_resolution = true;
+    }
+    else {
+      BOOST_LOG(error) << "NtSetTimerResolution() failed, falling back to timeBeginPeriod()";
+      timeBeginPeriod(1);
+      used_nt_set_timer_resolution = false;
+    }
 
     // Promote ourselves to high priority class
     SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
@@ -1199,8 +1238,16 @@ namespace platf {
     // Demote ourselves back to normal priority class
     SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS);
 
-    // End our 1ms timer request
-    timeEndPeriod(1);
+    // End our 0.5ms timer request
+    if (used_nt_set_timer_resolution) {
+      used_nt_set_timer_resolution = false;
+      if (!nt_set_timer_resolution_min()) {
+        BOOST_LOG(error) << "nt_set_timer_resolution_min() failed even though nt_set_timer_resolution_max() succeeded";
+      }
+    }
+    else {
+      timeEndPeriod(1);
+    }
 
     // Disable MMCSS scheduling for DWM
     DwmEnableMMCSS(false);

From 4fb843878a229deffbadf1a917db6d4edd43652b Mon Sep 17 00:00:00 2001
From: ns6089 <61738816+ns6089@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:49:14 +0300
Subject: [PATCH 3/3] Add static pacing to video network packets

---
 src/platform/common.h                 |  28 +++++
 src/platform/linux/misc.cpp           |  17 +++
 src/platform/macos/misc.mm            |  16 +++
 src/platform/windows/display.h        |   5 +-
 src/platform/windows/display_base.cpp |  35 +-----
 src/platform/windows/misc.cpp         |  51 ++++++++
 src/stream.cpp                        | 162 +++++++++++++++++++++-----
 src/utility.h                         |   3 +-
 8 files changed, 255 insertions(+), 62 deletions(-)

diff --git a/src/platform/common.h b/src/platform/common.h
index f42d366dac0..8a92c269e29 100644
--- a/src/platform/common.h
+++ b/src/platform/common.h
@@ -10,6 +10,8 @@
 #include <mutex>
 #include <string>
 
+#include <boost/core/noncopyable.hpp>
+
 #include "src/config.h"
 #include "src/logging.h"
 #include "src/stat_trackers.h"
@@ -799,4 +801,30 @@ namespace platf {
    */
   std::vector<supported_gamepad_t> &
   supported_gamepads(input_t *input);
+
+  struct high_precision_timer: private boost::noncopyable {
+    virtual ~high_precision_timer() = default;
+
+    /**
+     * @brief Sleep for the duration
+     * @param duration Sleep duration
+     */
+    virtual void
+    sleep_for(const std::chrono::nanoseconds &duration) = 0;
+
+    /**
+     * @brief Check if platform-specific timer backend has been initialized successfully
+     * @return `true` on success, `false` on error
+     */
+    virtual
+    operator bool() = 0;
+  };
+
+  /**
+   * @brief Create platform-specific timer capable of high-precision sleep
+   * @return A unique pointer to timer
+   */
+  std::unique_ptr<high_precision_timer>
+  create_high_precision_timer();
+
 }  // namespace platf
diff --git a/src/platform/linux/misc.cpp b/src/platform/linux/misc.cpp
index 551fab2c57e..25ea197046e 100644
--- a/src/platform/linux/misc.cpp
+++ b/src/platform/linux/misc.cpp
@@ -933,4 +933,21 @@ namespace platf {
 
     return std::make_unique<deinit_t>();
   }
+
+  class linux_high_precision_timer: public high_precision_timer {
+  public:
+    void
+    sleep_for(const std::chrono::nanoseconds &duration) override {
+      std::this_thread::sleep_for(duration);
+    }
+
+    operator bool() override {
+      return true;
+    }
+  };
+
+  std::unique_ptr<high_precision_timer>
+  create_high_precision_timer() {
+    return std::make_unique<linux_high_precision_timer>();
+  }
 }  // namespace platf
diff --git a/src/platform/macos/misc.mm b/src/platform/macos/misc.mm
index b075c1c24de..ae5f266da6d 100644
--- a/src/platform/macos/misc.mm
+++ b/src/platform/macos/misc.mm
@@ -507,6 +507,22 @@
     return std::make_unique<qos_t>(sockfd, reset_options);
   }
 
+  class macos_high_precision_timer: public high_precision_timer {
+  public:
+    void
+    sleep_for(const std::chrono::nanoseconds &duration) override {
+      std::this_thread::sleep_for(duration);
+    }
+
+    operator bool() override {
+      return true;
+    }
+  };
+
+  std::unique_ptr<high_precision_timer>
+  create_high_precision_timer() {
+    return std::make_unique<macos_high_precision_timer>();
+  }
 }  // namespace platf
 
 namespace dyn {
diff --git a/src/platform/windows/display.h b/src/platform/windows/display.h
index b257abb8e85..3e035490394 100644
--- a/src/platform/windows/display.h
+++ b/src/platform/windows/display.h
@@ -161,9 +161,6 @@ namespace platf::dxgi {
     int
     init(const ::video::config_t &config, const std::string &display_name);
 
-    void
-    high_precision_sleep(std::chrono::nanoseconds duration);
-
     capture_e
     capture(const push_captured_image_cb_t &push_captured_image_cb, const pull_free_image_cb_t &pull_free_image_cb, bool *cursor) override;
 
@@ -184,7 +181,7 @@ namespace platf::dxgi {
     DXGI_FORMAT capture_format;
     D3D_FEATURE_LEVEL feature_level;
 
-    util::safe_ptr_v2<std::remove_pointer_t<HANDLE>, BOOL, CloseHandle> timer;
+    std::unique_ptr<high_precision_timer> timer = create_high_precision_timer();
 
     typedef enum _D3DKMT_SCHEDULINGPRIORITYCLASS {
       D3DKMT_SCHEDULINGPRIORITYCLASS_IDLE,  ///< Idle priority class
diff --git a/src/platform/windows/display_base.cpp b/src/platform/windows/display_base.cpp
index 664fff8780f..6ab0f0c2c81 100644
--- a/src/platform/windows/display_base.cpp
+++ b/src/platform/windows/display_base.cpp
@@ -182,27 +182,6 @@ namespace platf::dxgi {
     release_frame();
   }
 
-  void
-  display_base_t::high_precision_sleep(std::chrono::nanoseconds duration) {
-    if (!timer) {
-      BOOST_LOG(error) << "Attempting high_precision_sleep() with uninitialized timer";
-      return;
-    }
-    if (duration < 0s) {
-      BOOST_LOG(error) << "Attempting high_precision_sleep() with negative duration";
-      return;
-    }
-    if (duration > 5s) {
-      BOOST_LOG(error) << "Attempting high_precision_sleep() with unexpectedly large duration (>5s)";
-      return;
-    }
-
-    LARGE_INTEGER due_time;
-    due_time.QuadPart = duration.count() / -100;
-    SetWaitableTimer(timer.get(), &due_time, 0, nullptr, nullptr, false);
-    WaitForSingleObject(timer.get(), INFINITE);
-  }
-
   capture_e
   display_base_t::capture(const push_captured_image_cb_t &push_captured_image_cb, const pull_free_image_cb_t &pull_free_image_cb, bool *cursor) {
     auto adjust_client_frame_rate = [&]() -> DXGI_RATIONAL {
@@ -268,7 +247,7 @@ namespace platf::dxgi {
           status = capture_e::timeout;
         }
         else {
-          high_precision_sleep(sleep_period);
+          timer->sleep_for(sleep_period);
           std::chrono::nanoseconds overshoot_ns = std::chrono::steady_clock::now() - sleep_target;
           log_sleep_overshoot(overshoot_ns);
 
@@ -799,15 +778,9 @@ namespace platf::dxgi {
         << "Max Full Luminance : "sv << desc1.MaxFullFrameLuminance << " nits"sv;
     }
 
-    // Use CREATE_WAITABLE_TIMER_HIGH_RESOLUTION if supported (Windows 10 1809+)
-    timer.reset(CreateWaitableTimerEx(nullptr, nullptr, CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, TIMER_ALL_ACCESS));
-    if (!timer) {
-      timer.reset(CreateWaitableTimerEx(nullptr, nullptr, 0, TIMER_ALL_ACCESS));
-      if (!timer) {
-        auto winerr = GetLastError();
-        BOOST_LOG(error) << "Failed to create timer: "sv << winerr;
-        return -1;
-      }
+    if (!timer || !*timer) {
+      BOOST_LOG(error) << "Uninitialized high precision timer";
+      return -1;
     }
 
     return 0;
diff --git a/src/platform/windows/misc.cpp b/src/platform/windows/misc.cpp
index 32c5da98abc..82632537ee2 100644
--- a/src/platform/windows/misc.cpp
+++ b/src/platform/windows/misc.cpp
@@ -1803,4 +1803,55 @@ namespace platf {
 
     return output;
   }
+
+  class win32_high_precision_timer: public high_precision_timer {
+  public:
+    win32_high_precision_timer() {
+      // Use CREATE_WAITABLE_TIMER_HIGH_RESOLUTION if supported (Windows 10 1809+)
+      timer = CreateWaitableTimerEx(nullptr, nullptr, CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, TIMER_ALL_ACCESS);
+      if (!timer) {
+        timer = CreateWaitableTimerEx(nullptr, nullptr, 0, TIMER_ALL_ACCESS);
+        if (!timer) {
+          BOOST_LOG(error) << "Unable to create high_precision_timer, CreateWaitableTimerEx() failed: " << GetLastError();
+        }
+      }
+    }
+
+    ~win32_high_precision_timer() {
+      if (timer) CloseHandle(timer);
+    }
+
+    void
+    sleep_for(const std::chrono::nanoseconds &duration) override {
+      if (!timer) {
+        BOOST_LOG(error) << "Attempting high_precision_timer::sleep_for() with uninitialized timer";
+        return;
+      }
+      if (duration < 0s) {
+        BOOST_LOG(error) << "Attempting high_precision_timer::sleep_for() with negative duration";
+        return;
+      }
+      if (duration > 5s) {
+        BOOST_LOG(error) << "Attempting high_precision_timer::sleep_for() with unexpectedly large duration (>5s)";
+        return;
+      }
+
+      LARGE_INTEGER due_time;
+      due_time.QuadPart = duration.count() / -100;
+      SetWaitableTimer(timer, &due_time, 0, nullptr, nullptr, false);
+      WaitForSingleObject(timer, INFINITE);
+    }
+
+    operator bool() override {
+      return timer != NULL;
+    }
+
+  private:
+    HANDLE timer = NULL;
+  };
+
+  std::unique_ptr<high_precision_timer>
+  create_high_precision_timer() {
+    return std::make_unique<win32_high_precision_timer>();
+  }
 }  // namespace platf
diff --git a/src/stream.cpp b/src/stream.cpp
index 02f230b7b98..46887c3a0aa 100644
--- a/src/stream.cpp
+++ b/src/stream.cpp
@@ -29,6 +29,8 @@ extern "C" {
 #include "thread_safe.h"
 #include "utility.h"
 
+#include "platform/common.h"
+
 #define IDX_START_A 0
 #define IDX_START_B 1
 #define IDX_INVALIDATE_REF_FRAMES 2
@@ -1246,13 +1248,29 @@ namespace stream {
     platf::adjust_thread_priority(platf::thread_priority_e::high);
 
     stat_trackers::min_max_avg_tracker<uint16_t> frame_processing_latency_tracker;
+
+    stat_trackers::min_max_avg_tracker<double> frame_send_batch_latency_tracker;
+    stat_trackers::min_max_avg_tracker<double> frame_fec_latency_tracker;
+    stat_trackers::min_max_avg_tracker<double> frame_network_latency_tracker;
+
     crypto::aes_t iv(12);
 
+    auto timer = platf::create_high_precision_timer();
+    if (!timer || !*timer) {
+      BOOST_LOG(error) << "Failed to create timer, aborting video broadcast thread";
+      return;
+    }
+
+    auto ratecontrol_next_frame_start = std::chrono::steady_clock::now();
+
     while (auto packet = packets->pop()) {
       if (shutdown_event->peek()) {
         break;
       }
 
+      auto frame_packet_start_time = std::chrono::steady_clock::now();
+      std::chrono::nanoseconds fec_time = 0ns;
+
       auto session = (session_t *) packet->channel_data;
       auto lowseq = session->video.lowseq;
 
@@ -1381,6 +1399,21 @@ namespace stream {
       }
 
       try {
+        // Use around 80% of 1Gbps          1Gbps            percent    ms     packet      byte
+        size_t ratecontrol_packets_in_1ms = std::giga::num * 80 / 100 / 1000 / blocksize / 8;
+
+        // Send less than 64K in a single batch.
+        // On Windows, batches above 64K seem to bypass SO_SNDBUF regardless of its size,
+        // appear in "Other I/O" and begin waiting for interrupts.
+        // This gives inconsistent performance so we'd rather avoid it.
+        size_t send_batch_size = 64 * 1024 / blocksize;
+
+        // Don't ignore the last ratecontrol group of the previous frame
+        auto ratecontrol_frame_start = std::max(ratecontrol_next_frame_start, std::chrono::steady_clock::now());
+
+        size_t ratecontrol_frame_packets_sent = 0;
+        size_t ratecontrol_group_packets_sent = 0;
+
         auto blockIndex = 0;
         std::for_each(fec_blocks_begin, fec_blocks_end, [&](std::string_view &current_payload) {
           auto packets = (current_payload.size() + (blocksize - 1)) / blocksize;
@@ -1404,10 +1437,27 @@ namespace stream {
             }
           }
 
+          auto fec_start = std::chrono::steady_clock::now();
+
           // If video encryption is enabled, we allocate space for the encryption header before each shard
           auto shards = fec::encode(current_payload, blocksize, fecPercentage, session->config.minRequiredFecPackets,
             session->video.cipher ? sizeof(video_packet_enc_prefix_t) : 0);
 
+          fec_time += std::chrono::steady_clock::now() - fec_start;
+
+          auto peer_address = session->video.peer.address();
+          auto batch_info = platf::batched_send_info_t {
+            nullptr,
+            shards.prefixsize + shards.blocksize,
+            0,
+            (uintptr_t) sock.native_handle(),
+            peer_address,
+            session->video.peer.port(),
+            session->localAddress,
+          };
+
+          size_t next_shard_to_send = 0;
+
           // set FEC info now that we know for sure what our percentage will be for this frame
           for (auto x = 0; x < shards.size(); ++x) {
             auto *inspect = (video_packet_raw_t *) shards.data(x);
@@ -1448,37 +1498,89 @@ namespace stream {
               std::copy(std::begin(iv), std::end(iv), prefix->iv);
               session->video.cipher->encrypt(std::string_view { (char *) inspect, (size_t) blocksize }, prefix->tag, &iv);
             }
-          }
 
-          auto peer_address = session->video.peer.address();
-          auto batch_info = platf::batched_send_info_t {
-            shards.shards.begin(),
-            shards.prefixsize + shards.blocksize,
-            shards.nr_shards,
-            (uintptr_t) sock.native_handle(),
-            peer_address,
-            session->video.peer.port(),
-            session->localAddress,
-          };
+            if (x - next_shard_to_send + 1 >= send_batch_size ||
+                x + 1 == shards.size()) {
+              // Do pacing within the frame.
+              // Also trigger pacing before the first send_batch() of the frame
+              // to account for the last send_batch() of the previous frame.
+              if (ratecontrol_group_packets_sent >= ratecontrol_packets_in_1ms ||
+                  ratecontrol_frame_packets_sent == 0) {
+                auto due = ratecontrol_frame_start +
+                           std::chrono::duration_cast<std::chrono::nanoseconds>(1ms) *
+                             ratecontrol_frame_packets_sent / ratecontrol_packets_in_1ms;
+
+                auto now = std::chrono::steady_clock::now();
+                if (now < due) {
+                  timer->sleep_for(due - now);
+                }
+
+                ratecontrol_group_packets_sent = 0;
+              }
 
-          // Use a batched send if it's supported on this platform
-          if (!platf::send_batch(batch_info)) {
-            // Batched send is not available, so send each packet individually
-            BOOST_LOG(verbose) << "Falling back to unbatched send"sv;
-            for (auto x = 0; x < shards.size(); ++x) {
-              auto send_info = platf::send_info_t {
-                shards.prefix(x),
-                shards.prefixsize + shards.blocksize,
-                (uintptr_t) sock.native_handle(),
-                peer_address,
-                session->video.peer.port(),
-                session->localAddress,
-              };
-
-              platf::send(send_info);
+              size_t current_batch_size = x - next_shard_to_send + 1;
+              batch_info.buffer = shards.prefix(next_shard_to_send);
+              batch_info.block_count = current_batch_size;
+
+              auto batch_start_time = std::chrono::steady_clock::now();
+              // Use a batched send if it's supported on this platform
+              if (!platf::send_batch(batch_info)) {
+                // Batched send is not available, so send each packet individually
+                BOOST_LOG(verbose) << "Falling back to unbatched send"sv;
+                for (auto y = 0; y < current_batch_size; y++) {
+                  auto send_info = platf::send_info_t {
+                    shards.prefix(next_shard_to_send + y),
+                    shards.prefixsize + shards.blocksize,
+                    (uintptr_t) sock.native_handle(),
+                    peer_address,
+                    session->video.peer.port(),
+                    session->localAddress,
+                  };
+
+                  platf::send(send_info);
+                }
+              }
+              if (config::sunshine.min_log_level <= 1) {
+                // Print send_batch() latency stats to debug log every 20 seconds
+                auto print_info = [&](double min_latency, double max_latency, double avg_latency) {
+                  auto f = stat_trackers::one_digit_after_decimal();
+                  BOOST_LOG(debug) << "Network: individual send_batch() latency (min/max/avg): " << f % min_latency << "ms/" << f % max_latency << "ms/" << f % avg_latency << "ms";
+                };
+                double send_batch_latency = (std::chrono::steady_clock::now() - batch_start_time).count() / 1000000.;
+                frame_send_batch_latency_tracker.collect_and_callback_on_interval(send_batch_latency, print_info, 20s);
+              }
+
+              ratecontrol_group_packets_sent += current_batch_size;
+              ratecontrol_frame_packets_sent += current_batch_size;
+              next_shard_to_send = x + 1;
             }
           }
 
+          // remember this in case the next frame comes immediately
+          ratecontrol_next_frame_start = ratecontrol_frame_start +
+                                         std::chrono::duration_cast<std::chrono::nanoseconds>(1ms) *
+                                           ratecontrol_frame_packets_sent / ratecontrol_packets_in_1ms;
+
+          if (config::sunshine.min_log_level <= 1) {
+            // Print frame FEC latency stats to debug log every 20 seconds
+            auto print_info = [&](double min_latency, double max_latency, double avg_latency) {
+              auto f = stat_trackers::one_digit_after_decimal();
+              BOOST_LOG(debug) << "Network: frame FEC latency (min/max/avg): " << f % min_latency << "ms/" << f % max_latency << "ms/" << f % avg_latency << "ms";
+            };
+            double fec_latency = fec_time.count() / 1000000.;
+            frame_fec_latency_tracker.collect_and_callback_on_interval(fec_latency, print_info, 20s);
+          }
+
+          if (config::sunshine.min_log_level <= 1) {
+            // Print frame network latency stats to debug log every 20 seconds
+            auto print_info = [&](double min_latency, double max_latency, double avg_latency) {
+              auto f = stat_trackers::one_digit_after_decimal();
+              BOOST_LOG(debug) << "Network: frame complete network latency (min/max/avg): " << f % min_latency << "ms/" << f % max_latency << "ms/" << f % avg_latency << "ms";
+            };
+            double network_latency = (std::chrono::steady_clock::now() - frame_packet_start_time).count() / 1000000.;
+            frame_network_latency_tracker.collect_and_callback_on_interval(network_latency, print_info, 20s);
+          }
+
           if (packet->is_idr()) {
             BOOST_LOG(verbose) << "Key Frame ["sv << packet->frame_index() << "] :: send ["sv << shards.size() << "] shards..."sv;
           }
@@ -1628,6 +1730,14 @@ namespace stream {
       return -1;
     }
 
+    // Set video socket send buffer size (SO_SENDBUF) to 1MB
+    try {
+      ctx.video_sock.set_option(boost::asio::socket_base::send_buffer_size(1024 * 1024));
+    }
+    catch (...) {
+      BOOST_LOG(error) << "Failed to set video socket send buffer size (SO_SENDBUF)";
+    }
+
     ctx.video_sock.bind(udp::endpoint(protocol, video_port), ec);
     if (ec) {
       BOOST_LOG(fatal) << "Couldn't bind Video server to port ["sv << video_port << "]: "sv << ec.message();
diff --git a/src/utility.h b/src/utility.h
index 3da006f8d47..c3b4f3d813b 100644
--- a/src/utility.h
+++ b/src/utility.h
@@ -606,7 +606,8 @@ namespace util {
       return _deleter;
     }
 
-    explicit operator bool() const {
+    explicit
+    operator bool() const {
       return _p != nullptr;
     }