diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs
index 71c97699fd07e7..0ce5d006a059ae 100644
--- a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs
@@ -22,6 +22,9 @@ internal static partial class Fcntl
[LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlSetFD", SetLastError = true)]
internal static partial int SetFD(SafeHandle fd, int flags);
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlSetFD", SetLastError = true)]
+ internal static partial int SetFD(IntPtr fd, int flags);
+
[LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlGetFD", SetLastError = true)]
internal static partial int GetFD(SafeHandle fd);
diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs
new file mode 100644
index 00000000000000..1a2216d8d6723c
--- /dev/null
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs
@@ -0,0 +1,58 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.InteropServices;
+
+internal static partial class Interop
+{
+ internal static partial class Sys
+ {
+ /// Wraps io_uring_setup(2): creates an io_uring instance.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimSetup")]
+ internal static unsafe partial Error IoUringShimSetup(
+ uint entries, void* parms, int* ringFd);
+
+ /// Wraps io_uring_enter(2): submits SQEs and/or waits for CQEs.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnter")]
+ internal static unsafe partial Error IoUringShimEnter(
+ int ringFd, uint toSubmit, uint minComplete, uint flags, int* result);
+
+ /// Wraps io_uring_enter2(2) with IORING_ENTER_EXT_ARG for bounded waits.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnterExt")]
+ internal static unsafe partial Error IoUringShimEnterExt(
+ int ringFd, uint toSubmit, uint minComplete, uint flags, void* arg, int* result);
+
+ /// Wraps io_uring_register(2): registers resources (files, buffers, ring fds).
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimRegister")]
+ internal static unsafe partial Error IoUringShimRegister(
+ int ringFd, uint opcode, void* arg, uint nrArgs, int* result);
+
+ /// Wraps mmap(2): maps io_uring SQ/CQ ring memory.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMmap")]
+ internal static unsafe partial Error IoUringShimMmap(
+ int ringFd, ulong size, ulong offset, void** mappedPtr);
+
+ /// Wraps munmap(2): unmaps io_uring ring memory.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMunmap")]
+ internal static unsafe partial Error IoUringShimMunmap(
+ void* addr, ulong size);
+
+ /// Creates an eventfd for io_uring wakeup signaling.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCreateEventFd")]
+ internal static unsafe partial Error IoUringShimCreateEventFd(
+ int* eventFd);
+
+ /// Writes to an eventfd to wake the io_uring event loop.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimWriteEventFd")]
+ internal static partial Error IoUringShimWriteEventFd(int eventFd);
+
+ /// Reads from an eventfd to consume a wakeup signal.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimReadEventFd")]
+ internal static unsafe partial Error IoUringShimReadEventFd(
+ int eventFd, ulong* value);
+
+ /// Wraps close(2): closes a file descriptor.
+ [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCloseFd")]
+ internal static partial Error IoUringShimCloseFd(int fd);
+ }
+}
diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs
new file mode 100644
index 00000000000000..1472d04c8b676a
--- /dev/null
+++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs
@@ -0,0 +1,150 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Net.Sockets;
+using System.Runtime.InteropServices;
+
+internal static partial class Interop
+{
+ internal static partial class Sys
+ {
+ /// Derived SQ ring state computed after mmap, used by the managed submission path.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringSqRingInfo
+ {
+ public IntPtr SqeBase;
+ public IntPtr SqTailPtr;
+ public IntPtr SqHeadPtr;
+ public uint SqMask;
+ public uint SqEntries;
+ public uint SqeSize;
+ public byte UsesNoSqArray;
+ public int RingFd;
+ public int RegisteredRingFd;
+ public byte UsesEnterExtArg;
+ public byte UsesRegisteredFiles;
+ }
+
+ /// Mirrors kernel struct io_sqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringSqOffsets
+ {
+ [FieldOffset(0)] public uint Head;
+ [FieldOffset(4)] public uint Tail;
+ [FieldOffset(8)] public uint RingMask;
+ [FieldOffset(12)] public uint RingEntries;
+ [FieldOffset(16)] public uint Flags;
+ [FieldOffset(20)] public uint Dropped;
+ [FieldOffset(24)] public uint Array;
+ // resv1 at 28, user_addr at 32 - not needed by managed code
+ }
+
+ /// Mirrors kernel struct io_cqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringCqOffsets
+ {
+ [FieldOffset(0)] public uint Head;
+ [FieldOffset(4)] public uint Tail;
+ [FieldOffset(8)] public uint RingMask;
+ [FieldOffset(12)] public uint RingEntries;
+ [FieldOffset(16)] public uint Overflow;
+ [FieldOffset(20)] public uint Cqes;
+ [FieldOffset(24)] public uint Flags;
+ // resv1 at 28, user_addr at 32 - not needed by managed code
+ }
+
+ /// Mirrors kernel struct io_uring_params (120 bytes), passed to io_uring_setup.
+ [StructLayout(LayoutKind.Explicit, Size = 120)]
+ internal struct IoUringParams
+ {
+ [FieldOffset(0)] public uint SqEntries;
+ [FieldOffset(4)] public uint CqEntries;
+ [FieldOffset(8)] public uint Flags;
+ [FieldOffset(12)] public uint SqThreadCpu;
+ [FieldOffset(16)] public uint SqThreadIdle;
+ [FieldOffset(20)] public uint Features;
+ [FieldOffset(24)] public uint WqFd;
+ // resv[3] at 28-39
+ [FieldOffset(40)] public IoUringSqOffsets SqOff;
+ [FieldOffset(80)] public IoUringCqOffsets CqOff;
+ }
+
+ /// Mirrors kernel struct io_uring_cqe (16 bytes), read from the CQ ring.
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringCqe
+ {
+ [FieldOffset(0)] public ulong UserData;
+ [FieldOffset(8)] public int Result;
+ [FieldOffset(12)] public uint Flags;
+ }
+
+ /// Mirrors kernel struct io_uring_buf (16 bytes), used by provided-buffer rings.
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringBuf
+ {
+ [FieldOffset(0)] public ulong Address;
+ [FieldOffset(8)] public uint Length;
+ [FieldOffset(12)] public ushort BufferId;
+ [FieldOffset(14)] public ushort Reserved;
+ }
+
+ ///
+ /// Mirrors the header overlay of kernel struct io_uring_buf_ring (16 bytes).
+ /// In UAPI this shares offset 0 with the first io_uring_buf entry via a union.
+ ///
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ internal struct IoUringBufRingHeader
+ {
+ [FieldOffset(0)] public ulong Reserved1;
+ [FieldOffset(8)] public uint Reserved2;
+ [FieldOffset(12)] public ushort Reserved3;
+ [FieldOffset(14)] public ushort Tail;
+ }
+
+ /// Mirrors kernel struct io_uring_buf_reg (40 bytes), used for pbuf ring registration.
+ [StructLayout(LayoutKind.Explicit, Size = 40)]
+ internal struct IoUringBufReg
+ {
+ [FieldOffset(0)] public ulong RingAddress;
+ [FieldOffset(8)] public uint RingEntries;
+ [FieldOffset(12)] public ushort BufferGroupId;
+ [FieldOffset(14)] public ushort Padding;
+ [FieldOffset(16)] public ulong Reserved0;
+ [FieldOffset(24)] public ulong Reserved1;
+ [FieldOffset(32)] public ulong Reserved2;
+ }
+
+ /// Derived CQ ring state computed after mmap, used by the managed completion drain path.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringCqRingInfo
+ {
+ public IntPtr CqeBase; // io_uring_cqe* base of CQE array
+ public IntPtr CqTailPtr; // uint32_t* kernel writes CQ tail
+ public IntPtr CqHeadPtr; // uint32_t* managed advances CQ head
+ public uint CqMask; // CqEntries - 1
+ public uint CqEntries; // number of CQ slots
+ public uint CqeSize; // sizeof(io_uring_cqe) = 16
+ public IntPtr CqOverflowPtr; // uint32_t* kernel CQ overflow counter
+ }
+
+ /// Mirrors kernel struct io_uring_getevents_arg, used with IORING_ENTER_EXT_ARG.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringGeteventsArg
+ {
+ public ulong Sigmask;
+ public uint SigmaskSize;
+ public uint MinWaitUsec;
+ public ulong Ts;
+ }
+
+ /// Mirrors kernel struct __kernel_timespec, used for io_uring timeout arguments.
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct IoUringKernelTimespec
+ {
+ public long TvSec;
+ public long TvNsec;
+ }
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
index bdb03b5a7b5548..89eaa2c02785ea 100644
--- a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
+++ b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj
@@ -197,9 +197,34 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs
new file mode 100644
index 00000000000000..98c6f93862417d
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs
@@ -0,0 +1,1126 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed partial class SocketAsyncEngine
+ {
+ private const string IoUringAdaptiveBufferSizingSwitchName = "System.Net.Sockets.IoUringAdaptiveBufferSizing";
+ private const int IoUringProvidedBufferRingEntries = (int)IoUringConstants.QueueEntries;
+ private const int IoUringProvidedBufferSizeDefault = 4096;
+ private const ushort IoUringProvidedBufferGroupIdStart = 0x8000;
+ private static readonly int s_ioUringProvidedBufferSize = GetConfiguredIoUringProvidedBufferSize();
+ private static readonly bool s_ioUringAdaptiveBufferSizingEnabled = IsAdaptiveIoUringProvidedBufferSizingEnabled();
+ private static readonly bool s_ioUringRegisterBuffersEnabled = IsIoUringRegisterBuffersEnabled();
+ private bool _adaptiveBufferSizingEnabled;
+ private ushort _nextIoUringProvidedBufferGroupId = IoUringProvidedBufferGroupIdStart;
+
+ ///
+ /// Initializes a provided-buffer ring and registers it with the kernel when supported.
+ /// Failures are non-fatal and leave completion mode enabled without provided buffers.
+ ///
+ private void InitializeIoUringProvidedBufferRingIfSupported(int ringFd)
+ {
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: false,
+ hasRegisteredBuffers: false);
+ _adaptiveBufferSizingEnabled = false;
+ _ioUringProvidedBufferGroupId = 0;
+ _ioUringProvidedBufferRing = null;
+ ushort initialGroupId = AllocateProvidedBufferGroupId();
+
+ if (!IoUringProvidedBufferRing.TryCreate(
+ initialGroupId,
+ IoUringProvidedBufferRingEntries,
+ s_ioUringProvidedBufferSize,
+ s_ioUringAdaptiveBufferSizingEnabled,
+ out IoUringProvidedBufferRing? bufferRing) ||
+ bufferRing is null)
+ {
+ return;
+ }
+
+ Interop.Error registerError = bufferRing.Register(ringFd);
+ if (registerError != Interop.Error.SUCCESS)
+ {
+ bufferRing.Dispose();
+ return;
+ }
+
+ _ioUringProvidedBufferRing = bufferRing;
+ _ioUringProvidedBufferGroupId = bufferRing.BufferGroupId;
+ _adaptiveBufferSizingEnabled = s_ioUringAdaptiveBufferSizingEnabled;
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: true,
+ hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry(bufferRing, ringFd, isReregistration: false));
+
+ SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(bufferRing.BufferSize);
+ }
+
+ ///
+ /// Evaluates adaptive buffer-sizing recommendations and hot-swaps the provided-buffer ring when safe.
+ /// Must run on the event-loop thread.
+ ///
+ private void EvaluateProvidedBufferRingResize()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Provided-buffer resize evaluation must run on the io_uring event-loop thread.");
+ if (!_adaptiveBufferSizingEnabled || _managedRingFd < 0)
+ {
+ return;
+ }
+
+ IoUringProvidedBufferRing? currentRing = _ioUringProvidedBufferRing;
+ if (currentRing is null)
+ {
+ return;
+ }
+
+ int currentBufferSize = currentRing.BufferSize;
+ int recommendedBufferSize = currentRing.RecommendedBufferSize;
+ if (recommendedBufferSize == 0 || recommendedBufferSize == currentBufferSize)
+ {
+ return;
+ }
+
+ if (!IsProvidedBufferResizeQuiescent(currentRing))
+ {
+ return;
+ }
+
+ ushort newGroupId = AllocateProvidedBufferGroupId(_ioUringProvidedBufferGroupId);
+ if (!IoUringProvidedBufferRing.TryCreate(
+ newGroupId,
+ IoUringProvidedBufferRingEntries,
+ recommendedBufferSize,
+ adaptiveSizingEnabled: true,
+ out IoUringProvidedBufferRing? replacementRing) ||
+ replacementRing is null)
+ {
+ return;
+ }
+
+ AssertProvidedBufferResizeQuiescent(currentRing);
+
+ bool restorePreviousBufferRegistration = _ioUringCapabilities.HasRegisteredBuffers;
+ TryUnregisterProvidedBuffersIfRegistered(currentRing, _managedRingFd, restorePreviousBufferRegistration);
+
+ if (replacementRing.Register(_managedRingFd) != Interop.Error.SUCCESS)
+ {
+ replacementRing.Dispose();
+ if (restorePreviousBufferRegistration)
+ {
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: true,
+ hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry(
+ currentRing,
+ _managedRingFd,
+ isReregistration: true));
+ }
+
+ return;
+ }
+
+ currentRing.Unregister(_managedRingFd);
+ currentRing.Dispose();
+
+ _ioUringProvidedBufferRing = replacementRing;
+ _ioUringProvidedBufferGroupId = replacementRing.BufferGroupId;
+ RefreshIoUringMultishotRecvSupport();
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: true,
+ hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry(
+ replacementRing,
+ _managedRingFd,
+ isReregistration: true));
+
+ SocketsTelemetry.Log.IoUringProvidedBufferResize();
+ SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(replacementRing.BufferSize);
+ }
+
+ private bool IsProvidedBufferResizeQuiescent(IoUringProvidedBufferRing currentRing)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Provided-buffer resize quiescence must be evaluated on the io_uring event-loop thread.");
+
+ if (currentRing.InUseCount != 0)
+ {
+ return false;
+ }
+
+ if (_cqOverflowRecoveryActive)
+ {
+ return false;
+ }
+
+ // Ring swap frees/replaces native buffer-ring memory. Delay swap until all tracked
+ // io_uring operations have drained so no in-flight SQE can still reference the old ring.
+ return Volatile.Read(ref _trackedIoUringOperationCount) == 0;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private ushort AllocateProvidedBufferGroupId(ushort avoidGroupId = 0)
+ {
+ ushort candidate = _nextIoUringProvidedBufferGroupId;
+ for (int attempts = 0; attempts < ushort.MaxValue; attempts++)
+ {
+ if (candidate != 0 &&
+ candidate != ushort.MaxValue &&
+ candidate != avoidGroupId)
+ {
+ _nextIoUringProvidedBufferGroupId = GetNextProvidedBufferGroupId(candidate);
+ return candidate;
+ }
+
+ candidate = GetNextProvidedBufferGroupId(candidate);
+ }
+
+ Debug.Fail("Unable to allocate an io_uring provided-buffer group id.");
+ _nextIoUringProvidedBufferGroupId = IoUringProvidedBufferGroupIdStart;
+ return IoUringProvidedBufferGroupIdStart;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ushort GetNextProvidedBufferGroupId(ushort currentGroupId)
+ {
+ ushort nextGroupId = unchecked((ushort)(currentGroupId + 1));
+ if (nextGroupId < IoUringProvidedBufferGroupIdStart || nextGroupId == ushort.MaxValue)
+ {
+ nextGroupId = IoUringProvidedBufferGroupIdStart;
+ }
+
+ return nextGroupId;
+ }
+
+ [Conditional("DEBUG")]
+ private void AssertProvidedBufferResizeQuiescent(IoUringProvidedBufferRing currentRing)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Provided-buffer resize assertions must run on the io_uring event-loop thread.");
+ Debug.Assert(
+ currentRing.InUseCount == 0,
+ "Provided-buffer resize requires no checked-out buffers before ring swap.");
+ Debug.Assert(
+ !_cqOverflowRecoveryActive,
+ "Provided-buffer resize must not run during CQ overflow recovery.");
+ Debug.Assert(
+ Volatile.Read(ref _trackedIoUringOperationCount) == 0,
+ "Provided-buffer resize requires no tracked io_uring operations before old ring disposal.");
+ }
+
+ private static int GetConfiguredIoUringProvidedBufferSize()
+ {
+#if DEBUG
+ string? configuredValue = Environment.GetEnvironmentVariable(
+ IoUringTestEnvironmentVariables.ProvidedBufferSize);
+
+ if (!string.IsNullOrWhiteSpace(configuredValue))
+ {
+ return int.TryParse(configuredValue, out int parsedSize) && parsedSize > 0
+ ? parsedSize
+ : IoUringProvidedBufferSizeDefault;
+ }
+#endif
+
+ return IoUringProvidedBufferSizeDefault;
+ }
+
+ private static bool IsAdaptiveIoUringProvidedBufferSizingEnabled()
+ {
+ bool enabled = AppContext.TryGetSwitch(IoUringAdaptiveBufferSizingSwitchName, out bool configured) && configured;
+
+#if DEBUG
+ string? configuredValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.AdaptiveBufferSizing);
+ if (string.Equals(configuredValue, "1", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(configuredValue, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ return enabled;
+ }
+
+ private static bool IsIoUringRegisterBuffersEnabled()
+ {
+#if DEBUG
+ // Test-only override for deterministic tests.
+ string? configuredValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.RegisterBuffers);
+ if (string.Equals(configuredValue, "1", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(configuredValue, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ // Default: enabled.
+ return true;
+ }
+
+ private static bool TryRegisterProvidedBuffersWithTelemetry(
+ IoUringProvidedBufferRing bufferRing,
+ int ringFd,
+ bool isReregistration)
+ {
+ if (!s_ioUringRegisterBuffersEnabled || ringFd < 0)
+ {
+ return false;
+ }
+
+ // REGISTER_BUFFERS is orthogonal to provided-buffer selection (RECV + IOSQE_BUFFER_SELECT).
+ // Any performance benefit for this path is kernel-dependent and must be validated empirically.
+ bool registered = bufferRing.TryRegisterBuffersWithKernel(ringFd);
+ if (isReregistration)
+ {
+ SocketsTelemetry.Log.IoUringRegisteredBuffersReregistration(registered);
+ }
+ else
+ {
+ SocketsTelemetry.Log.IoUringRegisteredBuffersResult(
+ registered,
+ IoUringProvidedBufferRingEntries,
+ bufferRing.BufferSize);
+ }
+
+ return registered;
+ }
+
+ private void TryUnregisterProvidedBuffersIfRegistered(
+ IoUringProvidedBufferRing bufferRing,
+ int ringFd,
+ bool hasRegisteredBuffers)
+ {
+ if (!hasRegisteredBuffers || ringFd < 0)
+ {
+ return;
+ }
+
+ bufferRing.TryUnregisterBuffersFromKernel(ringFd);
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: _ioUringCapabilities.SupportsProvidedBufferRings,
+ hasRegisteredBuffers: false);
+ }
+
+ /// Unregisters and disposes the provided-buffer ring.
+ private void FreeIoUringProvidedBufferRing()
+ {
+ IoUringProvidedBufferRing? bufferRing = _ioUringProvidedBufferRing;
+ bool hadRegisteredBuffers = _ioUringCapabilities.HasRegisteredBuffers;
+ _ioUringProvidedBufferRing = null;
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: false,
+ hasRegisteredBuffers: false);
+ _adaptiveBufferSizingEnabled = false;
+ _ioUringProvidedBufferGroupId = 0;
+
+ if (bufferRing is null)
+ {
+ return;
+ }
+
+ int recycledForTeardown = bufferRing.RecycleCheckedOutBuffersForTeardown();
+ if (recycledForTeardown > 0)
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferRecycle(recycledForTeardown);
+ }
+
+ TryUnregisterProvidedBuffersIfRegistered(bufferRing, _managedRingFd, hadRegisteredBuffers);
+
+ if (_managedRingFd >= 0)
+ {
+ bufferRing.Unregister(_managedRingFd);
+ }
+
+ bufferRing.Dispose();
+ SetIoUringProvidedBufferCapabilityState(
+ supportsProvidedBufferRings: false,
+ hasRegisteredBuffers: false);
+ }
+
+ ///
+ /// Owns a managed provided-buffer ring registration: native ring memory, pinned managed
+ /// buffers, buffer-id lifecycle, and recycle counters.
+ /// Lifetime is process-engine managed and deterministic via ; no finalizer is used.
+ ///
+ private sealed unsafe class IoUringProvidedBufferRing : IDisposable
+ {
+ private const int AdaptiveWindowCompletionCount = 256;
+ private const int AdaptiveMinBufferSize = 128;
+ private const int AdaptiveMaxBufferSize = 65536;
+ private const int PreparedReceiveMinimumReserve = 8;
+ private const int PreparedReceiveMaximumReserve = 64;
+ private const byte BufferStatePosted = 1;
+ private const byte BufferStateCheckedOut = 2;
+#if DEBUG
+ private static int s_testForceCreateOomOnce = -1;
+#endif
+
+ private readonly ushort _bufferGroupId;
+ private readonly int _bufferSize;
+ private readonly uint _ringEntries;
+ private readonly uint _ringMask;
+ private readonly bool _adaptiveSizingEnabled;
+ private readonly byte[][] _buffers;
+ private readonly nint[] _bufferAddresses;
+ private readonly byte[] _bufferStates;
+ private readonly ulong[] _postedBufferStateBits;
+ private Interop.Sys.IoUringBuf* _ringBuffers;
+ private Interop.Sys.IoUringBufRingHeader* _ringHeader;
+ private readonly void* _ringMemory;
+ private bool _registered;
+ private bool _disposed;
+ private int _availableCount;
+ private int _inUseCount;
+ private long _recycledCount;
+ private long _allocationFailureCount;
+ private long _totalCompletionBytes;
+ private long _totalCompletionCount;
+ private long _completionsAboveHighWatermark;
+ private long _completionsBelowLowWatermark;
+ private int _recommendedBufferSize;
+ private uint _nextPreparedReceiveBufferHint;
+ private uint _nextPreparedReceivePostedWordHint;
+ private bool _deferTailPublish;
+ private bool _deferredTailDirty;
+ private ushort _deferredTailValue;
+ private int _debugOwningThreadId;
+
+ internal ushort BufferGroupId => _bufferGroupId;
+ internal int BufferSize => _bufferSize;
+ internal int AvailableCount => Volatile.Read(ref _availableCount);
+ // Writers are single-threaded via AssertSingleThreadAccess; Volatile.Read keeps
+ // diagnostics/resize sampling conservative when observed outside mutation sites.
+ internal int InUseCount => Volatile.Read(ref _inUseCount);
+ internal long RecycledCount => Interlocked.Read(ref _recycledCount);
+ internal long AllocationFailureCount => Interlocked.Read(ref _allocationFailureCount);
+ internal int RecommendedBufferSize => Volatile.Read(ref _recommendedBufferSize);
+ internal int TotalBufferCountForTest => _bufferStates.Length;
+
+ private IoUringProvidedBufferRing(ushort bufferGroupId, int ringEntries, int bufferSize, bool adaptiveSizingEnabled)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(ringEntries);
+ if (!BitOperations.IsPow2((uint)ringEntries) || ringEntries > ushort.MaxValue)
+ {
+ throw new ArgumentOutOfRangeException(nameof(ringEntries));
+ }
+
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(bufferSize);
+
+ _bufferGroupId = bufferGroupId;
+ _bufferSize = bufferSize;
+ _adaptiveSizingEnabled = adaptiveSizingEnabled;
+ _ringEntries = (uint)ringEntries;
+ _ringMask = (uint)ringEntries - 1;
+ _availableCount = ringEntries;
+ _recommendedBufferSize = bufferSize;
+ _buffers = new byte[ringEntries][];
+ _bufferAddresses = new nint[ringEntries];
+ _bufferStates = GC.AllocateUninitializedArray(ringEntries);
+ _postedBufferStateBits = new ulong[(ringEntries + 63) / 64];
+
+ nuint ringByteCount = checked((nuint)ringEntries * (nuint)sizeof(Interop.Sys.IoUringBuf));
+ _ringMemory = NativeMemory.AlignedAlloc(ringByteCount, (nuint)Environment.SystemPageSize);
+ if (_ringMemory is null)
+ {
+ throw new OutOfMemoryException();
+ }
+
+ NativeMemory.Clear(_ringMemory, ringByteCount);
+ _ringBuffers = (Interop.Sys.IoUringBuf*)_ringMemory;
+ _ringHeader = (Interop.Sys.IoUringBufRingHeader*)_ringMemory;
+
+ int initializedCount = 0;
+ try
+ {
+ for (int i = 0; i < ringEntries; i++)
+ {
+ byte[] buffer = GC.AllocateUninitializedArray(bufferSize, pinned: true);
+ _buffers[i] = buffer;
+ _bufferAddresses[i] = (nint)Unsafe.AsPointer(ref MemoryMarshal.GetArrayDataReference(buffer));
+ _bufferStates[i] = BufferStatePosted;
+ SetPostedBufferBit((ushort)i, isPosted: true);
+
+ WriteBufferDescriptor((uint)i, (ushort)i);
+ initializedCount++;
+ }
+
+ PublishTail((ushort)initializedCount);
+ }
+ catch
+ {
+ _allocationFailureCount++;
+ Array.Clear(_buffers, 0, initializedCount);
+ Array.Clear(_bufferAddresses, 0, initializedCount);
+ NativeMemory.AlignedFree(_ringMemory);
+ throw;
+ }
+ }
+
+#if DEBUG
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool TryConsumeForcedCreateOutOfMemoryForTest()
+ {
+ int configured = Volatile.Read(ref s_testForceCreateOomOnce);
+ if (configured < 0)
+ {
+ configured = string.Equals(
+ Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceProvidedBufferRingOomOnce),
+ "1",
+ StringComparison.Ordinal) ? 1 : 0;
+ Volatile.Write(ref s_testForceCreateOomOnce, configured);
+ }
+
+ if (configured == 0)
+ {
+ return false;
+ }
+
+ return Interlocked.Exchange(ref s_testForceCreateOomOnce, 0) != 0;
+ }
+#endif
+
+ internal static bool TryCreate(
+ ushort bufferGroupId,
+ int ringEntries,
+ int bufferSize,
+ bool adaptiveSizingEnabled,
+ out IoUringProvidedBufferRing? bufferRing)
+ {
+#if DEBUG
+ if (TryConsumeForcedCreateOutOfMemoryForTest())
+ {
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Error(null, "io_uring provided-buffer ring create forced OOM via test hook.");
+ }
+
+ bufferRing = null;
+ return false;
+ }
+#endif
+
+ try
+ {
+ bufferRing = new IoUringProvidedBufferRing(bufferGroupId, ringEntries, bufferSize, adaptiveSizingEnabled);
+ return true;
+ }
+ catch (ArgumentOutOfRangeException exception)
+ {
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Error(null, $"io_uring provided-buffer ring create rejected configuration: {exception.Message}");
+ }
+ }
+ catch (OutOfMemoryException)
+ {
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Error(null, "io_uring provided-buffer ring create failed: out of memory.");
+ }
+ }
+
+ bufferRing = null;
+ return false;
+ }
+
+ /// Records a completion's bytes-transferred for adaptive sizing decisions.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void RecordCompletionUtilization(int bytesTransferred)
+ {
+ AssertSingleThreadAccess();
+ if (!_adaptiveSizingEnabled || bytesTransferred <= 0)
+ {
+ return;
+ }
+
+ int clampedBytes = Math.Min(bytesTransferred, _bufferSize);
+ _totalCompletionBytes += clampedBytes;
+ long count = ++_totalCompletionCount;
+
+ int highWatermark = (_bufferSize * 3) / 4;
+ int lowWatermark = _bufferSize / 4;
+ if (clampedBytes > highWatermark)
+ {
+ _completionsAboveHighWatermark++;
+ }
+ else if (clampedBytes < lowWatermark)
+ {
+ _completionsBelowLowWatermark++;
+ }
+
+ if ((count & (AdaptiveWindowCompletionCount - 1)) == 0)
+ {
+ EvaluateAdaptiveResize();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void EvaluateAdaptiveResize()
+ {
+ AssertSingleThreadAccess();
+ if (!_adaptiveSizingEnabled)
+ {
+ return;
+ }
+
+ long windowBytes = _totalCompletionBytes;
+ long aboveHigh = _completionsAboveHighWatermark;
+ long belowLow = _completionsBelowLowWatermark;
+ _totalCompletionBytes = 0;
+ _completionsAboveHighWatermark = 0;
+ _completionsBelowLowWatermark = 0;
+
+ int currentSize = _bufferSize;
+ int recommendedSize = currentSize;
+ if (aboveHigh > AdaptiveWindowCompletionCount / 2 ||
+ windowBytes > (long)AdaptiveWindowCompletionCount * ((long)currentSize * 3 / 4))
+ {
+ recommendedSize = Math.Min(currentSize * 2, AdaptiveMaxBufferSize);
+ }
+ else if (belowLow > AdaptiveWindowCompletionCount / 2 ||
+ windowBytes < (long)AdaptiveWindowCompletionCount * ((long)currentSize / 4))
+ {
+ recommendedSize = Math.Max(currentSize / 2, AdaptiveMinBufferSize);
+ }
+
+ Volatile.Write(ref _recommendedBufferSize, recommendedSize);
+ }
+
+ internal Interop.Error Register(int ringFd)
+ {
+ Debug.Assert(!_disposed);
+
+ if (_registered)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Interop.Sys.IoUringBufReg registration = default;
+ registration.RingAddress = (ulong)(nuint)_ringMemory;
+ registration.RingEntries = _ringEntries;
+ registration.BufferGroupId = _bufferGroupId;
+
+ int result;
+ Interop.Error registerError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.RegisterPbufRing,
+ ®istration,
+ 1u,
+ &result);
+ if (registerError == Interop.Error.SUCCESS)
+ {
+ _registered = true;
+ }
+
+ return registerError;
+ }
+
+ internal Interop.Error Unregister(int ringFd)
+ {
+ if (!_registered)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Interop.Sys.IoUringBufReg registration = default;
+ registration.BufferGroupId = _bufferGroupId;
+ int result;
+ Interop.Error unregisterError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.UnregisterPbufRing,
+ ®istration,
+ 1u,
+ &result);
+ if (unregisterError == Interop.Error.SUCCESS)
+ {
+ _registered = false;
+ }
+
+ return unregisterError;
+ }
+
+ ///
+ /// Attempts to register pinned buffer payload pages with the kernel via IORING_REGISTER_BUFFERS.
+ /// Failure is non-fatal and callers should gracefully continue with unregistered buffers.
+ /// This does not switch recv SQEs to fixed-buffer opcodes; provided-buffer recv stays on
+ /// IORING_OP_RECV + IOSQE_BUFFER_SELECT.
+ ///
+ internal bool TryRegisterBuffersWithKernel(int ringFd)
+ {
+ if (_disposed || ringFd < 0 || _buffers.Length == 0)
+ {
+ return false;
+ }
+
+ nuint allocationSize = checked((nuint)_buffers.Length * (nuint)sizeof(Interop.Sys.IOVector));
+ Interop.Sys.IOVector* iovecArray;
+ try
+ {
+ iovecArray = (Interop.Sys.IOVector*)NativeMemory.Alloc(allocationSize);
+ }
+ catch (OutOfMemoryException)
+ {
+ return false;
+ }
+
+ try
+ {
+ for (int i = 0; i < _buffers.Length; i++)
+ {
+ nint bufferAddress = _bufferAddresses[i];
+ if (bufferAddress == 0)
+ {
+ return false;
+ }
+
+ iovecArray[i].Base = (byte*)bufferAddress;
+ iovecArray[i].Count = (UIntPtr)_bufferSize;
+ }
+
+ int result;
+ Interop.Error registerError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.RegisterBuffers,
+ iovecArray,
+ (uint)_buffers.Length,
+ &result);
+ return registerError == Interop.Error.SUCCESS;
+ }
+ finally
+ {
+ NativeMemory.Free(iovecArray);
+ }
+ }
+
+ /// Unregisters previously registered pinned buffers via IORING_UNREGISTER_BUFFERS.
+ internal bool TryUnregisterBuffersFromKernel(int ringFd)
+ {
+ if (_disposed || ringFd < 0)
+ {
+ return false;
+ }
+
+ int result;
+ Interop.Error unregisterError = Interop.Sys.IoUringShimRegister(
+ ringFd,
+ IoUringConstants.UnregisterBuffers,
+ null,
+ 0u,
+ &result);
+ return unregisterError == Interop.Error.SUCCESS;
+ }
+
+ /// Acquires a kernel-selected buffer id for completion processing.
+ internal bool TryAcquireBufferForCompletion(ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ AssertSingleThreadAccess();
+ buffer = null;
+ bufferLength = 0;
+
+ if (bufferId >= _ringEntries)
+ {
+ _allocationFailureCount++;
+ return false;
+ }
+
+ byte state = _bufferStates[bufferId];
+ if (state != BufferStatePosted)
+ {
+ Debug.Assert(
+ state == BufferStateCheckedOut,
+ $"Unexpected provided-buffer state during acquire: id={bufferId}, state={state}");
+ _allocationFailureCount++;
+ return false;
+ }
+
+ _bufferStates[bufferId] = BufferStateCheckedOut;
+ SetPostedBufferBit(bufferId, isPosted: false);
+ Debug.Assert(_availableCount > 0, "Provided-buffer available count underflow.");
+ _availableCount--;
+ _inUseCount++;
+
+ nint bufferAddress = _bufferAddresses[bufferId];
+ if (bufferAddress == 0)
+ {
+ _bufferStates[bufferId] = BufferStatePosted;
+ SetPostedBufferBit(bufferId, isPosted: true);
+ _availableCount++;
+ _inUseCount--;
+ _allocationFailureCount++;
+ return false;
+ }
+
+ buffer = (byte*)bufferAddress;
+ bufferLength = _bufferSize;
+ return true;
+ }
+
+ ///
+ /// Acquires any currently posted provided buffer for fixed-recv submission.
+ /// The acquired buffer remains checked out until completion recycles it.
+ ///
+ internal bool TryAcquireBufferForPreparedReceive(out ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ AssertSingleThreadAccess();
+ bufferId = 0;
+ buffer = null;
+ bufferLength = 0;
+
+ // Keep a reserve for kernel-selected (IOSQE_BUFFER_SELECT) receive completions so
+ // fixed-recv one-shots don't deplete the provided-buffer pool under sustained load.
+ int reserveCount = GetPreparedReceiveReserveCount();
+ if (Volatile.Read(ref _availableCount) <= reserveCount)
+ {
+ return false;
+ }
+
+ uint searchStart = _nextPreparedReceiveBufferHint;
+ int maxAttempts = _postedBufferStateBits.Length + 1;
+ for (int attempt = 0; attempt < maxAttempts && TryFindPostedBufferId(searchStart, out ushort candidateId); attempt++)
+ {
+ if (TryAcquireBufferForCompletion(candidateId, out buffer, out bufferLength))
+ {
+ bufferId = candidateId;
+ uint nextSearchStart = ((uint)candidateId + 1) & _ringMask;
+ _nextPreparedReceiveBufferHint = nextSearchStart;
+ _nextPreparedReceivePostedWordHint = nextSearchStart >> 6;
+ return true;
+ }
+
+ searchStart = ((uint)candidateId + 1) & _ringMask;
+ _nextPreparedReceiveBufferHint = searchStart;
+ _nextPreparedReceivePostedWordHint = searchStart >> 6;
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int GetPreparedReceiveReserveCount()
+ {
+ int ringEntryCount = (int)_ringEntries;
+ int dynamicReserve = ringEntryCount / 16;
+ return Math.Clamp(dynamicReserve, PreparedReceiveMinimumReserve, PreparedReceiveMaximumReserve);
+ }
+
+ /// Returns the pointer/length for a buffer that is already checked out.
+ internal bool TryGetCheckedOutBuffer(ushort bufferId, out byte* buffer, out int bufferLength)
+ {
+ buffer = null;
+ bufferLength = 0;
+
+ if (bufferId >= _ringEntries || _bufferStates[bufferId] != BufferStateCheckedOut)
+ {
+ return false;
+ }
+
+ nint bufferAddress = _bufferAddresses[bufferId];
+ if (bufferAddress == 0)
+ {
+ _allocationFailureCount++;
+ return false;
+ }
+
+ buffer = (byte*)bufferAddress;
+ bufferLength = _bufferSize;
+ return true;
+ }
+
+ /// Returns a previously acquired buffer id back to the provided-buffer ring.
+ internal bool TryRecycleBufferFromCompletion(ushort bufferId)
+ {
+ AssertSingleThreadAccess();
+ if (bufferId >= _ringEntries)
+ {
+ return false;
+ }
+
+ byte state = _bufferStates[bufferId];
+ if (state != BufferStateCheckedOut)
+ {
+ Debug.Assert(
+ state == BufferStatePosted,
+ $"Unexpected provided-buffer state during recycle: id={bufferId}, state={state}");
+ return false;
+ }
+
+ RecycleCheckedOutBuffer(bufferId);
+ return true;
+ }
+
+ ///
+ /// Recycles any still-checked-out ids back into the ring during teardown.
+ /// Returns the number of ids recycled.
+ ///
+ internal int RecycleCheckedOutBuffersForTeardown()
+ {
+ AssertSingleThreadAccess();
+ int recycledCount = 0;
+ for (ushort bufferId = 0; bufferId < _ringEntries; bufferId++)
+ {
+ if (_bufferStates[bufferId] != BufferStateCheckedOut)
+ {
+ continue;
+ }
+
+ RecycleCheckedOutBuffer(bufferId);
+ recycledCount++;
+ }
+
+ return recycledCount;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void BeginDeferredRecyclePublish()
+ {
+ AssertSingleThreadAccess();
+ if (_deferTailPublish)
+ {
+ return;
+ }
+
+ _deferTailPublish = true;
+ _deferredTailDirty = false;
+ _deferredTailValue = ReadTail();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void EndDeferredRecyclePublish()
+ {
+ AssertSingleThreadAccess();
+ if (!_deferTailPublish)
+ {
+ return;
+ }
+
+ _deferTailPublish = false;
+ if (_deferredTailDirty)
+ {
+ PublishTail(_deferredTailValue);
+ _deferredTailDirty = false;
+ }
+ }
+
+ ///
+ /// Marks every provided buffer as checked out for deterministic test-only depletion setup.
+ ///
+ internal void ForceAllBuffersCheckedOutForTest()
+ {
+ AssertSingleThreadAccess();
+ for (int i = 0; i < _bufferStates.Length; i++)
+ {
+ _bufferStates[i] = BufferStateCheckedOut;
+ }
+
+ Array.Clear(_postedBufferStateBits);
+ _nextPreparedReceivePostedWordHint = 0;
+ Volatile.Write(ref _availableCount, 0);
+ Volatile.Write(ref _inUseCount, _bufferStates.Length);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecycleCheckedOutBuffer(ushort bufferId)
+ {
+ ushort tail = _deferTailPublish ? _deferredTailValue : ReadTail();
+ uint ringIndex = (uint)tail & _ringMask;
+ WriteBufferDescriptor(ringIndex, bufferId);
+ _bufferStates[bufferId] = BufferStatePosted;
+ SetPostedBufferBit(bufferId, isPosted: true);
+ _availableCount++;
+ Debug.Assert(_inUseCount > 0, "Provided-buffer in-use count underflow.");
+ _inUseCount--;
+ ushort nextTail = unchecked((ushort)(tail + 1));
+ if (_deferTailPublish)
+ {
+ _deferredTailValue = nextTail;
+ _deferredTailDirty = true;
+ }
+ else
+ {
+ PublishTail(nextTail);
+ }
+ _recycledCount++;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void SetPostedBufferBit(ushort bufferId, bool isPosted)
+ {
+ int wordIndex = bufferId >> 6;
+ ulong bit = 1UL << (bufferId & 63);
+ if (isPosted)
+ {
+ bool wordWasEmpty = _postedBufferStateBits[wordIndex] == 0;
+ _postedBufferStateBits[wordIndex] |= bit;
+ if (wordWasEmpty)
+ {
+ _nextPreparedReceivePostedWordHint = (uint)wordIndex;
+ }
+ }
+ else
+ {
+ _postedBufferStateBits[wordIndex] &= ~bit;
+ }
+ }
+
+ private bool TryFindPostedBufferId(uint startIndex, out ushort bufferId)
+ {
+ int wordCount = _postedBufferStateBits.Length;
+ if (wordCount == 0)
+ {
+ bufferId = 0;
+ return false;
+ }
+
+ int hintWord = (int)(_nextPreparedReceivePostedWordHint % (uint)wordCount);
+ if (TryFindBitInWord(hintWord, _postedBufferStateBits[hintWord], out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = (uint)hintWord;
+ return true;
+ }
+
+ uint startWord = startIndex >> 6;
+ int bitOffset = (int)(startIndex & 63);
+ if (startWord >= (uint)wordCount)
+ {
+ bufferId = 0;
+ return false;
+ }
+
+ if (TryFindBitInWord((int)startWord, _postedBufferStateBits[startWord] & (~0UL << bitOffset), out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = startWord;
+ return true;
+ }
+
+ for (int word = (int)startWord + 1; word < wordCount; word++)
+ {
+ if (TryFindBitInWord(word, _postedBufferStateBits[word], out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = (uint)word;
+ return true;
+ }
+ }
+
+ for (int word = 0; word < (int)startWord; word++)
+ {
+ if (TryFindBitInWord(word, _postedBufferStateBits[word], out bufferId))
+ {
+ _nextPreparedReceivePostedWordHint = (uint)word;
+ return true;
+ }
+ }
+
+ bufferId = 0;
+ return false;
+ }
+
+ private bool TryFindBitInWord(int wordIndex, ulong wordBits, out ushort bufferId)
+ {
+ while (wordBits != 0)
+ {
+ int bitIndex = BitOperations.TrailingZeroCount(wordBits);
+ int candidate = (wordIndex << 6) + bitIndex;
+ if ((uint)candidate < _ringEntries)
+ {
+ bufferId = (ushort)candidate;
+ return true;
+ }
+
+ wordBits &= wordBits - 1;
+ }
+
+ bufferId = 0;
+ return false;
+ }
+
+ [Conditional("DEBUG")]
+ private void AssertSingleThreadAccess()
+ {
+ int currentThreadId = Environment.CurrentManagedThreadId;
+ int ownerThreadId = Volatile.Read(ref _debugOwningThreadId);
+ if (ownerThreadId == 0)
+ {
+ int prior = Interlocked.CompareExchange(ref _debugOwningThreadId, currentThreadId, comparand: 0);
+ ownerThreadId = prior == 0 ? currentThreadId : prior;
+ }
+
+ Debug.Assert(
+ ownerThreadId == currentThreadId,
+ $"IoUringProvidedBufferRing mutable state must be accessed from one thread. Owner={ownerThreadId}, current={currentThreadId}");
+ }
+
+ public void Dispose()
+ {
+ if (_disposed)
+ {
+ return;
+ }
+
+#if DEBUG
+ int checkedOutBufferCount = 0;
+ for (int i = 0; i < _bufferStates.Length; i++)
+ {
+ if (_bufferStates[i] == BufferStateCheckedOut)
+ {
+ checkedOutBufferCount++;
+ }
+ }
+
+ Debug.Assert(
+ checkedOutBufferCount == 0,
+ $"Disposing provided-buffer ring with outstanding checked-out buffers: {checkedOutBufferCount}");
+#endif
+
+ Debug.Assert(
+ !_registered,
+ "Provided-buffer ring must be unregistered before disposing native ring memory.");
+ if (_registered)
+ {
+ return;
+ }
+
+ _ringBuffers = null;
+ _ringHeader = null;
+ NativeMemory.AlignedFree(_ringMemory);
+ _disposed = true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private ushort ReadTail() =>
+ Volatile.Read(ref Unsafe.AsRef(&_ringHeader->Tail));
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void PublishTail(ushort tail) =>
+ Volatile.Write(ref Unsafe.AsRef(&_ringHeader->Tail), tail);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void WriteBufferDescriptor(uint ringIndex, ushort bufferId)
+ {
+ Debug.Assert(ringIndex < _ringEntries);
+ Debug.Assert(bufferId < _ringEntries);
+ Debug.Assert(_bufferAddresses[bufferId] != 0);
+
+ Interop.Sys.IoUringBuf* bufferSlot = _ringBuffers + ringIndex;
+ bufferSlot->Address = (ulong)(nuint)_bufferAddresses[bufferId];
+ bufferSlot->Length = (uint)_bufferSize;
+ bufferSlot->BufferId = bufferId;
+ bufferSlot->Reserved = 0;
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs
new file mode 100644
index 00000000000000..ee2b5ff767f392
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs
@@ -0,0 +1,421 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ ///
+ /// Lock-free multi-producer, single-consumer queue optimized for the io_uring
+ /// event loop pattern where many threads enqueue work items but exactly one
+ /// thread drains them.
+ ///
+ /// Liveness contract:
+ /// TryDequeue/IsEmpty may observe a producer between index claim and publish
+ /// (Interlocked.Increment followed by Volatile.Write), and can transiently report
+ /// no available item even though an enqueue is in progress. Callers must provide
+ /// their own wakeup/progress mechanism after Enqueue.
+ ///
+ internal sealed class MpscQueue
+ {
+ private const int DefaultSegmentSize = 256;
+ private const int UnlinkedSegmentCacheCapacity = 4;
+ private const int MaxEnqueueSlowAttempts = 2048;
+#if DEBUG
+ private static int s_testSegmentAllocationFailuresRemaining;
+#endif
+
+ private readonly int _segmentSize;
+ private PaddedSegment _head;
+ private PaddedSegment _tail;
+ // Segment cache is shared by:
+ // - unlinked segments that lost tail->next publication races, and
+ // - drained head segments returned only after producer quiescence checks.
+ // Cache bookkeeping is protected by a tiny lock because this path is already slow-path only.
+ private readonly Lock _cachedUnlinkedSegmentGate = new Lock();
+ private readonly Segment?[] _cachedUnlinkedSegments = new Segment?[UnlinkedSegmentCacheCapacity];
+ private int _cachedUnlinkedSegmentCount;
+ private int _activeEnqueueOperations;
+
+ internal MpscQueue(int segmentSize = DefaultSegmentSize)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegativeOrZero(segmentSize);
+ _segmentSize = segmentSize;
+ Segment initial = new Segment(segmentSize);
+ _head.Value = initial;
+ _tail.Value = initial;
+ }
+
+ ///
+ /// Attempts to enqueue an item.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryEnqueue(T item)
+ {
+ if (TryEnqueueFast(item))
+ {
+ return true;
+ }
+
+ return TryEnqueueSlowWithProducerTracking(item);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool TryEnqueueSlowWithProducerTracking(T item)
+ {
+ // Only slow-path producers can retain stale segment references long enough to race with
+ // drained-segment recycling. Fast-path success doesn't need this accounting.
+ Interlocked.Increment(ref _activeEnqueueOperations);
+ try
+ {
+ return TryEnqueueSlow(item);
+ }
+ finally
+ {
+ Interlocked.Decrement(ref _activeEnqueueOperations);
+ }
+ }
+
+ ///
+ /// Enqueues an item, retrying until an enqueue slot is observed.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void Enqueue(T item)
+ {
+ SpinWait spinner = default;
+ while (!TryEnqueue(item))
+ {
+ spinner.SpinOnce();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryEnqueueFast(T item)
+ {
+ Segment tail = Volatile.Read(ref _tail.Value)!;
+ // Snapshot incarnation before claiming a slot. If the segment is recycled
+ // between this read and the Interlocked.Increment, the incarnation will differ.
+ int incarnation = Volatile.Read(ref tail.Incarnation);
+ int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1;
+ // A stale claim can over-increment the old segment index before incarnation
+ // mismatch is detected; this is safe because ResetForReuse resets EnqueueIndex.
+ if ((uint)index < (uint)tail.Entries.Length)
+ {
+ // Verify segment was not recycled while we were claiming the slot.
+ // A recycled segment has a different incarnation because ResetForReuse
+ // increments it. Without this check, TryReturnDrainedSegmentToCache can
+ // recycle the segment (since fast-path producers are not tracked by
+ // _activeEnqueueOperations) and we would write into reused memory.
+ if (Volatile.Read(ref tail.Incarnation) == incarnation)
+ {
+ ref SegmentEntry entry = ref tail.Entries[index];
+ entry.Item = item;
+ Volatile.Write(ref entry.State, 1);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool TryEnqueueSlow(T item)
+ {
+ SpinWait spinner = default;
+ for (int attempt = 0; attempt < MaxEnqueueSlowAttempts; attempt++)
+ {
+ Segment tail = Volatile.Read(ref _tail.Value)!;
+ int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1;
+ if ((uint)index < (uint)tail.Entries.Length)
+ {
+ ref SegmentEntry entry = ref tail.Entries[index];
+ entry.Item = item;
+ Volatile.Write(ref entry.State, 1);
+ return true;
+ }
+
+ Segment? next = Volatile.Read(ref tail.Next);
+ if (next is null)
+ {
+ Segment newSegment;
+ try
+ {
+ newSegment = RentUnlinkedSegment();
+ }
+ catch (OutOfMemoryException)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref tail.Next, newSegment, null) is null)
+ {
+ next = newSegment;
+ }
+ else
+ {
+ // Another producer linked its own segment first. Reuse ours later.
+ ReturnUnlinkedSegment(newSegment);
+ next = Volatile.Read(ref tail.Next);
+ }
+ }
+
+ if (next is not null)
+ {
+ Interlocked.CompareExchange(ref _tail.Value, next, tail);
+ }
+
+ spinner.SpinOnce();
+ }
+
+ return false;
+ }
+
+ ///
+ /// Attempts to dequeue an item. Must only be called by the single consumer thread.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryDequeue(out T item)
+ {
+ if (TryDequeueFast(out item))
+ {
+ return true;
+ }
+
+ return TryDequeueSlow(out item);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool TryDequeueFromSegment(Segment head, out T item)
+ {
+ int index = head.DequeueIndex;
+ if ((uint)index >= (uint)head.Entries.Length)
+ {
+ item = default!;
+ return false;
+ }
+
+ // Acquire published slot before reading the item value.
+ ref SegmentEntry entry = ref head.Entries[index];
+ if (Volatile.Read(ref entry.State) != 1)
+ {
+ item = default!;
+ return false;
+ }
+
+ item = entry.Item;
+ if (RuntimeHelpers.IsReferenceOrContainsReferences())
+ {
+ entry.Item = default!;
+ }
+
+ head.DequeueIndex = index + 1;
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryDequeueFast(out T item)
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ return TryDequeueFromSegment(head, out item);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool TryDequeueSlow(out T item)
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ while ((uint)head.DequeueIndex >= (uint)head.Entries.Length)
+ {
+ Segment? next = Volatile.Read(ref head.Next);
+ if (next is null)
+ {
+ item = default!;
+ return false;
+ }
+
+ // Consumer publishes head advance; producers read _head when resolving slow-path
+ // enqueue progress, so this store must be visible across cores.
+ Volatile.Write(ref _head.Value, next);
+ TryReturnDrainedSegmentToCache(head);
+ head = next;
+ }
+
+ return TryDequeueFromSegment(head, out item);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void TryReturnDrainedSegmentToCache(Segment drainedSegment)
+ {
+ // Safe reuse requires producer quiescence and tail advancement away from this segment.
+ // Without these checks, a producer that captured a stale segment pointer could publish
+ // into a reset segment after it has been recycled.
+ if (Volatile.Read(ref _activeEnqueueOperations) != 0 ||
+ ReferenceEquals(Volatile.Read(ref _tail.Value), drainedSegment))
+ {
+ return;
+ }
+
+ ReturnUnlinkedSegment(drainedSegment);
+ }
+
+ ///
+ /// Returns whether the queue currently appears empty (snapshot, not linearizable).
+ /// A return value of can also mean an enqueue is mid-flight.
+ ///
+ internal bool IsEmpty
+ {
+ get
+ {
+ Segment head = Volatile.Read(ref _head.Value)!;
+ while (true)
+ {
+ int index = head.DequeueIndex;
+ if ((uint)index >= (uint)head.Entries.Length)
+ {
+ Segment? next = Volatile.Read(ref head.Next);
+ if (next is null)
+ {
+ return true;
+ }
+
+ head = next;
+ continue;
+ }
+
+ return Volatile.Read(ref head.Entries[index].State) != 1;
+ }
+ }
+ }
+
+ private Segment RentUnlinkedSegment()
+ {
+ lock (_cachedUnlinkedSegmentGate)
+ {
+ if (_cachedUnlinkedSegmentCount != 0)
+ {
+ int nextIndex = _cachedUnlinkedSegmentCount - 1;
+ Segment segment = _cachedUnlinkedSegments[nextIndex]!;
+ _cachedUnlinkedSegments[nextIndex] = null;
+ _cachedUnlinkedSegmentCount = nextIndex;
+ segment.ResetForReuse();
+ return segment;
+ }
+ }
+
+#if DEBUG
+ if (TryConsumeSegmentAllocationFailureForTest())
+ {
+ throw new OutOfMemoryException("Injected MpscQueue segment allocation failure for test.");
+ }
+#endif
+
+ return new Segment(_segmentSize);
+ }
+
+#if DEBUG
+ internal static void SetSegmentAllocationFailuresForTest(int failureCount)
+ {
+ ArgumentOutOfRangeException.ThrowIfNegative(failureCount);
+
+ Volatile.Write(ref s_testSegmentAllocationFailuresRemaining, failureCount);
+ }
+
+ private static bool TryConsumeSegmentAllocationFailureForTest()
+ {
+ while (true)
+ {
+ int remainingFailures = Volatile.Read(ref s_testSegmentAllocationFailuresRemaining);
+ if (remainingFailures <= 0)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(
+ ref s_testSegmentAllocationFailuresRemaining,
+ remainingFailures - 1,
+ remainingFailures) == remainingFailures)
+ {
+ return true;
+ }
+ }
+ }
+#endif
+
+ private void ReturnUnlinkedSegment(Segment segment)
+ {
+ segment.ResetForReuse();
+ lock (_cachedUnlinkedSegmentGate)
+ {
+ if (_cachedUnlinkedSegmentCount < _cachedUnlinkedSegments.Length)
+ {
+ _cachedUnlinkedSegments[_cachedUnlinkedSegmentCount++] = segment;
+ }
+ }
+ }
+
+ private sealed class Segment
+ {
+ internal readonly SegmentEntry[] Entries;
+ internal int Incarnation;
+ internal PaddedInt32 EnqueueIndex;
+ internal int DequeueIndex;
+ internal Segment? Next;
+
+ internal Segment(int size)
+ {
+ Entries = new SegmentEntry[size];
+ ResetForReuse();
+ }
+
+ internal void ResetForReuse()
+ {
+ Interlocked.Increment(ref Incarnation);
+ EnqueueIndex.Value = 0;
+ DequeueIndex = 0;
+ Next = null;
+ if (RuntimeHelpers.IsReferenceOrContainsReferences())
+ {
+ Array.Clear(Entries);
+ }
+ else
+ {
+ for (int i = 0; i < Entries.Length; i++)
+ {
+ Entries[i].State = 0;
+ }
+ }
+ }
+ }
+
+ private struct SegmentEntry
+ {
+ internal T Item;
+ internal int State;
+ }
+
+#if TARGET_ARM64 || TARGET_LOONGARCH64
+ private const int CacheLineWordCount = 16; // 128-byte cache line / sizeof(nint)
+#else
+ private const int CacheLineWordCount = 8; // 64-byte cache line / sizeof(nint)
+#endif
+
+ [InlineArray(CacheLineWordCount - 1)]
+ private struct CacheLinePadding
+ {
+ internal nint _element0;
+ }
+
+ private struct PaddedSegment
+ {
+ internal Segment? Value;
+ internal CacheLinePadding _padding;
+ }
+
+ private struct PaddedInt32
+ {
+ internal int Value;
+ internal CacheLinePadding _padding;
+ }
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs
new file mode 100644
index 00000000000000..12014434570b1b
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs
@@ -0,0 +1,2764 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Buffers;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed partial class SocketAsyncContext
+ {
+ private const int MultishotAcceptQueueMaxSize = 256;
+ private const int PersistentMultishotRecvDataQueueMaxSize = 16;
+ private const int IoUringUserDataTagShift = 56;
+ private const byte IoUringReservedCompletionTag = 2;
+ private const long MultishotAcceptStateDisarmed = 0;
+ private const long MultishotAcceptStateArming = 1;
+ private Queue? _multishotAcceptQueue;
+ private long _multishotAcceptState; // 0=disarmed, 1=arming, otherwise encoded reserved-completion user_data
+ private ulong _persistentMultishotRecvUserData; // user_data of armed multishot recv SQE
+ private int _persistentMultishotRecvArmed; // 0=not armed, 1=armed
+ private Queue? _persistentMultishotRecvDataQueue;
+ private BufferedPersistentMultishotRecvData _persistentMultishotRecvDataHead;
+ private bool _hasPersistentMultishotRecvDataHead;
+ private int _persistentMultishotRecvDataHeadOffset;
+ private Lock? _multishotAcceptQueueGate;
+ private Lock? _persistentMultishotRecvDataGate;
+
+ private readonly struct BufferedPersistentMultishotRecvData
+ {
+ internal readonly byte[] Data;
+ internal readonly int Length;
+ internal readonly bool UsesPooledBuffer;
+
+ internal BufferedPersistentMultishotRecvData(byte[] data, int length, bool usesPooledBuffer)
+ {
+ Data = data;
+ Length = length;
+ UsesPooledBuffer = usesPooledBuffer;
+ }
+ }
+
+ /// Holds a pre-accepted connection's fd and socket address from a multishot accept CQE.
+ private readonly struct PreAcceptedConnection
+ {
+ internal readonly IntPtr FileDescriptor;
+ internal readonly byte[] SocketAddressData;
+ internal readonly int SocketAddressLength;
+ internal readonly bool UsesPooledBuffer;
+
+ internal PreAcceptedConnection(IntPtr fileDescriptor, byte[] socketAddressData, int socketAddressLength, bool usesPooledBuffer)
+ {
+ FileDescriptor = fileDescriptor;
+ SocketAddressData = socketAddressData;
+ SocketAddressLength = socketAddressLength;
+ UsesPooledBuffer = usesPooledBuffer;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Lock EnsureMultishotAcceptQueueGate() => EnsureLockInitialized(ref _multishotAcceptQueueGate);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private Lock EnsurePersistentMultishotRecvDataGate() => EnsureLockInitialized(ref _persistentMultishotRecvDataGate);
+
+ private int PersistentMultishotRecvBufferedCount =>
+ (_persistentMultishotRecvDataQueue?.Count ?? 0) + (_hasPersistentMultishotRecvDataHead ? 1 : 0);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Lock EnsureLockInitialized(ref Lock? gate)
+ {
+ Lock? existing = Volatile.Read(ref gate);
+ if (existing is not null)
+ {
+ return existing;
+ }
+
+ Lock created = new Lock();
+ Lock? prior = Interlocked.CompareExchange(ref gate, created, null);
+ return prior ?? created;
+ }
+
+ /// Returns whether this context's engine is using io_uring completion mode.
+ private bool IsIoUringCompletionModeEnabled()
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ return engine is not null && engine.IsIoUringCompletionModeEnabled;
+ }
+
+ /// Returns the total count of non-pinnable buffer prepare fallbacks across active engines.
+ internal static long GetIoUringNonPinnablePrepareFallbackCount() =>
+ SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackCount();
+
+ /// Test-only setter for the non-pinnable fallback counter.
+ internal static void SetIoUringNonPinnablePrepareFallbackCountForTest(long value) =>
+ SocketAsyncEngine.SetIoUringNonPinnablePrepareFallbackCountForTest(value);
+
+ internal static bool TryGetSocketAsyncContextForTest(Socket socket, out SocketAsyncContext? context)
+ {
+ try
+ {
+ context = socket.SafeHandle.AsyncContext;
+ return true;
+ }
+ catch (ObjectDisposedException)
+ {
+ context = null;
+ return false;
+ }
+ }
+
+ internal static bool IsMultishotAcceptArmedForTest(Socket socket)
+ {
+ if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null)
+ {
+ return false;
+ }
+
+ return context.IsMultishotAcceptArmed;
+ }
+
+ internal static int GetMultishotAcceptQueueCountForTest(Socket socket)
+ {
+ if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null)
+ {
+ return 0;
+ }
+
+ Lock gate = context.EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ return context._multishotAcceptQueue?.Count ?? 0;
+ }
+ }
+
+ internal static bool IsPersistentMultishotRecvArmedForTest(Socket socket)
+ {
+ if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null)
+ {
+ return false;
+ }
+
+ return context.IsPersistentMultishotRecvArmed();
+ }
+
+ internal static ulong GetPersistentMultishotRecvUserDataForTest(Socket socket)
+ {
+ if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null)
+ {
+ return 0;
+ }
+
+ return context.IsPersistentMultishotRecvArmed() ? context.PersistentMultishotRecvUserData : 0;
+ }
+
+ internal static int GetPersistentMultishotRecvBufferedCountForTest(Socket socket)
+ {
+ if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null)
+ {
+ return 0;
+ }
+
+ Lock gate = context.EnsurePersistentMultishotRecvDataGate();
+ lock (gate)
+ {
+ return context.PersistentMultishotRecvBufferedCount;
+ }
+ }
+
+ /// Returns whether a multishot accept SQE is currently armed for this context.
+ internal bool IsMultishotAcceptArmed => Volatile.Read(ref _multishotAcceptState) != MultishotAcceptStateDisarmed;
+
+ /// Returns the user_data payload for the armed multishot accept SQE, if any.
+ internal ulong MultishotAcceptUserData => DecodeMultishotAcceptUserData(Volatile.Read(ref _multishotAcceptState));
+
+ /// Clears multishot accept armed-state for this context.
+ internal void DisarmMultishotAccept()
+ {
+ Volatile.Write(ref _multishotAcceptState, MultishotAcceptStateDisarmed);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong DecodeMultishotAcceptUserData(long packedState)
+ {
+ ulong rawState = (ulong)packedState;
+ return (byte)(rawState >> IoUringUserDataTagShift) == IoUringReservedCompletionTag
+ ? rawState
+ : 0;
+ }
+
+ /// Returns whether a persistent multishot recv SQE is currently armed for this context.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool IsPersistentMultishotRecvArmed() =>
+ Volatile.Read(ref _persistentMultishotRecvArmed) != 0;
+
+ /// Records that a persistent multishot recv SQE has been armed for this context.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void SetPersistentMultishotRecvArmed(ulong userData)
+ {
+ Volatile.Write(ref _persistentMultishotRecvUserData, userData);
+ Volatile.Write(ref _persistentMultishotRecvArmed, 1);
+ }
+
+ /// Clears this context's armed persistent multishot recv state.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void ClearPersistentMultishotRecvArmed()
+ {
+ Volatile.Write(ref _persistentMultishotRecvUserData, 0);
+ Volatile.Write(ref _persistentMultishotRecvArmed, 0);
+ }
+
+ /// Gets the user_data of the armed persistent multishot recv SQE, or 0 if none is armed.
+ internal ulong PersistentMultishotRecvUserData =>
+ Volatile.Read(ref _persistentMultishotRecvUserData);
+
+ ///
+ /// Clears persistent multishot recv armed-state and requests ASYNC_CANCEL for
+ /// the armed user_data when available.
+ ///
+ internal void RequestPersistentMultishotRecvCancel()
+ {
+ ulong recvUserData = Volatile.Read(ref _persistentMultishotRecvUserData);
+ ClearPersistentMultishotRecvArmed();
+ if (recvUserData != 0)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ engine?.TryRequestIoUringCancellation(recvUserData);
+ }
+ }
+
+ /// Copies an early multishot-recv payload into the per-socket replay queue.
+ internal bool TryBufferEarlyPersistentMultishotRecvData(ReadOnlySpan payload)
+ {
+ if (payload.Length == 0)
+ {
+ return true;
+ }
+
+ EnsurePersistentMultishotRecvDataQueueInitialized();
+ Queue? queue = _persistentMultishotRecvDataQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ byte[] copy = ArrayPool.Shared.Rent(payload.Length);
+ payload.CopyTo(copy);
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ lock (gate)
+ {
+ if (PersistentMultishotRecvBufferedCount >= PersistentMultishotRecvDataQueueMaxSize)
+ {
+ ArrayPool.Shared.Return(copy);
+ return false;
+ }
+
+ // Publish queue count only after enqueue to avoid teardown observing phantom items.
+ queue.Enqueue(new BufferedPersistentMultishotRecvData(copy, payload.Length, usesPooledBuffer: true));
+ }
+
+ return true;
+ }
+
+ /// Attempts to drain buffered multishot-recv payload into the caller destination.
+ internal bool TryConsumeBufferedPersistentMultishotRecvData(Memory destination, out int bytesTransferred)
+ {
+ bytesTransferred = 0;
+ if (destination.Length == 0)
+ {
+ return false;
+ }
+
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ byte[] sourceBuffer;
+ int sourceOffset;
+ int toCopy;
+ bool releaseHeadAfterCopy;
+ BufferedPersistentMultishotRecvData sourceHead;
+ lock (gate)
+ {
+ if (!TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered))
+ {
+ return false;
+ }
+
+ int headOffset = _persistentMultishotRecvDataHeadOffset;
+ int remaining = buffered.Length - headOffset;
+ Debug.Assert(remaining > 0);
+ if (remaining <= 0)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+ return false;
+ }
+
+ toCopy = Math.Min(destination.Length, remaining);
+ sourceBuffer = buffered.Data;
+ sourceOffset = headOffset;
+ sourceHead = buffered;
+ _persistentMultishotRecvDataHeadOffset = headOffset + toCopy;
+ releaseHeadAfterCopy = _persistentMultishotRecvDataHeadOffset >= buffered.Length;
+ }
+
+ sourceBuffer.AsSpan(sourceOffset, toCopy).CopyTo(destination.Span);
+ bytesTransferred = toCopy;
+
+ if (releaseHeadAfterCopy)
+ {
+ lock (gate)
+ {
+ if (_hasPersistentMultishotRecvDataHead &&
+ _persistentMultishotRecvDataHead.Length == sourceHead.Length &&
+ ReferenceEquals(_persistentMultishotRecvDataHead.Data, sourceHead.Data) &&
+ _persistentMultishotRecvDataHeadOffset >= sourceHead.Length)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /// Ensures the pre-accepted connection queue exists.
+ private void EnsureMultishotAcceptQueueInitialized()
+ {
+ if (_multishotAcceptQueue is null)
+ {
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ _multishotAcceptQueue ??= new Queue();
+ }
+ }
+ }
+
+ ///
+ /// Attempts to enqueue a pre-accepted connection from a multishot accept CQE.
+ /// Caller is responsible for closing when this returns false.
+ ///
+ internal bool TryEnqueuePreAcceptedConnection(IntPtr acceptedFd, ReadOnlySpan socketAddressData, int socketAddressLen)
+ {
+ EnsureMultishotAcceptQueueInitialized();
+ Queue? queue = _multishotAcceptQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ int length = socketAddressLen;
+ if (length < 0)
+ {
+ length = 0;
+ }
+
+ if ((uint)length > (uint)socketAddressData.Length)
+ {
+ length = socketAddressData.Length;
+ }
+
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ if (queue.Count >= MultishotAcceptQueueMaxSize)
+ {
+ return false;
+ }
+
+ byte[] copy;
+ if (length != 0)
+ {
+ copy = ArrayPool.Shared.Rent(length);
+ socketAddressData.Slice(0, length).CopyTo(copy);
+ }
+ else
+ {
+ copy = Array.Empty();
+ }
+
+ queue.Enqueue(new PreAcceptedConnection(acceptedFd, copy, length, usesPooledBuffer: length != 0));
+ }
+
+ return true;
+ }
+
+ ///
+ /// Attempts to dequeue a pre-accepted connection from the multishot accept queue.
+ /// Returns true if a connection was available, populating the operation fields.
+ ///
+ internal bool TryDequeuePreAcceptedConnection(AcceptOperation operation)
+ {
+ EnsureMultishotAcceptQueueInitialized();
+ Queue? queue = _multishotAcceptQueue;
+ if (queue is null)
+ {
+ return false;
+ }
+
+ PreAcceptedConnection accepted;
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ if (queue.Count == 0)
+ {
+ return false;
+ }
+
+ accepted = queue.Dequeue();
+ }
+
+ try
+ {
+ operation.AcceptedFileDescriptor = accepted.FileDescriptor;
+ int socketAddressLen = accepted.SocketAddressLength;
+ if ((uint)socketAddressLen > (uint)operation.SocketAddress.Length)
+ {
+ socketAddressLen = operation.SocketAddress.Length;
+ }
+
+ if (socketAddressLen != 0)
+ {
+ accepted.SocketAddressData.AsSpan(0, socketAddressLen).CopyTo(operation.SocketAddress.Span);
+ }
+
+ operation.AcceptSocketAddressLength = socketAddressLen;
+ operation.SocketAddress = operation.SocketAddress.Slice(0, socketAddressLen);
+ operation.ErrorCode = SocketError.Success;
+ return true;
+ }
+ finally
+ {
+ ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer);
+ }
+ }
+
+ /// Removes a completed io_uring operation from its queue and signals or dispatches its callback.
+ internal bool TryCompleteIoUringOperation(AsyncOperation operation)
+ {
+ bool removed =
+ operation is ReadOperation readOperation ? _receiveQueue.TryRemoveCompletedOperation(this, readOperation) :
+ operation is WriteOperation writeOperation ? _sendQueue.TryRemoveCompletedOperation(this, writeOperation) :
+ false;
+ if (!removed)
+ {
+ return false;
+ }
+
+ ManualResetEventSlim? e = operation.Event;
+ if (e is not null)
+ {
+ e.Set();
+ return true;
+ }
+
+ operation.CancellationRegistration.Dispose();
+ if (ShouldDispatchCompletionCallback(operation))
+ {
+ if (PreferInlineCompletions)
+ {
+ // Inline completion: invoke directly on the event-loop thread,
+ // matching the epoll path (HandleEventsInline). This avoids the
+ // ThreadPool hop for latency-sensitive workloads that opted in
+ // via DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS=1.
+ operation.InvokeCallback(allowPooling: true);
+ }
+ else
+ {
+ operation.QueueIoUringCompletionCallback();
+ }
+ }
+
+ return true;
+ }
+
+ /// Enqueues an operation for deferred SQE preparation on the event loop thread.
+ private bool TryEnqueueIoUringPreparation(AsyncOperation operation, long prepareSequence)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ return engine is not null && engine.TryEnqueueIoUringPreparation(operation, prepareSequence);
+ }
+
+ /// Applies cancellation and/or untracking to an operation's io_uring state.
+ private void HandleIoUringCancellationTransition(
+ AsyncOperation operation,
+ bool requestKernelCancellation,
+ bool untrackAndClear)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ ulong userData = operation.IoUringUserData;
+ if (userData == 0)
+ {
+ return;
+ }
+
+ if (requestKernelCancellation)
+ {
+ engine?.TryRequestIoUringCancellation(userData);
+ }
+
+ if (untrackAndClear)
+ {
+ bool clearAllowed = engine?.TryUntrackIoUringOperation(userData, operation) ?? true;
+ if (clearAllowed)
+ {
+ operation.ClearIoUringUserData();
+ }
+ }
+ }
+
+ /// Requests kernel-level ASYNC_CANCEL for an in-flight operation.
+ private void TryRequestIoUringCancellation(AsyncOperation operation)
+ {
+ HandleIoUringCancellationTransition(
+ operation,
+ requestKernelCancellation: true,
+ untrackAndClear: false);
+ }
+
+ /// Removes an operation from the registry and clears its user_data.
+ internal void TryUntrackIoUringOperation(AsyncOperation operation)
+ {
+ HandleIoUringCancellationTransition(
+ operation,
+ requestKernelCancellation: false,
+ untrackAndClear: true);
+ }
+
+ /// Stages an operation for io_uring preparation if completion mode is active.
+ static partial void LinuxTryStageIoUringOperation(AsyncOperation operation)
+ {
+ if (operation.Event is null && operation.AssociatedContext.IsIoUringCompletionModeEnabled())
+ {
+ if (!operation.TryQueueIoUringPreparation())
+ {
+ operation.EmitReadinessFallbackForQueueOverflow();
+ }
+ }
+ }
+
+ partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued)
+ {
+ dequeued = TryDequeuePreAcceptedConnection(operation);
+ }
+
+ partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred)
+ {
+ consumed = TryConsumeBufferedPersistentMultishotRecvData(destination, out bytesTransferred);
+ }
+
+ /// Cleans up multishot-accept state and queued pre-accepted descriptors during abort.
+ partial void LinuxOnStopAndAbort()
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine);
+ if (IsPersistentMultishotRecvArmed())
+ {
+ RequestPersistentMultishotRecvCancel();
+ }
+
+ ulong armedUserData = GetArmedMultishotAcceptUserDataForCancellation();
+ if (engine is not null && armedUserData != 0)
+ {
+ engine.TryRequestIoUringCancellation(armedUserData);
+ }
+
+ DisarmMultishotAccept();
+
+ Queue? multishotAcceptQueue = _multishotAcceptQueue;
+ if (multishotAcceptQueue is not null)
+ {
+ while (true)
+ {
+ PreAcceptedConnection accepted;
+ Lock gate = EnsureMultishotAcceptQueueGate();
+ lock (gate)
+ {
+ if (multishotAcceptQueue.Count == 0)
+ {
+ break;
+ }
+
+ accepted = multishotAcceptQueue.Dequeue();
+ }
+
+ Interop.Sys.Close(accepted.FileDescriptor);
+ ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer);
+ }
+ }
+
+ Lock persistentGate = EnsurePersistentMultishotRecvDataGate();
+ lock (persistentGate)
+ {
+ ReleasePersistentMultishotRecvDataHead();
+
+ Queue? bufferedQueue = _persistentMultishotRecvDataQueue;
+ if (bufferedQueue is not null)
+ {
+ while (bufferedQueue.Count != 0)
+ {
+ BufferedPersistentMultishotRecvData buffered = bufferedQueue.Dequeue();
+ ReturnPooledBufferIfNeeded(buffered.Data, buffered.UsesPooledBuffer);
+ }
+ }
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void EnsurePersistentMultishotRecvDataQueueInitialized()
+ {
+ if (_persistentMultishotRecvDataQueue is null)
+ {
+ Lock gate = EnsurePersistentMultishotRecvDataGate();
+ lock (gate)
+ {
+ _persistentMultishotRecvDataQueue ??= new Queue();
+ }
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered)
+ {
+ if (_hasPersistentMultishotRecvDataHead)
+ {
+ buffered = _persistentMultishotRecvDataHead;
+ return true;
+ }
+
+ Queue? queue = _persistentMultishotRecvDataQueue;
+ if (queue is null || queue.Count == 0)
+ {
+ buffered = default;
+ return false;
+ }
+
+ BufferedPersistentMultishotRecvData dequeued = queue.Dequeue();
+ _persistentMultishotRecvDataHead = dequeued;
+ _hasPersistentMultishotRecvDataHead = true;
+ _persistentMultishotRecvDataHeadOffset = 0;
+ buffered = dequeued;
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ReleasePersistentMultishotRecvDataHead()
+ {
+ if (!_hasPersistentMultishotRecvDataHead)
+ {
+ return;
+ }
+
+ BufferedPersistentMultishotRecvData head = _persistentMultishotRecvDataHead;
+ _persistentMultishotRecvDataHead = default;
+ _hasPersistentMultishotRecvDataHead = false;
+ _persistentMultishotRecvDataHeadOffset = 0;
+ ReturnPooledBufferIfNeeded(head.Data, head.UsesPooledBuffer);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ReturnPooledBufferIfNeeded(byte[] buffer, bool usesPooledBuffer)
+ {
+ if (usesPooledBuffer)
+ {
+ ArrayPool.Shared.Return(buffer);
+ }
+ }
+
+ private ulong GetArmedMultishotAcceptUserDataForCancellation()
+ {
+ long packedState = Volatile.Read(ref _multishotAcceptState);
+ ulong userData = DecodeMultishotAcceptUserData(packedState);
+ if (userData != 0 || packedState == MultishotAcceptStateDisarmed)
+ {
+ return userData;
+ }
+
+ // A transient "arming without published user_data" state can race this read.
+ // Bounded spin is best-effort; a miss is benign because later cancellation
+ // and teardown paths still unarm/cleanup safely.
+ SpinWait spinner = default;
+ do
+ {
+ spinner.SpinOnce();
+ packedState = Volatile.Read(ref _multishotAcceptState);
+ userData = DecodeMultishotAcceptUserData(packedState);
+ if (userData != 0 || packedState == MultishotAcceptStateDisarmed)
+ {
+ break;
+ }
+ } while (!spinner.NextSpinWillYield);
+
+ return userData;
+ }
+
+ internal abstract partial class AsyncOperation
+ {
+ /// Outcome of processing an io_uring CQE, determining the dispatch action.
+ internal enum IoUringCompletionResult
+ {
+ Completed = 0,
+ Pending = 1,
+ Canceled = 2,
+ Ignored = 3
+ }
+
+ /// Tri-state result from direct (managed) SQE preparation.
+ internal enum IoUringDirectPrepareResult
+ {
+ Unsupported = 0, // Direct path unavailable for this shape; caller keeps operation pending.
+ Prepared = 1, // SQE written
+ PrepareFailed = 2 // Direct preparation failed; caller handles retry/fallback without native prepare.
+ }
+
+ /// Tracks whether a receive operation prepared as one-shot or multishot.
+ internal enum IoUringReceiveSubmissionMode : byte
+ {
+ None = 0,
+ OneShot = 1,
+ Multishot = 2
+ }
+
+ private long _ioUringPrepareSequence;
+ private int _ioUringPrepareQueued;
+ private int _ioUringPreparationReusable;
+ private MemoryHandle _ioUringPinnedBuffer;
+ private int _ioUringPinnedBufferActive;
+ private int _ioUringCompletionSocketAddressLen;
+ private int _ioUringCompletionControlBufferLen;
+ private int _ioUringReceiveSubmissionMode;
+ private int _ioUringSlotExhaustionRetryCount;
+ internal ulong IoUringUserData;
+
+ /// Requests kernel cancellation if the flag is set.
+ partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation)
+ {
+ if (requestIoUringCancellation)
+ {
+ AssociatedContext.TryRequestIoUringCancellation(this);
+ }
+ }
+
+ /// Untracks this operation unless it is in the Canceled state awaiting a terminal CQE.
+ partial void LinuxUntrackIoUringOperation()
+ {
+ // Canceled operations remain tracked until the terminal CQE arrives so that
+ // pinned/user-owned resources are not released while the kernel may still
+ // reference them. Dispatch will clear resources on that terminal completion.
+ if (_state == State.Canceled)
+ {
+ return;
+ }
+
+ AssociatedContext.TryUntrackIoUringOperation(this);
+ }
+
+ /// Resets all io_uring preparation state and advances the prepare sequence.
+ partial void ResetIoUringState()
+ {
+ ReleaseIoUringPreparationResources();
+ IoUringUserData = 0;
+ Volatile.Write(ref _ioUringPreparationReusable, 0);
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None;
+ _ioUringSlotExhaustionRetryCount = 0;
+ long nextPrepareSequence = unchecked(_ioUringPrepareSequence + 1);
+ // Keep sequence strictly positive so stale queued work from previous resets never matches.
+ if (nextPrepareSequence <= 0)
+ {
+ nextPrepareSequence = 1;
+ }
+
+ Volatile.Write(ref _ioUringPrepareSequence, nextPrepareSequence);
+ Volatile.Write(ref _ioUringPrepareQueued, 0);
+ }
+
+ /// Marks this operation as ready for SQE preparation and returns its sequence number.
+ internal long MarkReadyForIoUringPreparation()
+ {
+ long prepareSequence = Volatile.Read(ref _ioUringPrepareSequence);
+ Debug.Assert(prepareSequence > 0);
+ Volatile.Write(ref _ioUringPrepareQueued, 1);
+ return prepareSequence;
+ }
+
+ /// Cancels a pending preparation if the sequence number still matches.
+ internal void CancelPendingIoUringPreparation(long prepareSequence)
+ {
+ if (Volatile.Read(ref _ioUringPrepareSequence) == prepareSequence)
+ {
+ Volatile.Write(ref _ioUringPrepareQueued, 0);
+ }
+ }
+
+ /// Attempts to prepare an SQE for this operation via the managed direct path.
+ internal bool TryPrepareIoUring(SocketAsyncContext context, long prepareSequence)
+ {
+ if (prepareSequence <= 0 ||
+ Volatile.Read(ref _ioUringPrepareSequence) != prepareSequence ||
+ Interlocked.Exchange(ref _ioUringPrepareQueued, 0) == 0 ||
+ _state != State.Waiting)
+ {
+ return false;
+ }
+
+ if (Interlocked.Exchange(ref _ioUringPreparationReusable, 0) == 0)
+ {
+ ReleaseIoUringPreparationResources();
+ }
+
+ SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine);
+ if (engine is null || !engine.IsIoUringDirectSqeEnabled)
+ {
+ // Managed completion mode assumes direct SQE submission.
+ // If direct submission is unavailable, keep operation pending for fallback handling.
+ ErrorCode = SocketError.Success;
+ IoUringUserData = 0;
+ return false;
+ }
+
+ IoUringDirectPrepareResult directResult = IoUringPrepareDirect(context, engine, out ulong directUserData);
+ if (directResult == IoUringDirectPrepareResult.Prepared)
+ {
+ _ioUringSlotExhaustionRetryCount = 0;
+ IoUringUserData = ErrorCode == SocketError.Success ? directUserData : 0;
+ return true;
+ }
+
+ if (directResult == IoUringDirectPrepareResult.PrepareFailed)
+ {
+ IoUringUserData = 0;
+ return false;
+ }
+
+ // Direct preparation unsupported for this operation shape.
+ // Leave operation pending so caller can use completion-path fallback semantics.
+ ErrorCode = SocketError.Success;
+ IoUringUserData = 0;
+ return false;
+ }
+
+ /// Queues this operation for deferred preparation on the event loop thread.
+ internal bool TryQueueIoUringPreparation()
+ {
+ if (!AssociatedContext.IsIoUringCompletionModeEnabled())
+ {
+ return false;
+ }
+
+ long prepareSequence = MarkReadyForIoUringPreparation();
+ if (AssociatedContext.TryEnqueueIoUringPreparation(this, prepareSequence))
+ {
+ return true;
+ }
+
+ CancelPendingIoUringPreparation(prepareSequence);
+ return false;
+ }
+
+ /// Returns whether this operation is currently in the waiting state.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool IsInWaitingState() => _state == State.Waiting;
+
+ /// Increments and returns the slot-exhaustion retry count for this operation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal int IncrementIoUringSlotExhaustionRetryCount() => ++_ioUringSlotExhaustionRetryCount;
+
+ /// Resets slot-exhaustion retry tracking for this operation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void ResetIoUringSlotExhaustionRetryCount() => _ioUringSlotExhaustionRetryCount = 0;
+
+ ///
+ /// Emits a readiness fallback event when io_uring prepare-queue staging fails.
+ ///
+ internal void EmitReadinessFallbackForQueueOverflow()
+ {
+ Interop.Sys.SocketEvents fallbackEvents = GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ SocketAsyncContext context = AssociatedContext;
+ SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine);
+ if (engine is null)
+ {
+ return;
+ }
+
+ engine.EnqueueReadinessFallbackEvent(
+ context,
+ fallbackEvents,
+ countAsPrepareQueueOverflowFallback: true);
+ }
+
+ /// Processes a CQE result and returns the dispatch action for the completion handler.
+ internal IoUringCompletionResult ProcessIoUringCompletionResult(int result, uint flags, uint auxiliaryData)
+ {
+ Trace($"Enter, result={result}, flags={flags}, auxiliaryData={auxiliaryData}");
+
+ // Claim ownership of completion processing; if cancellation already won, do not publish completion.
+ State oldState = Interlocked.CompareExchange(ref _state, State.Running, State.Waiting);
+ if (oldState == State.Canceled)
+ {
+ Trace("Exit, previously canceled");
+ return IoUringCompletionResult.Canceled;
+ }
+
+ if (oldState != State.Waiting)
+ {
+ Trace("Exit, ignored");
+ return IoUringCompletionResult.Ignored;
+ }
+
+ if (ProcessIoUringCompletionViaDiscriminator(AssociatedContext, result, auxiliaryData))
+ {
+ _state = State.Complete;
+ Trace("Exit, completed");
+ return IoUringCompletionResult.Completed;
+ }
+
+ // Incomplete path (e.g. transient retry): mirror TryComplete state transition handling.
+ State newState;
+ while (true)
+ {
+ State state = _state;
+ Debug.Assert(state is State.Running or State.RunningWithPendingCancellation, $"Unexpected operation state: {(State)state}");
+
+ newState = (state == State.Running ? State.Waiting : State.Canceled);
+ if (state == Interlocked.CompareExchange(ref _state, newState, state))
+ {
+ break;
+ }
+ }
+
+ if (newState == State.Canceled)
+ {
+ ProcessCancellation();
+ Trace("Exit, canceled while pending");
+ return IoUringCompletionResult.Canceled;
+ }
+
+ Trace("Exit, pending");
+ return IoUringCompletionResult.Pending;
+ }
+
+ /// Stores recvmsg output lengths from the CQE for post-completion processing.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void SetIoUringCompletionMessageMetadata(int socketAddressLen, int controlBufferLen)
+ {
+ _ioUringCompletionSocketAddressLen = socketAddressLen;
+ _ioUringCompletionControlBufferLen = controlBufferLen;
+ }
+
+ /// Releases preparation resources and resets the user_data to zero.
+ internal void ClearIoUringUserData()
+ {
+ ReleaseIoUringPreparationResources();
+ IoUringUserData = 0;
+ Volatile.Write(ref _ioUringPreparationReusable, 0);
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None;
+ _ioUringSlotExhaustionRetryCount = 0;
+ }
+
+ /// Clears user_data without releasing preparation resources for pending requeue.
+ internal void ResetIoUringUserDataForRequeue()
+ {
+ IoUringUserData = 0;
+ _ioUringCompletionSocketAddressLen = 0;
+ _ioUringCompletionControlBufferLen = 0;
+ }
+
+ /// Records whether the current receive preparation uses one-shot or multishot mode.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected void SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode mode)
+ {
+ Volatile.Write(ref _ioUringReceiveSubmissionMode, (int)mode);
+ }
+
+ /// Marks preparation resources as reusable so the next prepare skips re-pinning.
+ internal void MarkIoUringPreparationReusable()
+ {
+ Volatile.Write(ref _ioUringPreparationReusable, 1);
+ }
+
+ /// Socket address length reported by the kernel in the CQE.
+ protected int IoUringCompletionSocketAddressLen => _ioUringCompletionSocketAddressLen;
+ /// Control buffer length reported by the kernel in the CQE.
+ protected int IoUringCompletionControlBufferLen => _ioUringCompletionControlBufferLen;
+
+ /// Pins a buffer and returns the raw pointer, recording the handle for later release.
+ protected unsafe byte* PinIoUringBuffer(Memory buffer)
+ {
+ ReleasePinnedIoUringBuffer();
+ if (buffer.Length == 0)
+ {
+ return null;
+ }
+
+ _ioUringPinnedBuffer = buffer.Pin();
+ Volatile.Write(ref _ioUringPinnedBufferActive, 1);
+ return (byte*)_ioUringPinnedBuffer.Pointer;
+ }
+
+ /// Attempts to pin a buffer, falling back to the readiness path if not pinnable.
+ protected unsafe bool TryPinIoUringBuffer(Memory buffer, out byte* pinnedBuffer)
+ {
+ if (Volatile.Read(ref _ioUringPinnedBufferActive) != 0)
+ {
+ pinnedBuffer = (byte*)_ioUringPinnedBuffer.Pointer;
+ if (buffer.Length > 0 && pinnedBuffer is null)
+ {
+ ReleasePinnedIoUringBuffer();
+ RecordIoUringNonPinnablePrepareFallback("null-reused-pin-pointer", buffer.Length);
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ return true;
+ }
+
+ try
+ {
+ pinnedBuffer = PinIoUringBuffer(buffer);
+ if (buffer.Length > 0 && pinnedBuffer is null)
+ {
+ ReleasePinnedIoUringBuffer();
+ RecordIoUringNonPinnablePrepareFallback("null-pin-pointer", buffer.Length);
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ return true;
+ }
+ catch (NotSupportedException)
+ {
+ pinnedBuffer = null;
+ RecordIoUringNonPinnablePrepareFallback("pin-not-supported", buffer.Length);
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+ }
+
+ /// Transfers ownership of the active pinned buffer to the caller.
+ internal MemoryHandle TransferPinnedBuffer()
+ {
+ if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) == 0)
+ {
+ return default;
+ }
+
+ MemoryHandle pinnedBuffer = _ioUringPinnedBuffer;
+ _ioUringPinnedBuffer = default;
+ return pinnedBuffer;
+ }
+
+ ///
+ /// Attempts to pin a socket address buffer, reusing an existing pin when possible.
+ /// Caller is responsible for setting operation ErrorCode on failure if needed.
+ ///
+ protected static unsafe bool TryPinIoUringSocketAddress(
+ Memory socketAddress,
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ out byte* rawSocketAddress)
+ {
+ rawSocketAddress = null;
+ if (socketAddress.Length == 0)
+ {
+ return true;
+ }
+
+ if (Volatile.Read(ref pinnedSocketAddressActive) != 0)
+ {
+ rawSocketAddress = (byte*)pinnedSocketAddress.Pointer;
+ if (rawSocketAddress is null)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ Volatile.Write(ref pinnedSocketAddressActive, 0);
+ return false;
+ }
+
+ return true;
+ }
+
+ try
+ {
+ pinnedSocketAddress = socketAddress.Pin();
+ Volatile.Write(ref pinnedSocketAddressActive, 1);
+ }
+ catch (NotSupportedException)
+ {
+ rawSocketAddress = null;
+ return false;
+ }
+
+ rawSocketAddress = (byte*)pinnedSocketAddress.Pointer;
+ if (rawSocketAddress is null)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ Volatile.Write(ref pinnedSocketAddressActive, 0);
+ return false;
+ }
+
+ return true;
+ }
+
+ ///
+ /// Pins a socket address buffer and normalizes pinning failures to a non-terminal fallback signal.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected unsafe bool TryPinIoUringSocketAddressForPrepare(
+ Memory socketAddress,
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ out byte* rawSocketAddress)
+ {
+ if (TryPinIoUringSocketAddress(
+ socketAddress,
+ ref pinnedSocketAddress,
+ ref pinnedSocketAddressActive,
+ out rawSocketAddress))
+ {
+ return true;
+ }
+
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ /// Releases an operation-owned pinned socket-address buffer and message-header allocation.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected static unsafe void ReleaseIoUringSocketAddressAndMessageHeader(
+ ref MemoryHandle pinnedSocketAddress,
+ ref int pinnedSocketAddressActive,
+ ref IntPtr messageHeader)
+ {
+ if (Interlocked.Exchange(ref pinnedSocketAddressActive, 0) != 0)
+ {
+ pinnedSocketAddress.Dispose();
+ pinnedSocketAddress = default;
+ }
+
+ IntPtr header = Interlocked.Exchange(ref messageHeader, IntPtr.Zero);
+ if (header != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)header);
+ }
+ }
+
+ /// Records a telemetry counter for a non-pinnable buffer fallback.
+ private void RecordIoUringNonPinnablePrepareFallback(string reason, int bufferLength)
+ {
+ SocketAsyncEngine? engine = Volatile.Read(ref AssociatedContext._asyncEngine);
+ if (engine is null || !engine.IsIoUringCompletionModeEnabled)
+ {
+ return;
+ }
+
+ engine.RecordIoUringNonPinnablePrepareFallback();
+ long count = SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackCount();
+ if (NetEventSource.Log.IsEnabled() && (count & 0x3F) == 1)
+ {
+ LogIoUringNonPinnablePrepareFallback(reason, bufferLength, count);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ void LogIoUringNonPinnablePrepareFallback(string fallbackReason, int fallbackBufferLength, long fallbackCount)
+ {
+ NetEventSource.Info(
+ AssociatedContext,
+ $"io_uring prepare fallback due to non-pinnable buffer: reason={fallbackReason}, length={fallbackBufferLength}, count={fallbackCount}");
+ }
+ }
+
+ /// Releases the currently pinned buffer handle if active.
+ private void ReleasePinnedIoUringBuffer()
+ {
+ if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) != 0)
+ {
+ _ioUringPinnedBuffer.Dispose();
+ _ioUringPinnedBuffer = default;
+ }
+ }
+
+ /// Releases the pinned buffer when the operation shape (single vs list) changes.
+ protected void ReleaseIoUringPinnedBufferForShapeTransition() =>
+ ReleasePinnedIoUringBuffer();
+
+ /// Releases all preparation resources including the pinned buffer and subclass resources.
+ private void ReleaseIoUringPreparationResources()
+ {
+ ReleasePinnedIoUringBuffer();
+ ReleaseIoUringPreparationResourcesCore();
+ }
+
+ /// Subclass hook to release operation-specific preparation resources.
+ protected virtual void ReleaseIoUringPreparationResourcesCore()
+ {
+ }
+
+ /// Frees a set of GCHandles used for buffer list pinning.
+ protected static void ReleasePinnedHandles(GCHandle[] pinnedHandles, int count)
+ {
+ if (count <= 0)
+ {
+ return;
+ }
+
+ int releaseCount = count < pinnedHandles.Length ? count : pinnedHandles.Length;
+ for (int i = 0; i < releaseCount; i++)
+ {
+ if (pinnedHandles[i].IsAllocated)
+ {
+ pinnedHandles[i].Free();
+ }
+ }
+ }
+
+ /// Rents an array from the shared pool for temporary io_uring preparation use.
+ private static T[] RentIoUringArray(int minimumLength) =>
+ minimumLength == 0 ? Array.Empty() : ArrayPool.Shared.Rent(minimumLength);
+
+ /// Returns a rented array to the shared pool.
+ private static void ReturnIoUringArray(T[] array, bool clearArray = false)
+ {
+ if (array.Length != 0)
+ {
+ ArrayPool.Shared.Return(array, clearArray);
+ }
+ }
+
+ /// Releases pinned handles and returns the iovec array to the pool.
+ protected static void ReleaseIoUringPinnedHandlesAndIovecs(
+ ref GCHandle[]? pinnedHandles,
+ ref Interop.Sys.IOVector[]? iovecs,
+ ref int pinnedHandleCount)
+ {
+ GCHandle[]? handles = Interlocked.Exchange(ref pinnedHandles, null);
+ int handleCount = Interlocked.Exchange(ref pinnedHandleCount, 0);
+ if (handles is not null)
+ {
+ ReleasePinnedHandles(handles, handleCount);
+ ReturnIoUringArray(handles, clearArray: true);
+ }
+
+ Interop.Sys.IOVector[]? vectors = Interlocked.Exchange(ref iovecs, null);
+ if (vectors is not null)
+ {
+ ReturnIoUringArray(vectors, clearArray: true);
+ }
+ }
+
+ /// Pins a list of buffer segments and builds an iovec array for scatter/gather I/O.
+ protected static unsafe bool TryPinBufferListForIoUring(
+ IList> buffers,
+ int startIndex,
+ int startOffset,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out int iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode)
+ {
+ iovCount = 0;
+ pinnedHandleCount = 0;
+ if ((uint)startIndex > (uint)buffers.Count)
+ {
+ errorCode = SocketError.InvalidArgument;
+ pinnedHandles = Array.Empty();
+ iovecs = Array.Empty();
+ return false;
+ }
+
+ int remainingBufferCount = buffers.Count - startIndex;
+ pinnedHandles = RentIoUringArray(remainingBufferCount);
+ iovecs = RentIoUringArray(remainingBufferCount);
+
+ int currentOffset = startOffset;
+ byte[]? lastPinnedArray = null;
+ GCHandle lastPinnedHandle = default;
+ try
+ {
+ for (int i = 0; i < remainingBufferCount; i++, currentOffset = 0)
+ {
+ ArraySegment buffer = buffers[startIndex + i];
+ RangeValidationHelpers.ValidateSegment(buffer);
+
+ if ((uint)currentOffset > (uint)buffer.Count)
+ {
+ ReleasePinnedHandles(pinnedHandles, pinnedHandleCount);
+ ReturnIoUringArray(pinnedHandles, clearArray: true);
+ ReturnIoUringArray(iovecs, clearArray: true);
+ errorCode = SocketError.InvalidArgument;
+ return false;
+ }
+
+ int bufferCount = buffer.Count - currentOffset;
+ byte* basePtr = null;
+ if (bufferCount != 0)
+ {
+ byte[] array = buffer.Array!;
+ GCHandle handle;
+ if (ReferenceEquals(array, lastPinnedArray))
+ {
+ handle = lastPinnedHandle;
+ }
+ else
+ {
+ handle = GCHandle.Alloc(array, GCHandleType.Pinned);
+ pinnedHandles[pinnedHandleCount] = handle;
+ pinnedHandleCount++;
+ lastPinnedArray = array;
+ lastPinnedHandle = handle;
+ }
+
+ basePtr = &((byte*)handle.AddrOfPinnedObject())[buffer.Offset + currentOffset];
+ }
+
+ iovecs[i].Base = basePtr;
+ iovecs[i].Count = (UIntPtr)bufferCount;
+ iovCount++;
+ }
+ }
+ catch
+ {
+ ReleasePinnedHandles(pinnedHandles, pinnedHandleCount);
+ ReturnIoUringArray(pinnedHandles, clearArray: true);
+ ReturnIoUringArray(iovecs, clearArray: true);
+ throw;
+ }
+
+ errorCode = SocketError.Success;
+ return true;
+ }
+
+ /// Prepares an SQE via the managed direct path. Override in subclasses for direct submission.
+ protected virtual IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ return IoUringDirectPrepareResult.Unsupported;
+ }
+
+ ///
+ /// Routes a CQE using an operation-kind discriminator to avoid virtual completion dispatch
+ /// on this hot path.
+ ///
+ private bool ProcessIoUringCompletionViaDiscriminator(SocketAsyncContext context, int result, uint auxiliaryData)
+ {
+ IoUringCompletionDispatchKind kind = GetIoUringCompletionDispatchKind();
+ if (result >= 0)
+ {
+ return kind switch
+ {
+ IoUringCompletionDispatchKind.BufferListSendOperation => ((BufferListSendOperation)this).ProcessIoUringCompletionSuccessBufferListSend(result),
+ IoUringCompletionDispatchKind.BufferMemoryReceiveOperation => ((BufferMemoryReceiveOperation)this).ProcessIoUringCompletionSuccessBufferMemoryReceive(result, auxiliaryData),
+ IoUringCompletionDispatchKind.BufferListReceiveOperation => ((BufferListReceiveOperation)this).ProcessIoUringCompletionSuccessBufferListReceive(result, auxiliaryData),
+ IoUringCompletionDispatchKind.ReceiveMessageFromOperation => ((ReceiveMessageFromOperation)this).ProcessIoUringCompletionSuccessReceiveMessageFrom(result, auxiliaryData),
+ IoUringCompletionDispatchKind.AcceptOperation => ((AcceptOperation)this).ProcessIoUringCompletionSuccessAccept(result, auxiliaryData),
+ IoUringCompletionDispatchKind.ConnectOperation => ((ConnectOperation)this).ProcessIoUringCompletionSuccessConnect(context),
+ IoUringCompletionDispatchKind.SendOperation => ((SendOperation)this).ProcessIoUringCompletionSuccessSend(result),
+ _ => ProcessIoUringCompletionSuccessDefault(result)
+ };
+ }
+
+ return kind switch
+ {
+ IoUringCompletionDispatchKind.ReceiveMessageFromOperation => ((ReceiveMessageFromOperation)this).ProcessIoUringCompletionErrorReceiveMessageFrom(result),
+ IoUringCompletionDispatchKind.AcceptOperation => ((AcceptOperation)this).ProcessIoUringCompletionErrorAccept(result),
+ IoUringCompletionDispatchKind.ConnectOperation => ((ConnectOperation)this).ProcessIoUringCompletionErrorConnect(context, result),
+ IoUringCompletionDispatchKind.ReadOperation or
+ IoUringCompletionDispatchKind.BufferMemoryReceiveOperation or
+ IoUringCompletionDispatchKind.BufferListReceiveOperation => ((ReadOperation)this).ProcessIoUringCompletionErrorRead(result),
+ IoUringCompletionDispatchKind.WriteOperation or
+ IoUringCompletionDispatchKind.SendOperation or
+ IoUringCompletionDispatchKind.BufferListSendOperation => ((WriteOperation)this).ProcessIoUringCompletionErrorWrite(result),
+ _ => ProcessIoUringCompletionErrorDefault(result)
+ };
+ }
+
+ /// Processes a successful (non-negative) io_uring completion result.
+ private bool ProcessIoUringCompletionSuccessDefault(int result)
+ {
+ Debug.Assert(result >= 0, $"Expected non-negative io_uring result, got {result}");
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ /// Processes a failed (negative) io_uring completion result.
+ private bool ProcessIoUringCompletionErrorDefault(int result)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+ ErrorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result));
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private IoUringCompletionDispatchKind GetIoUringCompletionDispatchKind()
+ {
+ int dispatchKind = _ioUringCompletionDispatchKind;
+ return dispatchKind != 0 ?
+ (IoUringCompletionDispatchKind)dispatchKind :
+ IoUringCompletionDispatchKind.Default;
+ }
+
+ /// Whether preparation resources should be preserved when the operation is requeued.
+ internal virtual bool ShouldReuseIoUringPreparationResourcesOnPending => false;
+
+ /// Returns whether the negative result represents EAGAIN/EWOULDBLOCK.
+ protected static bool IsIoUringRetryableError(int result)
+ {
+ if (result >= 0)
+ {
+ return false;
+ }
+
+ Interop.Error error = GetIoUringPalError(result);
+ return error == Interop.Error.EAGAIN || error == Interop.Error.EWOULDBLOCK;
+ }
+
+ /// Converts a negative io_uring result to a SocketError, returning false for retryable errors.
+ protected static bool ProcessIoUringErrorResult(int result, out SocketError errorCode)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+
+ if (IsIoUringRetryableError(result))
+ {
+ errorCode = SocketError.Success;
+ return false;
+ }
+
+ errorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result));
+ return true;
+ }
+
+ /// Converts a negative io_uring CQE result (raw -errno) to PAL error space.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected static Interop.Error GetIoUringPalError(int result)
+ {
+ Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}");
+ int platformErrno = -result;
+ return Interop.Sys.ConvertErrorPlatformToPal(platformErrno);
+ }
+
+ /// Returns the epoll event mask to use when falling back from io_uring to readiness notification.
+ internal virtual Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.None;
+
+ ///
+ /// Copies payload bytes from a provided-buffer ring selection into the operation's target memory.
+ /// Returns false when this operation shape does not support provided-buffer payload materialization.
+ ///
+ internal virtual unsafe bool TryProcessIoUringProvidedBufferCompletion(
+ byte* providedBuffer,
+ int providedBufferLength,
+ int bytesTransferred,
+ ref uint auxiliaryData)
+ {
+ _ = providedBuffer;
+ _ = providedBufferLength;
+ _ = bytesTransferred;
+ _ = auxiliaryData;
+ return false;
+ }
+ }
+
+ internal abstract partial class ReadOperation
+ {
+ internal bool ProcessIoUringCompletionErrorRead(int result) =>
+ ProcessIoUringErrorResult(result, out ErrorCode);
+
+ ///
+ // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback.
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Read;
+ }
+
+ private abstract partial class WriteOperation
+ {
+ internal bool ProcessIoUringCompletionErrorWrite(int result) =>
+ ProcessIoUringErrorResult(result, out ErrorCode);
+
+ ///
+ // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback.
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Write;
+ }
+
+ private abstract partial class SendOperation
+ {
+ internal bool ProcessIoUringCompletionSuccessSend(int result)
+ {
+ if (result == 0)
+ {
+ // A zero-byte completion for a non-empty send payload indicates peer close
+ // on stream sockets; report reset instead of a spurious success/0-byte write.
+ if (Count > 0)
+ {
+ ErrorCode = SocketError.ConnectionReset;
+ return true;
+ }
+
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}");
+ Debug.Assert(result <= Count, $"Unexpected io_uring send completion size: result={result}, count={Count}");
+
+ int sent = Math.Min(result, Count);
+ BytesTransferred += sent;
+ Offset += sent;
+ Count -= sent;
+ ErrorCode = SocketError.Success;
+ return Count == 0;
+ }
+ }
+
+ private partial class BufferMemorySendOperation
+ {
+ private IntPtr _ioUringMessageHeader;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Gets a message header buffer and sets the common sendmsg fields.
+ private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringSendMessageHeader(byte* rawSocketAddress)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+ return messageHeader;
+ }
+
+ /// Configures a message header with zero or one iovec entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConfigureSingleIov(
+ Interop.Sys.MessageHeader* messageHeader,
+ byte* rawBuffer,
+ int bufferLength,
+ Interop.Sys.IOVector* iov)
+ {
+ if (bufferLength == 0)
+ {
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ return;
+ }
+
+ iov->Base = rawBuffer;
+ iov->Count = (UIntPtr)bufferLength;
+ messageHeader->IOVectors = iov;
+ messageHeader->IOVectorCount = 1;
+ }
+
+ /// Builds a connected send or sendmsg preparation request.
+ private unsafe IoUringDirectPrepareResult IoUringPrepareDirectSendMessage(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (rawBuffer is not null)
+ {
+ rawBuffer += Offset;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringSendMessageHeader(rawSocketAddress);
+ Interop.Sys.IOVector sendIov;
+ ConfigureSingleIov(messageHeader, rawBuffer, Count, &sendIov);
+
+ IoUringDirectPrepareResult sendMessagePrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ messageHeader,
+ Count,
+ Flags,
+ out userData,
+ out SocketError sendMessageErrorCode);
+ ErrorCode = sendMessageErrorCode;
+ return sendMessagePrepareResult;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (SocketAddress.Length == 0)
+ {
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (rawBuffer is not null)
+ {
+ rawBuffer += Offset;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendWithZeroCopyFallback(
+ context._socket,
+ rawBuffer,
+ Count,
+ Flags,
+ out bool usedZeroCopy,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ if (usedZeroCopy && prepareResult == IoUringDirectPrepareResult.Prepared)
+ {
+ engine.TransferIoUringZeroCopyPinHold(userData, TransferPinnedBuffer());
+ }
+
+ return prepareResult;
+ }
+
+ return IoUringPrepareDirectSendMessage(context, engine, out userData);
+ }
+ }
+
+ private sealed partial class BufferListSendOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private int _ioUringPreparedBufferCount = -1;
+ private int _ioUringPreparedStartIndex = -1;
+ private int _ioUringPreparedStartOffset = -1;
+ private int _ioUringPreparedIovCount;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedBufferCount = -1;
+ _ioUringPreparedStartIndex = -1;
+ _ioUringPreparedStartOffset = -1;
+ _ioUringPreparedIovCount = 0;
+ }
+
+ /// Pins buffer segments starting at BufferIndex/Offset and builds the iovec array.
+ private bool TryPinIoUringBuffers(
+ IList> buffers,
+ int startIndex,
+ int startOffset,
+ out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedBufferCount == buffers.Count &&
+ _ioUringPreparedStartIndex == startIndex &&
+ _ioUringPreparedStartOffset == startOffset &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ // Release any existing pinned handles and rented arrays before creating new ones.
+ // This handles the partial-send case where BufferIndex/Offset advanced, causing the
+ // reuse check above to fail while old resources are still held.
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex,
+ startOffset,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedBufferCount = buffers.Count;
+ _ioUringPreparedStartIndex = startIndex;
+ _ioUringPreparedStartOffset = startOffset;
+ _ioUringPreparedIovCount = iovCount;
+ return true;
+ }
+
+ /// Advances the buffer position after a partial send, returning true when all data is sent.
+ private bool AdvanceSendBufferPosition(int bytesSent)
+ {
+ IList>? buffers = Buffers;
+ if (buffers is null || bytesSent <= 0)
+ {
+ return buffers is null || BufferIndex >= buffers.Count;
+ }
+
+ int remaining = bytesSent;
+ int index = BufferIndex;
+ int offset = Offset;
+
+ while (remaining > 0 && index < buffers.Count)
+ {
+ int available = buffers[index].Count - offset;
+ Debug.Assert(available >= 0, "Unexpected negative buffer availability during io_uring send completion.");
+
+ if (available > remaining)
+ {
+ offset += remaining;
+ break;
+ }
+
+ remaining -= Math.Max(available, 0);
+ index++;
+ offset = 0;
+ }
+
+ BufferIndex = index;
+ Offset = offset;
+ return index >= buffers.Count;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ IList>? buffers = Buffers;
+ if (buffers is null)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if ((uint)BufferIndex > (uint)buffers.Count)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffers(buffers, BufferIndex, Offset, out int iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ byte* rawSocketAddress = null;
+ if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader messageHeader;
+ messageHeader.SocketAddress = rawSocketAddress;
+ messageHeader.SocketAddressLen = SocketAddress.Length;
+ messageHeader.ControlBuffer = null;
+ messageHeader.ControlBufferLen = 0;
+ messageHeader.Flags = SocketFlags.None;
+
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader.IOVectors = iovecsPtr;
+ messageHeader.IOVectorCount = iovCount;
+ // Buffer-list sends can be many small segments (e.g. 4KB chunks). Use
+ // aggregate payload size for zero-copy eligibility, not per-segment size.
+ long totalPayloadBytes = 0;
+ for (int i = 0; i < iovCount; i++)
+ {
+ totalPayloadBytes += (long)(nuint)iovecs[i].Count;
+ if (totalPayloadBytes >= int.MaxValue)
+ {
+ totalPayloadBytes = int.MaxValue;
+ break;
+ }
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ &messageHeader,
+ (int)totalPayloadBytes,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader.IOVectors = null;
+ messageHeader.IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ context._socket,
+ &messageHeader,
+ payloadLength: 0,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessBufferListSend(int result)
+ {
+ if (result == 0)
+ {
+ // Buffer-list sends can represent empty payloads; only treat result=0 as
+ // reset when there are still bytes pending across remaining segments.
+ if (HasPendingBufferListSendBytes())
+ {
+ ErrorCode = SocketError.ConnectionReset;
+ return true;
+ }
+
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}");
+ BytesTransferred += result;
+ bool complete = AdvanceSendBufferPosition(result);
+ ErrorCode = SocketError.Success;
+ return complete;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool HasPendingBufferListSendBytes()
+ {
+ IList>? buffers = Buffers;
+ if (buffers is null || BufferIndex >= buffers.Count)
+ {
+ return false;
+ }
+
+ int index = BufferIndex;
+ int offset = Offset;
+ while (index < buffers.Count)
+ {
+ int available = buffers[index].Count - offset;
+ if (available > 0)
+ {
+ return true;
+ }
+
+ index++;
+ offset = 0;
+ }
+
+ return false;
+ }
+ }
+
+ private sealed partial class BufferMemoryReceiveOperation
+ {
+ private IntPtr _ioUringMessageHeader;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Gets a message header buffer and sets the common recvmsg fields.
+ private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringReceiveMessageHeader(byte* rawSocketAddress)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ InitializeReceiveMessageHeader(messageHeader, rawSocketAddress);
+ return messageHeader;
+ }
+
+ /// Initializes recvmsg header fields shared by direct preparation variants.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void InitializeReceiveMessageHeader(Interop.Sys.MessageHeader* messageHeader, byte* rawSocketAddress)
+ {
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+ }
+
+ /// Configures a message header with a single iovec entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConfigureSingleIov(
+ Interop.Sys.MessageHeader* messageHeader,
+ byte* rawBuffer,
+ int bufferLength,
+ Interop.Sys.IOVector* iov)
+ {
+ // Keep a single iovec even for zero-length receives so recvmsg preserves
+ // completion-mode readiness probe behavior for zero-byte operations.
+ iov->Base = rawBuffer;
+ iov->Count = (UIntPtr)bufferLength;
+ messageHeader->IOVectors = iov;
+ messageHeader->IOVectorCount = 1;
+ }
+
+ /// Builds a connected or receive-from recvmsg operation.
+ private unsafe IoUringDirectPrepareResult IoUringPrepareDirectReceiveMessage(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringReceiveMessageHeader(rawSocketAddress);
+ Interop.Sys.IOVector receiveIov;
+ ConfigureSingleIov(messageHeader, rawBuffer, Buffer.Length, &receiveIov);
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ ///
+ /// Returns whether this operation shape is eligible for multishot recv submission.
+ /// Eligible: connected TCP receive (no socket address, no recvmsg flags) with non-empty buffer.
+ /// Ineligible: zero-byte probes, recvmsg-based receive paths (SetReceivedFlags/socket address).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsEligibleForIoUringMultishotRecv()
+ {
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ return false;
+ }
+
+ // Multishot recv uses IORING_OP_RECV (no msg_flags). Message-oriented sockets
+ // rely on MSG_TRUNC to report truncation, which is not observable in this path.
+ if (SocketPal.GetSockOpt(
+ AssociatedContext._socket,
+ SocketOptionLevel.Socket,
+ SocketOptionName.Type,
+ out int socketTypeValue) != SocketError.Success)
+ {
+ // If type probing fails, keep completion correctness by disabling multishot recv.
+ return false;
+ }
+
+ SocketType socketType = (SocketType)socketTypeValue;
+ if (socketType == SocketType.Dgram ||
+ socketType == SocketType.Raw ||
+ socketType == SocketType.Seqpacket)
+ {
+ return false;
+ }
+
+ return Buffer.Length != 0;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.OneShot);
+ IoUringDirectPrepareResult receiveMessagePrepareResult =
+ IoUringPrepareDirectReceiveMessage(context, engine, out userData);
+ if (receiveMessagePrepareResult != IoUringDirectPrepareResult.Prepared || ErrorCode != SocketError.Success)
+ {
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ }
+
+ return receiveMessagePrepareResult;
+ }
+
+ bool allowMultishotRecv = IsEligibleForIoUringMultishotRecv() && engine.SupportsMultishotRecv;
+ if (!allowMultishotRecv && context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ SetIoUringReceiveSubmissionMode(
+ allowMultishotRecv ? IoUringReceiveSubmissionMode.Multishot : IoUringReceiveSubmissionMode.OneShot);
+
+ // Persistent multishot receive: if one is already armed, attach this operation to
+ // that existing user_data instead of submitting a new recv SQE.
+ if (allowMultishotRecv && context.IsPersistentMultishotRecvArmed())
+ {
+ ulong armedUserData = context.PersistentMultishotRecvUserData;
+ if (armedUserData != 0 &&
+ engine.TryReplaceIoUringTrackedOperation(armedUserData, this))
+ {
+ SocketsTelemetry.Log.IoUringPersistentMultishotRecvReuse();
+ userData = armedUserData;
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.Prepared;
+ }
+
+ // Stale armed-state; clear and submit a fresh SQE below.
+ context.ClearPersistentMultishotRecvArmed();
+ }
+
+ if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer))
+ {
+ ErrorCode = SocketError.Success;
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectRecv(
+ context._socket,
+ rawBuffer,
+ Buffer.Length,
+ Flags,
+ allowMultishotRecv,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ if (allowMultishotRecv &&
+ prepareResult == IoUringDirectPrepareResult.Prepared &&
+ errorCode == SocketError.Success)
+ {
+ context.SetPersistentMultishotRecvArmed(userData);
+ }
+
+ if (prepareResult != IoUringDirectPrepareResult.Prepared || errorCode != SocketError.Success)
+ {
+ SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None);
+ }
+
+ return prepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessBufferMemoryReceive(int result, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = SetReceivedFlags ? (SocketFlags)(int)auxiliaryData : SocketFlags.None;
+
+ if (SocketAddress.Length != 0)
+ {
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)SocketAddress.Length)
+ {
+ socketAddressLen = SocketAddress.Length;
+ }
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ }
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
+ ///
+ internal override unsafe bool TryProcessIoUringProvidedBufferCompletion(
+ byte* providedBuffer,
+ int providedBufferLength,
+ int bytesTransferred,
+ ref uint auxiliaryData)
+ {
+ _ = auxiliaryData;
+
+ if (bytesTransferred <= 0)
+ {
+ return true;
+ }
+
+ if (SetReceivedFlags || SocketAddress.Length != 0)
+ {
+ return false;
+ }
+
+ if ((uint)bytesTransferred > (uint)providedBufferLength ||
+ (uint)bytesTransferred > (uint)Buffer.Length)
+ {
+ return false;
+ }
+
+ new ReadOnlySpan(providedBuffer, bytesTransferred).CopyTo(Buffer.Span);
+ return true;
+ }
+ }
+
+ private sealed partial class BufferListReceiveOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private IntPtr _ioUringMessageHeader;
+ private int _ioUringPreparedIovCount;
+ private int _ioUringPreparedBufferCount = -1;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferCount = -1;
+
+ IntPtr messageHeader = Interlocked.Exchange(ref _ioUringMessageHeader, IntPtr.Zero);
+ if (messageHeader != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)messageHeader);
+ }
+ }
+
+ /// Pins all buffer segments and builds the iovec array.
+ private bool TryPinIoUringBuffers(IList> buffers, out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedIovCount != 0 &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length &&
+ _ioUringPreparedBufferCount == buffers.Count)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex: 0,
+ startOffset: 0,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedIovCount = iovCount;
+ _ioUringPreparedBufferCount = buffers.Count;
+ return true;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ IList>? buffers = Buffers;
+ if (buffers is null)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (!TryPinIoUringBuffers(buffers, out int iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ byte* rawSocketAddress = null;
+ if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ messageHeader->Flags = SocketFlags.None;
+
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader->IOVectors = iovecsPtr;
+ messageHeader->IOVectorCount = iovCount;
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ internal unsafe bool ProcessIoUringCompletionSuccessBufferListReceive(int result, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = (SocketFlags)(int)auxiliaryData;
+ ErrorCode = SocketError.Success;
+
+ if (_ioUringMessageHeader != IntPtr.Zero && SocketAddress.Length != 0)
+ {
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)SocketAddress.Length)
+ {
+ socketAddressLen = SocketAddress.Length;
+ }
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ }
+
+ return true;
+ }
+ }
+
+ private sealed partial class ReceiveMessageFromOperation
+ {
+ private GCHandle[]? _ioUringPinnedBufferHandles;
+ private Interop.Sys.IOVector[]? _ioUringIovecs;
+ private int _ioUringPinnedHandleCount;
+ private int _ioUringPreparedIovCount;
+ private int _ioUringPreparedBufferListCount = -1;
+ private IntPtr _ioUringMessageHeader;
+ private IntPtr _ioUringControlBuffer;
+ private int _ioUringControlBufferLength;
+ private MemoryHandle _ioUringPinnedSocketAddress;
+ private int _ioUringPinnedSocketAddressActive;
+
+ ///
+ internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true;
+
+ ///
+ protected override unsafe void ReleaseIoUringPreparationResourcesCore()
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferListCount = -1;
+
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+ _ioUringControlBufferLength = 0;
+
+ ReleaseIoUringSocketAddressAndMessageHeader(
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ ref _ioUringMessageHeader);
+ }
+
+ /// Pins buffer segments and builds the iovec array for recvmsg.
+ private bool TryPinIoUringBuffers(IList> buffers, out int iovCount)
+ {
+ if (_ioUringPinnedBufferHandles is not null &&
+ _ioUringIovecs is not null &&
+ _ioUringPreparedIovCount <= _ioUringIovecs.Length &&
+ _ioUringPreparedBufferListCount == buffers.Count)
+ {
+ iovCount = _ioUringPreparedIovCount;
+ return true;
+ }
+
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+
+ if (!TryPinBufferListForIoUring(
+ buffers,
+ startIndex: 0,
+ startOffset: 0,
+ out GCHandle[] pinnedHandles,
+ out Interop.Sys.IOVector[] iovecs,
+ out iovCount,
+ out int pinnedHandleCount,
+ out SocketError errorCode))
+ {
+ ErrorCode = errorCode;
+ return false;
+ }
+
+ _ioUringPinnedBufferHandles = pinnedHandles;
+ _ioUringIovecs = iovecs;
+ _ioUringPinnedHandleCount = pinnedHandleCount;
+ _ioUringPreparedIovCount = iovCount;
+ _ioUringPreparedBufferListCount = buffers.Count;
+ return true;
+ }
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (context.IsPersistentMultishotRecvArmed())
+ {
+ context.RequestPersistentMultishotRecvCancel();
+ }
+
+ IList>? buffers = Buffers;
+ byte* rawBuffer = null;
+ int iovCount;
+ if (buffers is not null)
+ {
+ ReleaseIoUringPinnedBufferForShapeTransition();
+ if (!TryPinIoUringBuffers(buffers, out iovCount))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+ }
+ else
+ {
+ if (!TryPinIoUringBuffer(Buffer, out rawBuffer))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (_ioUringPinnedBufferHandles is not null || _ioUringIovecs is not null)
+ {
+ ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount);
+ _ioUringPreparedIovCount = 0;
+ _ioUringPreparedBufferListCount = -1;
+ }
+
+ iovCount = 1;
+ }
+
+ if (!TryPinIoUringSocketAddressForPrepare(
+ SocketAddress,
+ ref _ioUringPinnedSocketAddress,
+ ref _ioUringPinnedSocketAddressActive,
+ out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ if (messageHeader is null)
+ {
+ messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader));
+ _ioUringMessageHeader = (IntPtr)messageHeader;
+ }
+
+ messageHeader->SocketAddress = rawSocketAddress;
+ messageHeader->SocketAddressLen = SocketAddress.Length;
+ messageHeader->Flags = SocketFlags.None;
+
+ int controlBufferLen = Interop.Sys.GetControlMessageBufferSize(Convert.ToInt32(IsIPv4), Convert.ToInt32(IsIPv6));
+ if (controlBufferLen < 0)
+ {
+ ErrorCode = SocketError.Success;
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (controlBufferLen != 0)
+ {
+ if (_ioUringControlBuffer == IntPtr.Zero || _ioUringControlBufferLength != controlBufferLen)
+ {
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+
+ void* rawControlBuffer = NativeMemory.Alloc((nuint)controlBufferLen);
+ _ioUringControlBuffer = (IntPtr)rawControlBuffer;
+ _ioUringControlBufferLength = controlBufferLen;
+ }
+
+ messageHeader->ControlBuffer = (byte*)_ioUringControlBuffer;
+ messageHeader->ControlBufferLen = controlBufferLen;
+ }
+ else
+ {
+ IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero);
+ if (controlBuffer != IntPtr.Zero)
+ {
+ NativeMemory.Free((void*)controlBuffer);
+ }
+
+ _ioUringControlBufferLength = 0;
+ messageHeader->ControlBuffer = null;
+ messageHeader->ControlBufferLen = 0;
+ }
+
+ if (buffers is not null)
+ {
+ Interop.Sys.IOVector[] iovecs = _ioUringIovecs!;
+ if (iovCount != 0)
+ {
+ fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0])
+ {
+ messageHeader->IOVectors = iovecsPtr;
+ messageHeader->IOVectorCount = iovCount;
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+ }
+
+ messageHeader->IOVectors = null;
+ messageHeader->IOVectorCount = 0;
+ IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError zeroIovErrorCode);
+ ErrorCode = zeroIovErrorCode;
+ return zeroIovPrepareResult;
+ }
+
+ Interop.Sys.IOVector iov;
+ iov.Base = rawBuffer;
+ iov.Count = (UIntPtr)Buffer.Length;
+ messageHeader->IOVectors = &iov;
+ messageHeader->IOVectorCount = 1;
+ IoUringDirectPrepareResult singleBufferPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage(
+ context._socket,
+ messageHeader,
+ Flags,
+ out userData,
+ out SocketError singleBufferErrorCode);
+ ErrorCode = singleBufferErrorCode;
+ return singleBufferPrepareResult;
+ }
+
+ internal unsafe bool ProcessIoUringCompletionSuccessReceiveMessageFrom(int result, uint auxiliaryData)
+ {
+ BytesTransferred = result;
+ ReceivedFlags = (SocketFlags)(int)auxiliaryData;
+ ErrorCode = SocketError.Success;
+ IPPacketInformation = default;
+
+ if (_ioUringMessageHeader != IntPtr.Zero)
+ {
+ Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader;
+ int socketAddressCapacity = SocketAddress.Length;
+ int socketAddressLen = IoUringCompletionSocketAddressLen;
+ if (socketAddressLen < 0)
+ {
+ socketAddressLen = 0;
+ }
+
+ if ((uint)socketAddressLen > (uint)socketAddressCapacity)
+ {
+ socketAddressLen = socketAddressCapacity;
+ }
+
+ if (socketAddressLen == 0 && socketAddressCapacity != 0)
+ {
+ socketAddressLen = socketAddressCapacity;
+ SocketAddress.Span.Clear();
+ }
+
+ int controlBufferCapacity = messageHeader->ControlBufferLen;
+ int controlBufferLen = IoUringCompletionControlBufferLen;
+ if (controlBufferLen < 0)
+ {
+ controlBufferLen = 0;
+ }
+
+ if ((uint)controlBufferLen > (uint)controlBufferCapacity)
+ {
+ controlBufferLen = controlBufferCapacity;
+ }
+
+ messageHeader->SocketAddressLen = socketAddressLen;
+ messageHeader->ControlBufferLen = controlBufferLen;
+ messageHeader->Flags = ReceivedFlags;
+
+ SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+
+ IPPacketInformation = SocketPal.GetIoUringIPPacketInformation(messageHeader, IsIPv4, IsIPv6);
+ }
+
+ return true;
+ }
+
+ internal bool ProcessIoUringCompletionErrorReceiveMessageFrom(int result)
+ {
+ if (!ProcessIoUringErrorResult(result, out ErrorCode))
+ {
+ return false;
+ }
+
+ IPPacketInformation = default;
+ return true;
+ }
+ }
+
+ internal sealed partial class AcceptOperation
+ {
+ ///
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Read;
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ AcceptSocketAddressLength = SocketAddress.Length;
+ if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ if (engine.SupportsMultishotAccept &&
+ Interlocked.CompareExchange(
+ ref context._multishotAcceptState,
+ MultishotAcceptStateArming,
+ MultishotAcceptStateDisarmed) == MultishotAcceptStateDisarmed)
+ {
+ context.EnsureMultishotAcceptQueueInitialized();
+ IoUringDirectPrepareResult multishotPrepareResult = engine.TryPrepareIoUringDirectMultishotAccept(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError multishotErrorCode);
+ if (multishotPrepareResult == IoUringDirectPrepareResult.Prepared)
+ {
+ Debug.Assert(
+ (byte)(userData >> IoUringUserDataTagShift) == IoUringReservedCompletionTag,
+ "Multishot accept user_data must be a reserved-completion token.");
+ Volatile.Write(ref context._multishotAcceptState, unchecked((long)userData));
+ ErrorCode = multishotErrorCode;
+ return multishotPrepareResult;
+ }
+
+ context.DisarmMultishotAccept();
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectAccept(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessAccept(int result, uint auxiliaryData)
+ {
+ AcceptedFileDescriptor = (IntPtr)result;
+ ErrorCode = SocketError.Success;
+ // Keep parity with readiness path: always honor reported address length, including 0.
+ AcceptSocketAddressLength = auxiliaryData > (uint)SocketAddress.Length ? SocketAddress.Length : (int)auxiliaryData;
+ SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength);
+ return true;
+ }
+
+ internal bool ProcessIoUringCompletionErrorAccept(int result)
+ {
+ AcceptedFileDescriptor = (IntPtr)(-1);
+ return ProcessIoUringCompletionErrorRead(result);
+ }
+ }
+
+ private sealed partial class ConnectOperation
+ {
+ ///
+ internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() =>
+ Interop.Sys.SocketEvents.Write;
+
+ ///
+ protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect(
+ SocketAsyncContext context,
+ SocketAsyncEngine engine,
+ out ulong userData)
+ {
+ userData = 0;
+ if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress))
+ {
+ return IoUringDirectPrepareResult.PrepareFailed;
+ }
+
+ IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectConnect(
+ context._socket,
+ rawSocketAddress,
+ SocketAddress.Length,
+ out userData,
+ out SocketError errorCode);
+ ErrorCode = errorCode;
+ return prepareResult;
+ }
+
+ internal bool ProcessIoUringCompletionErrorConnect(SocketAsyncContext context, int result)
+ {
+ Interop.Error error = GetIoUringPalError(result);
+ if (error == Interop.Error.EINPROGRESS)
+ {
+ ErrorCode = SocketError.Success;
+ return false;
+ }
+
+ if (!ProcessIoUringCompletionErrorWrite(result))
+ {
+ return false;
+ }
+
+ context._socket.RegisterConnectResult(ErrorCode);
+ return true;
+ }
+
+ internal bool ProcessIoUringCompletionSuccessConnect(SocketAsyncContext context)
+ {
+ ErrorCode = SocketError.Success;
+ context._socket.RegisterConnectResult(ErrorCode);
+
+ if (Buffer.Length > 0)
+ {
+ Action, SocketFlags, SocketError>? callback = Callback;
+ Debug.Assert(callback is not null);
+ SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, default, ref BytesTransferred, callback!, default);
+ if (error == SocketError.IOPending)
+ {
+ // Callback ownership moved to the async send operation.
+ Callback = null;
+ Buffer = default;
+ }
+ else
+ {
+ if (error != SocketError.Success)
+ {
+ ErrorCode = error;
+ context._socket.RegisterConnectResult(ErrorCode);
+ }
+
+ // Follow-up send completed synchronously (success/error), so invoke
+ // Connect callback from this operation path.
+ Buffer = default;
+ }
+ }
+
+ return true;
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
index 4e2e117984084c..37de5ad03d346d 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
@@ -43,10 +44,10 @@ internal sealed partial class SocketAsyncContext
private BufferListReceiveOperation? _cachedBufferListReceiveOperation;
private BufferMemorySendOperation? _cachedBufferMemorySendOperation;
private BufferListSendOperation? _cachedBufferListSendOperation;
-
private void ReturnOperation(AcceptOperation operation)
{
operation.Reset();
+ operation.AcceptSocketAddressLength = 0;
operation.Callback = null;
operation.SocketAddress = default;
Volatile.Write(ref _cachedAcceptOperation, operation); // benign race condition
@@ -83,6 +84,7 @@ private void ReturnOperation(BufferListSendOperation operation)
{
operation.Reset();
operation.Buffers = null;
+ operation.SetBufferPosition(bufferIndex: 0, offset: 0);
operation.Callback = null;
operation.SocketAddress = default;
Volatile.Write(ref _cachedBufferListSendOperation, operation); // benign race condition
@@ -108,8 +110,20 @@ private BufferListSendOperation RentBufferListSendOperation() =>
Interlocked.Exchange(ref _cachedBufferListSendOperation, null) ??
new BufferListSendOperation(this);
- private abstract class AsyncOperation : IThreadPoolWorkItem
+ // Partial method hooks for io_uring completion-mode staging (Linux-only).
+ // No-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs.
+ static partial void LinuxTryStageIoUringOperation(AsyncOperation operation);
+ partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued);
+ partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred);
+ partial void LinuxOnStopAndAbort();
+
+ internal abstract partial class AsyncOperation : IThreadPoolWorkItem
{
+ private const int CancellationCallbackBatchSize = 64;
+ private static readonly ConcurrentQueue s_cancellationCallbackQueue = new ConcurrentQueue();
+ private static readonly IThreadPoolWorkItem s_processCancellationCallbacks = new CancellationCallbackWorker();
+ private static int s_cancellationCallbackWorkerQueued;
+
private enum State
{
Waiting = 0,
@@ -120,6 +134,10 @@ private enum State
}
private volatile AsyncOperation.State _state;
+ private int _ioUringCompletionCallbackQueued;
+ // Defined in the shared Unix partial so operation constructors can compile
+ // for both linux and non-linux unix TFMs; only linux consumes the value.
+ private int _ioUringCompletionDispatchKind;
#if DEBUG
private bool _callbackQueued; // When true, the callback has been queued.
@@ -133,6 +151,24 @@ private enum State
public ManualResetEventSlim? Event { get; set; }
+ protected enum IoUringCompletionDispatchKind : byte
+ {
+ Default = 0,
+ ReadOperation = 1,
+ WriteOperation = 2,
+ SendOperation = 3,
+ BufferListSendOperation = 4,
+ BufferMemoryReceiveOperation = 5,
+ BufferListReceiveOperation = 6,
+ ReceiveMessageFromOperation = 7,
+ AcceptOperation = 8,
+ ConnectOperation = 9
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ protected void SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind kind) =>
+ _ioUringCompletionDispatchKind = (int)kind;
+
public AsyncOperation(SocketAsyncContext context)
{
AssociatedContext = context;
@@ -141,7 +177,9 @@ public AsyncOperation(SocketAsyncContext context)
public void Reset()
{
+ ResetIoUringState();
_state = State.Waiting;
+ _ioUringCompletionCallbackQueued = 0;
Event = null;
Next = this;
#if DEBUG
@@ -202,6 +240,16 @@ public OperationResult TryComplete(SocketAsyncContext context)
}
public bool TryCancel()
+ {
+ return TryCancelCore(requestIoUringCancellation: true);
+ }
+
+ internal bool TryCancelForTeardown()
+ {
+ return TryCancelCore(requestIoUringCancellation: false);
+ }
+
+ private bool TryCancelCore(bool requestIoUringCancellation)
{
Trace("Enter");
@@ -232,6 +280,9 @@ public bool TryCancel()
return false;
}
+ // Best effort: if completion-mode io_uring work was already submitted, request kernel-side cancellation now.
+ // Partial method: no-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs.
+ LinuxRequestIoUringCancellationIfNeeded(requestIoUringCancellation);
ProcessCancellation();
// Note, we leave the operation in the OperationQueue.
@@ -245,6 +296,7 @@ public void ProcessCancellation()
Debug.Assert(_state == State.Canceled);
+ LinuxUntrackIoUringOperation();
ErrorCode = SocketError.OperationAborted;
ManualResetEventSlim? e = Event;
@@ -261,10 +313,53 @@ public void ProcessCancellation()
// we can't pool the object, as ProcessQueue may still have a reference to it, due to
// using a pattern whereby it takes the lock to grab an item, but then releases the lock
// to do further processing on the item that's still in the list.
- ThreadPool.UnsafeQueueUserWorkItem(o => ((AsyncOperation)o!).InvokeCallback(allowPooling: false), this);
+ QueueCancellationCallback(this);
+ }
+ }
+
+ private static void QueueCancellationCallback(AsyncOperation operation)
+ {
+ s_cancellationCallbackQueue.Enqueue(operation);
+ if (Interlocked.CompareExchange(ref s_cancellationCallbackWorkerQueued, 1, 0) == 0)
+ {
+ ThreadPool.UnsafeQueueUserWorkItem(s_processCancellationCallbacks, preferLocal: false);
}
}
+ private static void ProcessQueuedCancellationCallbacks()
+ {
+ while (true)
+ {
+ int processed = 0;
+ while (processed < CancellationCallbackBatchSize &&
+ s_cancellationCallbackQueue.TryDequeue(out AsyncOperation? operation))
+ {
+ operation.InvokeCallback(allowPooling: false);
+ processed++;
+ }
+
+ if (s_cancellationCallbackQueue.IsEmpty)
+ {
+ Volatile.Write(ref s_cancellationCallbackWorkerQueued, 0);
+ if (s_cancellationCallbackQueue.IsEmpty ||
+ Interlocked.CompareExchange(ref s_cancellationCallbackWorkerQueued, 1, 0) != 0)
+ {
+ return;
+ }
+
+ continue;
+ }
+
+ ThreadPool.UnsafeQueueUserWorkItem(s_processCancellationCallbacks, preferLocal: false);
+ return;
+ }
+ }
+
+ private sealed class CancellationCallbackWorker : IThreadPoolWorkItem
+ {
+ void IThreadPoolWorkItem.Execute() => ProcessQueuedCancellationCallbacks();
+ }
+
public void Dispatch()
{
ManualResetEventSlim? e = Event;
@@ -288,6 +383,30 @@ public void Schedule()
ThreadPool.UnsafeQueueUserWorkItem(this, preferLocal: false);
}
+ internal void QueueIoUringCompletionCallback()
+ {
+ Debug.Assert(Event == null);
+ if (Interlocked.Exchange(ref _ioUringCompletionCallbackQueued, 1) != 0)
+ {
+ Debug.Fail("io_uring completion callback was already queued for this operation.");
+ return;
+ }
+
+ ThreadPool.UnsafeQueueUserWorkItem(this, preferLocal: false);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool TryExecuteIoUringCompletionCallback()
+ {
+ if (Interlocked.Exchange(ref _ioUringCompletionCallbackQueued, 0) == 0)
+ {
+ return false;
+ }
+
+ InvokeCallback(allowPooling: true);
+ return true;
+ }
+
public void Process() => ((IThreadPoolWorkItem)this).Execute();
void IThreadPoolWorkItem.Execute()
@@ -305,17 +424,27 @@ void IThreadPoolWorkItem.Execute()
// We could also add an abstract method that the base interface implementation
// invokes, but that adds an extra virtual dispatch.
Debug.Fail("Expected derived type to implement IThreadPoolWorkItem");
- throw new InvalidOperationException();
+ ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem();
}
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem() =>
+ throw new InvalidOperationException();
+
// Called when op is not in the queue yet, so can't be otherwise executing
public void DoAbort()
{
+ LinuxUntrackIoUringOperation();
ErrorCode = SocketError.OperationAborted;
}
protected abstract bool DoTryComplete(SocketAsyncContext context);
+ partial void ResetIoUringState();
+ partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation);
+ partial void LinuxUntrackIoUringOperation();
+
public abstract void InvokeCallback(bool allowPooling);
[Conditional("SOCKETASYNCCONTEXT_TRACE")]
@@ -333,36 +462,74 @@ public void TraceWithContext(SocketAsyncContext context, string message, [Caller
// These two abstract classes differentiate the operations that go in the
// read queue vs the ones that go in the write queue.
- private abstract class ReadOperation : AsyncOperation, IThreadPoolWorkItem
+ internal abstract partial class ReadOperation : AsyncOperation, IThreadPoolWorkItem
{
- public ReadOperation(SocketAsyncContext context) : base(context) { }
+ public ReadOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ReadOperation);
+ }
+
+ void IThreadPoolWorkItem.Execute()
+ {
+ if (TryExecuteIoUringCompletionCallback())
+ {
+ return;
+ }
- void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncReadOperation(this);
+ AssociatedContext.ProcessAsyncReadOperation(this);
+ }
}
- private abstract class WriteOperation : AsyncOperation, IThreadPoolWorkItem
+ private static bool ShouldDispatchCompletionCallback(AsyncOperation operation)
{
- public WriteOperation(SocketAsyncContext context) : base(context) { }
+ if (operation is ConnectOperation connectOperation)
+ {
+ // Connect can hand callback ownership to a follow-up send operation;
+ // dispatch here only when connect still owns the callback.
+ return connectOperation.Buffer.Length == 0 && connectOperation.Callback is not null;
+ }
- void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncWriteOperation(this);
+ return true;
}
- private abstract class SendOperation : WriteOperation
+ private abstract partial class WriteOperation : AsyncOperation, IThreadPoolWorkItem
+ {
+ public WriteOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.WriteOperation);
+ }
+
+ void IThreadPoolWorkItem.Execute()
+ {
+ if (TryExecuteIoUringCompletionCallback())
+ {
+ return;
+ }
+
+ AssociatedContext.ProcessAsyncWriteOperation(this);
+ }
+ }
+
+ private abstract partial class SendOperation : WriteOperation
{
public SocketFlags Flags;
public int BytesTransferred;
public int Offset;
public int Count;
- public SendOperation(SocketAsyncContext context) : base(context) { }
+ public SendOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.SendOperation);
+ }
public Action, SocketFlags, SocketError>? Callback { get; set; }
public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, SocketFlags.None, ErrorCode);
+
}
- private class BufferMemorySendOperation : SendOperation
+ private partial class BufferMemorySendOperation : SendOperation
{
public Memory Buffer;
@@ -390,18 +557,27 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class BufferListSendOperation : SendOperation
+ private sealed partial class BufferListSendOperation : SendOperation
{
public IList>? Buffers;
public int BufferIndex;
- public BufferListSendOperation(SocketAsyncContext context) : base(context) { }
+ public BufferListSendOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferListSendOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
return SocketPal.TryCompleteSendTo(context._socket, default(ReadOnlySpan), Buffers, ref BufferIndex, ref Offset, ref Count, Flags, SocketAddress.Span, ref BytesTransferred, out ErrorCode);
}
+ internal void SetBufferPosition(int bufferIndex, int offset)
+ {
+ BufferIndex = bufferIndex;
+ Offset = offset;
+ }
+
public override void InvokeCallback(bool allowPooling)
{
var cb = Callback!;
@@ -446,15 +622,31 @@ public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, ReceivedFlags, ErrorCode);
}
- private sealed class BufferMemoryReceiveOperation : ReceiveOperation
+ private sealed partial class BufferMemoryReceiveOperation : ReceiveOperation
{
public Memory Buffer;
public bool SetReceivedFlags;
- public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context) { }
+ public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferMemoryReceiveOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
+ bool consumedBufferedData = false;
+ int bufferedBytes = 0;
+ context.LinuxTryConsumeBufferedPersistentMultishotRecvData(Buffer, ref consumedBufferedData, ref bufferedBytes);
+ if (!SetReceivedFlags &&
+ SocketAddress.Length == 0 &&
+ consumedBufferedData)
+ {
+ BytesTransferred = bufferedBytes;
+ ReceivedFlags = SocketFlags.None;
+ ErrorCode = SocketError.Success;
+ return true;
+ }
+
// Zero byte read is performed to know when data is available.
// We don't have to call receive, our caller is interested in the event.
if (Buffer.Length == 0 && Flags == SocketFlags.None && SocketAddress.Length == 0)
@@ -502,11 +694,14 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class BufferListReceiveOperation : ReceiveOperation
+ private sealed partial class BufferListReceiveOperation : ReceiveOperation
{
public IList>? Buffers;
- public BufferListReceiveOperation(SocketAsyncContext context) : base(context) { }
+ public BufferListReceiveOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferListReceiveOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
@@ -553,7 +748,7 @@ protected override bool DoTryComplete(SocketAsyncContext context)
}
}
- private sealed class ReceiveMessageFromOperation : ReadOperation
+ private sealed partial class ReceiveMessageFromOperation : ReadOperation
{
public Memory Buffer;
public SocketFlags Flags;
@@ -565,7 +760,10 @@ private sealed class ReceiveMessageFromOperation : ReadOperation
public bool IsIPv6;
public IPPacketInformation IPPacketInformation;
- public ReceiveMessageFromOperation(SocketAsyncContext context) : base(context) { }
+ public ReceiveMessageFromOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ReceiveMessageFromOperation);
+ }
public Action, SocketFlags, IPPacketInformation, SocketError>? Callback { get; set; }
@@ -613,21 +811,33 @@ public override void InvokeCallback(bool allowPooling) =>
Callback!(BytesTransferred, SocketAddress, ReceivedFlags, IPPacketInformation, ErrorCode);
}
- private sealed class AcceptOperation : ReadOperation
+ internal sealed partial class AcceptOperation : ReadOperation
{
public IntPtr AcceptedFileDescriptor;
+ public int AcceptSocketAddressLength;
- public AcceptOperation(SocketAsyncContext context) : base(context) { }
+ public AcceptOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.AcceptOperation);
+ }
public Action, SocketError>? Callback { get; set; }
protected override bool DoTryComplete(SocketAsyncContext context)
{
+ bool dequeuedPreAcceptedConnection = false;
+ context.LinuxTryDequeuePreAcceptedConnection(this, ref dequeuedPreAcceptedConnection);
+ if (dequeuedPreAcceptedConnection)
+ {
+ return true;
+ }
+
bool completed = SocketPal.TryCompleteAccept(context._socket, SocketAddress, out int socketAddressLen, out AcceptedFileDescriptor, out ErrorCode);
+ AcceptSocketAddressLength = socketAddressLen;
Debug.Assert(ErrorCode == SocketError.Success || AcceptedFileDescriptor == (IntPtr)(-1), $"Unexpected values: ErrorCode={ErrorCode}, AcceptedFileDescriptor={AcceptedFileDescriptor}");
if (ErrorCode == SocketError.Success)
{
- SocketAddress = SocketAddress.Slice(0, socketAddressLen);
+ SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength);
}
return completed;
}
@@ -648,21 +858,49 @@ public override void InvokeCallback(bool allowPooling)
}
}
- private sealed class ConnectOperation : BufferMemorySendOperation
+ private sealed partial class ConnectOperation : BufferMemorySendOperation
{
- public ConnectOperation(SocketAsyncContext context) : base(context) { }
+ public ConnectOperation(SocketAsyncContext context) : base(context)
+ {
+ SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ConnectOperation);
+ }
protected override bool DoTryComplete(SocketAsyncContext context)
{
bool result = SocketPal.TryCompleteConnect(context._socket, out ErrorCode);
context._socket.RegisterConnectResult(ErrorCode);
- if (result && ErrorCode == SocketError.Success && Buffer.Length > 0)
+ if (result && Buffer.Length > 0)
{
- SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, Callback!, default);
- if (error != SocketError.Success && error != SocketError.IOPending)
+ if (ErrorCode == SocketError.Success)
{
- context._socket.RegisterConnectResult(ErrorCode);
+ Action, SocketFlags, SocketError>? callback = Callback;
+ Debug.Assert(callback != null);
+ SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, callback!, default);
+ if (error == SocketError.IOPending)
+ {
+ // Callback ownership moved to the async send operation.
+ Callback = null;
+ Buffer = default;
+ }
+ else
+ {
+ if (error != SocketError.Success)
+ {
+ ErrorCode = error;
+ context._socket.RegisterConnectResult(ErrorCode);
+ }
+
+ // Follow-up send completed synchronously (success/error), so invoke
+ // Connect callback from this operation path.
+ Buffer = default;
+ }
+ }
+ else
+ {
+ // Connect failed — no follow-up send will occur.
+ // Clear buffer so callback dispatch is not suppressed.
+ Buffer = default;
}
}
return result;
@@ -670,17 +908,18 @@ protected override bool DoTryComplete(SocketAsyncContext context)
public override void InvokeCallback(bool allowPooling)
{
- var cb = Callback!;
+ Action, SocketFlags, SocketError>? cb = Callback;
int bt = BytesTransferred;
Memory sa = SocketAddress;
SocketError ec = ErrorCode;
Memory buffer = Buffer;
- if (buffer.Length == 0)
+ if (cb != null && (buffer.Length == 0 || ec == SocketError.OperationAborted))
{
// Invoke callback only when we are completely done.
// In case data were provided for Connect we may or may not send them all.
- // If we did not we will need follow-up with Send operation
+ // If we did not we will need follow-up with Send operation.
+ // On cancellation, always invoke — the send was never started.
cb(bt, sa, SocketFlags.None, ec);
}
}
@@ -890,6 +1129,9 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation
operation.CancellationRegistration = cancellationToken.UnsafeRegister(s => ((TOperation)s!).TryCancel(), operation);
}
+ // Completion-mode staging: partial method is no-op on non-Linux.
+ LinuxTryStageIoUringOperation(operation);
+
return true;
case QueueState.Stopped:
@@ -898,7 +1140,7 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation
break;
default:
- Environment.FailFast("unexpected queue state");
+ FailFastUnexpectedQueueState(_state);
break;
}
}
@@ -939,7 +1181,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper
}
else
{
- throw new InternalException(error);
+ ThrowInternalException(error);
}
}
}
@@ -986,7 +1228,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper
return null;
default:
- Environment.FailFast("unexpected queue state");
+ FailFastUnexpectedQueueState(_state);
return null;
}
}
@@ -1022,7 +1264,10 @@ internal void ProcessAsyncOperation(TOperation op)
// request for a previous operation could affect a subsequent one)
// and here we know the operation has completed.
op.CancellationRegistration.Dispose();
- op.InvokeCallback(allowPooling: true);
+ if (ShouldDispatchCompletionCallback(op))
+ {
+ op.InvokeCallback(allowPooling: true);
+ }
}
}
@@ -1129,6 +1374,59 @@ public OperationResult ProcessQueuedOperation(TOperation op)
return result;
}
+ public bool TryRemoveCompletedOperation(SocketAsyncContext context, TOperation operation)
+ {
+ using (Lock())
+ {
+ if (_tail == null || _state == QueueState.Stopped)
+ {
+ return false;
+ }
+
+ AsyncOperation? previous = _tail;
+ AsyncOperation? current = _tail.Next;
+ while (!ReferenceEquals(current, operation))
+ {
+ if (ReferenceEquals(current, _tail))
+ {
+ return false;
+ }
+
+ previous = current;
+ current = current!.Next;
+ }
+
+ Debug.Assert(previous != null && current != null);
+ bool removedHead = ReferenceEquals(current, _tail.Next);
+ bool removedTail = ReferenceEquals(current, _tail);
+
+ if (removedHead && removedTail)
+ {
+ _tail = null;
+ _isNextOperationSynchronous = false;
+ _state = QueueState.Ready;
+ _sequenceNumber++;
+ Trace(context, $"Removed completed {IdOf(operation)} (queue empty)");
+ return true;
+ }
+
+ previous!.Next = current!.Next;
+ if (removedTail)
+ {
+ _tail = (TOperation)previous;
+ }
+
+ if (removedHead)
+ {
+ Debug.Assert(_tail != null);
+ _isNextOperationSynchronous = _tail.Next.Event != null;
+ }
+
+ Trace(context, $"Removed completed {IdOf(operation)}");
+ return true;
+ }
+ }
+
public void CancelAndContinueProcessing(TOperation op)
{
// Note, only sync operations use this method.
@@ -1244,6 +1542,17 @@ public bool StopAndAbort(SocketAsyncContext context)
return aborted;
}
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(Interop.Error error) =>
+ throw new InternalException(error);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void FailFastUnexpectedQueueState(QueueState state) =>
+ Environment.FailFast($"unexpected queue state: {state}");
+
[Conditional("SOCKETASYNCCONTEXT_TRACE")]
public void Trace(SocketAsyncContext context, string message, [CallerMemberName] string? memberName = null)
{
@@ -1328,6 +1637,7 @@ public bool StopAndAbort()
// Drain queues
aborted |= _sendQueue.StopAndAbort(this);
aborted |= _receiveQueue.StopAndAbort(this);
+ LinuxOnStopAndAbort();
// We don't need to synchronize with Register.
// This method is called when the handle gets released.
@@ -1360,7 +1670,7 @@ public void SetHandleNonBlocking()
{
if (Interop.Sys.Fcntl.SetIsNonBlocking(_socket, 1) != 0)
{
- throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError()));
+ ThrowSocketExceptionFromLastError();
}
_isHandleNonBlocking = true;
@@ -1369,11 +1679,36 @@ public void SetHandleNonBlocking()
public bool IsHandleNonBlocking => _isHandleNonBlocking;
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ThrowIfThreadsAreNotSupported()
+ {
+ if (!Socket.OSSupportsThreads)
+ {
+ ThrowPlatformNotSupportedForMissingThreadSupport();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ValidateSyncOperationPreconditions(int timeout)
+ {
+ ThrowIfThreadsAreNotSupported();
+ Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ }
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowPlatformNotSupportedForMissingThreadSupport() =>
+ throw new PlatformNotSupportedException();
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowSocketExceptionFromLastError() =>
+ throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError()));
+
private void PerformSyncOperation(ref OperationQueue queue, TOperation operation, int timeout, int observedSequenceNumber)
where TOperation : AsyncOperation
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
using (var e = new ManualResetEventSlim(false, 0))
{
@@ -1509,7 +1844,7 @@ public SocketError AcceptAsync(Memory socketAddress, out int socketAddress
public SocketError Connect(Memory socketAddress)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
+ ThrowIfThreadsAreNotSupported();
Debug.Assert(socketAddress.Length > 0, $"Unexpected socketAddressLen: {socketAddress.Length}");
// Connect is different than the usual "readiness" pattern of other operations.
@@ -1603,9 +1938,7 @@ public SocketError ReceiveAsync(Memory buffer, SocketFlags flags, out int
public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1636,7 +1969,7 @@ public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memor
public unsafe SocketError ReceiveFrom(Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1748,9 +2081,7 @@ public SocketError ReceiveAsync(IList> buffers, SocketFlags f
public SocketError ReceiveFrom(IList> buffers, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1817,9 +2148,7 @@ public SocketError ReceiveFromAsync(IList> buffers, SocketFla
public SocketError ReceiveMessageFrom(
Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1854,9 +2183,7 @@ public SocketError ReceiveMessageFrom(
public unsafe SocketError ReceiveMessageFrom(
Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
SocketFlags receivedFlags;
SocketError errorCode;
@@ -1946,9 +2273,7 @@ public SocketError SendAsync(Memory buffer, int offset, int count, SocketF
public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
@@ -1978,9 +2303,7 @@ public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flag
public unsafe SocketError SendTo(ReadOnlySpan buffer, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
@@ -2057,9 +2380,7 @@ public SocketError SendAsync(IList> buffers, SocketFlags flag
public SocketError SendTo(IList> buffers, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
int bufferIndex = 0;
@@ -2127,9 +2448,7 @@ public SocketError SendToAsync(IList> buffers, SocketFlags fl
public SocketError SendFile(SafeFileHandle fileHandle, long offset, long count, int timeout, out long bytesSent)
{
- if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException();
-
- Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}");
+ ValidateSyncOperationPreconditions(timeout);
bytesSent = 0;
SocketError errorCode;
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs
new file mode 100644
index 00000000000000..1baf5c67b8e6b6
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs
@@ -0,0 +1,684 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ private readonly partial struct SocketEventHandler
+ {
+ /// Delivers a completed operation to its owning socket context.
+ private void DispatchCompletedIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData)
+ {
+ if (!operation.AssociatedContext.TryCompleteIoUringOperation(operation))
+ {
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ }
+ }
+
+ /// Completes a deferred SEND_ZC operation when its NOTIF CQE arrives.
+ public void DispatchZeroCopyIoUringNotification(ulong payload)
+ {
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (!_engine.TryTakeTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null)
+ {
+ return;
+ }
+
+ Debug.Assert(
+ !_engine.IsZeroCopyNotificationPending(userData),
+ "NOTIF CQE dispatch must occur only after clearing SEND_ZC pending slot state.");
+ Debug.Assert(
+ operation.IoUringUserData == userData,
+ "Deferred SEND_ZC operation must still be tracked with its original user_data at NOTIF dispatch.");
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation, userData);
+ }
+
+ /// Processes a single completion and dispatches it to its owning operation.
+ public void DispatchSingleIoUringCompletion(
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ int controlBufferLen,
+ uint auxiliaryData,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref bool enqueuedFallbackEvent)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchSingleIoUringCompletion must only run on the event-loop thread.");
+ if (userData == 0)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ // Benign race: cancellation/abort paths may have already removed this tracked entry.
+ if (!_engine.TryTakeTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation))
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ return;
+ }
+
+ if (operation is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId);
+ return;
+ }
+
+ SocketAsyncContext receiveContext = operation.AssociatedContext;
+ if (receiveContext.IsPersistentMultishotRecvArmed() &&
+ receiveContext.PersistentMultishotRecvUserData == userData)
+ {
+ // Terminal CQE for persistent multishot recv (normal completion, cancel,
+ // ENOBUFS, EOF, or other error): clear armed-state so the next receive can re-arm.
+ SocketsTelemetry.Log.IoUringPersistentMultishotRecvTermination();
+ receiveContext.ClearPersistentMultishotRecvArmed();
+ }
+
+ if (operation is SocketAsyncContext.AcceptOperation acceptOperation &&
+ acceptOperation.AssociatedContext.MultishotAcceptUserData == userData)
+ {
+ acceptOperation.AssociatedContext.DisarmMultishotAccept();
+ }
+
+ uint completionAuxiliaryData = auxiliaryData;
+ int completionResultCode = result;
+ if (!TryMaterializeIoUringReceiveCompletion(
+ operation!,
+ completionResultCode,
+ flags,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref completionAuxiliaryData))
+ {
+ completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS);
+ completionAuxiliaryData = 0;
+ }
+
+ // Process completion metadata before processing result to allow message post-processing.
+ operation!.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen);
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData);
+
+ if (completionDispatchResult == SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed &&
+ _engine.IsZeroCopyNotificationPending(userData))
+ {
+ // SEND_ZC API contract: complete managed operation only once NOTIF confirms
+ // the kernel/NIC no longer references the caller buffer.
+ _engine.AssertZeroCopyDeferredCompletionState(userData, operation);
+ if (!_engine.TryReattachTrackedIoUringOperation(userData, operation))
+ {
+ Debug.Fail("SEND_ZC deferred completion reattach failed; completing operation with EINVAL and releasing deferred slot.");
+ bool cleanedDeferredSlot = _engine.TryCleanupDeferredZeroCopyCompletionSlot(userData);
+ Debug.Assert(
+ cleanedDeferredSlot,
+ "SEND_ZC deferred completion reattach failure should release the deferred completion slot.");
+ operation.ErrorCode = SocketPal.GetSocketErrorForErrorCode(Interop.Error.EINVAL);
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation, userData);
+ return;
+ }
+
+ return;
+ }
+
+ DispatchIoUringCompletionResult(
+ operation,
+ completionDispatchResult,
+ userData,
+ ref enqueuedFallbackEvent);
+ }
+
+ ///
+ /// Processes a multishot completion by completing the current operation and
+ /// requesting async cancel for non-terminal shots until full item-9 dispatch lands.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public void DispatchMultishotIoUringCompletion(
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ int controlBufferLen,
+ uint auxiliaryData,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref bool enqueuedFallbackEvent)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchMultishotIoUringCompletion must only run on the event-loop thread.");
+ _ = enqueuedFallbackEvent; // Transitional path never requeues via readiness fallback.
+ _ = hasFixedRecvBuffer;
+ _ = fixedRecvBufferId;
+ Debug.Assert((flags & IoUringConstants.CqeFMore) != 0,
+ "Multishot dispatch must only be used for non-terminal CQEs (IORING_CQE_F_MORE).");
+
+ if (userData == 0)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0);
+ return;
+ }
+
+ if (!_engine.TryGetTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null)
+ {
+ RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0);
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ return;
+ }
+
+ if (operation is SocketAsyncContext.AcceptOperation acceptOperation)
+ {
+ DispatchMultishotAcceptIoUringCompletion(
+ acceptOperation,
+ userData,
+ result,
+ flags,
+ socketAddressLen,
+ auxiliaryData);
+ return;
+ }
+
+ if (!operation.IsInWaitingState())
+ {
+ if (!TryBufferEarlyPersistentMultishotRecvCompletion(operation.AssociatedContext, result, flags))
+ {
+ _engine.TryRequestIoUringCancellation(userData);
+ }
+
+ return;
+ }
+
+ uint completionAuxiliaryData = auxiliaryData;
+ int completionResultCode = result;
+ if (!TryMaterializeIoUringReceiveCompletion(
+ operation,
+ completionResultCode,
+ flags,
+ hasFixedRecvBuffer: false,
+ fixedRecvBufferId: 0,
+ ref completionAuxiliaryData))
+ {
+ completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS);
+ completionAuxiliaryData = 0;
+ }
+
+ operation.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen);
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData);
+
+ SocketAsyncContext context = operation.AssociatedContext;
+ bool isPersistentMultishotRecv =
+ context.IsPersistentMultishotRecvArmed() &&
+ context.PersistentMultishotRecvUserData == userData;
+
+ // Transitional multishot model cancels after the first shot.
+ // Persistent multishot receive remains armed and rebinds future operations via TryReplace.
+ if (!isPersistentMultishotRecv)
+ {
+ _engine.TryRequestIoUringCancellation(userData);
+ }
+
+ switch (completionDispatchResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ DispatchCompletedIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ // Transitional multishot mode does not requeue intermediate shots.
+ // Cancellation is already requested above; terminal CQE cleanup path
+ // remains responsible for tracked-state/resource release.
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring multishot completion result: {completionDispatchResult}");
+ break;
+ }
+ }
+
+ ///
+ /// Handles transitional multishot-accept CQEs by completing one waiting operation and
+ /// canceling the multishot request. Extra successful shots are queued for dequeue on
+ /// the accept operation queue when possible.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void DispatchMultishotAcceptIoUringCompletion(
+ SocketAsyncContext.AcceptOperation operation,
+ ulong userData,
+ int result,
+ uint flags,
+ int socketAddressLen,
+ uint auxiliaryData)
+ {
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "DispatchMultishotAcceptIoUringCompletion must only run on the event-loop thread.");
+ operation.SetIoUringCompletionMessageMetadata(socketAddressLen, 0);
+ SocketAsyncContext context = operation.AssociatedContext;
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult =
+ operation.ProcessIoUringCompletionResult(result, flags, auxiliaryData);
+
+ // Transitional multishot-accept model: complete one managed accept and then
+ // issue async-cancel so terminal cleanup runs through single-shot dispatch.
+ _engine.TryRequestIoUringCancellation(userData);
+
+ switch (completionDispatchResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ DispatchCompletedIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ if (result >= 0)
+ {
+ int addressLength = auxiliaryData > (uint)operation.SocketAddress.Length ?
+ operation.SocketAddress.Length :
+ (int)auxiliaryData;
+ if (context.TryEnqueuePreAcceptedConnection((IntPtr)result, operation.SocketAddress.Span, addressLength))
+ {
+ _engine.EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read);
+ }
+ else
+ {
+ Interop.Sys.Close((IntPtr)result);
+ }
+ }
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring multishot accept completion result: {completionDispatchResult}");
+ break;
+ }
+ }
+
+ ///
+ /// For receive completions that used provided buffers (buffer-select or fixed receive),
+ /// materializes payload bytes into the operation target and recycles checked-out buffers.
+ ///
+ private unsafe bool TryMaterializeIoUringReceiveCompletion(
+ SocketAsyncContext.AsyncOperation operation,
+ int result,
+ uint flags,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId,
+ ref uint auxiliaryData)
+ {
+ bool hasSelectedBuffer = (flags & IoUringConstants.CqeFBuffer) != 0;
+ if (!hasFixedRecvBuffer && !hasSelectedBuffer)
+ {
+ return true;
+ }
+
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return false;
+ }
+
+ ushort bufferId;
+ bool reportRecycleFailureAsDepletion;
+ byte* providedBuffer = null;
+ int providedBufferLength = 0;
+ if (hasFixedRecvBuffer)
+ {
+ bufferId = fixedRecvBufferId;
+ reportRecycleFailureAsDepletion = true;
+
+ if (result > 0 &&
+ !providedBufferRing.TryGetCheckedOutBuffer(
+ bufferId,
+ out providedBuffer,
+ out providedBufferLength))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ return false;
+ }
+ }
+ else
+ {
+ bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ reportRecycleFailureAsDepletion = false;
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out providedBuffer,
+ out providedBufferLength))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ return false;
+ }
+ }
+
+ bool handled = result <= 0;
+ try
+ {
+ if (result > 0)
+ {
+ handled =
+ operation.TryProcessIoUringProvidedBufferCompletion(
+ providedBuffer,
+ providedBufferLength,
+ result,
+ ref auxiliaryData);
+ }
+
+ RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result);
+ }
+ finally
+ {
+ handled &= TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: reportRecycleFailureAsDepletion);
+ }
+
+ return handled;
+ }
+
+ ///
+ /// For persistent multishot recv, buffers payload bytes that arrive while no
+ /// managed receive operation is in the Waiting state.
+ ///
+ private unsafe bool TryBufferEarlyPersistentMultishotRecvCompletion(
+ SocketAsyncContext context,
+ int result,
+ uint flags)
+ {
+ if (result <= 0)
+ {
+ return true;
+ }
+
+ if ((flags & IoUringConstants.CqeFBuffer) == 0)
+ {
+ return false;
+ }
+
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return false;
+ }
+
+ ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out byte* providedBuffer,
+ out int providedBufferLength))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ return false;
+ }
+
+ bool buffered = false;
+ try
+ {
+ if ((uint)result <= (uint)providedBufferLength)
+ {
+ buffered = context.TryBufferEarlyPersistentMultishotRecvData(
+ new ReadOnlySpan(providedBuffer, result));
+ if (buffered)
+ {
+ RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result);
+ _engine.RecordIoUringPersistentMultishotRecvEarlyDataForDrainBatch();
+ }
+ }
+ }
+ finally
+ {
+ buffered &= TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: false);
+ }
+
+ return buffered;
+ }
+
+ ///
+ /// Recycles a provided-buffer selection for completions that can no longer be
+ /// dispatched to a tracked operation (e.g., late multishot CQEs after cancel).
+ ///
+ private unsafe void RecycleUntrackedReceiveCompletionBuffers(
+ uint flags,
+ bool hasFixedRecvBuffer,
+ ushort fixedRecvBufferId)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ return;
+ }
+
+ if ((flags & IoUringConstants.CqeFBuffer) == 0)
+ {
+ if (hasFixedRecvBuffer)
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ fixedRecvBufferId,
+ reportFailureAsDepletion: true);
+ }
+
+ return;
+ }
+
+ ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift);
+ if (!providedBufferRing.TryAcquireBufferForCompletion(
+ bufferId,
+ out _,
+ out _))
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ }
+ else
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ bufferId,
+ reportFailureAsDepletion: false);
+ }
+
+ if (hasFixedRecvBuffer)
+ {
+ _ = TryRecycleProvidedBufferFromCheckedOutState(
+ providedBufferRing,
+ fixedRecvBufferId,
+ reportFailureAsDepletion: true);
+ }
+ }
+
+ private void RecordProvidedBufferUtilizationIfEnabled(
+ IoUringProvidedBufferRing providedBufferRing,
+ int bytesTransferred)
+ {
+ if (bytesTransferred <= 0 || !_engine._adaptiveBufferSizingEnabled)
+ {
+ return;
+ }
+
+ Debug.Assert(_engine.IsCurrentThreadEventLoopThread(),
+ "Adaptive provided-buffer utilization tracking must run on the event-loop thread.");
+ providedBufferRing.RecordCompletionUtilization(bytesTransferred);
+ }
+
+ private bool TryRecycleProvidedBufferFromCheckedOutState(
+ IoUringProvidedBufferRing providedBufferRing,
+ ushort bufferId,
+ bool reportFailureAsDepletion)
+ {
+ bool recycled = providedBufferRing.TryRecycleBufferFromCompletion(bufferId);
+ if (recycled)
+ {
+ _engine.RecordIoUringProvidedBufferRecycleForDrainBatch();
+ }
+ else if (reportFailureAsDepletion)
+ {
+ _engine.RecordIoUringProvidedBufferDepletionForDrainBatch();
+ }
+
+ return recycled;
+ }
+
+ /// Requeues a pending operation or falls back to readiness notification.
+ private bool DispatchPendingIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData)
+ {
+ PendingIoUringReprepareResult inlineReprepareResult = TryDispatchPendingIoUringOperationInline(operation);
+ if (inlineReprepareResult == PendingIoUringReprepareResult.Prepared)
+ {
+ return false;
+ }
+
+ if (inlineReprepareResult == PendingIoUringReprepareResult.NotAttempted &&
+ operation.TryQueueIoUringPreparation())
+ {
+ _engine.RecordIoUringPendingRetryQueuedToPrepareQueue();
+ return false;
+ }
+
+ Debug.Assert(
+ inlineReprepareResult == PendingIoUringReprepareResult.Failed ||
+ !_engine._ioUringCapabilities.IsCompletionMode,
+ "Requeue should not fail in pure io_uring completion mode when inline re-prepare was not attempted.");
+
+ _engine.RecordIoUringCompletionRequeueFailure(userData);
+ operation.ClearIoUringUserData();
+ Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return false;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogUnexpectedCompletionFallback(_engine, fallbackEvents, userData);
+ }
+ _eventQueue.Enqueue(new SocketIOEvent(operation.AssociatedContext, fallbackEvents));
+ return true;
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static void LogUnexpectedCompletionFallback(SocketAsyncEngine engine, Interop.Sys.SocketEvents events, ulong completionUserData)
+ {
+ NetEventSource.Error(
+ engine,
+ $"io_uring completion fallback to readiness notification in unexpected path: events={events}, user_data=0x{completionUserData:x}");
+ }
+ }
+
+ ///
+ /// Attempts to re-prepare and re-track a pending operation inline on the event loop thread.
+ /// This avoids an extra prepare-queue round-trip for completion-mode retries.
+ ///
+ private enum PendingIoUringReprepareResult : byte
+ {
+ NotAttempted = 0,
+ Prepared = 1,
+ Failed = 2
+ }
+
+ ///
+ /// Attempts to re-prepare a pending operation inline.
+ /// Returns whether inline re-prepare was prepared, skipped, or failed without producing an SQE.
+ ///
+ private PendingIoUringReprepareResult TryDispatchPendingIoUringOperationInline(SocketAsyncContext.AsyncOperation operation)
+ {
+ if (!_engine._ioUringCapabilities.IsCompletionMode || !_engine.IsCurrentThreadEventLoopThread())
+ {
+ return PendingIoUringReprepareResult.NotAttempted;
+ }
+
+ long prepareSequence = operation.MarkReadyForIoUringPreparation();
+ Interop.Error prepareError = _engine.TryPrepareAndTrackIoUringOperation(
+ operation,
+ prepareSequence,
+ out bool preparedSqe);
+ if (prepareError != Interop.Error.SUCCESS)
+ {
+ Debug.Fail($"io_uring inline re-prepare failed: {prepareError}");
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Error(_engine, $"io_uring inline re-prepare failed: {prepareError}");
+ }
+
+ return PendingIoUringReprepareResult.Failed;
+ }
+
+ return preparedSqe ? PendingIoUringReprepareResult.Prepared : PendingIoUringReprepareResult.Failed;
+ }
+
+ /// Routes a CQE completion result to the appropriate dispatch behavior.
+ private void DispatchIoUringCompletionResult(
+ SocketAsyncContext.AsyncOperation operation,
+ SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionResult,
+ ulong userData,
+ ref bool enqueuedFallbackEvent)
+ {
+ switch (completionResult)
+ {
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Completed);
+ operation.ClearIoUringUserData();
+ DispatchCompletedIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Queued);
+ if (operation.ShouldReuseIoUringPreparationResourcesOnPending)
+ {
+ operation.MarkIoUringPreparationReusable();
+ operation.ResetIoUringUserDataForRequeue();
+ }
+ else
+ {
+ operation.ClearIoUringUserData();
+ }
+
+ enqueuedFallbackEvent |= DispatchPendingIoUringOperation(operation, userData);
+ break;
+
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled:
+ case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored:
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Canceled);
+ operation.ClearIoUringUserData();
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ break;
+
+ default:
+ Debug.Fail($"Unexpected io_uring completion result: {completionResult}");
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Detached);
+ operation.ClearIoUringUserData();
+ _engine.RecordBenignLateIoUringCompletion(userData);
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs
new file mode 100644
index 00000000000000..c6aa7912c60fa8
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs
@@ -0,0 +1,211 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ private readonly struct IoUringConfigurationInputs
+ {
+ internal readonly string? IoUringEnvironmentValue;
+ internal readonly bool IoUringFeatureSwitchEnabled;
+ internal readonly string? SqPollEnvironmentValue;
+ internal readonly bool SqPollFeatureSwitchEnabled;
+ internal readonly string? DirectSqeEnvironmentValue;
+ internal readonly string? ZeroCopySendEnvironmentValue;
+
+ internal IoUringConfigurationInputs(
+ string? ioUringEnvironmentValue,
+ bool ioUringFeatureSwitchEnabled,
+ string? sqPollEnvironmentValue,
+ bool sqPollFeatureSwitchEnabled,
+ string? directSqeEnvironmentValue,
+ string? zeroCopySendEnvironmentValue)
+ {
+ IoUringEnvironmentValue = ioUringEnvironmentValue;
+ IoUringFeatureSwitchEnabled = ioUringFeatureSwitchEnabled;
+ SqPollEnvironmentValue = sqPollEnvironmentValue;
+ SqPollFeatureSwitchEnabled = sqPollFeatureSwitchEnabled;
+ DirectSqeEnvironmentValue = directSqeEnvironmentValue;
+ ZeroCopySendEnvironmentValue = zeroCopySendEnvironmentValue;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static IoUringResolvedConfiguration ResolveIoUringResolvedConfiguration()
+ {
+ IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs();
+ return new IoUringResolvedConfiguration(
+ ioUringEnabled: ResolveIoUringEnabled(inputs),
+ sqPollRequested: ResolveSqPollRequested(inputs),
+ directSqeDisabled: ResolveIoUringDirectSqeDisabled(inputs),
+ zeroCopySendOptedIn: ResolveZeroCopySendOptedIn(inputs),
+ registerBuffersEnabled: s_ioUringRegisterBuffersEnabled,
+ adaptiveProvidedBufferSizingEnabled: s_ioUringAdaptiveBufferSizingEnabled,
+ providedBufferSize: s_ioUringProvidedBufferSize,
+ prepareQueueCapacity: s_ioUringPrepareQueueCapacity,
+ cancellationQueueCapacity: s_ioUringCancellationQueueCapacity);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static IoUringConfigurationInputs ReadIoUringConfigurationInputs()
+ {
+#if DEBUG
+ string? directSqeValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.DirectSqe);
+ string? zeroCopySendValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ZeroCopySend);
+#else
+ string? directSqeValue = null;
+ string? zeroCopySendValue = null;
+#endif
+
+ return new IoUringConfigurationInputs(
+ ioUringEnvironmentValue: Environment.GetEnvironmentVariable(IoUringEnvironmentVariable),
+ ioUringFeatureSwitchEnabled: IsIoUringFeatureEnabled,
+ sqPollEnvironmentValue: Environment.GetEnvironmentVariable(IoUringSqPollEnvironmentVariable),
+ sqPollFeatureSwitchEnabled: IsSqPollFeatureEnabled,
+ directSqeEnvironmentValue: directSqeValue,
+ zeroCopySendEnvironmentValue: zeroCopySendValue);
+ }
+
+ ///
+ /// Checks whether direct SQE submission is disabled.
+ /// Defaults to enabled; test-only env var can disable for deterministic tests.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringDirectSqeDisabled()
+ {
+ IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs();
+ return ResolveIoUringDirectSqeDisabled(inputs);
+ }
+
+ /// Checks whether io_uring is enabled.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringEnabled()
+ {
+ IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs();
+ return ResolveIoUringEnabled(inputs);
+ }
+
+ [FeatureSwitchDefinition(UseIoUringAppContextSwitch)]
+ private static bool IsIoUringFeatureEnabled
+ {
+ get
+ {
+ if (AppContext.TryGetSwitch(UseIoUringAppContextSwitch, out bool enabled))
+ {
+ return enabled;
+ }
+
+ return false;
+ }
+ }
+
+ ///
+ /// Returns whether SEND_ZC should be enabled.
+ /// Defaults to enabled; test-only env var can disable for deterministic tests.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsZeroCopySendOptedIn()
+ {
+ IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs();
+ return ResolveZeroCopySendOptedIn(inputs);
+ }
+
+ private static bool ResolveIoUringDirectSqeDisabled(in IoUringConfigurationInputs inputs)
+ {
+#if DEBUG
+ // Test-only override for deterministic coverage.
+ string? value = inputs.DirectSqeEnvironmentValue;
+ if (string.Equals(value, "0", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(value, "1", StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ // Default: direct SQE enabled.
+ return false;
+ }
+
+ private static bool ResolveIoUringEnabled(in IoUringConfigurationInputs inputs)
+ {
+ // Override order: environment variable wins over AppContext switch.
+ if (string.Equals(inputs.IoUringEnvironmentValue, "1", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(inputs.IoUringEnvironmentValue, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+
+ return inputs.IoUringFeatureSwitchEnabled;
+ }
+
+ private static bool ResolveZeroCopySendOptedIn(in IoUringConfigurationInputs inputs)
+ {
+#if DEBUG
+ // Test-only override for deterministic coverage.
+ string? value = inputs.ZeroCopySendEnvironmentValue;
+ if (string.Equals(value, "1", StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ if (string.Equals(value, "0", StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ // Default: zero-copy send enabled.
+ return true;
+ }
+
+ [FeatureSwitchDefinition(UseIoUringSqPollAppContextSwitch)]
+ private static bool IsSqPollFeatureEnabled
+ {
+ get
+ {
+ if (AppContext.TryGetSwitch(UseIoUringSqPollAppContextSwitch, out bool enabled))
+ {
+ return enabled;
+ }
+
+ return false;
+ }
+ }
+
+ ///
+ /// Returns whether SQPOLL mode has been explicitly requested.
+ /// SQPOLL requires dual opt-in: AppContext switch + environment variable.
+ /// This is intentionally stricter than the primary io_uring gate
+ /// (`IsIoUringEnabled`), which accepts either source.
+ /// SQPOLL pins a kernel thread, so accidental activation should require
+ /// explicit confirmation from both configuration surfaces.
+ ///
+ private static bool IsSqPollRequested()
+ {
+ IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs();
+ return ResolveSqPollRequested(inputs);
+ }
+
+ private static bool ResolveSqPollRequested(in IoUringConfigurationInputs inputs)
+ {
+ if (!inputs.SqPollFeatureSwitchEnabled)
+ {
+ return false;
+ }
+
+ return string.Equals(inputs.SqPollEnvironmentValue, "1", StringComparison.Ordinal);
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs
new file mode 100644
index 00000000000000..972f6a2240615a
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs
@@ -0,0 +1,320 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Resets the native diagnostics poll countdown.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void InitializeLinuxIoUringDiagnosticsState() =>
+ _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval;
+
+ /// Logs a failed ASYNC_CANCEL SQE preparation.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringAsyncCancelPrepareFailure(SocketError cancelError, ulong userData, IoUringCancellationOrigin origin)
+ {
+ string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty;
+ NetEventSource.Info(this, $"io_uring async-cancel prepare failed{originLabel}: error={cancelError}, user_data=0x{userData:x}");
+ }
+
+ /// Logs a failed ASYNC_CANCEL submission.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringAsyncCancelSubmitFailure(Interop.Error submitError, IoUringCancellationOrigin origin)
+ {
+ string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty;
+ NetEventSource.Info(this, $"io_uring async-cancel submit failed{originLabel}: error={submitError}");
+ }
+
+ /// Logs a sampled counter value with its associated user_data.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCounterSample(string message, long count, ulong userData)
+ {
+ NetEventSource.Info(this, $"{message}: count={count}, user_data=0x{userData:x}");
+ }
+
+ /// Logs a prepare queue overflow event.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringPrepareQueueOverflow(long count, int capacity)
+ {
+ NetEventSource.Info(this, $"io_uring prepare queue overflow: count={count}, capacity={capacity}");
+ }
+
+ /// Logs a cancellation queue overflow event.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCancellationQueueOverflow(long count, int capacity)
+ {
+ NetEventSource.Info(this, $"io_uring cancellation queue overflow: count={count}, capacity={capacity}");
+ }
+
+ /// Logs a CQ overflow observation from the kernel CQ ring counter.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCqOverflow(uint totalOverflowCount, uint delta)
+ {
+ NetEventSource.Error(this, $"io_uring CQ overflow detected: total={totalOverflowCount}, delta={delta}");
+ }
+
+ /// Logs CQ-overflow recovery activation with branch discriminator.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCqOverflowRecoveryEntry(
+ IoUringCqOverflowRecoveryBranch branch,
+ uint totalOverflowCount,
+ uint delta)
+ {
+ NetEventSource.Error(
+ this,
+ $"io_uring CQ overflow recovery entered: branch={branch}, total={totalOverflowCount}, delta={delta}");
+ }
+
+ /// Logs CQ-overflow recovery completion for diagnostics correlation.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCqOverflowRecoveryCompleted(
+ IoUringCqOverflowRecoveryBranch branch,
+ int completionSlotsInUse)
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring CQ overflow recovery completed: branch={branch}, completion_slots_in_use={completionSlotsInUse}");
+ }
+
+ /// Logs a deferred multishot-accept rearm nudge issued after CQ-overflow recovery.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringDeferredMultishotAcceptRearmAfterRecovery()
+ {
+ NetEventSource.Info(this, "io_uring CQ overflow recovery branch (a): deferred multishot-accept rearm nudged after drain.");
+ }
+
+ /// Logs when teardown preempts in-progress CQ-overflow recovery ownership.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCqOverflowRecoveryTeardownPreempted()
+ {
+ NetEventSource.Info(this, "io_uring CQ overflow recovery preempted by teardown; cancellation/drain owns shutdown.");
+ }
+
+ /// Logs a failed eventfd wake signal.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringWakeFailure(Interop.Error error)
+ {
+ NetEventSource.Info(this, $"io_uring wake signal failed: error={error}");
+ }
+
+ /// Logs eventfd wake circuit-breaker transitions.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringWakeCircuitBreakerStateChanged(bool enabled, int consecutiveFailures)
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring wake circuit-breaker {(enabled ? "enabled" : "disabled")}: consecutiveWakeFailures={consecutiveFailures}");
+ }
+
+ /// Logs the final count of benign late completions at teardown.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringTeardownSummary(long lateCompletionCount)
+ {
+ NetEventSource.Info(this, $"io_uring benign late-completion total={lateCompletionCount}");
+ }
+
+ /// Logs an untrack operation mismatch.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringUntrackMismatch(ulong userData, long mismatchCount)
+ {
+ NetEventSource.Info(this, $"io_uring untrack mismatch: user_data=0x{userData:x}, count={mismatchCount}");
+ }
+
+ /// Logs the negotiated io_uring mode for this engine instance.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringModeSelection(LinuxIoUringCapabilities capabilities)
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring mode={capabilities.Mode}, is_io_uring_port={capabilities.IsIoUringPort}, supports_multishot_recv={capabilities.SupportsMultishotRecv}, supports_multishot_accept={capabilities.SupportsMultishotAccept}, zero_copy_send_enabled={capabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, sqpoll_enabled={capabilities.SqPollEnabled}");
+ }
+
+ /// Logs active advanced io_uring features for this engine instance.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringAdvancedFeatureState()
+ {
+ int providedBufferSize = _ioUringProvidedBufferRing?.BufferSize ?? 0;
+ NetEventSource.Info(
+ this,
+ $"io_uring features: multishot_recv={_ioUringCapabilities.SupportsMultishotRecv}, multishot_accept={_ioUringCapabilities.SupportsMultishotAccept}, zero_copy_send_enabled={_ioUringCapabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, fixed_recv_active={_supportsOpReadFixed && _ioUringCapabilities.HasRegisteredBuffers}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, provided_buffers={_ioUringCapabilities.SupportsProvidedBufferRings}, registered_buffers={_ioUringCapabilities.HasRegisteredBuffers}, adaptive_buffer_sizing={_adaptiveBufferSizingEnabled}, sqpoll_enabled={_ioUringCapabilities.SqPollEnabled}, provided_buffer_size={providedBufferSize}");
+ }
+
+ /// Publishes prepare queue depth delta to telemetry and resets the counter.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ResetIoUringPrepareQueueDepthTelemetry()
+ {
+ long publishedDepth = Interlocked.Exchange(ref _ioUringPublishedPrepareQueueLength, 0);
+ if (publishedDepth != 0)
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(-publishedDepth);
+ }
+ }
+
+ /// Increments a counter and logs a sample every 64 increments.
+ private void RecordIoUringCounterAndMaybeLog(ref long counter, ulong userData, string message)
+ {
+ long count = Interlocked.Increment(ref counter);
+ if ((count & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCounterSample(message, count, userData);
+ }
+ }
+
+ /// Logs the teardown summary if any late completions were recorded.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void LogLinuxIoUringTeardownSummaryIfNeeded()
+ {
+ long lateCompletionCount = Interlocked.Read(ref _ioUringBenignLateCompletionCount);
+ if (lateCompletionCount > 0 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringTeardownSummary(lateCompletionCount);
+ }
+ }
+
+ /// Periodically polls native counters and publishes deltas to telemetry.
+ private void PollIoUringDiagnosticsIfNeeded(bool force)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort)
+ {
+ return;
+ }
+
+ if (!force)
+ {
+ int countdown = _ioUringDiagnosticsPollCountdown - 1;
+ _ioUringDiagnosticsPollCountdown = countdown;
+ if (countdown > 0)
+ {
+ return;
+ }
+ }
+
+ _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval;
+ PublishIoUringManagedDiagnosticsDelta();
+ if (!_ioUringAdvancedFeatureStateLogged && NetEventSource.Log.IsEnabled())
+ {
+ _ioUringAdvancedFeatureStateLogged = true;
+ LogIoUringAdvancedFeatureState();
+ }
+
+ if (!force)
+ {
+ EvaluateProvidedBufferRingResize();
+ }
+ }
+
+ /// Returns the non-negative delta between two counter snapshots.
+ private static long ComputeManagedCounterDelta(long previous, long current) =>
+ current >= previous ? current - previous : current;
+
+ /// Publishes a managed counter delta from source to published baseline.
+ private static bool TryPublishManagedCounterDelta(
+ ref long sourceCounter,
+ ref long publishedCounter,
+ out long delta,
+ bool monotonic = true)
+ {
+ long current = Interlocked.Read(ref sourceCounter);
+ long previous = Interlocked.Exchange(ref publishedCounter, current);
+ delta = monotonic ? ComputeManagedCounterDelta(previous, current) : current - previous;
+ return delta != 0;
+ }
+
+ /// Computes and publishes this engine's non-pinnable fallback counter delta.
+ private bool TryPublishIoUringNonPinnablePrepareFallbackDelta(out long delta)
+ {
+ long current = Interlocked.Read(ref _ioUringNonPinnablePrepareFallbackCount);
+ long previous = Interlocked.Exchange(ref _ioUringPublishedNonPinnablePrepareFallbackCount, current);
+ delta = ComputeManagedCounterDelta(previous, current);
+ return delta != 0;
+ }
+
+ /// Publishes all managed diagnostic counter deltas to telemetry.
+ private void PublishIoUringManagedDiagnosticsDelta()
+ {
+ // Sample pending SEND_ZC NOTIF state directly from completion slots so
+ // reset/teardown paths that bypass normal completion dispatch still publish accurate gauge data.
+ SocketsTelemetry.Log.IoUringZeroCopyNotificationPendingSlots(CountZeroCopyNotificationPendingSlots());
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringCompletionRequeueFailureCount,
+ ref _ioUringPublishedCompletionRequeueFailureCount,
+ out long requeueFailureDelta))
+ {
+ SocketsTelemetry.Log.IoUringCompletionRequeueFailure(requeueFailureDelta);
+ }
+
+ if (TryPublishIoUringNonPinnablePrepareFallbackDelta(out long nonPinnableFallbackDelta))
+ {
+ SocketsTelemetry.Log.IoUringPrepareNonPinnableFallback(nonPinnableFallbackDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringPrepareQueueOverflowCount,
+ ref _ioUringPublishedPrepareQueueOverflowCount,
+ out long prepareQueueOverflowDelta))
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueOverflow(prepareQueueOverflowDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringPrepareQueueOverflowFallbackCount,
+ ref _ioUringPublishedPrepareQueueOverflowFallbackCount,
+ out long prepareQueueOverflowFallbackDelta))
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueOverflowFallback(prepareQueueOverflowFallbackDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringPrepareQueueLength,
+ ref _ioUringPublishedPrepareQueueLength,
+ out long prepareQueueDepthDelta,
+ monotonic: false))
+ {
+ SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(prepareQueueDepthDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringCompletionSlotExhaustionCount,
+ ref _ioUringPublishedCompletionSlotExhaustionCount,
+ out long completionSlotExhaustionDelta))
+ {
+ SocketsTelemetry.Log.IoUringCompletionSlotExhaustion(completionSlotExhaustionDelta);
+ }
+
+ if (TryPublishManagedCounterDelta(
+ ref _ioUringCompletionSlotDrainRecoveryCount,
+ ref _ioUringPublishedCompletionSlotDrainRecoveryCount,
+ out long completionSlotDrainRecoveryDelta))
+ {
+ SocketsTelemetry.Log.IoUringCompletionSlotDrainRecovery(completionSlotDrainRecoveryDelta);
+ }
+ }
+
+ /// Counts completion slots currently waiting for SEND_ZC NOTIF CQEs.
+ private int CountZeroCopyNotificationPendingSlots()
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return 0;
+ }
+
+ int pendingNotificationSlots = 0;
+ for (int i = 0; i < completionEntries.Length; i++)
+ {
+ ref IoUringCompletionSlot slot = ref completionEntries[i];
+ if (slot.IsZeroCopySend && slot.ZeroCopyNotificationPending)
+ {
+ pendingNotificationSlots++;
+ }
+ }
+
+ return pendingNotificationSlots;
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs
new file mode 100644
index 00000000000000..535807a55d6bfb
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs
@@ -0,0 +1,366 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ ///
+ /// Maps the SQ ring, CQ ring, and SQE array into managed address space and derives
+ /// all ring pointers from the kernel-reported offsets. On failure, unmaps any
+ /// partially-mapped regions and closes the ring fd.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryMmapRings(ref IoUringSetupResult setup)
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static bool IsOffsetInRange(ulong offset, ulong size, ulong mappedSize) =>
+ offset <= mappedSize && size <= mappedSize - offset;
+
+ ref Interop.Sys.IoUringParams p = ref setup.Params;
+ bool usesNoSqArray = (setup.NegotiatedFlags & IoUringConstants.SetupNoSqArray) != 0;
+ bool usesSqe128 = (setup.NegotiatedFlags & IoUringConstants.SetupSqe128) != 0;
+ uint negotiatedSqeSize = usesSqe128 ? 128u : (uint)sizeof(IoUringSqe);
+ if (negotiatedSqeSize != (uint)sizeof(IoUringSqe))
+ {
+ // Managed SQE writers currently mirror the 64-byte io_uring_sqe layout.
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+
+ // Compute ring sizes.
+ ulong sqRingSize = p.SqOff.Array;
+ if (!usesNoSqArray)
+ {
+ sqRingSize += p.SqEntries * (uint)sizeof(uint);
+ }
+ ulong cqRingSize = p.CqOff.Cqes + p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe);
+ ulong sqesSize = p.SqEntries * negotiatedSqeSize;
+
+ // mmap SQ ring (and possibly CQ ring if SINGLE_MMAP).
+ bool usesSingleMmap = (p.Features & IoUringConstants.FeatureSingleMmap) != 0;
+
+ byte* sqRingPtr;
+ byte* cqRingPtr;
+
+ if (usesSingleMmap)
+ {
+ ulong ringSize = sqRingSize > cqRingSize ? sqRingSize : cqRingSize;
+ void* ptr;
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, ringSize, IoUringConstants.OffSqRing, &ptr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ sqRingPtr = (byte*)ptr;
+ cqRingPtr = (byte*)ptr;
+ sqRingSize = ringSize;
+ cqRingSize = ringSize;
+ }
+ else
+ {
+ void* sqPtr;
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqRingSize, IoUringConstants.OffSqRing, &sqPtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ sqRingPtr = (byte*)sqPtr;
+
+ void* cqPtr;
+ err = Interop.Sys.IoUringShimMmap(setup.RingFd, cqRingSize, IoUringConstants.OffCqRing, &cqPtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ cqRingPtr = (byte*)cqPtr;
+ }
+
+ if (!IsOffsetInRange(p.SqOff.Head, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.Tail, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.RingMask, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.RingEntries, sizeof(uint), sqRingSize) ||
+ !IsOffsetInRange(p.SqOff.Flags, sizeof(uint), sqRingSize) ||
+ (!usesNoSqArray && !IsOffsetInRange(p.SqOff.Array, p.SqEntries * (uint)sizeof(uint), sqRingSize)) ||
+ !IsOffsetInRange(p.CqOff.Head, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.Tail, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.RingMask, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.RingEntries, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.Overflow, sizeof(uint), cqRingSize) ||
+ !IsOffsetInRange(p.CqOff.Cqes, p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe), cqRingSize))
+ {
+ if (!usesSingleMmap)
+ {
+ Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize);
+ }
+
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+
+ // mmap SQE array.
+ void* sqePtr;
+ {
+ Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqesSize, IoUringConstants.OffSqes, &sqePtr);
+ if (err != Interop.Error.SUCCESS)
+ {
+ if (!usesSingleMmap)
+ Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize);
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ Interop.Sys.IoUringShimCloseFd(setup.RingFd);
+ return false;
+ }
+ }
+
+ // Derive SQ pointers and populate existing _ioUringSqRingInfo for compatibility.
+ _ioUringSqRingInfo.SqeBase = (IntPtr)sqePtr;
+ _ioUringSqRingInfo.SqTailPtr = (IntPtr)(sqRingPtr + p.SqOff.Tail);
+ _ioUringSqRingInfo.SqHeadPtr = (IntPtr)(sqRingPtr + p.SqOff.Head);
+ _ioUringSqRingInfo.SqMask = *(uint*)(sqRingPtr + p.SqOff.RingMask);
+ _ioUringSqRingInfo.SqEntries = *(uint*)(sqRingPtr + p.SqOff.RingEntries);
+ _ioUringSqRingInfo.SqeSize = negotiatedSqeSize;
+ _ioUringSqRingInfo.UsesNoSqArray = usesNoSqArray ? (byte)1 : (byte)0;
+ _ioUringSqRingInfo.RingFd = setup.RingFd;
+ _ioUringSqRingInfo.UsesEnterExtArg = setup.UsesExtArg ? (byte)1 : (byte)0;
+ _managedSqFlagsPtr = (uint*)(sqRingPtr + p.SqOff.Flags);
+
+ // Initialize SQ array identity mapping if NO_SQARRAY is not active.
+ if (!usesNoSqArray)
+ {
+ uint* sqArray = (uint*)(sqRingPtr + p.SqOff.Array);
+ for (uint i = 0; i < p.SqEntries; i++)
+ {
+ sqArray[i] = i;
+ }
+ }
+
+ // Derive CQ pointers.
+ _managedCqeBase = (Interop.Sys.IoUringCqe*)(cqRingPtr + p.CqOff.Cqes);
+ _managedCqTailPtr = (uint*)(cqRingPtr + p.CqOff.Tail);
+ _managedCqHeadPtr = (uint*)(cqRingPtr + p.CqOff.Head);
+ _managedCqMask = *(uint*)(cqRingPtr + p.CqOff.RingMask);
+ _managedCqEntries = *(uint*)(cqRingPtr + p.CqOff.RingEntries);
+ _managedCqOverflowPtr = (uint*)(cqRingPtr + p.CqOff.Overflow);
+
+ Debug.Assert(
+ BitOperations.IsPow2(_ioUringSqRingInfo.SqEntries),
+ $"Kernel-reported SQ entries must be power-of-two. sq_entries={_ioUringSqRingInfo.SqEntries}");
+ Debug.Assert(
+ BitOperations.IsPow2(_managedCqEntries),
+ $"Kernel-reported CQ entries must be power-of-two. cq_entries={_managedCqEntries}");
+ Debug.Assert(
+ _ioUringSqRingInfo.SqMask == _ioUringSqRingInfo.SqEntries - 1,
+ $"Unexpected SQ mask/entries contract: sq_mask={_ioUringSqRingInfo.SqMask}, sq_entries={_ioUringSqRingInfo.SqEntries}");
+ Debug.Assert(
+ _managedCqMask == _managedCqEntries - 1,
+ $"Unexpected CQ mask/entries contract: cq_mask={_managedCqMask}, cq_entries={_managedCqEntries}");
+
+ _managedObservedCqOverflow = Volatile.Read(ref *_managedCqOverflowPtr);
+ _cqOverflowRecoveryActive = false;
+ _cqOverflowRecoveryBranch = default;
+
+ // Store ring region info for teardown.
+ _managedSqRingPtr = sqRingPtr;
+ _managedCqRingPtr = cqRingPtr;
+ _managedSqRingSize = sqRingSize;
+ _managedCqRingSize = cqRingSize;
+ _managedSqesSize = sqesSize;
+ _managedUsesSingleMmap = usesSingleMmap;
+ _managedRingFd = setup.RingFd;
+ _managedUsesExtArg = setup.UsesExtArg;
+ _managedUsesNoSqArray = usesNoSqArray;
+ _managedNegotiatedFlags = setup.NegotiatedFlags;
+ _managedSqeInvariantsValidated = ValidateManagedSqeInitializationInvariants();
+ if (!_managedSqeInvariantsValidated)
+ {
+ CleanupManagedRings();
+ return false;
+ }
+
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void CleanupManagedRings()
+ {
+ _managedCqDrainEnabled = false;
+
+ byte* sqRingPtr = _managedSqRingPtr;
+ byte* cqRingPtr = _managedCqRingPtr;
+ ulong sqRingSize = _managedSqRingSize;
+ ulong cqRingSize = _managedCqRingSize;
+ ulong sqesSize = _managedSqesSize;
+ bool usesSingleMmap = _managedUsesSingleMmap;
+ void* sqeBase = _ioUringSqRingInfo.SqeBase.ToPointer();
+
+ // Clear all mmap-derived pointers before unmapping so any late reads fail safely.
+ _managedSqFlagsPtr = null;
+ _managedCqeBase = null;
+ _managedCqTailPtr = null;
+ _managedCqHeadPtr = null;
+ _managedCqOverflowPtr = null;
+ _managedSqRingPtr = null;
+ _managedCqRingPtr = null;
+ _managedSqRingSize = 0;
+ _managedCqRingSize = 0;
+ _managedSqesSize = 0;
+ _managedCqMask = 0;
+ _managedCqEntries = 0;
+ _managedCachedCqHead = 0;
+ _managedObservedCqOverflow = 0;
+ _ioUringSqRingInfo = default;
+ _managedSqeInvariantsValidated = false;
+
+ if (sqRingPtr != null)
+ {
+ // Unmap SQEs first
+ if (sqesSize > 0 && sqeBase != null)
+ {
+ Interop.Sys.IoUringShimMunmap(sqeBase, sqesSize);
+ }
+ // Unmap CQ ring (only if separate from SQ ring)
+ if (!usesSingleMmap && cqRingPtr != null && cqRingPtr != sqRingPtr)
+ {
+ Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize);
+ }
+ // Unmap SQ ring
+ Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize);
+ }
+ if (_managedRingFd >= 0)
+ {
+ Interop.Sys.IoUringShimCloseFd(_managedRingFd);
+ _managedRingFd = -1;
+ }
+ }
+
+ /// Unmaps rings and closes the ring fd.
+ partial void LinuxFreeIoUringResources()
+ {
+ // Managed io_uring teardown: release resources allocated during TryInitializeManagedIoUring.
+ // This must run BEFORE the common slot/buffer cleanup below because kernel
+ // unregister operations need the ring fd to still be open.
+ if (_ioUringInitialized)
+ {
+ // 0. Unregister/dispose provided buffer ring while the main ring fd is still open.
+ FreeIoUringProvidedBufferRing();
+
+ // 1. The registered ring fd is implicitly released when the ring fd is closed.
+ // Just mark it as inactive so no subsequent code attempts to use it.
+ _ioUringSqRingInfo.RegisteredRingFd = -1;
+
+ // 2. Close the wakeup eventfd.
+ if (_managedWakeupEventFd >= 0)
+ {
+ Interop.Sys.IoUringShimCloseFd(_managedWakeupEventFd);
+ _managedWakeupEventFd = -1;
+ }
+
+ // 3. Unmap SQ/CQ rings, SQEs and close the ring fd.
+ // Closing the ring fd also terminates any kernel SQPOLL thread for this ring.
+ CleanupManagedRings();
+
+ // 4. Disable managed flags to prevent any late operations.
+ _ioUringInitialized = false;
+ _managedCqDrainEnabled = false;
+ }
+
+ bool portClosedForTeardown = Volatile.Read(ref _ioUringPortClosedForTeardown) != 0;
+ if (!portClosedForTeardown)
+ {
+ PollIoUringDiagnosticsIfNeeded(force: true);
+ }
+
+ // Second drain intentionally catches any items enqueued after LinuxBeforeFreeNativeResources
+ // published teardown but before native port closure became globally visible.
+ DrainQueuedIoUringOperationsForTeardown();
+
+ if (_completionSlots is not null)
+ {
+ DrainTrackedIoUringOperationsForTeardown(portClosedForTeardown);
+ Debug.Assert(IsIoUringTrackingEmpty(), $"Leaked tracked io_uring operations: {Volatile.Read(ref _trackedIoUringOperationCount)}");
+
+ // Free any native memory still held by completion slots
+ for (int i = 0; i < _completionSlots.Length; i++)
+ {
+ ref IoUringCompletionSlot slot = ref _completionSlots[i];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![i];
+ if (slot.IsZeroCopySend && slot.ZeroCopyNotificationPending)
+ {
+ // Ring teardown can drop in-flight NOTIF CQEs; clear pending SEND_ZC state
+ // so teardown cannot leave slots/pin-holds logically waiting forever.
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ }
+
+ ReleaseZeroCopyPinHold(i);
+ if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ FreeMessageStorage(i);
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Accept && slotStorage.NativeSocketAddressLengthPtr != null)
+ {
+ *slotStorage.NativeSocketAddressLengthPtr = 0;
+ }
+
+ // Clear all pointers that alias _completionSlotNativeStorage before freeing it.
+ slotStorage.NativeInlineStorage = null;
+ slotStorage.NativeSocketAddressLengthPtr = null;
+ slotStorage.NativeMsgHdrPtr = IntPtr.Zero;
+ slotStorage.MessageIsReceive = false;
+ slotStorage.NativeIOVectors = null;
+ slotStorage.NativeSocketAddress = null;
+ slotStorage.NativeControlBuffer = null;
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ }
+
+ _completionSlots = null;
+ _trackedOperations = null;
+ _completionSlotStorage = null;
+ _trackedIoUringOperationCount = 0;
+ _zeroCopyPinHolds = null;
+ _completionSlotFreeListHead = -1;
+ _completionSlotsInUse = 0;
+ _liveAcceptCompletionSlotCount = 0;
+
+ _ioUringSlotCapacity = 0;
+ _cqOverflowRecoveryActive = false;
+ _cqOverflowRecoveryBranch = default;
+ _ioUringManagedPendingSubmissions = 0;
+ _ioUringManagedSqTail = 0;
+ _ioUringManagedSqTailLoaded = false;
+ _ioUringSqRingInfo = default;
+ _ioUringDirectSqeEnabled = false;
+ _sqPollEnabled = false;
+
+ LogLinuxIoUringTeardownSummaryIfNeeded();
+ }
+
+ if (_completionSlotNativeStorage != null)
+ {
+ NativeMemory.Free(_completionSlotNativeStorage);
+ _completionSlotNativeStorage = null;
+ _completionSlotNativeStorageStride = 0;
+ }
+
+ ResetIoUringPrepareQueueDepthTelemetry();
+
+ // Final flush of managed io_uring deltas in case teardown modified counters
+ // after the forced diagnostics poll and no further event-loop iteration runs.
+ PublishIoUringManagedDiagnosticsDelta();
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs
new file mode 100644
index 00000000000000..164c67891c375a
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs
@@ -0,0 +1,461 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.ExceptionServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+using Microsoft.Win32.SafeHandles;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe nuint GetCompletionSlotNativeStorageStride()
+ {
+ nuint iovSize = (nuint)IoUringConstants.MessageInlineIovCount * (nuint)sizeof(Interop.Sys.IOVector);
+ return (nuint)sizeof(NativeMsghdr) +
+ iovSize +
+ (nuint)IoUringConstants.MessageInlineSocketAddressCapacity +
+ (nuint)IoUringConstants.MessageInlineControlBufferCapacity +
+ (nuint)sizeof(int);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void InitializeCompletionSlotNativeStorage(
+ ref IoUringCompletionSlotStorage slotStorage,
+ byte* slotStorageBase)
+ {
+ slotStorage.NativeInlineStorage = slotStorageBase;
+ slotStorage.NativeMsgHdrPtr = (IntPtr)slotStorageBase;
+
+ byte* cursor = slotStorageBase + sizeof(NativeMsghdr);
+ slotStorage.NativeIOVectors = (Interop.Sys.IOVector*)cursor;
+ cursor += IoUringConstants.MessageInlineIovCount * sizeof(Interop.Sys.IOVector);
+ slotStorage.NativeSocketAddress = cursor;
+ cursor += IoUringConstants.MessageInlineSocketAddressCapacity;
+ slotStorage.NativeControlBuffer = cursor;
+ cursor += IoUringConstants.MessageInlineControlBufferCapacity;
+ slotStorage.NativeSocketAddressLengthPtr = (int*)cursor;
+
+ slotStorage.MessageIsReceive = false;
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ }
+
+ /// Allocates SoA completion slot arrays and initializes the free list.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void InitializeCompletionSlotPool(int capacity)
+ {
+ Debug.Assert(
+ (ulong)capacity <= IoUringConstants.SlotIndexMask + 1UL,
+ $"Completion slot capacity {capacity} exceeds encodable slot index range {IoUringConstants.SlotIndexMask + 1UL}.");
+ Debug.Assert(
+ Unsafe.SizeOf() == 24,
+ $"IoUringCompletionSlot size drifted: expected 24, got {Unsafe.SizeOf()}.");
+ _completionSlots = new IoUringCompletionSlot[capacity];
+ _trackedOperations = new IoUringTrackedOperationState[capacity];
+ _completionSlotStorage = new IoUringCompletionSlotStorage[capacity];
+ _zeroCopyPinHolds = new System.Buffers.MemoryHandle[capacity];
+ _completionSlotNativeStorageStride = GetCompletionSlotNativeStorageStride();
+ Debug.Assert(
+ _completionSlotNativeStorageStride <= int.MaxValue,
+ $"Completion slot native storage stride overflow: {_completionSlotNativeStorageStride}.");
+ if (_completionSlotNativeStorageStride > int.MaxValue)
+ {
+ // FailFast-adjacent site: impossible stride overflow indicates corrupted
+ // layout assumptions during engine initialization, so keep the hard failure.
+ ThrowInternalException(Interop.Error.EOVERFLOW);
+ }
+
+ _completionSlotNativeStorage = (byte*)NativeMemory.AllocZeroed((nuint)capacity * _completionSlotNativeStorageStride);
+ // Build free list linking all slots
+ for (int i = 0; i < capacity - 1; i++)
+ {
+ _completionSlots[i].Generation = 1;
+ _completionSlots[i].FreeListNext = i + 1;
+ InitializeCompletionSlotNativeStorage(
+ ref _completionSlotStorage[i],
+ _completionSlotNativeStorage + ((nuint)i * _completionSlotNativeStorageStride));
+ }
+ _completionSlots[capacity - 1].Generation = 1;
+ _completionSlots[capacity - 1].FreeListNext = -1;
+ InitializeCompletionSlotNativeStorage(
+ ref _completionSlotStorage[capacity - 1],
+ _completionSlotNativeStorage + ((nuint)(capacity - 1) * _completionSlotNativeStorageStride));
+ _completionSlotFreeListHead = 0;
+ _completionSlotsInUse = 0;
+ _completionSlotsHighWaterMark = 0;
+ _liveAcceptCompletionSlotCount = 0;
+ _trackedIoUringOperationCount = 0;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void SetCompletionSlotKind(ref IoUringCompletionSlot slot, IoUringCompletionOperationKind kind)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SetCompletionSlotKind must run on the event-loop thread.");
+ IoUringCompletionOperationKind previousKind = slot.Kind;
+ if (previousKind == kind)
+ {
+ return;
+ }
+
+ slot.Kind = kind;
+ if (previousKind == IoUringCompletionOperationKind.Accept ||
+ kind == IoUringCompletionOperationKind.Accept)
+ {
+ int liveAcceptCount = _liveAcceptCompletionSlotCount;
+ if (previousKind == IoUringCompletionOperationKind.Accept)
+ {
+ liveAcceptCount--;
+ }
+
+ if (kind == IoUringCompletionOperationKind.Accept)
+ {
+ liveAcceptCount++;
+ }
+
+ Debug.Assert(liveAcceptCount >= 0);
+ Volatile.Write(ref _liveAcceptCompletionSlotCount, liveAcceptCount);
+ }
+ }
+
+ ///
+ /// Allocates a completion slot from the free list. Returns the slot index,
+ /// or -1 if the pool is exhausted (backpressure signal).
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private int AllocateCompletionSlot()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "AllocateCompletionSlot must run on the event-loop thread.");
+ Debug.Assert(_completionSlots is not null);
+ int index = _completionSlotFreeListHead;
+ if (index < 0)
+ return -1; // Pool exhausted
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![index];
+ // Slot state is reset in FreeCompletionSlot; keep allocation to free-list bookkeeping only.
+ _completionSlotFreeListHead = slot.FreeListNext;
+ slot.FreeListNext = -1;
+ int inUse = ++_completionSlotsInUse;
+ if (inUse > _completionSlotsHighWaterMark)
+ {
+ _completionSlotsHighWaterMark = inUse;
+ SocketsTelemetry.Log.IoUringCompletionSlotHighWaterMark(inUse);
+ }
+ return index;
+ }
+
+ ///
+ /// Returns a completion slot to the free list, incrementing its generation
+ /// to invalidate any stale user_data references.
+ ///
+ private unsafe void FreeCompletionSlot(int index)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "FreeCompletionSlot must run on the event-loop thread.");
+ Debug.Assert(index >= 0 && index < _completionSlots!.Length);
+
+ ReleaseZeroCopyPinHold(index);
+ ref IoUringCompletionSlot slot = ref _completionSlots![index];
+ ref IoUringTrackedOperationState trackedState = ref _trackedOperations![index];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![index];
+ Debug.Assert(
+ Volatile.Read(ref trackedState.TrackedOperation) is null,
+ "Completion slot should not be freed while a tracked io_uring operation is still attached.");
+
+ SafeSocketHandle? dangerousRefSocketHandle = slotStorage.DangerousRefSocketHandle;
+ ExceptionDispatchInfo? dangerousReleaseException = null;
+ try
+ {
+ if (dangerousRefSocketHandle is not null)
+ {
+ slotStorage.DangerousRefSocketHandle = null;
+ dangerousRefSocketHandle.DangerousRelease();
+ }
+ }
+ catch (Exception ex)
+ {
+ dangerousReleaseException = ExceptionDispatchInfo.Capture(ex);
+ }
+ finally
+ {
+ if (slot.UsesFixedRecvBuffer)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ if (providedBufferRing is not null)
+ {
+ providedBufferRing.TryRecycleBufferFromCompletion(slot.FixedRecvBufferId);
+ }
+ }
+
+ // Free any native message storage
+ if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ FreeMessageStorage(index);
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Accept)
+ {
+ if (slotStorage.NativeSocketAddressLengthPtr != null)
+ {
+ *slotStorage.NativeSocketAddressLengthPtr = 0;
+ }
+ }
+
+ slot.Generation = (slot.Generation + 1UL) & IoUringConstants.GenerationMask;
+ if (slot.Generation == 0)
+ {
+ slot.Generation = 1;
+ }
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.None);
+ ResetDebugTestForcedResult(ref slot);
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ Volatile.Write(ref trackedState.TrackedOperation, null);
+ trackedState.TrackedOperationGeneration = 0;
+ slot.FreeListNext = _completionSlotFreeListHead;
+ _completionSlotFreeListHead = index;
+ _completionSlotsInUse--;
+ }
+
+ dangerousReleaseException?.Throw();
+ }
+
+ /// Disposes a retained zero-copy pin-hold for the specified completion slot.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ReleaseZeroCopyPinHold(int slotIndex)
+ {
+ System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds;
+ if (pinHolds is null || (uint)slotIndex >= (uint)pinHolds.Length)
+ {
+ return;
+ }
+
+ pinHolds[slotIndex].Dispose();
+ pinHolds[slotIndex] = default;
+ }
+
+ /// Transfers operation-owned pin state into the engine's zero-copy pin-hold table.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal void TransferIoUringZeroCopyPinHold(ulong userData, System.Buffers.MemoryHandle pinHold)
+ {
+ System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds;
+ if (pinHolds is null)
+ {
+ pinHold.Dispose();
+ Debug.Fail("Zero-copy pin-hold table is unavailable while transferring pin ownership.");
+ return;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask);
+ if ((uint)slotIndex >= (uint)pinHolds.Length)
+ {
+ pinHold.Dispose();
+ Debug.Fail($"Invalid completion slot index while transferring zero-copy pin hold: {slotIndex}.");
+ return;
+ }
+
+ Debug.Assert(_completionSlots is not null);
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ if (!slot.IsZeroCopySend)
+ {
+ pinHold.Dispose();
+ Debug.Fail("Zero-copy pin hold transfer requested for a non-zero-copy completion slot.");
+ return;
+ }
+
+ pinHolds[slotIndex].Dispose();
+ pinHolds[slotIndex] = pinHold;
+ }
+
+ ///
+ /// Prepares pre-allocated per-slot native message storage for sendmsg/recvmsg.
+ /// Returns false when header shape exceeds inline capacities so callers can fall back.
+ ///
+ private unsafe bool TryPrepareInlineMessageStorage(int slotIndex, Interop.Sys.MessageHeader* messageHeader, bool isReceive)
+ {
+ Debug.Assert(sizeof(NativeMsghdr) == 56, $"NativeMsghdr size mismatch with kernel struct msghdr: expected 56, got {sizeof(NativeMsghdr)}");
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+
+ int iovCount = messageHeader->IOVectorCount;
+ int sockAddrLen = messageHeader->SocketAddressLen;
+ int controlBufLen = messageHeader->ControlBufferLen;
+ Debug.Assert(iovCount >= 0, $"Expected non-negative iovCount, got {iovCount}");
+ Debug.Assert(sockAddrLen >= 0, $"Expected non-negative socket address length, got {sockAddrLen}");
+ Debug.Assert(controlBufLen >= 0, $"Expected non-negative control buffer length, got {controlBufLen}");
+
+ if ((uint)iovCount > IoUringConstants.MessageInlineIovCount ||
+ (uint)sockAddrLen > IoUringConstants.MessageInlineSocketAddressCapacity ||
+ (uint)controlBufLen > IoUringConstants.MessageInlineControlBufferCapacity)
+ {
+ return false;
+ }
+
+ if (slotStorage.NativeInlineStorage == null)
+ {
+ return false;
+ }
+
+ if ((iovCount > 0 && messageHeader->IOVectors == null) ||
+ (sockAddrLen > 0 && messageHeader->SocketAddress == null) ||
+ (controlBufLen > 0 && messageHeader->ControlBuffer == null))
+ {
+ return false;
+ }
+
+ // Most of the inline slab is overwritten immediately; clear only msghdr header state.
+ new Span(slotStorage.NativeMsgHdrPtr.ToPointer(), sizeof(NativeMsghdr)).Clear();
+
+ NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr;
+ Interop.Sys.IOVector* iovDst = slotStorage.NativeIOVectors;
+ byte* sockAddrDst = slotStorage.NativeSocketAddress;
+ byte* controlBufDst = slotStorage.NativeControlBuffer;
+
+ if (iovCount > 0)
+ {
+ nuint iovBytes = (nuint)iovCount * (nuint)sizeof(Interop.Sys.IOVector);
+ Buffer.MemoryCopy(
+ messageHeader->IOVectors,
+ iovDst,
+ (nuint)IoUringConstants.MessageInlineIovCount * (nuint)sizeof(Interop.Sys.IOVector),
+ iovBytes);
+ }
+
+ if (!isReceive)
+ {
+ if (sockAddrLen > 0)
+ {
+ Buffer.MemoryCopy(
+ messageHeader->SocketAddress,
+ sockAddrDst,
+ (nuint)IoUringConstants.MessageInlineSocketAddressCapacity,
+ (nuint)sockAddrLen);
+ }
+
+ if (controlBufLen > 0)
+ {
+ Buffer.MemoryCopy(
+ messageHeader->ControlBuffer,
+ controlBufDst,
+ (nuint)IoUringConstants.MessageInlineControlBufferCapacity,
+ (nuint)controlBufLen);
+ }
+ }
+
+ hdr->MsgName = sockAddrLen > 0 ? sockAddrDst : null;
+ hdr->MsgNameLen = (uint)sockAddrLen;
+ hdr->MsgIov = iovCount > 0 ? iovDst : null;
+ hdr->MsgIovLen = (nuint)iovCount;
+ hdr->MsgControl = controlBufLen > 0 ? controlBufDst : null;
+ hdr->MsgControlLen = (nuint)controlBufLen;
+ hdr->MsgFlags = 0;
+
+ if (isReceive)
+ {
+ slotStorage.ReceiveOutputSocketAddress = messageHeader->SocketAddress;
+ slotStorage.ReceiveOutputControlBuffer = messageHeader->ControlBuffer;
+ slotStorage.ReceiveSocketAddressCapacity = sockAddrLen;
+ slotStorage.ReceiveControlBufferCapacity = controlBufLen;
+ }
+ else
+ {
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ }
+
+ slotStorage.MessageIsReceive = isReceive;
+ return true;
+ }
+
+ ///
+ /// Resets inline message metadata on the completion slot.
+ ///
+ private unsafe void FreeMessageStorage(int slotIndex)
+ {
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ // Slot inline storage is cleared on prepare before each reuse; avoid a second full memset on free.
+
+ slotStorage.ReceiveOutputSocketAddress = null;
+ slotStorage.ReceiveOutputControlBuffer = null;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+ slotStorage.ReceiveControlBufferCapacity = 0;
+ slotStorage.MessageIsReceive = false;
+ }
+
+ ///
+ /// After a recvmsg CQE completes, copies the kernel-written socket address and
+ /// control buffer data from the native msghdr back to the managed MessageHeader's
+ /// output buffers. For sendmsg completions this is a no-op.
+ /// Returns the actual socket address length, control buffer length, and msg_flags written by the kernel.
+ ///
+ private unsafe void CopyMessageCompletionOutputs(
+ int slotIndex,
+ out int socketAddressLen,
+ out int controlBufferLen,
+ out uint messageFlags)
+ {
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ socketAddressLen = 0;
+ controlBufferLen = 0;
+ messageFlags = 0;
+
+ if (!slotStorage.MessageIsReceive)
+ return;
+
+ NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr;
+ if (hdr == null)
+ return;
+
+ socketAddressLen = (int)hdr->MsgNameLen;
+ controlBufferLen = (int)hdr->MsgControlLen;
+ messageFlags = (uint)hdr->MsgFlags;
+
+ // Copy socket address from native buffer back to managed output buffer
+ if (slotStorage.ReceiveOutputSocketAddress != null && slotStorage.NativeSocketAddress != null &&
+ slotStorage.ReceiveSocketAddressCapacity > 0 && socketAddressLen > 0)
+ {
+ int copyLen = Math.Min(slotStorage.ReceiveSocketAddressCapacity, socketAddressLen);
+ Buffer.MemoryCopy(slotStorage.NativeSocketAddress, slotStorage.ReceiveOutputSocketAddress, copyLen, copyLen);
+ }
+
+ // Copy control buffer from native buffer back to managed output buffer
+ if (slotStorage.ReceiveOutputControlBuffer != null && slotStorage.NativeControlBuffer != null &&
+ slotStorage.ReceiveControlBufferCapacity > 0 && controlBufferLen > 0)
+ {
+ int copyLen = Math.Min(slotStorage.ReceiveControlBufferCapacity, controlBufferLen);
+ Buffer.MemoryCopy(slotStorage.NativeControlBuffer, slotStorage.ReceiveOutputControlBuffer, copyLen, copyLen);
+ }
+ }
+
+ ///
+ /// Decodes a completion slot index from a user_data payload value.
+ /// The slot index is encoded in the lower bits of the payload.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int DecodeCompletionSlotIndex(ulong payload)
+ {
+ return (int)(payload & IoUringConstants.SlotIndexMask);
+ }
+
+ ///
+ /// Encodes a completion slot index and generation into a user_data value
+ /// with the ReservedCompletion tag.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong EncodeCompletionSlotUserData(int slotIndex, ulong generation)
+ {
+ ulong payload = ((ulong)(generation & IoUringConstants.GenerationMask) << IoUringConstants.SlotIndexBits) | ((ulong)slotIndex & IoUringConstants.SlotIndexMask);
+ return EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs
new file mode 100644
index 00000000000000..6a3bbad9a3b883
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs
@@ -0,0 +1,249 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Converts SocketFlags to the kernel msg_flags representation for io_uring.
+ private static bool TryConvertIoUringPrepareSocketFlags(SocketFlags flags, out uint rwFlags)
+ {
+ const SocketFlags SupportedIoUringFlags =
+ SocketFlags.OutOfBand |
+ SocketFlags.Peek |
+ SocketFlags.DontRoute;
+
+ if ((flags & ~SupportedIoUringFlags) != 0)
+ {
+ rwFlags = 0;
+ return false;
+ }
+
+ rwFlags = (uint)(int)flags;
+ return true;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteSendLikeSqe(
+ IoUringSqe* sqe,
+ byte opcode,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ uint rwFlags)
+ {
+ sqe->Opcode = opcode;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = userData;
+ }
+
+ /// Writes a recv SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteRecvSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.Recv;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ sqe->RwFlags = rwFlags;
+ sqe->BufIndex = 0;
+ sqe->UserData = userData;
+ }
+
+ /// Writes a read-fixed SQE for registered-buffer receive.
+ private static unsafe void WriteReadFixedSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* buffer,
+ uint length,
+ ushort bufferIndex)
+ {
+ sqe->Opcode = IoUringOpcodes.ReadFixed;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = 0;
+ sqe->Addr = (ulong)(nuint)buffer;
+ sqe->Len = length;
+ // For non-seekable sockets, offset is ignored; -1 matches "current position" semantics.
+ sqe->Off = ulong.MaxValue;
+ sqe->RwFlags = 0;
+ sqe->BufIndex = bufferIndex;
+ sqe->UserData = userData;
+ }
+
+ ///
+ /// Writes a one-shot recv SQE using provided-buffer selection.
+ /// The kernel chooses a buffer from the specified buffer group.
+ ///
+ private static void WriteProvidedBufferRecvSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ uint requestedLength,
+ uint rwFlags,
+ ushort bufferGroupId)
+ {
+ sqe->Opcode = IoUringOpcodes.Recv;
+ sqe->Fd = sqeFd;
+ sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect);
+ sqe->Ioprio = 0;
+ sqe->Addr = 0;
+ sqe->Len = requestedLength;
+ sqe->RwFlags = rwFlags;
+ sqe->BufIndex = bufferGroupId;
+ sqe->UserData = userData;
+ }
+
+ ///
+ /// Writes a multishot recv SQE to the submission ring entry.
+ /// The kernel selects buffers from a provided buffer ring (IOSQE_BUFFER_SELECT).
+ ///
+ private static void WriteMultishotRecvSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ ushort bufferGroupId)
+ {
+ sqe->Opcode = IoUringOpcodes.Recv;
+ sqe->Fd = sqeFd;
+ sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect);
+ sqe->Ioprio = IoUringConstants.RecvMultishot;
+ sqe->Addr = 0;
+ sqe->Len = 0;
+ sqe->RwFlags = 0;
+ sqe->BufIndex = bufferGroupId;
+ sqe->UserData = userData;
+ }
+
+ /// Writes an accept SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteAcceptSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ IntPtr socketAddressLengthPtr)
+ {
+ sqe->Opcode = IoUringOpcodes.Accept;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // Kernel accept prep aliases addr2 at sqe->off.
+ sqe->Off = (ulong)(nuint)socketAddressLengthPtr;
+ sqe->RwFlags = IoUringConstants.AcceptFlags;
+ sqe->UserData = userData;
+ }
+
+ /// Writes a multishot accept SQE to the submission ring entry.
+ private static unsafe void WriteMultishotAcceptSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ IntPtr socketAddressLengthPtr)
+ {
+ sqe->Opcode = IoUringOpcodes.Accept;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Ioprio = IoUringConstants.AcceptMultishot;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // accept4 prep aliases addr2 at sqe->off for addrlen pointer
+ sqe->Off = (ulong)(nuint)socketAddressLengthPtr;
+ sqe->RwFlags = IoUringConstants.AcceptFlags;
+ sqe->UserData = userData;
+ }
+
+ private static void WriteSendMsgLikeSqe(
+ IoUringSqe* sqe,
+ byte opcode,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ IntPtr messageHeader,
+ uint rwFlags)
+ {
+ sqe->Opcode = opcode;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)messageHeader;
+ sqe->Len = 1;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = userData;
+ }
+
+ /// Writes a recvmsg SQE to the submission ring entry.
+ private static void WriteRecvMsgSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ IntPtr messageHeader,
+ uint rwFlags)
+ {
+ sqe->Opcode = IoUringOpcodes.RecvMsg;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)messageHeader;
+ sqe->Len = 1;
+ sqe->RwFlags = rwFlags;
+ sqe->UserData = userData;
+ }
+
+ /// Writes a connect SQE to the submission ring entry.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WriteConnectSqe(
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ byte* socketAddress,
+ int socketAddressLen)
+ {
+ sqe->Opcode = IoUringOpcodes.Connect;
+ sqe->Fd = sqeFd;
+ sqe->Flags = sqeFlags;
+ sqe->Addr = (ulong)(nuint)socketAddress;
+ // Kernel connect prep aliases addrlen at sqe->off and requires len=0.
+ sqe->Off = (uint)socketAddressLen;
+ sqe->Len = 0;
+ sqe->UserData = userData;
+ }
+
+ /// Writes an ASYNC_CANCEL SQE targeting the specified user_data.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteAsyncCancelSqe(IoUringSqe* sqe, ulong userData)
+ {
+ sqe->Opcode = IoUringOpcodes.AsyncCancel;
+ sqe->Fd = -1;
+ Debug.Assert((byte)(userData >> IoUringUserDataTagShift) == IoUringConstants.TagReservedCompletion);
+ sqe->Addr = userData;
+ sqe->UserData = 0;
+ }
+
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs
new file mode 100644
index 00000000000000..b541bfff24d9d6
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs
@@ -0,0 +1,53 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.CompilerServices;
+
+namespace System.Net.Sockets
+{
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ResetDebugTestForcedResult(ref IoUringCompletionSlot slot)
+ {
+ _ = slot;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ResolveDebugTestForcedResult(ref IoUringCompletionSlot slot, ref int result)
+ {
+ _ = slot;
+ _ = result;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ApplyDebugTestForcedResult(ref IoUringCompletionSlot slot, byte opcode)
+ {
+ _ = _ioUringInitialized;
+ _ = slot;
+ _ = opcode;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RestoreDebugTestForcedResultIfNeeded(int slotIndex, byte opcode)
+ {
+ _ = _ioUringInitialized;
+ _ = slotIndex;
+ _ = opcode;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void InitializeDebugTestHooksFromEnvironment()
+ {
+ _ = _ioUringInitialized;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryConsumeDebugForcedSubmitError(out Interop.Error forcedError)
+ {
+ _ = _ioUringInitialized;
+ forcedError = Interop.Error.SUCCESS;
+ return false;
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs
new file mode 100644
index 00000000000000..1b1c50f0c1ac2f
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs
@@ -0,0 +1,4611 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace System.Net.Sockets
+{
+ /// Linux socket engine coordinating epoll and io_uring work for process sockets.
+ ///
+ /// io_uring completion mode currently uses one active engine/event-loop instance per process.
+ /// This keeps ownership and teardown semantics simple, but the single submit/drain loop can
+ /// become a bottleneck at high core/socket densities.
+ /// Future work may evaluate multi-engine sharding (for example by socket affinity) when high-core
+ /// throughput data justifies the additional complexity.
+ ///
+ internal sealed unsafe partial class SocketAsyncEngine
+ {
+ /// Indicates which io_uring dispatch mode is active for this engine instance.
+ private enum IoUringMode : byte
+ {
+ Disabled = 0,
+ Completion = 1
+ }
+
+ /// Distinguishes cancellation requests issued during normal runtime from those during engine teardown.
+ private enum IoUringCancellationOrigin : byte
+ {
+ Runtime = 0,
+ Teardown = 1
+ }
+
+ /// Identifies which CQ-overflow recovery branch is active for logging/telemetry correlation.
+ private enum IoUringCqOverflowRecoveryBranch : byte
+ {
+ MultishotAcceptArming = 0,
+ Teardown = 1,
+ // Steady-state branch: normal runtime overflow recovery outside teardown/accept-arm handoff.
+ DualWave = 2
+ }
+
+ /// Tracks the lifecycle of an io_uring operation for debug assertions on valid state transitions.
+ private enum IoUringOperationLifecycleState : byte
+ {
+ Queued = 0,
+ Prepared = 1,
+ Submitted = 2,
+ Completed = 3,
+ Canceled = 4,
+ Detached = 5
+ }
+
+ /// Result of attempting to remove a tracked operation by user_data.
+ private enum IoUringTrackedOperationRemoveResult : byte
+ {
+ Removed = 0,
+ NotFound = 1,
+ Mismatch = 2
+ }
+
+ /// Immutable snapshot of negotiated io_uring capabilities for this engine instance.
+ private readonly struct LinuxIoUringCapabilities
+ {
+ private const uint FlagIsIoUringPort = 1u << 0;
+ private const uint FlagSupportsMultishotRecv = 1u << 1;
+ private const uint FlagSupportsMultishotAccept = 1u << 2;
+ private const uint FlagSupportsZeroCopySend = 1u << 3;
+ private const uint FlagSqPollEnabled = 1u << 4;
+ private const uint FlagSupportsProvidedBufferRings = 1u << 5;
+ private const uint FlagHasRegisteredBuffers = 1u << 6;
+
+ private readonly uint _flags;
+
+ /// The active io_uring dispatch mode.
+ internal IoUringMode Mode { get; }
+
+ /// Whether the engine's port was created as an io_uring instance.
+ internal bool IsIoUringPort => (_flags & FlagIsIoUringPort) != 0;
+ /// Whether multishot recv can be used by this engine instance.
+ internal bool SupportsMultishotRecv => (_flags & FlagSupportsMultishotRecv) != 0;
+ /// Whether multishot accept can be used by this engine instance.
+ internal bool SupportsMultishotAccept => (_flags & FlagSupportsMultishotAccept) != 0;
+ /// Whether zero-copy send is enabled for this engine instance.
+ internal bool SupportsZeroCopySend => (_flags & FlagSupportsZeroCopySend) != 0;
+ /// Whether SQPOLL mode is enabled for this engine instance.
+ internal bool SqPollEnabled => (_flags & FlagSqPollEnabled) != 0;
+ /// Whether provided-buffer rings are active for this engine instance.
+ internal bool SupportsProvidedBufferRings => (_flags & FlagSupportsProvidedBufferRings) != 0;
+ /// Whether provided buffers are currently registered with the kernel.
+ internal bool HasRegisteredBuffers => (_flags & FlagHasRegisteredBuffers) != 0;
+
+ /// Whether the engine is operating in full completion mode.
+ internal bool IsCompletionMode =>
+ Mode == IoUringMode.Completion;
+
+ private LinuxIoUringCapabilities(IoUringMode mode, uint flags)
+ {
+ Mode = mode;
+ _flags = flags;
+ }
+
+ internal LinuxIoUringCapabilities WithMode(IoUringMode mode) =>
+ new LinuxIoUringCapabilities(mode, _flags);
+
+ internal LinuxIoUringCapabilities WithIsIoUringPort(bool value) =>
+ WithFlag(FlagIsIoUringPort, value);
+
+ internal LinuxIoUringCapabilities WithSupportsMultishotRecv(bool value) =>
+ WithFlag(FlagSupportsMultishotRecv, value);
+
+ internal LinuxIoUringCapabilities WithSupportsMultishotAccept(bool value) =>
+ WithFlag(FlagSupportsMultishotAccept, value);
+
+ internal LinuxIoUringCapabilities WithSupportsZeroCopySend(bool value) =>
+ WithFlag(FlagSupportsZeroCopySend, value);
+
+ internal LinuxIoUringCapabilities WithSqPollEnabled(bool value) =>
+ WithFlag(FlagSqPollEnabled, value);
+
+ internal LinuxIoUringCapabilities WithSupportsProvidedBufferRings(bool value) =>
+ WithFlag(FlagSupportsProvidedBufferRings, value);
+
+ internal LinuxIoUringCapabilities WithHasRegisteredBuffers(bool value) =>
+ WithFlag(FlagHasRegisteredBuffers, value);
+
+ private LinuxIoUringCapabilities WithFlag(uint flag, bool value)
+ {
+ uint flags = value ? (_flags | flag) : (_flags & ~flag);
+ return new LinuxIoUringCapabilities(Mode, flags);
+ }
+ }
+
+ [Flags]
+ private enum IoUringConfigurationWarningFlags : byte
+ {
+ None = 0,
+ SqPollRequestedWithoutIoUring = 1 << 0,
+ DirectSqeDisabledWithoutIoUring = 1 << 1,
+ ZeroCopyOptInWithoutIoUring = 1 << 2
+ }
+
+ /// Immutable process-wide snapshot of resolved io_uring configuration inputs.
+ private readonly struct IoUringResolvedConfiguration
+ {
+ internal bool IoUringEnabled { get; }
+ internal bool SqPollRequested { get; }
+ internal bool DirectSqeDisabled { get; }
+ internal bool ZeroCopySendOptedIn { get; }
+ internal bool RegisterBuffersEnabled { get; }
+ internal bool AdaptiveProvidedBufferSizingEnabled { get; }
+ internal int ProvidedBufferSize { get; }
+ internal int PrepareQueueCapacity { get; }
+ internal int CancellationQueueCapacity { get; }
+ private readonly IoUringConfigurationWarningFlags _warningFlags;
+
+ internal IoUringResolvedConfiguration(
+ bool ioUringEnabled,
+ bool sqPollRequested,
+ bool directSqeDisabled,
+ bool zeroCopySendOptedIn,
+ bool registerBuffersEnabled,
+ bool adaptiveProvidedBufferSizingEnabled,
+ int providedBufferSize,
+ int prepareQueueCapacity,
+ int cancellationQueueCapacity)
+ {
+ IoUringEnabled = ioUringEnabled;
+ SqPollRequested = sqPollRequested;
+ DirectSqeDisabled = directSqeDisabled;
+ ZeroCopySendOptedIn = zeroCopySendOptedIn;
+ RegisterBuffersEnabled = registerBuffersEnabled;
+ AdaptiveProvidedBufferSizingEnabled = adaptiveProvidedBufferSizingEnabled;
+ ProvidedBufferSize = providedBufferSize;
+ PrepareQueueCapacity = prepareQueueCapacity;
+ CancellationQueueCapacity = cancellationQueueCapacity;
+ _warningFlags = ComputeWarningFlags(
+ ioUringEnabled,
+ sqPollRequested,
+ directSqeDisabled,
+ zeroCopySendOptedIn);
+ }
+
+ internal string ToLogString() =>
+ $"enabled={IoUringEnabled}, sqpollRequested={SqPollRequested}, directSqeDisabled={DirectSqeDisabled}, zeroCopySendOptedIn={ZeroCopySendOptedIn}, registerBuffersEnabled={RegisterBuffersEnabled}, adaptiveProvidedBufferSizingEnabled={AdaptiveProvidedBufferSizingEnabled}, providedBufferSize={ProvidedBufferSize}, prepareQueueCapacity={PrepareQueueCapacity}, cancellationQueueCapacity={CancellationQueueCapacity}";
+
+ internal bool TryGetValidationWarnings([NotNullWhen(true)] out string? warnings)
+ {
+ if (_warningFlags == IoUringConfigurationWarningFlags.None)
+ {
+ warnings = null;
+ return false;
+ }
+
+ warnings = BuildWarningMessage(_warningFlags);
+ return true;
+ }
+
+ private static IoUringConfigurationWarningFlags ComputeWarningFlags(
+ bool ioUringEnabled,
+ bool sqPollRequested,
+ bool directSqeDisabled,
+ bool zeroCopySendOptedIn)
+ {
+ IoUringConfigurationWarningFlags warnings = IoUringConfigurationWarningFlags.None;
+ if (!ioUringEnabled && sqPollRequested)
+ {
+ warnings |= IoUringConfigurationWarningFlags.SqPollRequestedWithoutIoUring;
+ }
+
+ if (!ioUringEnabled && directSqeDisabled)
+ {
+ warnings |= IoUringConfigurationWarningFlags.DirectSqeDisabledWithoutIoUring;
+ }
+
+ if (!ioUringEnabled && zeroCopySendOptedIn)
+ {
+ warnings |= IoUringConfigurationWarningFlags.ZeroCopyOptInWithoutIoUring;
+ }
+
+ return warnings;
+ }
+
+ private static string BuildWarningMessage(IoUringConfigurationWarningFlags warnings)
+ {
+ var parts = new List(3);
+ if ((warnings & IoUringConfigurationWarningFlags.SqPollRequestedWithoutIoUring) != 0)
+ {
+ parts.Add("SQPOLL requested while io_uring is disabled");
+ }
+
+ if ((warnings & IoUringConfigurationWarningFlags.DirectSqeDisabledWithoutIoUring) != 0)
+ {
+ parts.Add("direct SQE disabled while io_uring is disabled");
+ }
+
+ if ((warnings & IoUringConfigurationWarningFlags.ZeroCopyOptInWithoutIoUring) != 0)
+ {
+ parts.Add("zero-copy send opted-in while io_uring is disabled");
+ }
+
+ return string.Join("; ", parts);
+ }
+ }
+
+ /// Mirrors kernel struct io_uring_sqe (64 bytes), written to the SQ ring for submission.
+ [StructLayout(LayoutKind.Explicit, Size = 64)]
+ internal struct IoUringSqe
+ {
+ [FieldOffset(0)]
+ internal byte Opcode;
+ [FieldOffset(1)]
+ internal byte Flags;
+ [FieldOffset(2)]
+ internal ushort Ioprio;
+ [FieldOffset(4)]
+ internal int Fd;
+ [FieldOffset(8)]
+ internal ulong Off;
+ [FieldOffset(16)]
+ internal ulong Addr;
+ [FieldOffset(24)]
+ internal uint Len;
+ [FieldOffset(28)]
+ internal uint RwFlags;
+ [FieldOffset(32)]
+ internal ulong UserData;
+ [FieldOffset(40)]
+ internal ushort BufIndex;
+ [FieldOffset(42)]
+ internal ushort Personality;
+ [FieldOffset(44)]
+ internal int SpliceFdIn;
+ [FieldOffset(48)]
+ internal ulong Addr3;
+ }
+
+ /// Mirrors kernel struct io_uring_probe_op (8 bytes per entry in the probe ops array).
+ [StructLayout(LayoutKind.Explicit, Size = 8)]
+ private struct IoUringProbeOp
+ {
+ [FieldOffset(0)] internal byte Op;
+ [FieldOffset(1)] internal byte Resv;
+ [FieldOffset(2)] internal ushort Flags;
+ // 4 bytes reserved at offset 4
+ }
+
+ /// Mirrors kernel struct io_uring_probe (16-byte header preceding the variable-length ops array).
+ [StructLayout(LayoutKind.Explicit, Size = 16)]
+ private struct IoUringProbeHeader
+ {
+ [FieldOffset(0)] internal byte LastOp;
+ [FieldOffset(1)] internal byte OpsLen;
+ // 14 bytes reserved at offset 2
+ }
+
+ ///
+ /// Kernel ABI opcode constants as a static class (not an enum) to avoid byte-cast noise
+ /// at every SQE write site, since the SQE Opcode field is typed as byte.
+ ///
+ private static class IoUringOpcodes
+ {
+ internal const byte ReadFixed = 4;
+ internal const byte Send = 26;
+ internal const byte Recv = 27;
+ internal const byte SendMsg = 9;
+ internal const byte RecvMsg = 10;
+ internal const byte Accept = 13;
+ internal const byte Connect = 16;
+ internal const byte SendZc = 53;
+ internal const byte SendMsgZc = 54;
+ internal const byte AsyncCancel = 14;
+ internal const byte PollAdd = 6;
+ }
+
+ ///
+ /// Centralizes io_uring ABI constants that mirror the native definitions in pal_io_uring.c.
+ /// These are used by managed code that directly interacts with the io_uring submission
+ /// and completion rings (e.g., direct SQE writes via mmap'd ring access).
+ ///
+ private static class IoUringConstants
+ {
+ // Setup flags (io_uring_setup params.flags)
+ internal const uint SetupCqSize = 1u << 3;
+ internal const uint SetupSqPoll = 1u << 5;
+ internal const uint SetupSubmitAll = 1u << 7;
+ internal const uint SetupCoopTaskrun = 1u << 8;
+ internal const uint SetupSqe128 = 1u << 10;
+ internal const uint SetupSingleIssuer = 1u << 12;
+ internal const uint SetupDeferTaskrun = 1u << 13;
+ internal const uint SetupRDisabled = 1u << 6;
+ internal const uint SetupNoSqArray = 1u << 16;
+ internal const uint SetupCloexec = 1u << 19;
+
+ // Feature flags (io_uring_params.features)
+ internal const uint FeatureSingleMmap = 1u << 0;
+ internal const uint FeatureExtArg = 1u << 8;
+
+ // Enter flags (io_uring_enter flags parameter)
+ internal const uint EnterGetevents = 1u << 0;
+ internal const uint EnterSqWakeup = 1u << 1;
+ internal const uint EnterExtArg = 1u << 3;
+ internal const uint EnterRegisteredRing = 1u << 4;
+
+ // SQ ring flags (sq_ring->flags)
+ internal const uint SqNeedWakeup = 1u << 0;
+
+ // Register opcodes
+ internal const uint RegisterEnableRings = 1;
+ internal const uint RegisterBuffers = 0;
+ internal const uint UnregisterBuffers = 1;
+ internal const uint RegisterProbe = 8;
+ internal const uint RegisterRingFds = 20;
+ internal const uint UnregisterRingFds = 21;
+ internal const uint RegisterPbufRing = 22;
+ internal const uint UnregisterPbufRing = 23;
+
+ // Register helper values
+ internal const uint RegisterOffsetAuto = 0xFFFFFFFFU;
+
+ // Probe op flags
+ internal const uint ProbeOpFlagSupported = 1u << 0;
+
+ // Poll flags
+ internal const uint PollAddFlagMulti = 1u << 0;
+ internal const uint PollIn = 0x0001;
+
+ // CQE flags
+ internal const uint CqeFBuffer = 1u << 0; // IORING_CQE_F_BUFFER (buffer id in upper bits)
+ internal const uint CqeFMore = 1u << 1; // IORING_CQE_F_MORE (multishot)
+ internal const uint CqeFNotif = 1u << 2; // IORING_CQE_F_NOTIF (zero-copy notification)
+ internal const int CqeBufferShift = 16; // IORING_CQE_BUFFER_SHIFT
+
+ // Recv ioprio flags
+ internal const ushort RecvMultishot = 1 << 1; // IORING_RECV_MULTISHOT
+ // Accept ioprio flags
+ internal const ushort AcceptMultishot = 1 << 0; // IORING_ACCEPT_MULTISHOT
+
+ // SQE flags
+ internal const byte SqeBufferSelect = 1 << 5; // IOSQE_BUFFER_SELECT
+
+ // Sizing
+ internal const uint QueueEntries = 1024;
+ // Keep CQ capacity at 4x SQ entries to absorb completion bursts during short GC pauses
+ // without immediately tripping overflow recovery on busy rings.
+ internal const uint CqEntriesFactor = 4;
+ internal const uint MaxCqeDrainBatch = 128;
+ // Bounded wait trades wake latency for starvation resilience:
+ // if an eventfd wake is missed or deferred, the event loop still polls at least once
+ // every 50ms (worst-case deferred wake latency).
+ internal const long BoundedWaitTimeoutNanos = 50L * 1000 * 1000; // 50ms
+ // Circuit-breaker bounded wait used after repeated eventfd wake failures.
+ internal const long WakeFailureFallbackWaitTimeoutNanos = 1L * 1000 * 1000; // 1ms
+
+ // Completion operation pool sizing
+ internal const int CompletionOperationPoolCapacityFactor = 2;
+
+ // mmap offsets (from kernel UAPI: IORING_OFF_SQ_RING, IORING_OFF_CQ_RING, IORING_OFF_SQES)
+ internal const ulong OffSqRing = 0;
+ internal const ulong OffCqRing = 0x8000000;
+ internal const ulong OffSqes = 0x10000000;
+
+ // Minimum kernel version for io_uring engine.
+ // SEND_ZC deferred-completion logic relies on NOTIF CQE sequencing behavior stabilized in Linux 6.1.0.
+ internal const int MinKernelMajor = 6;
+ internal const int MinKernelMinor = 1;
+
+ // Zero-copy send size threshold (payloads below this use regular send).
+ internal const int ZeroCopySendThreshold = 16384; // 16KB
+
+ // User data tag values (encoded in upper bits of user_data)
+ internal const byte TagNone = 0;
+ internal const byte TagReservedCompletion = 2;
+ internal const byte TagWakeupSignal = 3;
+
+ // Accept-time flags for accepted socket descriptors: SOCK_CLOEXEC | SOCK_NONBLOCK.
+ internal const uint AcceptFlags = 0x80800;
+
+ // Message inline capacities (avoid heap allocation on common small payloads)
+ internal const int MessageInlineIovCount = 4;
+ internal const int MessageInlineSocketAddressCapacity = 128; // sizeof(sockaddr_storage)
+ internal const int MessageInlineControlBufferCapacity = 128;
+
+ // Internal discriminator for io_uring vs epoll fallback detection
+ internal const int NotSocketEventPort = int.MinValue + 1;
+
+ // Completion slot encoding
+ // Slot index is encoded into 13 bits of user_data payload => max 8192 slot IDs per engine.
+ // Capacity planning note: with persistent multishot receives occupying long-lived slots,
+ // a 4000-connection steady state leaves ~4192 slots for transient sends/connects/one-shot recvs.
+ internal const int SlotIndexBits = 13;
+ internal const ulong SlotIndexMask = (1UL << SlotIndexBits) - 1UL;
+ internal const int GenerationBits = 56 - SlotIndexBits;
+ // 43-bit generation space gives each slot ~8.8 trillion incarnations before wrap.
+ // Generation zero remains reserved as "uninitialized", so wrap remaps 2^43-1 -> 1.
+ internal const ulong GenerationMask = (1UL << GenerationBits) - 1UL;
+
+ // Test hook opcode masks (mirrors IoUringTestOpcodeMask in pal_io_uring.c)
+ internal const byte TestOpcodeMaskNone = 0;
+ internal const byte TestOpcodeMaskSend = 1 << 0;
+ internal const byte TestOpcodeMaskRecv = 1 << 1;
+ internal const byte TestOpcodeMaskSendMsg = 1 << 2;
+ internal const byte TestOpcodeMaskRecvMsg = 1 << 3;
+ internal const byte TestOpcodeMaskAccept = 1 << 4;
+ internal const byte TestOpcodeMaskConnect = 1 << 5;
+ internal const byte TestOpcodeMaskSendZc = 1 << 6;
+ internal const byte TestOpcodeMaskSendMsgZc = 1 << 7;
+ }
+
+ /// Captures the results of io_uring_setup(2) including ring fd, negotiated params, and feature flags.
+ private struct IoUringSetupResult
+ {
+ internal int RingFd;
+ internal Interop.Sys.IoUringParams Params;
+ internal uint NegotiatedFlags;
+ internal bool UsesExtArg;
+ internal bool SqPollNegotiated;
+ }
+
+ /// Discriminates completion slot metadata shape for operation-specific post-completion processing.
+ private enum IoUringCompletionOperationKind : byte
+ {
+ None = 0,
+ Accept = 1,
+ Message = 2,
+ }
+
+ ///
+ /// Hot per-slot metadata used on every CQE dispatch.
+ /// Keep this minimal; native pointer-heavy state is kept in .
+ /// Explicit 24-byte layout keeps generation/free-list state and hot flags in one compact block.
+ ///
+ [StructLayout(LayoutKind.Explicit, Size = 24)]
+ private struct IoUringCompletionSlot
+ {
+ // 0..7
+ [FieldOffset(0)]
+ public ulong Generation;
+ // 8..11 (-1 = end of free list)
+ [FieldOffset(8)]
+ public int FreeListNext;
+ // 12..15 (operation kind + hot state flags)
+ [FieldOffset(12)]
+ private uint _packedState;
+ // 16..17
+ [FieldOffset(16)]
+ public ushort FixedRecvBufferId;
+#if DEBUG
+ // 20..23 debug-only forced completion result payload.
+ [FieldOffset(20)]
+ public int TestForcedResult;
+#endif
+
+ private const uint KindMask = 0xFFu;
+ private const uint FlagIsZeroCopySend = 1u << 8;
+ private const uint FlagZeroCopyNotificationPending = 1u << 9;
+ private const uint FlagUsesFixedRecvBuffer = 1u << 10;
+#if DEBUG
+ private const uint FlagHasTestForcedResult = 1u << 11;
+#endif
+
+ public IoUringCompletionOperationKind Kind
+ {
+ get => (IoUringCompletionOperationKind)(_packedState & KindMask);
+ set => _packedState = (_packedState & ~KindMask) | ((uint)value & KindMask);
+ }
+
+ public bool IsZeroCopySend
+ {
+ get => (_packedState & FlagIsZeroCopySend) != 0;
+ set => SetFlag(FlagIsZeroCopySend, value);
+ }
+
+ public bool ZeroCopyNotificationPending
+ {
+ get => (_packedState & FlagZeroCopyNotificationPending) != 0;
+ set => SetFlag(FlagZeroCopyNotificationPending, value);
+ }
+
+ public bool UsesFixedRecvBuffer
+ {
+ get => (_packedState & FlagUsesFixedRecvBuffer) != 0;
+ set => SetFlag(FlagUsesFixedRecvBuffer, value);
+ }
+
+#if DEBUG
+ public bool HasTestForcedResult
+ {
+ get => (_packedState & FlagHasTestForcedResult) != 0;
+ set => SetFlag(FlagHasTestForcedResult, value);
+ }
+#endif
+
+ private void SetFlag(uint mask, bool value)
+ {
+ if (value)
+ {
+ _packedState |= mask;
+ }
+ else
+ {
+ _packedState &= ~mask;
+ }
+ }
+ }
+
+ ///
+ /// Hot tracked-operation ownership state used on completion and cancellation paths.
+ /// Kept separate from native slot storage to improve cache locality in CQE dispatch.
+ ///
+ private struct IoUringTrackedOperationState
+ {
+ public SocketAsyncContext.AsyncOperation? TrackedOperation;
+ public ulong TrackedOperationGeneration;
+ }
+
+ ///
+ /// Cold per-slot native metadata: pointers and message writeback state needed only for
+ /// operation-specific completion processing.
+ ///
+ private struct IoUringCompletionSlotStorage
+ {
+ // Hold a DangerousAddRef lease for the socket fd until this slot is fully retired.
+ public SafeSocketHandle? DangerousRefSocketHandle;
+ // Per-slot pre-allocated native slab backing accept socklen_t and message inline storage.
+ public unsafe byte* NativeInlineStorage;
+ // Accept metadata
+ public unsafe int* NativeSocketAddressLengthPtr; // socklen_t* in NativeInlineStorage
+ // Message metadata (pointers to native-alloc'd msghdr/iovec)
+ public IntPtr NativeMsgHdrPtr;
+ public bool MessageIsReceive;
+ // Message metadata - deep-copied native msghdr constituents (point into NativeInlineStorage).
+ public unsafe Interop.Sys.IOVector* NativeIOVectors;
+ public unsafe byte* NativeSocketAddress;
+ public unsafe byte* NativeControlBuffer;
+ // RecvMsg output capture - pointers back to managed MessageHeader buffers for writeback
+ public unsafe byte* ReceiveOutputSocketAddress;
+ public unsafe byte* ReceiveOutputControlBuffer;
+ public int ReceiveSocketAddressCapacity;
+ public int ReceiveControlBufferCapacity;
+ }
+
+ ///
+ /// Mirrors the kernel's struct msghdr layout for direct SQE submission.
+ /// Used by to build a native msghdr that
+ /// io_uring sendmsg/recvmsg opcodes can consume directly.
+ /// Must only be used on 64-bit Linux where sizeof(msghdr) == 56.
+ ///
+ [StructLayout(LayoutKind.Explicit)]
+ private unsafe struct NativeMsghdr
+ {
+ [FieldOffset(0)]
+ public void* MsgName;
+ [FieldOffset(8)]
+ public uint MsgNameLen;
+ [FieldOffset(16)]
+ public Interop.Sys.IOVector* MsgIov;
+ [FieldOffset(24)]
+ public nuint MsgIovLen;
+ [FieldOffset(32)]
+ public void* MsgControl;
+ [FieldOffset(40)]
+ public nuint MsgControlLen;
+ [FieldOffset(48)]
+ public int MsgFlags;
+ }
+
+ ///
+ /// Grouped managed ring mmap state.
+ /// Keeping these fields in a single struct reduces top-level instance-field sprawl.
+ ///
+ private unsafe struct ManagedRingState
+ {
+ public Interop.Sys.IoUringCqe* CqeBase;
+ public uint* CqTailPtr;
+ public uint* CqHeadPtr;
+ public uint CqMask;
+ public uint CqEntries;
+ public uint* CqOverflowPtr;
+ public uint ObservedCqOverflow;
+ public byte* SqRingPtr;
+ public byte* CqRingPtr;
+ public uint* SqFlagsPtr;
+ public ulong SqRingSize;
+ public ulong CqRingSize;
+ public ulong SqesSize;
+ public bool UsesSingleMmap;
+ public int RingFd;
+ public bool UsesExtArg;
+ public bool UsesNoSqArray;
+ public uint NegotiatedFlags;
+ public uint CachedCqHead;
+ public bool CqDrainEnabled;
+ public int WakeupEventFd;
+
+ public static ManagedRingState CreateDefault()
+ {
+ ManagedRingState state = default;
+ state.RingFd = -1;
+ state.WakeupEventFd = -1;
+ return state;
+ }
+ }
+
+ private const int IoUringDiagnosticsPollInterval = 64;
+ private const long DiagnosticSampleMask = IoUringDiagnosticsPollInterval - 1;
+ private const int MaxIoUringPrepareQueueDrainPerSubmit = 256;
+ private const int MaxIoUringCancelQueueDrainPerSubmit = 256;
+ private const int MaxSlotExhaustionRetries = 3;
+ private const int MaxIoUringSqeAcquireSubmitAttempts = 16;
+ private const int CqOverflowTrackedSweepDelayMilliseconds = 250;
+ private const int CqOverflowTrackedSweepMaxRearms = 8;
+ private const int IoUringWakeFailureCircuitBreakerThreshold = 8;
+ private const string IoUringEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING";
+ private const string IoUringSqPollEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL";
+ private const string UseIoUringAppContextSwitch = "System.Net.Sockets.UseIoUring";
+ private const string UseIoUringSqPollAppContextSwitch = "System.Net.Sockets.UseIoUringSqPoll";
+ // Configuration matrix (7 surfaces):
+ // 1) DOTNET_SYSTEM_NET_SOCKETS_IO_URING
+ // 2) AppContext: System.Net.Sockets.UseIoUring
+ // 3) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL
+ // 4) AppContext: System.Net.Sockets.UseIoUringSqPoll
+ // 5) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE (DEBUG)
+ // 6) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND (DEBUG)
+ // 7) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS (DEBUG, in IoUringProvidedBufferRing)
+ //
+ // Precedence:
+ // - Primary gate: env (surface #1) overrides AppContext (surface #2); AppContext is used only when env is unset.
+ // - SQPOLL gate: dual opt-in requires both env (surface #3 == "1") and AppContext (surface #4 == true).
+ private const ulong IoUringUserDataPayloadMask = 0x00FF_FFFF_FFFF_FFFFUL;
+ private const int IoUringUserDataTagShift = 56;
+ private static readonly int s_ioUringPrepareQueueCapacity = GetIoUringPrepareQueueCapacity();
+ private static readonly int s_ioUringCancellationQueueCapacity = s_ioUringPrepareQueueCapacity;
+ private int _ioUringResolvedConfigurationLogged;
+ private long _ioUringPendingRetryQueuedToPrepareQueueCount;
+ private long _ioUringNonPinnablePrepareFallbackCount;
+ private long _ioUringPublishedNonPinnablePrepareFallbackCount;
+ private MpscQueue? _ioUringPrepareQueue;
+ private MpscQueue? _ioUringCancelQueue;
+ private long _ioUringPrepareQueueLength;
+ private long _ioUringCancelQueueLength;
+ private long _ioUringPrepareQueueOverflowCount;
+ private long _ioUringCancelQueueOverflowCount;
+ private long _ioUringPrepareQueueOverflowFallbackCount;
+ private long _ioUringCompletionSlotExhaustionCount;
+ private long _ioUringCompletionSlotDrainRecoveryCount;
+ private long _ioUringPublishedPrepareQueueLength;
+ private long _ioUringBenignLateCompletionCount;
+ private long _ioUringCompletionRequeueFailureCount;
+ private long _ioUringUntrackMismatchCount;
+ private long _ioUringPublishedPrepareQueueOverflowCount;
+ private long _ioUringPublishedPrepareQueueOverflowFallbackCount;
+ private long _ioUringPublishedCompletionRequeueFailureCount;
+ private long _ioUringPublishedCompletionSlotExhaustionCount;
+ private long _ioUringPublishedCompletionSlotDrainRecoveryCount;
+ private int _ioUringDiagnosticsPollCountdown;
+ private bool _ioUringAdvancedFeatureStateLogged;
+ private int _ioUringWakeFailureConsecutiveCount;
+ private uint _ioUringWakeupGeneration;
+ private int _ioUringPortClosedForTeardown;
+ // Release-published teardown gate. Readers use Volatile.Read in enqueue/wakeup paths
+ // to prevent new io_uring work from being published after teardown begins.
+ private int _ioUringTeardownInitiated;
+ private int _ioUringSlotCapacity;
+ private bool _completionSlotDrainInProgress;
+ private bool _cqOverflowRecoveryActive;
+ private IoUringCqOverflowRecoveryBranch _cqOverflowRecoveryBranch;
+ private long _cqOverflowTrackedSweepDeadlineTicks;
+ private int _cqOverflowTrackedSweepRearmCount;
+ private uint _ioUringManagedPendingSubmissions;
+ private uint _ioUringManagedSqTail;
+ private bool _ioUringManagedSqTailLoaded;
+ private Interop.Sys.IoUringSqRingInfo _ioUringSqRingInfo;
+ private bool _managedSqeInvariantsValidated;
+ private bool _ioUringDirectSqeEnabled;
+ private ManagedRingState _ringState = ManagedRingState.CreateDefault();
+
+ // Per-opcode support flags, populated by ProbeIoUringOpcodeSupport.
+ private bool _supportsOpSend;
+ private bool _supportsOpReadFixed;
+ private bool _supportsOpRecv;
+ private bool _supportsOpSendMsg;
+ private bool _supportsOpRecvMsg;
+ private bool _supportsOpAccept;
+ private bool _supportsOpConnect;
+ private bool _supportsOpSendZc;
+ private bool _supportsOpSendMsgZc;
+ private bool _supportsOpAsyncCancel;
+ private bool _supportsMultishotRecv;
+ private bool _supportsMultishotAccept;
+ private bool _zeroCopySendEnabled;
+
+ private bool _sqPollEnabled;
+ private bool _ioUringInitialized;
+ private int _ioUringDrainTelemetryBatchActive;
+ private long _ioUringDrainBatchProvidedBufferDepletionCount;
+ private long _ioUringDrainBatchProvidedBufferRecycleCount;
+ private long _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount;
+ private IoUringProvidedBufferRing? _ioUringProvidedBufferRing;
+ private ushort _ioUringProvidedBufferGroupId;
+ // SoA split: hot completion slot state and cold native storage/tracking metadata.
+ private IoUringCompletionSlot[]? _completionSlots;
+ private IoUringTrackedOperationState[]? _trackedOperations;
+ private IoUringCompletionSlotStorage[]? _completionSlotStorage;
+ private unsafe byte* _completionSlotNativeStorage;
+ private nuint _completionSlotNativeStorageStride;
+ private int _trackedIoUringOperationCount;
+ private System.Buffers.MemoryHandle[]? _zeroCopyPinHolds;
+ private int _completionSlotFreeListHead = -1;
+ private int _completionSlotsInUse;
+ private int _completionSlotsHighWaterMark;
+ private int _liveAcceptCompletionSlotCount;
+
+#if DEBUG
+ // Test hook state: forced completion result injection (mirrors native pal_io_uring.c test hooks).
+ private byte _testForceEagainOnceMask;
+ private byte _testForceEcanceledOnceMask;
+ private int _testForceSubmitEpermOnce;
+ // Test-only observability for cancel-queue full retry path.
+ private long _testCancelQueueWakeRetryCount;
+#endif
+
+ private LinuxIoUringCapabilities _ioUringCapabilities;
+
+ // Managed ring state accessors (backed by _ringState).
+ private unsafe Interop.Sys.IoUringCqe* _managedCqeBase
+ {
+ get => _ringState.CqeBase;
+ set => _ringState.CqeBase = value;
+ }
+
+ private unsafe uint* _managedCqTailPtr
+ {
+ get => _ringState.CqTailPtr;
+ set => _ringState.CqTailPtr = value;
+ }
+
+ private unsafe uint* _managedCqHeadPtr
+ {
+ get => _ringState.CqHeadPtr;
+ set => _ringState.CqHeadPtr = value;
+ }
+
+ private uint _managedCqMask
+ {
+ get => _ringState.CqMask;
+ set => _ringState.CqMask = value;
+ }
+
+ private uint _managedCqEntries
+ {
+ get => _ringState.CqEntries;
+ set => _ringState.CqEntries = value;
+ }
+
+ private unsafe uint* _managedCqOverflowPtr
+ {
+ get => _ringState.CqOverflowPtr;
+ set => _ringState.CqOverflowPtr = value;
+ }
+
+ private uint _managedObservedCqOverflow
+ {
+ get => _ringState.ObservedCqOverflow;
+ set => _ringState.ObservedCqOverflow = value;
+ }
+
+ private unsafe byte* _managedSqRingPtr
+ {
+ get => _ringState.SqRingPtr;
+ set => _ringState.SqRingPtr = value;
+ }
+
+ private unsafe byte* _managedCqRingPtr
+ {
+ get => _ringState.CqRingPtr;
+ set => _ringState.CqRingPtr = value;
+ }
+
+ private unsafe uint* _managedSqFlagsPtr
+ {
+ get => _ringState.SqFlagsPtr;
+ set => _ringState.SqFlagsPtr = value;
+ }
+
+ private ulong _managedSqRingSize
+ {
+ get => _ringState.SqRingSize;
+ set => _ringState.SqRingSize = value;
+ }
+
+ private ulong _managedCqRingSize
+ {
+ get => _ringState.CqRingSize;
+ set => _ringState.CqRingSize = value;
+ }
+
+ private ulong _managedSqesSize
+ {
+ get => _ringState.SqesSize;
+ set => _ringState.SqesSize = value;
+ }
+
+ private bool _managedUsesSingleMmap
+ {
+ get => _ringState.UsesSingleMmap;
+ set => _ringState.UsesSingleMmap = value;
+ }
+
+ private int _managedRingFd
+ {
+ get => _ringState.RingFd;
+ set => _ringState.RingFd = value;
+ }
+
+ private bool _managedUsesExtArg
+ {
+ get => _ringState.UsesExtArg;
+ set => _ringState.UsesExtArg = value;
+ }
+
+ private bool _managedUsesNoSqArray
+ {
+ get => _ringState.UsesNoSqArray;
+ set => _ringState.UsesNoSqArray = value;
+ }
+
+ private uint _managedNegotiatedFlags
+ {
+ get => _ringState.NegotiatedFlags;
+ set => _ringState.NegotiatedFlags = value;
+ }
+
+ private uint _managedCachedCqHead
+ {
+ get => _ringState.CachedCqHead;
+ set => _ringState.CachedCqHead = value;
+ }
+
+ private bool _managedCqDrainEnabled
+ {
+ get => _ringState.CqDrainEnabled;
+ set => _ringState.CqDrainEnabled = value;
+ }
+
+ private int _managedWakeupEventFd
+ {
+ get => _ringState.WakeupEventFd;
+ set => _ringState.WakeupEventFd = value;
+ }
+
+ /// Whether this engine instance is using io_uring completion mode.
+ internal bool IsIoUringCompletionModeEnabled => _ioUringCapabilities.IsCompletionMode;
+ /// Whether managed direct SQE submission is enabled.
+ internal bool IsIoUringDirectSqeEnabled => _ioUringDirectSqeEnabled;
+ /// Whether a connected send payload is eligible for the SEND_ZC path.
+ internal bool ShouldTryIoUringDirectSendZeroCopy(int payloadLength) =>
+ IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: false);
+ /// Whether a message-based send payload is eligible for the SENDMSG_ZC path.
+ internal bool ShouldTryIoUringDirectSendMessageZeroCopy(int payloadLength) =>
+ IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: true);
+
+ ///
+ /// Centralized zero-copy policy:
+ /// 1) process-level opt-in, 2) opcode support, 3) payload threshold.
+ /// The threshold is based on total payload bytes so buffer-list workloads (e.g. 4KB segments)
+ /// are eligible once the aggregate payload crosses the cutoff.
+ ///
+ private bool IsIoUringZeroCopySendEligible(int payloadLength, bool requiresSendMessageOpcode)
+ {
+ if (!_zeroCopySendEnabled || payloadLength < IoUringConstants.ZeroCopySendThreshold)
+ {
+ return false;
+ }
+
+ return requiresSendMessageOpcode ? _supportsOpSendMsgZc : _supportsOpSendZc;
+ }
+
+ ///
+ /// Reads the total count of pending completions that had to requeue through prepare queues
+ /// after inline completion-mode re-prepare was not used.
+ ///
+ internal static long GetIoUringPendingRetryQueuedToPrepareQueueCount()
+ {
+ long total = 0;
+ foreach (SocketAsyncEngine engine in s_engines)
+ {
+ total += Interlocked.Read(ref engine._ioUringPendingRetryQueuedToPrepareQueueCount);
+ }
+
+ return total;
+ }
+
+ internal static long GetIoUringNonPinnablePrepareFallbackCount()
+ {
+ long total = 0;
+ foreach (SocketAsyncEngine engine in s_engines)
+ {
+ total += Interlocked.Read(ref engine._ioUringNonPinnablePrepareFallbackCount);
+ }
+
+ return total;
+ }
+
+ internal static void SetIoUringNonPinnablePrepareFallbackCountForTest(long value)
+ {
+#if DEBUG
+ bool assigned = false;
+ foreach (SocketAsyncEngine engine in s_engines)
+ {
+ if (!engine.IsIoUringCompletionModeEnabled)
+ {
+ continue;
+ }
+
+ long engineValue = assigned ? 0 : value;
+ Interlocked.Exchange(ref engine._ioUringNonPinnablePrepareFallbackCount, engineValue);
+ Interlocked.Exchange(ref engine._ioUringPublishedNonPinnablePrepareFallbackCount, 0);
+ assigned = true;
+ }
+#else
+ _ = value;
+#endif
+ }
+
+ private void LogIoUringResolvedConfigurationIfNeeded(in IoUringResolvedConfiguration resolvedConfiguration)
+ {
+ if (Interlocked.Exchange(ref _ioUringResolvedConfigurationLogged, 1) != 0)
+ {
+ return;
+ }
+
+ string configuration = resolvedConfiguration.ToLogString();
+ SocketsTelemetry.Log.ReportIoUringResolvedConfiguration(configuration);
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(this, $"io_uring resolved configuration: {configuration}");
+ if (resolvedConfiguration.TryGetValidationWarnings(out string? warnings))
+ {
+ NetEventSource.Info(this, $"io_uring configuration warnings: {warnings}");
+ }
+ }
+ }
+
+ private static int GetIoUringPrepareQueueCapacity()
+ {
+#if DEBUG
+ if (Environment.GetEnvironmentVariable(
+ IoUringTestEnvironmentVariables.PrepareQueueCapacity) is string configuredValue &&
+ int.TryParse(configuredValue, out int configuredCapacity) &&
+ configuredCapacity > 0)
+ {
+ return configuredCapacity;
+ }
+#endif
+
+ // Raised default to reduce fallback frequency under bursty load.
+ int scaledCapacity = s_eventBufferCount >= 32 ? checked(s_eventBufferCount * 4) : 512;
+ return Math.Max(scaledCapacity, 512);
+ }
+
+ private static uint GetIoUringQueueEntries()
+ {
+#if DEBUG
+ if (Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.QueueEntries) is string configuredValue &&
+ int.TryParse(configuredValue, out int configuredEntries) &&
+ configuredEntries >= 2 &&
+ configuredEntries <= IoUringConstants.QueueEntries &&
+ (configuredEntries & (configuredEntries - 1)) == 0)
+ {
+ return (uint)configuredEntries;
+ }
+#endif
+
+ return IoUringConstants.QueueEntries;
+ }
+
+ /// Creates a capabilities snapshot based on whether the port is io_uring.
+ private static LinuxIoUringCapabilities ResolveLinuxIoUringCapabilities(bool isIoUringPort) =>
+ default(LinuxIoUringCapabilities)
+ .WithIsIoUringPort(isIoUringPort)
+ .WithMode(isIoUringPort ? IoUringMode.Completion : IoUringMode.Disabled);
+
+ private void SetIoUringProvidedBufferCapabilityState(bool supportsProvidedBufferRings, bool hasRegisteredBuffers)
+ {
+ _ioUringCapabilities = _ioUringCapabilities
+ .WithSupportsProvidedBufferRings(supportsProvidedBufferRings)
+ .WithHasRegisteredBuffers(hasRegisteredBuffers);
+ }
+
+ /// Encodes a tag byte and payload into a 64-bit user_data value.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static ulong EncodeIoUringUserData(byte tag, ulong payload) =>
+ ((ulong)tag << IoUringUserDataTagShift) | (payload & IoUringUserDataPayloadMask);
+
+ /// Reads the next CQE from the completion ring without advancing the head.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe, int eventLoopThreadId)
+ {
+ Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId,
+ "TryPeekNextCqe must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ cqe = null;
+ uint cqTail = Volatile.Read(ref *_managedCqTailPtr);
+ if (_managedCachedCqHead == cqTail) return false;
+ uint index = _managedCachedCqHead & _managedCqMask;
+ cqe = _managedCqeBase + index;
+ return true;
+ }
+
+ /// Advances the CQ head pointer by the given count, making slots available to the kernel.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void AdvanceCqHead(uint count, int eventLoopThreadId)
+ {
+ Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId,
+ "AdvanceCqHead must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ _managedCachedCqHead += count;
+ Volatile.Write(ref *_managedCqHeadPtr, _managedCachedCqHead);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void BeginIoUringDrainTelemetryBatch()
+ {
+ _ioUringDrainBatchProvidedBufferDepletionCount = 0;
+ _ioUringDrainBatchProvidedBufferRecycleCount = 0;
+ _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount = 0;
+ Volatile.Write(ref _ioUringDrainTelemetryBatchActive, 1);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void FlushIoUringDrainTelemetryBatch()
+ {
+ Volatile.Write(ref _ioUringDrainTelemetryBatchActive, 0);
+
+ long depletionCount = _ioUringDrainBatchProvidedBufferDepletionCount;
+ if (depletionCount != 0)
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion(depletionCount);
+ }
+
+ long recycleCount = _ioUringDrainBatchProvidedBufferRecycleCount;
+ if (recycleCount != 0)
+ {
+ SocketsTelemetry.Log.IoUringProvidedBufferRecycle(recycleCount);
+ }
+
+ long earlyDataCount = _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount;
+ if (earlyDataCount != 0)
+ {
+ SocketsTelemetry.Log.IoUringPersistentMultishotRecvEarlyData(earlyDataCount);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordIoUringProvidedBufferDepletionForDrainBatch(long count = 1)
+ {
+ if (Volatile.Read(ref _ioUringDrainTelemetryBatchActive) != 0)
+ {
+ _ioUringDrainBatchProvidedBufferDepletionCount += count;
+ return;
+ }
+
+ SocketsTelemetry.Log.IoUringProvidedBufferDepletion(count);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordIoUringProvidedBufferRecycleForDrainBatch(long count = 1)
+ {
+ if (Volatile.Read(ref _ioUringDrainTelemetryBatchActive) != 0)
+ {
+ _ioUringDrainBatchProvidedBufferRecycleCount += count;
+ return;
+ }
+
+ SocketsTelemetry.Log.IoUringProvidedBufferRecycle(count);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void RecordIoUringPersistentMultishotRecvEarlyDataForDrainBatch(long count = 1)
+ {
+ if (Volatile.Read(ref _ioUringDrainTelemetryBatchActive) != 0)
+ {
+ _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount += count;
+ return;
+ }
+
+ SocketsTelemetry.Log.IoUringPersistentMultishotRecvEarlyData(count);
+ }
+
+ ///
+ /// Drains up to CQEs from the mmap'd
+ /// completion ring and dispatches each based on the user_data tag.
+ /// Tag=2 (reserved completion) entries are dispatched directly through
+ /// .
+ /// Tag=3 (wakeup signal) entries are handled inline.
+ /// Returns true when at least one CQE was drained.
+ ///
+ private unsafe bool DrainCqeRingBatch(SocketEventHandler handler)
+ {
+ int eventLoopThreadId = Volatile.Read(ref _eventLoopManagedThreadId);
+ Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId,
+ "DrainCqeRingBatch must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ ObserveManagedCqOverflowCounter();
+ int drained = 0;
+ bool drainedAnyCqe = false;
+ bool enqueuedFallbackEvent = false;
+ uint deferredCqHeadAdvance = 0;
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ providedBufferRing?.BeginDeferredRecyclePublish();
+ BeginIoUringDrainTelemetryBatch();
+
+ try
+ {
+ while (drained < (int)IoUringConstants.MaxCqeDrainBatch
+ && TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe, eventLoopThreadId))
+ {
+ drainedAnyCqe = true;
+ ulong userData = cqe->UserData;
+ int result = cqe->Result;
+ uint flags = cqe->Flags;
+
+ if (_cqOverflowRecoveryActive)
+ {
+ // During overflow recovery, publish head movement per CQE so the kernel can
+ // reclaim CQ ring space immediately and avoid extending overflow pressure.
+ AdvanceCqHead(1, eventLoopThreadId);
+ }
+ else
+ {
+ _managedCachedCqHead++;
+ deferredCqHeadAdvance++;
+ }
+
+ byte tag = (byte)(userData >> IoUringUserDataTagShift);
+ ulong payload = userData & IoUringUserDataPayloadMask;
+
+ if (tag == IoUringConstants.TagReservedCompletion)
+ {
+ if ((flags & IoUringConstants.CqeFNotif) != 0)
+ {
+ if (HandleZeroCopyNotification(payload))
+ {
+ handler.DispatchZeroCopyIoUringNotification(payload);
+ }
+
+ drained++;
+ continue;
+ }
+
+ bool isMultishotCompletion = false;
+ if ((flags & IoUringConstants.CqeFMore) != 0)
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if (completionEntries is not null &&
+ (uint)slotIndex < (uint)completionEntries.Length)
+ {
+ IoUringCompletionOperationKind kind = completionEntries[slotIndex].Kind;
+ isMultishotCompletion =
+ (kind == IoUringCompletionOperationKind.Message && _ioUringCapabilities.SupportsMultishotRecv) ||
+ (kind == IoUringCompletionOperationKind.Accept && _ioUringCapabilities.SupportsMultishotAccept);
+ }
+ }
+ ResolveReservedCompletionSlotMetadata(
+ payload,
+ isMultishotCompletion,
+ ref result,
+ out int completionSocketAddressLen,
+ out int completionControlBufferLen,
+ out uint completionAuxiliaryData,
+ out bool hasFixedRecvBuffer,
+ out ushort fixedRecvBufferId);
+
+ if (isMultishotCompletion)
+ {
+ // Dispatch expects full tagged user_data so tracked-ownership decode can validate tag+generation.
+ handler.DispatchMultishotIoUringCompletion(
+ userData,
+ result,
+ flags,
+ completionSocketAddressLen,
+ completionControlBufferLen,
+ completionAuxiliaryData,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref enqueuedFallbackEvent);
+ }
+ else
+ {
+ // Dispatch expects full tagged user_data so tracked-ownership decode can validate tag+generation.
+ handler.DispatchSingleIoUringCompletion(
+ userData,
+ result,
+ flags,
+ completionSocketAddressLen,
+ completionControlBufferLen,
+ completionAuxiliaryData,
+ hasFixedRecvBuffer,
+ fixedRecvBufferId,
+ ref enqueuedFallbackEvent);
+ }
+ }
+ else if (tag == IoUringConstants.TagWakeupSignal)
+ {
+ HandleManagedWakeupSignal(result);
+ }
+ else if (tag != IoUringConstants.TagNone)
+ {
+ Debug.Fail($"Unknown io_uring CQE user_data tag: {tag}.");
+ }
+
+ drained++;
+ }
+ }
+ finally
+ {
+ providedBufferRing?.EndDeferredRecyclePublish();
+ FlushIoUringDrainTelemetryBatch();
+ if (deferredCqHeadAdvance != 0 && _managedCqHeadPtr is not null)
+ {
+ Volatile.Write(ref *_managedCqHeadPtr, _managedCachedCqHead);
+ }
+ }
+
+ if (enqueuedFallbackEvent)
+ {
+ EnsureWorkerScheduled();
+ }
+
+ TryCompleteManagedCqOverflowRecovery();
+ AssertCompletionSlotUsageBounded();
+
+ return drainedAnyCqe;
+ }
+
+ ///
+ /// Resolves metadata for a reserved completion by applying forced test results and
+ /// copying operation-specific completion outputs (accept/recvmsg) from native storage.
+ ///
+ private void ResolveReservedCompletionSlotMetadata(
+ ulong payload,
+ bool isMultishotCompletion,
+ ref int result,
+ out int completionSocketAddressLen,
+ out int completionControlBufferLen,
+ out uint completionAuxiliaryData,
+ out bool hasFixedRecvBuffer,
+ out ushort fixedRecvBufferId)
+ {
+ completionSocketAddressLen = 0;
+ completionControlBufferLen = 0;
+ completionAuxiliaryData = 0;
+ hasFixedRecvBuffer = false;
+ fixedRecvBufferId = 0;
+
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)_completionSlots!.Length)
+ {
+ return;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots[slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ ulong completionGeneration = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask;
+ if (completionGeneration != slot.Generation)
+ {
+ // Stale CQE for a recycled slot; ignore without mutating current slot state.
+ return;
+ }
+
+ ResolveDebugTestForcedResult(ref slot, ref result);
+
+ if (slot.UsesFixedRecvBuffer)
+ {
+ hasFixedRecvBuffer = true;
+ fixedRecvBufferId = slot.FixedRecvBufferId;
+ slot.UsesFixedRecvBuffer = false;
+ slot.FixedRecvBufferId = 0;
+ Debug.Assert(!isMultishotCompletion, "Fixed-buffer receive completions are expected to be one-shot.");
+ }
+
+ if (slot.Kind == IoUringCompletionOperationKind.Accept &&
+ slotStorage.NativeSocketAddressLengthPtr is not null)
+ {
+ int nativeSocketAddressLength = *slotStorage.NativeSocketAddressLengthPtr;
+ completionAuxiliaryData = nativeSocketAddressLength >= 0 ? (uint)nativeSocketAddressLength : 0u;
+ if (isMultishotCompletion)
+ {
+ int socketAddressCapacity = slotStorage.ReceiveSocketAddressCapacity;
+ if (socketAddressCapacity > 0 && slotStorage.NativeSocketAddress is not null)
+ {
+ Unsafe.InitBlockUnaligned(slotStorage.NativeSocketAddress, 0, (uint)socketAddressCapacity);
+ }
+
+ *slotStorage.NativeSocketAddressLengthPtr = socketAddressCapacity >= 0 ? socketAddressCapacity : 0;
+ }
+ }
+ else if (slot.Kind == IoUringCompletionOperationKind.Message)
+ {
+ CopyMessageCompletionOutputs(
+ slotIndex,
+ out completionSocketAddressLen,
+ out completionControlBufferLen,
+ out completionAuxiliaryData);
+ }
+
+ if (!isMultishotCompletion)
+ {
+ if (!slot.IsZeroCopySend)
+ {
+ FreeCompletionSlot(slotIndex);
+ }
+ else if (result < 0)
+ {
+ // Error completion path may not produce a NOTIF CQE.
+ FreeCompletionSlot(slotIndex);
+ }
+ else if (!slot.ZeroCopyNotificationPending)
+ {
+ // First CQE for zero-copy send: keep slot alive until NOTIF CQE arrives.
+ slot.ZeroCopyNotificationPending = true;
+ AssertZeroCopyNotificationPendingForPayload(payload);
+ }
+ }
+ }
+
+ /// Handles NOTIF CQEs for zero-copy sends and releases retained completion slots.
+ private bool HandleZeroCopyNotification(ulong payload)
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return false;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ ulong completionGeneration = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask;
+ if (slot.Generation != completionGeneration)
+ {
+ return false;
+ }
+
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending)
+ {
+ return false;
+ }
+
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ FreeCompletionSlot(slotIndex);
+ return true;
+ }
+
+ /// Returns true when the completion slot for is waiting on SEND_ZC NOTIF.
+ private bool IsZeroCopyNotificationPending(ulong userData)
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return false;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask);
+ if ((uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ return slot.IsZeroCopySend && slot.ZeroCopyNotificationPending;
+ }
+
+ ///
+ /// Releases a deferred SEND_ZC completion slot when dispatch cannot reattach ownership.
+ ///
+ private bool TryCleanupDeferredZeroCopyCompletionSlot(ulong userData)
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return false;
+ }
+
+ int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask);
+ if ((uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ ulong completionGeneration = ((userData & IoUringUserDataPayloadMask) >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask;
+ if (slot.Generation != completionGeneration)
+ {
+ return false;
+ }
+
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending)
+ {
+ return false;
+ }
+
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ FreeCompletionSlot(slotIndex);
+ return true;
+ }
+
+ /// Debug assertion that a reserved completion payload remains armed for SEND_ZC NOTIF.
+ [Conditional("DEBUG")]
+ private void AssertZeroCopyNotificationPendingForPayload(ulong payload)
+ {
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ Debug.Assert(
+ IsZeroCopyNotificationPending(userData),
+ "SEND_ZC first CQE must leave the completion slot pending until NOTIF CQE arrives.");
+ }
+
+ /// Debug assertion that SEND_ZC completion dispatch is deferred until NOTIF arrives.
+ [Conditional("DEBUG")]
+ private void AssertZeroCopyDeferredCompletionState(ulong userData, SocketAsyncContext.AsyncOperation operation)
+ {
+ Debug.Assert(
+ operation.IoUringUserData == userData,
+ "Deferred SEND_ZC completion must retain the original user_data until NOTIF CQE dispatch.");
+ Debug.Assert(
+ IsZeroCopyNotificationPending(userData),
+ "Deferred SEND_ZC completion requires an armed NOTIF state.");
+ }
+
+ /// Observes kernel CQ overflow count deltas and emits telemetry/logs.
+ private unsafe void ObserveManagedCqOverflowCounter()
+ {
+ if (_managedCqOverflowPtr is null)
+ {
+ return;
+ }
+
+ uint observedOverflow = Volatile.Read(ref *_managedCqOverflowPtr);
+ uint previousOverflow = _managedObservedCqOverflow;
+ // The kernel counter is uint32 and wraps; compare via wrapped delta instead of monotonic ordering.
+ uint delta = unchecked(observedOverflow - previousOverflow);
+ if (delta == 0)
+ {
+ return;
+ }
+
+ _managedObservedCqOverflow = observedOverflow;
+ SocketsTelemetry.Log.IoUringCqOverflow(delta);
+ // Defer stale-tracked sweep scheduling until recovery completes.
+ Volatile.Write(ref _cqOverflowTrackedSweepDeadlineTicks, 0);
+ _cqOverflowTrackedSweepRearmCount = 0;
+
+ IoUringCqOverflowRecoveryBranch branch = _cqOverflowRecoveryActive ?
+ IoUringCqOverflowRecoveryBranch.DualWave :
+ DetermineCqOverflowRecoveryBranchAtEntry();
+ _cqOverflowRecoveryActive = true;
+ _cqOverflowRecoveryBranch = branch;
+ AssertLiveAcceptSlotsRemainTrackedDuringRecovery(branch);
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCqOverflow(observedOverflow, delta);
+ LogIoUringCqOverflowRecoveryEntry(branch, observedOverflow, delta);
+ }
+ }
+
+ /// Determines the initial recovery branch discriminator for a newly observed CQ overflow.
+ private IoUringCqOverflowRecoveryBranch DetermineCqOverflowRecoveryBranchAtEntry()
+ {
+ if (Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return IoUringCqOverflowRecoveryBranch.Teardown;
+ }
+
+ if (_ioUringCapabilities.SupportsMultishotAccept &&
+ HasLiveAcceptCompletionSlot())
+ {
+ return IoUringCqOverflowRecoveryBranch.MultishotAcceptArming;
+ }
+
+ return IoUringCqOverflowRecoveryBranch.DualWave;
+ }
+
+ /// Returns true when at least one active completion slot is currently tracking accept metadata.
+ private bool HasLiveAcceptCompletionSlot()
+ {
+ // Keep this O(1): CQ-overflow branch selection can run frequently on the event loop hot path.
+ int liveAcceptCount = Volatile.Read(ref _liveAcceptCompletionSlotCount);
+ Debug.Assert(liveAcceptCount >= 0);
+ return liveAcceptCount != 0;
+ }
+
+ ///
+ /// Completes CQ-overflow recovery once the ring is drained and no additional overflow increments are observed.
+ /// Recovery is best-effort: dropped CQEs cannot be reconstructed, so this only restores steady-state draining.
+ ///
+ private unsafe void TryCompleteManagedCqOverflowRecovery()
+ {
+ if (!_cqOverflowRecoveryActive ||
+ _managedCqOverflowPtr is null ||
+ _managedCqTailPtr is null)
+ {
+ return;
+ }
+
+ uint cqTail = Volatile.Read(ref *_managedCqTailPtr);
+ if (_managedCachedCqHead != cqTail)
+ {
+ return;
+ }
+
+ if (Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.Teardown;
+ }
+
+ uint observedOverflow = Volatile.Read(ref *_managedCqOverflowPtr);
+ // The kernel counter is uint32 and wraps; compare via wrapped subtraction.
+ uint delta = unchecked(observedOverflow - _managedObservedCqOverflow);
+ if (delta > 0)
+ {
+ _managedObservedCqOverflow = observedOverflow;
+ if (_cqOverflowRecoveryBranch != IoUringCqOverflowRecoveryBranch.Teardown)
+ {
+ _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.DualWave;
+ }
+ SocketsTelemetry.Log.IoUringCqOverflow(delta);
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCqOverflow(observedOverflow, delta);
+ LogIoUringCqOverflowRecoveryEntry(
+ _cqOverflowRecoveryBranch,
+ observedOverflow,
+ delta);
+ }
+
+ return;
+ }
+
+ _cqOverflowRecoveryActive = false;
+ _cqOverflowTrackedSweepRearmCount = 0;
+ Volatile.Write(
+ ref _cqOverflowTrackedSweepDeadlineTicks,
+ Environment.TickCount64 + CqOverflowTrackedSweepDelayMilliseconds);
+ SocketsTelemetry.Log.IoUringCqOverflowRecovery(1);
+ if (_cqOverflowRecoveryBranch == IoUringCqOverflowRecoveryBranch.MultishotAcceptArming)
+ {
+ // Phase 1 spec branch (a): if CQ overflow occurs while multishot accept is live,
+ // defer re-arm nudges until after drain completes instead of discarding active state.
+ TryQueueDeferredMultishotAcceptRearmAfterRecovery();
+ }
+ AssertCompletionSlotPoolConsistency();
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCqOverflowRecoveryCompleted(
+ _cqOverflowRecoveryBranch,
+ _completionSlotsInUse);
+ }
+ }
+
+ ///
+ /// After CQ-overflow recovery completes, performs a delayed sweep to retire tracked operations
+ /// that remain attached despite already transitioning out of the waiting state.
+ ///
+ private void TrySweepStaleTrackedIoUringOperationsAfterCqOverflowRecovery()
+ {
+ if (!_ioUringCapabilities.IsCompletionMode ||
+ _cqOverflowRecoveryActive ||
+ !IsCurrentThreadEventLoopThread())
+ {
+ return;
+ }
+
+ long deadline = Volatile.Read(ref _cqOverflowTrackedSweepDeadlineTicks);
+ if (deadline == 0 ||
+ unchecked(Environment.TickCount64 - deadline) < 0)
+ {
+ return;
+ }
+
+ // Consume the deadline before the sweep; follow-up work can re-arm it.
+ Volatile.Write(ref _cqOverflowTrackedSweepDeadlineTicks, 0);
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ IoUringTrackedOperationState[]? trackedOperations = _trackedOperations;
+ if (completionEntries is null ||
+ trackedOperations is null ||
+ trackedOperations.Length != completionEntries.Length ||
+ IsIoUringTrackingEmpty())
+ {
+ return;
+ }
+
+ int detachedCount = 0;
+ int canceledWaitingCount = 0;
+
+ for (int slotIndex = 0; slotIndex < trackedOperations.Length; slotIndex++)
+ {
+ ref IoUringTrackedOperationState trackedState = ref trackedOperations[slotIndex];
+ SocketAsyncContext.AsyncOperation? operation = Volatile.Read(ref trackedState.TrackedOperation);
+ if (operation is null)
+ {
+ continue;
+ }
+
+ ulong generation = Volatile.Read(ref trackedState.TrackedOperationGeneration);
+ if (generation == 0)
+ {
+ continue;
+ }
+
+ ulong payload = EncodeCompletionSlotUserData(slotIndex, generation);
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (operation.IoUringUserData != userData)
+ {
+ continue;
+ }
+
+ IoUringCompletionOperationKind kind = completionEntries[slotIndex].Kind;
+ if (ShouldSkipCqOverflowTrackedSweep(operation, userData, kind))
+ {
+ continue;
+ }
+
+ if (operation.IsInWaitingState())
+ {
+ if (operation.TryCancel())
+ {
+ canceledWaitingCount++;
+ }
+
+ continue;
+ }
+
+ if (TryUntrackTrackedIoUringOperation(userData, operation, out SocketAsyncContext.AsyncOperation? removedOperation) != IoUringTrackedOperationRemoveResult.Removed ||
+ removedOperation is null)
+ {
+ continue;
+ }
+
+ removedOperation.ClearIoUringUserData();
+ FreeCompletionSlot(slotIndex);
+ detachedCount++;
+ }
+
+ // Sweep for orphaned SEND_ZC completion slots whose NOTIF CQE was lost to CQ overflow.
+ int zeroCopyOrphanCount = SweepOrphanedZeroCopyNotificationSlots(completionEntries, trackedOperations);
+
+ int totalDrainRecovery = detachedCount + zeroCopyOrphanCount;
+ if (totalDrainRecovery != 0)
+ {
+ SocketsTelemetry.Log.IoUringCompletionSlotDrainRecovery(totalDrainRecovery);
+ }
+
+ if (canceledWaitingCount != 0)
+ {
+ if (_cqOverflowTrackedSweepRearmCount < CqOverflowTrackedSweepMaxRearms)
+ {
+ _cqOverflowTrackedSweepRearmCount++;
+ Volatile.Write(
+ ref _cqOverflowTrackedSweepDeadlineTicks,
+ Environment.TickCount64 + CqOverflowTrackedSweepDelayMilliseconds);
+ }
+ else if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCqOverflowTrackedSweepRearmLimitReached(
+ CqOverflowTrackedSweepMaxRearms,
+ canceledWaitingCount);
+ }
+ }
+ else
+ {
+ _cqOverflowTrackedSweepRearmCount = 0;
+ }
+
+ if (NetEventSource.Log.IsEnabled() &&
+ (detachedCount != 0 || canceledWaitingCount != 0))
+ {
+ LogIoUringCqOverflowTrackedSweepResult(detachedCount, canceledWaitingCount);
+ }
+ }
+
+ ///
+ /// Scans completion slots for SEND_ZC entries stuck in ZeroCopyNotificationPending state
+ /// with no corresponding tracked operation, indicating a lost NOTIF CQE from CQ overflow.
+ ///
+ private int SweepOrphanedZeroCopyNotificationSlots(
+ IoUringCompletionSlot[] completionEntries,
+ IoUringTrackedOperationState[] trackedOperations)
+ {
+ int freedCount = 0;
+ for (int slotIndex = 0; slotIndex < completionEntries.Length; slotIndex++)
+ {
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending)
+ {
+ continue;
+ }
+
+ // The slot is waiting for a NOTIF CQE. Check whether any tracked operation
+ // still references this slot. If not, the first CQE was already processed and
+ // the operation was completed/dispatched, meaning the NOTIF CQE is the only
+ // thing keeping this slot alive -- and it was lost to CQ overflow.
+ ref IoUringTrackedOperationState trackedState = ref trackedOperations[slotIndex];
+ if (Volatile.Read(ref trackedState.TrackedOperation) is not null)
+ {
+ continue;
+ }
+
+ // Orphaned: NOTIF-pending with no tracked operation. Force-free the slot.
+ slot.IsZeroCopySend = false;
+ slot.ZeroCopyNotificationPending = false;
+ FreeCompletionSlot(slotIndex);
+ freedCount++;
+ }
+
+ if (freedCount != 0 && NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring CQ overflow recovery: freed {freedCount} orphaned SEND_ZC NOTIF-pending completion slot(s).");
+ }
+
+ return freedCount;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool ShouldSkipCqOverflowTrackedSweep(
+ SocketAsyncContext.AsyncOperation operation,
+ ulong userData,
+ IoUringCompletionOperationKind kind)
+ {
+ SocketAsyncContext context = operation.AssociatedContext;
+
+ if (kind == IoUringCompletionOperationKind.Accept &&
+ context.IsMultishotAcceptArmed &&
+ context.MultishotAcceptUserData == userData)
+ {
+ // Active multishot accept slots are intentionally long-lived.
+ return true;
+ }
+
+ if (kind == IoUringCompletionOperationKind.Message &&
+ context.IsPersistentMultishotRecvArmed() &&
+ context.PersistentMultishotRecvUserData == userData)
+ {
+ // Persistent multishot recv slots are intentionally long-lived.
+ return true;
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCqOverflowTrackedSweepResult(int detachedCount, int canceledWaitingCount)
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring CQ overflow stale-tracked sweep: detached={detachedCount}, canceledWaiting={canceledWaitingCount}");
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringCqOverflowTrackedSweepRearmLimitReached(int maxRearms, int canceledWaitingCount)
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring CQ overflow stale-tracked sweep rearm limit reached: maxRearms={maxRearms}, canceledWaiting={canceledWaitingCount}");
+ }
+
+ /// Debug assertion for Phase-1 branch (a): live multishot-accept slots must remain tracked during recovery.
+ [Conditional("DEBUG")]
+ private void AssertLiveAcceptSlotsRemainTrackedDuringRecovery(IoUringCqOverflowRecoveryBranch branch)
+ {
+ if (branch != IoUringCqOverflowRecoveryBranch.MultishotAcceptArming)
+ {
+ return;
+ }
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return;
+ }
+
+ bool foundTrackedAccept = false;
+ for (int i = 0; i < completionEntries.Length; i++)
+ {
+ if (completionEntries[i].Kind != IoUringCompletionOperationKind.Accept)
+ {
+ continue;
+ }
+
+ ulong payload = EncodeCompletionSlotUserData(i, completionEntries[i].Generation);
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (ContainsTrackedIoUringOperation(userData))
+ {
+ foundTrackedAccept = true;
+ break;
+ }
+ }
+
+ Debug.Assert(
+ foundTrackedAccept,
+ "CQ-overflow recovery branch (a) requires at least one live tracked multishot-accept slot.");
+ }
+
+ ///
+ /// After overflow recovery completes, nudges accept contexts with live multishot accept state
+ /// so the managed accept pipeline can resume dequeue/prepare flow.
+ ///
+ private void TryQueueDeferredMultishotAcceptRearmAfterRecovery()
+ {
+ if (!_ioUringCapabilities.SupportsMultishotAccept ||
+ Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return;
+ }
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return;
+ }
+
+ bool queuedAnyRearmNudge = false;
+ for (int slotIndex = 0; slotIndex < completionEntries.Length; slotIndex++)
+ {
+ if (completionEntries[slotIndex].Kind != IoUringCompletionOperationKind.Accept)
+ {
+ continue;
+ }
+
+ ulong payload = EncodeCompletionSlotUserData(slotIndex, completionEntries[slotIndex].Generation);
+ ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload);
+ if (!TryGetTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) ||
+ operation is not SocketAsyncContext.AcceptOperation acceptOperation)
+ {
+ continue;
+ }
+
+ SocketAsyncContext context = acceptOperation.AssociatedContext;
+ if (!context.IsMultishotAcceptArmed ||
+ context.MultishotAcceptUserData != userData)
+ {
+ continue;
+ }
+
+ EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read);
+ queuedAnyRearmNudge = true;
+ }
+
+ if (queuedAnyRearmNudge && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringDeferredMultishotAcceptRearmAfterRecovery();
+ }
+ }
+
+ ///
+ /// Handles a wakeup signal CQE by consuming the eventfd counter.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void HandleManagedWakeupSignal(int cqeResult)
+ {
+ if (cqeResult >= 0 && _managedWakeupEventFd >= 0)
+ {
+ ulong value;
+ Interop.Error readError = Interop.Sys.IoUringShimReadEventFd(_managedWakeupEventFd, &value);
+ if (readError != Interop.Error.SUCCESS &&
+ readError != Interop.Error.EAGAIN &&
+ NetEventSource.Log.IsEnabled())
+ {
+ LogWakeupReadFailure(this, readError);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static void LogWakeupReadFailure(SocketAsyncEngine engine, Interop.Error readErrorCode)
+ {
+ NetEventSource.Error(engine, $"io_uring wakeup eventfd read failed: error={readErrorCode}");
+ }
+ }
+
+ private const int FdCloexec = 1;
+ /// io_uring completion mode does not use socket event registration updates.
+ partial void LinuxTryChangeSocketEventRegistration(
+ IntPtr socketHandle,
+ Interop.Sys.SocketEvents currentEvents,
+ Interop.Sys.SocketEvents newEvents,
+ int data,
+ ref Interop.Error error,
+ ref bool handled)
+ {
+ if (!Volatile.Read(ref _ioUringInitialized))
+ {
+ return;
+ }
+
+ handled = true;
+ error = Interop.Error.SUCCESS;
+ }
+
+ private static bool TrySetFdCloseOnExec(int fd, out Interop.Error error)
+ {
+ int currentFlags = Interop.Sys.Fcntl.GetFD((IntPtr)fd);
+ if (currentFlags < 0)
+ {
+ error = Interop.Sys.GetLastErrorInfo().Error;
+ return false;
+ }
+
+ int updatedFlags = currentFlags | FdCloexec;
+ if (updatedFlags == currentFlags)
+ {
+ error = Interop.Error.SUCCESS;
+ return true;
+ }
+
+ if (Interop.Sys.Fcntl.SetFD((IntPtr)fd, updatedFlags) == 0)
+ {
+ error = Interop.Error.SUCCESS;
+ return true;
+ }
+
+ error = Interop.Sys.GetLastErrorInfo().Error;
+ return false;
+ }
+
+ ///
+ /// Probes the kernel for supported io_uring opcodes using IORING_REGISTER_PROBE and
+ /// populates the per-opcode _supportsOp* capability flags.
+ /// When the probe syscall is unavailable (older kernels), all flags remain at their
+ /// default value ().
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void ProbeIoUringOpcodeSupport(int ringFd)
+ {
+ // Probe buffer: 16-byte header + 256 * 8-byte ops = 2064 bytes.
+ const int maxOps = 256;
+ const int probeSize = 16 + maxOps * 8;
+ byte* probeBuffer = stackalloc byte[probeSize];
+ new Span(probeBuffer, probeSize).Clear();
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ ringFd, IoUringConstants.RegisterProbe, probeBuffer, (uint)maxOps, &result);
+
+ if (err != Interop.Error.SUCCESS)
+ {
+ // Probe not supported (for example older kernels): per-opcode flags remain false.
+ // Direct SQE prep does not gate on these flags; this mainly affects optional feature light-up.
+ return;
+ }
+
+ // Parse: ops start at offset 16, each is 8 bytes.
+ IoUringProbeOp* ops = (IoUringProbeOp*)(probeBuffer + 16);
+ IoUringProbeHeader* header = (IoUringProbeHeader*)probeBuffer;
+ int opsCount = Math.Min((int)header->OpsLen, maxOps);
+
+ _supportsOpReadFixed = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.ReadFixed);
+ _supportsOpSend = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Send);
+ _supportsOpRecv = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Recv);
+ _supportsOpSendMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsg);
+ _supportsOpRecvMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.RecvMsg);
+ _supportsOpAccept = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Accept);
+ _supportsOpConnect = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Connect);
+ _supportsOpSendZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendZc);
+ _supportsOpSendMsgZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsgZc);
+ _zeroCopySendEnabled = _supportsOpSendZc && IsZeroCopySendOptedIn();
+ _supportsOpAsyncCancel = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.AsyncCancel);
+ _supportsMultishotAccept = _supportsOpAccept;
+ RefreshIoUringMultishotRecvSupport();
+ }
+
+ /// Checks whether a specific opcode is supported by the kernel's io_uring probe result.
+ private static unsafe bool IsOpcodeSupported(IoUringProbeOp* ops, int opsCount, byte opcode)
+ {
+ if (opcode >= opsCount) return false;
+ return (ops[opcode].Flags & IoUringConstants.ProbeOpFlagSupported) != 0;
+ }
+
+ /// Publishes the managed SQ tail pointer to make queued SQEs visible to the kernel.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe void PublishManagedSqeTail()
+ {
+ if (!_ioUringManagedSqTailLoaded || _ioUringSqRingInfo.SqTailPtr == IntPtr.Zero)
+ {
+ return;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "PublishManagedSqeTail must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ ref uint sqTailRef = ref Unsafe.AsRef((void*)_ioUringSqRingInfo.SqTailPtr);
+ Volatile.Write(ref sqTailRef, _ioUringManagedSqTail);
+ _ioUringManagedSqTailLoaded = false;
+ }
+
+ ///
+ /// Returns true when the SQPOLL kernel thread has gone idle and needs an explicit wakeup.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool SqNeedWakeup()
+ {
+ Debug.Assert(_sqPollEnabled, "SqNeedWakeup should only be checked in SQPOLL mode.");
+ if (_managedSqFlagsPtr == null)
+ {
+ return true;
+ }
+
+ return (Volatile.Read(ref *_managedSqFlagsPtr) & IoUringConstants.SqNeedWakeup) != 0;
+ }
+
+ /// Allocates the next available SQE slot from the submission ring.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe bool TryGetNextManagedSqe(out IoUringSqe* sqe)
+ {
+ sqe = null;
+ if (!_ioUringDirectSqeEnabled)
+ {
+ return false;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryGetNextManagedSqe must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ if (!_managedSqeInvariantsValidated)
+ {
+ return false;
+ }
+
+ ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo;
+ Debug.Assert(ringInfo.SqeBase != IntPtr.Zero);
+ Debug.Assert(ringInfo.SqHeadPtr != IntPtr.Zero);
+ Debug.Assert(ringInfo.SqTailPtr != IntPtr.Zero);
+ Debug.Assert(ringInfo.SqEntries != 0);
+ Debug.Assert(ringInfo.SqeSize == (uint)sizeof(IoUringSqe));
+
+ ref uint sqHeadRef = ref Unsafe.AsRef((void*)ringInfo.SqHeadPtr);
+ uint sqHead = Volatile.Read(ref sqHeadRef);
+ if (!_ioUringManagedSqTailLoaded)
+ {
+ ref uint sqTailRef = ref Unsafe.AsRef((void*)ringInfo.SqTailPtr);
+ _ioUringManagedSqTail = Volatile.Read(ref sqTailRef);
+ _ioUringManagedSqTailLoaded = true;
+ }
+
+ uint sqTail = _ioUringManagedSqTail;
+ if (sqTail - sqHead >= ringInfo.SqEntries)
+ {
+ return false;
+ }
+
+ uint index = sqTail & ringInfo.SqMask;
+ nint sqeOffset = checked((nint)((nuint)index * ringInfo.SqeSize));
+ sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset);
+ // Managed direct-SQE preparation in System.Net.Sockets is socket-opcode-only. Clearing the full
+ // SQE is safe because every opcode we emit initializes all fields it relies on.
+ Unsafe.WriteUnaligned(sqe, default(IoUringSqe));
+ _ioUringManagedSqTail = sqTail + 1;
+ _ioUringManagedPendingSubmissions++;
+ return true;
+ }
+
+ /// Validates immutable SQ ring invariants once at initialization.
+ private bool ValidateManagedSqeInitializationInvariants()
+ {
+ ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo;
+ if (ringInfo.SqeBase == IntPtr.Zero ||
+ ringInfo.SqHeadPtr == IntPtr.Zero ||
+ ringInfo.SqTailPtr == IntPtr.Zero ||
+ ringInfo.SqEntries == 0)
+ {
+ return false;
+ }
+
+ if (ringInfo.SqeSize != (uint)sizeof(IoUringSqe))
+ {
+ Debug.Fail($"Unexpected io_uring SQE size. Expected {sizeof(IoUringSqe)}, got {ringInfo.SqeSize}.");
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Attempts to acquire an SQE, retrying with intermediate submits on ring full.
+ private unsafe bool TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError)
+ {
+ sqe = null;
+ submitError = Interop.Error.SUCCESS;
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryAcquireManagedSqeWithRetry must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ SocketEventHandler drainHandler = default;
+ bool drainHandlerInitialized = false;
+
+ for (int attempt = 0; attempt < MaxIoUringSqeAcquireSubmitAttempts; attempt++)
+ {
+ if (TryGetNextManagedSqe(out sqe))
+ {
+ return true;
+ }
+
+ // Before retrying submission, run a CQ drain pass so completions can release
+ // slots and unblock kernel forward progress. The overflow counter is observed
+ // during drain; do not assume a single pass fully clears overflow pressure.
+ if (_managedCqDrainEnabled &&
+ _managedCqOverflowPtr is not null &&
+ _completionSlotsInUse != 0)
+ {
+ if (!drainHandlerInitialized)
+ {
+ drainHandler = new SocketEventHandler(this);
+ drainHandlerInitialized = true;
+ }
+ _ = DrainCqeRingBatch(drainHandler);
+
+ if (TryGetNextManagedSqe(out sqe))
+ {
+ return true;
+ }
+ }
+
+ submitError = SubmitIoUringOperationsNormalized();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ return false;
+ }
+ }
+
+ submitError = Interop.Error.EAGAIN;
+ return false;
+ }
+
+ ///
+ /// Common setup for direct SQE preparation: allocates a completion slot, encodes user data,
+ /// resolves the socket fd/flags, applies test hooks, and acquires an SQE. On failure,
+ /// restores test state and frees the slot.
+ ///
+ private unsafe struct IoUringDirectSqeSetupResult
+ {
+ public SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult PrepareResult;
+ public int SlotIndex;
+ public ulong UserData;
+ public int SqeFd;
+ public byte SqeFlags;
+ public IoUringSqe* Sqe;
+ public SocketError ErrorCode;
+ }
+
+ ///
+ /// Prepares a direct SQE and returns all setup data as a single struct to avoid large
+ /// out-parameter callsites in per-opcode prepare paths.
+ ///
+ ///
+ /// if the SQE was acquired
+ /// (caller must write the SQE and return Prepared),
+ /// or a terminal result (Unsupported/PrepareFailed) that the caller should return directly.
+ ///
+ private unsafe IoUringDirectSqeSetupResult TrySetupDirectSqe(
+ SafeSocketHandle socket,
+ byte opcode)
+ {
+ IoUringDirectSqeSetupResult setup = default;
+ setup.SlotIndex = -1;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ setup.ErrorCode = SocketError.Success;
+
+ if (!_ioUringDirectSqeEnabled)
+ {
+ return setup;
+ }
+
+ int slotIndex = AllocateCompletionSlot();
+ if (slotIndex < 0)
+ {
+ RecordIoUringCompletionSlotExhaustion();
+
+ if (!_completionSlotDrainInProgress)
+ {
+ _completionSlotDrainInProgress = true;
+ try
+ {
+ SocketEventHandler handler = new SocketEventHandler(this);
+ if (DrainCqeRingBatch(handler))
+ {
+ slotIndex = AllocateCompletionSlot();
+ }
+ }
+ finally
+ {
+ _completionSlotDrainInProgress = false;
+ }
+ }
+
+ if (slotIndex < 0)
+ {
+ return setup;
+ }
+
+ RecordIoUringCompletionSlotDrainRecovery();
+ }
+
+ setup.SlotIndex = slotIndex;
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex];
+ setup.UserData = EncodeCompletionSlotUserData(slotIndex, slot.Generation);
+
+ bool addedSocketRef = false;
+ try
+ {
+ // Keep the fd alive from SQE prep through CQE retirement to avoid fd-reuse races after close.
+ socket.DangerousAddRef(ref addedSocketRef);
+ }
+ catch (ObjectDisposedException)
+ {
+ FreeCompletionSlot(slotIndex);
+ setup.SlotIndex = -1;
+ setup.ErrorCode = SocketError.OperationAborted;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed;
+ return setup;
+ }
+
+ if (!addedSocketRef)
+ {
+ FreeCompletionSlot(slotIndex);
+ setup.SlotIndex = -1;
+ setup.ErrorCode = SocketError.OperationAborted;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed;
+ return setup;
+ }
+
+ slotStorage.DangerousRefSocketHandle = socket;
+ // GC/rooting contract for fd lifetime:
+ // Engine -> _completionSlotStorage[slotIndex].DangerousRefSocketHandle -> SafeSocketHandle.
+ // Keep this chain alive across SQE submission through CQE retirement to avoid fd reuse races.
+ SafeSocketHandle? operation = slotStorage.DangerousRefSocketHandle;
+ Debug.Assert(operation != null);
+ int socketFd = (int)(nint)operation!.DangerousGetHandle();
+ ConfigureSocketSqeFdAndFlags(socketFd, out setup.SqeFd, out setup.SqeFlags);
+ ApplyDebugTestForcedResult(ref slot, opcode);
+
+ if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError))
+ {
+ RestoreDebugTestForcedResultIfNeeded(slotIndex, opcode);
+ FreeCompletionSlot(slotIndex);
+ setup.SlotIndex = -1;
+
+ if (submitError == Interop.Error.SUCCESS ||
+ submitError == Interop.Error.EAGAIN ||
+ submitError == Interop.Error.EWOULDBLOCK)
+ {
+ return setup;
+ }
+
+ setup.ErrorCode = SocketPal.GetSocketErrorForErrorCode(submitError);
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed;
+ return setup;
+ }
+
+ setup.Sqe = sqe;
+ setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ return setup;
+ }
+
+ /// Prepares a send SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSend(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Send);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ WriteSendLikeSqe(setup.Sqe, IoUringOpcodes.Send, setup.SqeFd, setup.SqeFlags, setup.UserData, buffer, (uint)bufferLen, rwFlags);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ ///
+ /// Prepares a send SQE, preferring SEND_ZC when eligible and falling back to SEND when unavailable.
+ ///
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendWithZeroCopyFallback(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ out bool usedZeroCopy,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ usedZeroCopy = false;
+ if (ShouldTryIoUringDirectSendZeroCopy(bufferLen))
+ {
+ SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendZc(
+ socket,
+ buffer,
+ bufferLen,
+ flags,
+ out userData,
+ out errorCode);
+ if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported)
+ {
+ usedZeroCopy = zeroCopyResult == SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ return zeroCopyResult;
+ }
+ }
+
+ return TryPrepareIoUringDirectSend(
+ socket,
+ buffer,
+ bufferLen,
+ flags,
+ out userData,
+ out errorCode);
+ }
+
+ /// Prepares a zero-copy send SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendZc(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!ShouldTryIoUringDirectSendZeroCopy(bufferLen))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.SendZc);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ slot.IsZeroCopySend = true;
+ slot.ZeroCopyNotificationPending = false;
+
+ WriteSendLikeSqe(setup.Sqe, IoUringOpcodes.SendZc, setup.SqeFd, setup.SqeFlags, setup.UserData, buffer, (uint)bufferLen, rwFlags);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a recv SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectRecv(
+ SafeSocketHandle socket,
+ byte* buffer,
+ int bufferLen,
+ SocketFlags flags,
+ bool allowMultishotRecv,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Recv);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ if (ShouldTryIoUringDirectFixedRecv(flags, allowMultishotRecv, bufferLen) &&
+ TryPrepareIoUringDirectRecvFixed(setup.SlotIndex, setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, bufferLen))
+ {
+ SocketsTelemetry.Log.IoUringFixedRecvSelected();
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ if (allowMultishotRecv &&
+ bufferLen > 0 &&
+ TryGetIoUringMultishotRecvBufferGroupId(out ushort multishotBufferGroupId))
+ {
+ WriteMultishotRecvSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, multishotBufferGroupId);
+ }
+ else if (bufferLen > 0 &&
+ TryGetIoUringProvidedBufferGroupId(out ushort providedBufferGroupId))
+ {
+ WriteProvidedBufferRecvSqe(
+ setup.Sqe,
+ setup.SqeFd,
+ setup.SqeFlags,
+ setup.UserData,
+ (uint)bufferLen,
+ rwFlags,
+ providedBufferGroupId);
+ }
+ else
+ {
+ WriteRecvSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, buffer, (uint)bufferLen, rwFlags);
+ }
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ private bool ShouldTryIoUringDirectFixedRecv(SocketFlags flags, bool allowMultishotRecv, int bufferLen)
+ {
+ if (!_supportsOpReadFixed || !_ioUringCapabilities.HasRegisteredBuffers)
+ {
+ return false;
+ }
+
+ if (allowMultishotRecv || bufferLen <= 0)
+ {
+ return false;
+ }
+
+ // READ_FIXED does not provide recvmsg/socket flags semantics.
+ return flags == SocketFlags.None;
+ }
+
+ private unsafe bool TryPrepareIoUringDirectRecvFixed(
+ int slotIndex,
+ IoUringSqe* sqe,
+ int sqeFd,
+ byte sqeFlags,
+ ulong userData,
+ int requestedLength)
+ {
+ IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing;
+ if (providedBufferRing is null)
+ {
+ SocketsTelemetry.Log.IoUringFixedRecvFallback();
+ return false;
+ }
+
+ if (!providedBufferRing.TryAcquireBufferForPreparedReceive(
+ out ushort bufferId,
+ out byte* fixedBuffer,
+ out int fixedBufferLength))
+ {
+ // Under transient provided-buffer pressure, fall back to normal receive preparation.
+ SocketsTelemetry.Log.IoUringFixedRecvFallback();
+ return false;
+ }
+
+ Debug.Assert(_completionSlots is not null);
+ ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex];
+ slot.UsesFixedRecvBuffer = true;
+ slot.FixedRecvBufferId = bufferId;
+
+ int receiveLength = Math.Min(requestedLength, fixedBufferLength);
+ WriteReadFixedSqe(
+ sqe,
+ sqeFd,
+ sqeFlags,
+ userData,
+ fixedBuffer,
+ (uint)receiveLength,
+ bufferId);
+ return true;
+ }
+
+ /// Prepares an accept SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectAccept(
+ SafeSocketHandle socket,
+ byte* socketAddress,
+ int socketAddressLen,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Accept);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Accept);
+ Debug.Assert(slotStorage.NativeSocketAddressLengthPtr is not null);
+ *slotStorage.NativeSocketAddressLengthPtr = socketAddressLen;
+
+ WriteAcceptSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, socketAddress, (IntPtr)slotStorage.NativeSocketAddressLengthPtr);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a multishot accept SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectMultishotAccept(
+ SafeSocketHandle socket,
+ byte* socketAddress,
+ int socketAddressLen,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+ if (!_supportsMultishotAccept)
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Accept);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+ _ = socketAddress;
+ _ = socketAddressLen;
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Accept);
+ Debug.Assert(slotStorage.NativeSocketAddressLengthPtr is not null);
+ // Security hardening: multishot accept reuses a single SQE across shots, so sharing one sockaddr
+ // writeback buffer can race and surface mismatched peer addresses under bursty delivery.
+ // Transitional multishot accept only needs accepted fds, so request no sockaddr writeback.
+ *slotStorage.NativeSocketAddressLengthPtr = 0;
+ slotStorage.ReceiveSocketAddressCapacity = 0;
+
+ WriteMultishotAcceptSqe(
+ setup.Sqe,
+ setup.SqeFd,
+ setup.SqeFlags,
+ setup.UserData,
+ socketAddress: null,
+ socketAddressLengthPtr: IntPtr.Zero);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a connect SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectConnect(
+ SafeSocketHandle socket,
+ byte* socketAddress,
+ int socketAddressLen,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Connect);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ WriteConnectSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, socketAddress, socketAddressLen);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a sendmsg SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessage(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.SendMsg);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Message);
+ if (!TryPrepareInlineMessageStorage(setup.SlotIndex, messageHeader, isReceive: false))
+ {
+ FreeCompletionSlot(setup.SlotIndex);
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ WriteSendMsgLikeSqe(setup.Sqe, IoUringOpcodes.SendMsg, setup.SqeFd, setup.SqeFlags, setup.UserData, slotStorage.NativeMsgHdrPtr, rwFlags);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ ///
+ /// Prepares a sendmsg SQE, preferring SENDMSG_ZC when eligible and falling back to SENDMSG otherwise.
+ ///
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageWithZeroCopyFallback(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ int payloadLength,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ if (ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength))
+ {
+ SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendMessageZc(
+ socket,
+ messageHeader,
+ payloadLength,
+ flags,
+ out userData,
+ out errorCode);
+ if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported)
+ {
+ return zeroCopyResult;
+ }
+ }
+
+ return TryPrepareIoUringDirectSendMessage(
+ socket,
+ messageHeader,
+ flags,
+ out userData,
+ out errorCode);
+ }
+
+ /// Prepares a sendmsg_zc SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageZc(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ int payloadLength,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.SendMsgZc);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Message);
+ slotStorage.MessageIsReceive = false;
+ // Mirror SEND_ZC semantics: first CQE is not final managed completion; operation
+ // completes only after NOTIF CQE confirms kernel/NIC no longer references payload.
+ slot.IsZeroCopySend = true;
+ slot.ZeroCopyNotificationPending = false;
+ if (!TryPrepareInlineMessageStorage(setup.SlotIndex, messageHeader, isReceive: false))
+ {
+ // Per-slot inline native slabs avoid hot-path allocations; overflow shapes
+ // fall back by returning Unsupported so upper layers can use alternate paths.
+ FreeCompletionSlot(setup.SlotIndex);
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ WriteSendMsgLikeSqe(setup.Sqe, IoUringOpcodes.SendMsgZc, setup.SqeFd, setup.SqeFlags, setup.UserData, slotStorage.NativeMsgHdrPtr, rwFlags);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Prepares a recvmsg SQE via the managed direct path.
+ internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectReceiveMessage(
+ SafeSocketHandle socket,
+ Interop.Sys.MessageHeader* messageHeader,
+ SocketFlags flags,
+ out ulong userData,
+ out SocketError errorCode)
+ {
+ userData = 0;
+ errorCode = SocketError.Success;
+
+ if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags))
+ {
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.RecvMsg);
+ if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared)
+ {
+ errorCode = setup.ErrorCode;
+ return setup.PrepareResult;
+ }
+
+ ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex];
+ ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex];
+ SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Message);
+ if (!TryPrepareInlineMessageStorage(setup.SlotIndex, messageHeader, isReceive: true))
+ {
+ FreeCompletionSlot(setup.SlotIndex);
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported;
+ }
+
+ WriteRecvMsgSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, slotStorage.NativeMsgHdrPtr, rwFlags);
+ userData = setup.UserData;
+ return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared;
+ }
+
+ /// Debug-only assertion that validates a state machine transition.
+ [Conditional("DEBUG")]
+ private static void AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState from,
+ IoUringOperationLifecycleState to)
+ {
+ bool isValid =
+ from == IoUringOperationLifecycleState.Queued && to == IoUringOperationLifecycleState.Prepared ||
+ from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Submitted ||
+ from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Detached ||
+ from == IoUringOperationLifecycleState.Submitted &&
+ (to == IoUringOperationLifecycleState.Queued ||
+ to == IoUringOperationLifecycleState.Completed ||
+ to == IoUringOperationLifecycleState.Canceled ||
+ to == IoUringOperationLifecycleState.Detached);
+
+ Debug.Assert(isValid, $"Invalid io_uring lifecycle transition: {from} -> {to}");
+ }
+
+ /// Checks whether the kernel version meets the minimum for io_uring support.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static bool IsIoUringKernelVersionSupported()
+ {
+#if DEBUG
+ if (string.Equals(
+ Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceKernelVersionUnsupported),
+ "1",
+ StringComparison.Ordinal))
+ {
+ return false;
+ }
+#endif
+
+ return OperatingSystem.IsOSPlatformVersionAtLeast(
+ "Linux",
+ IoUringConstants.MinKernelMajor,
+ IoUringConstants.MinKernelMinor);
+ }
+
+ ///
+ /// Recomputes whether multishot recv can be used by this engine instance.
+ /// Requires opcode support and active provided-buffer ring support.
+ ///
+ private bool RefreshIoUringMultishotRecvSupport()
+ {
+ _supportsMultishotRecv =
+ _supportsOpRecv &&
+ _ioUringCapabilities.SupportsProvidedBufferRings;
+ return _supportsMultishotRecv;
+ }
+
+ ///
+ /// Returns the provided-buffer group id used for buffer-select receive submissions.
+ ///
+ private bool TryGetIoUringProvidedBufferGroupId(out ushort bufferGroupId)
+ {
+ if (_ioUringCapabilities.SupportsProvidedBufferRings && _ioUringProvidedBufferRing is not null)
+ {
+ bufferGroupId = _ioUringProvidedBufferGroupId;
+ return true;
+ }
+
+ bufferGroupId = default;
+ return false;
+ }
+
+ ///
+ /// Returns the provided-buffer group id used for multishot recv submissions.
+ /// Multishot recv remains disabled unless both the opcode probe and provided-ring
+ /// registration succeeded for this engine instance.
+ ///
+ private bool TryGetIoUringMultishotRecvBufferGroupId(out ushort bufferGroupId)
+ {
+ if (_supportsMultishotRecv && TryGetIoUringProvidedBufferGroupId(out bufferGroupId))
+ {
+ return true;
+ }
+
+ bufferGroupId = default;
+ return false;
+ }
+
+ internal bool SupportsMultishotRecv => _ioUringCapabilities.SupportsMultishotRecv;
+ internal bool SupportsMultishotAccept => _ioUringCapabilities.SupportsMultishotAccept;
+
+ /// Calls io_uring_setup and negotiates feature flags.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static unsafe bool TrySetupIoUring(bool sqPollRequested, out IoUringSetupResult setupResult)
+ {
+ setupResult = default;
+ uint queueEntries = GetIoUringQueueEntries();
+
+ // R_DISABLED defers submitter_task assignment until REGISTER_ENABLE_RINGS,
+ // which is called from the event loop thread. This ensures DEFER_TASKRUN's
+ // submitter_task check (EEXIST on mismatch) passes on all kernel versions.
+ uint flags = IoUringConstants.SetupCqSize | IoUringConstants.SetupSubmitAll
+ | IoUringConstants.SetupCoopTaskrun | IoUringConstants.SetupSingleIssuer
+ | IoUringConstants.SetupNoSqArray | IoUringConstants.SetupCloexec
+ | IoUringConstants.SetupRDisabled;
+
+ if (sqPollRequested)
+ {
+ // SQPOLL and DEFER_TASKRUN are mutually exclusive in practice.
+ flags |= IoUringConstants.SetupSqPoll;
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(null, "io_uring setup: SQPOLL requested and included in initial setup flags.");
+ }
+ }
+ else
+ {
+ // Default to DEFER_TASKRUN (+ COOP_TASKRUN already in base flags): this reduces
+ // event-loop CPU by deferring task work until io_uring_enter, at the cost of
+ // potentially higher tail latency when the event loop is preempted by other work.
+ // Benchmark data for DEFER_TASKRUN vs COOP_TASKRUN-only under mixed epoll/io_uring
+ // loads and revisit the default when we have p99 data.
+ flags |= IoUringConstants.SetupDeferTaskrun;
+ }
+
+ Interop.Sys.IoUringParams ioParams = default;
+ ioParams.Flags = flags;
+ ioParams.CqEntries = queueEntries * IoUringConstants.CqEntriesFactor;
+
+ int ringFd;
+ Interop.Error err = Interop.Sys.IoUringShimSetup(queueEntries, &ioParams, &ringFd);
+
+ // IORING_SETUP_NO_SQARRAY was introduced in Linux 6.6.
+ // IORING_SETUP_CLOEXEC was introduced in Linux 5.19.
+ // Peel unsupported setup flags on EINVAL and retry.
+ if (err == Interop.Error.EPERM)
+ {
+ string deniedFlag = (flags & IoUringConstants.SetupSqPoll) != 0 ?
+ "IORING_SETUP_SQPOLL" :
+ "IORING_SETUP_NO_SQARRAY";
+ // Never peel/retry on EPERM; that can bypass an explicit seccomp/kernel policy denial.
+ NetEventSource.Error(null, $"io_uring setup denied (EPERM) for {deniedFlag}; not retrying with peeled flags.");
+ }
+ else if (err == Interop.Error.EINVAL &&
+ (flags & IoUringConstants.SetupNoSqArray) != 0)
+ {
+ flags &= ~IoUringConstants.SetupNoSqArray;
+ ioParams = default;
+ ioParams.Flags = flags;
+ ioParams.CqEntries = queueEntries * IoUringConstants.CqEntriesFactor;
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(null, $"io_uring setup: peeled NO_SQARRAY after {err}.");
+ }
+
+ err = Interop.Sys.IoUringShimSetup(queueEntries, &ioParams, &ringFd);
+ }
+
+ if (err == Interop.Error.EINVAL &&
+ (flags & IoUringConstants.SetupCloexec) != 0)
+ {
+ flags &= ~IoUringConstants.SetupCloexec;
+ ioParams = default;
+ ioParams.Flags = flags;
+ ioParams.CqEntries = queueEntries * IoUringConstants.CqEntriesFactor;
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(null, $"io_uring setup: peeled CLOEXEC after {err}.");
+ }
+
+ err = Interop.Sys.IoUringShimSetup(queueEntries, &ioParams, &ringFd);
+ }
+
+ if (err != Interop.Error.SUCCESS)
+ {
+ return false;
+ }
+
+ // IORING_SETUP_CLOEXEC removes the fork/exec inheritance window on supporting kernels.
+ // Keep FD_CLOEXEC as a fallback for peeled/older setups.
+ if (!TrySetFdCloseOnExec(ringFd, out Interop.Error cloexecError))
+ {
+ if (NetEventSource.Log.IsEnabled())
+ {
+ // Ensure ring fd is not inherited across fork/exec; inherited ring fds can corrupt ownership.
+ NetEventSource.Error(null, $"io_uring setup: failed to set FD_CLOEXEC on ring fd: {cloexecError}.");
+ }
+
+ Interop.Sys.IoUringShimCloseFd(ringFd);
+ return false;
+ }
+
+ setupResult.RingFd = ringFd;
+ setupResult.Params = ioParams;
+ setupResult.NegotiatedFlags = flags;
+ setupResult.UsesExtArg = (ioParams.Features & IoUringConstants.FeatureExtArg) != 0;
+ setupResult.SqPollNegotiated = (flags & IoUringConstants.SetupSqPoll) != 0;
+ if (setupResult.SqPollNegotiated && NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(null, "io_uring setup: SQPOLL negotiated.");
+ }
+
+ if (setupResult.SqPollNegotiated)
+ {
+ SocketsTelemetry.Log.ReportIoUringSqPollNegotiatedWarning();
+ }
+ return true;
+ }
+
+
+ /// Queues a POLL_ADD SQE on the wakeup eventfd for cross-thread signaling.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool QueueManagedWakeupPollAdd()
+ {
+ if (_managedWakeupEventFd < 0)
+ return false;
+
+ if (!TryGetNextManagedSqe(out IoUringSqe* sqe))
+ return false;
+
+ sqe->Opcode = IoUringOpcodes.PollAdd;
+ sqe->Fd = _managedWakeupEventFd;
+ sqe->Len = IoUringConstants.PollAddFlagMulti; // IORING_POLL_ADD_MULTI
+ sqe->RwFlags = IoUringConstants.PollIn;
+ sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagWakeupSignal, 0);
+ return true;
+ }
+
+ /// Attempts to register the ring fd for fixed-fd submission.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryRegisterRingFd(int ringFd, out int registeredRingFd)
+ {
+ registeredRingFd = -1;
+
+ // io_uring_rsrc_update: { uint32 offset, uint32 resv, uint64 data }
+ uint* update = stackalloc uint[4]; // 16 bytes
+ update[0] = IoUringConstants.RegisterOffsetAuto; // offset = auto-assign
+ update[1] = 0; // resv
+ *(ulong*)(update + 2) = (ulong)ringFd; // data = ring fd
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ ringFd, IoUringConstants.RegisterRingFds, update, 1u, &result);
+
+ if (err != Interop.Error.SUCCESS || result <= 0)
+ return false;
+
+ registeredRingFd = (int)update[0]; // kernel wrote assigned index back
+ return true;
+ }
+
+ ///
+ /// Configures the SQE fd and flags for a socket operation.
+ /// Completion mode uses direct socket file descriptors.
+ ///
+ private static void ConfigureSocketSqeFdAndFlags(int socketFd, out int sqeFd, out byte sqeFlags)
+ {
+ sqeFd = socketFd;
+ sqeFlags = 0;
+ }
+
+
+ ///
+ /// Orchestrates complete managed io_uring initialization: kernel version check,
+ /// ring setup with flag negotiation, mmap, opcode probe, eventfd creation,
+ /// ring fd registration, and initial wakeup poll queue.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe bool TryInitializeManagedIoUring(in IoUringResolvedConfiguration resolvedConfiguration)
+ {
+ if (!IsIoUringKernelVersionSupported())
+ return false;
+
+ bool sqPollRequested = resolvedConfiguration.SqPollRequested;
+ if (!TrySetupIoUring(sqPollRequested, out IoUringSetupResult setupResult))
+ return false;
+
+ if (!TryMmapRings(ref setupResult))
+ return false;
+
+ _sqPollEnabled = setupResult.SqPollNegotiated;
+ if (NetEventSource.Log.IsEnabled())
+ {
+ if (sqPollRequested && !_sqPollEnabled)
+ {
+ NetEventSource.Info(
+ this,
+ "SQPOLL requested but not negotiated (kernel support/capabilities may be unavailable).");
+ }
+ else if (_sqPollEnabled)
+ {
+ NetEventSource.Info(this, "SQPOLL negotiated and enabled.");
+ }
+ }
+
+ // Probe opcode support.
+ ProbeIoUringOpcodeSupport(setupResult.RingFd);
+
+ // Create wakeup eventfd.
+ int eventFd;
+ Interop.Error err = Interop.Sys.IoUringShimCreateEventFd(&eventFd);
+ if (err != Interop.Error.SUCCESS)
+ {
+ // Cleanup: unmap and close
+ CleanupManagedRings();
+ return false;
+ }
+
+ if (!TrySetFdCloseOnExec(eventFd, out Interop.Error cloexecError))
+ {
+ if (NetEventSource.Log.IsEnabled())
+ {
+ // Eventfd wake channel must remain process-local across exec to prevent stale cross-process signaling.
+ NetEventSource.Error(this, $"io_uring setup: failed to set FD_CLOEXEC on wakeup eventfd: {cloexecError}.");
+ }
+
+ Interop.Sys.IoUringShimCloseFd(eventFd);
+ CleanupManagedRings();
+ return false;
+ }
+
+ _managedWakeupEventFd = eventFd;
+
+ // Try to register the ring fd for faster enter syscalls.
+ if (TryRegisterRingFd(setupResult.RingFd, out int registeredRingFd))
+ {
+ _ioUringSqRingInfo.RegisteredRingFd = registeredRingFd;
+ }
+
+ // Queue the initial wakeup POLL_ADD.
+ // Direct SQE must be enabled for QueueManagedWakeupPollAdd to work.
+ _ioUringDirectSqeEnabled = true;
+ if (!QueueManagedWakeupPollAdd())
+ {
+ _ioUringDirectSqeEnabled = false;
+ Interop.Sys.IoUringShimCloseFd(eventFd);
+ _managedWakeupEventFd = -1;
+ CleanupManagedRings();
+ return false;
+ }
+
+ // Respect process-level direct SQE toggle after the required wakeup POLL_ADD is armed.
+ if (resolvedConfiguration.DirectSqeDisabled)
+ {
+ _ioUringDirectSqeEnabled = false;
+ }
+
+ InitializeIoUringProvidedBufferRingIfSupported(setupResult.RingFd);
+ RefreshIoUringMultishotRecvSupport();
+ _ioUringInitialized = true;
+
+ InitializeDebugTestHooksFromEnvironment();
+
+ return true;
+ }
+
+ /// Validates the managed NativeMsghdr layout contract for direct io_uring message operations.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private bool IsNativeMsghdrLayoutSupportedForIoUring()
+ {
+ if (IsNativeMsghdrLayoutSupportedForIoUring(IntPtr.Size, sizeof(NativeMsghdr)))
+ {
+ return true;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Info(
+ this,
+ $"io_uring disabled: unsupported NativeMsghdr layout (pointerSize={IntPtr.Size}, sizeof(NativeMsghdr)={sizeof(NativeMsghdr)})");
+ }
+
+ return false;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsNativeMsghdrLayoutSupportedForIoUring(int pointerSize, int nativeMsghdrSize) =>
+ pointerSize == 8 && nativeMsghdrSize == 56;
+
+ /// Detects io_uring support and initializes the managed submission/completion paths.
+ partial void LinuxDetectAndInitializeIoUring()
+ {
+ IoUringResolvedConfiguration resolvedConfiguration = ResolveIoUringResolvedConfiguration();
+ LogIoUringResolvedConfigurationIfNeeded(in resolvedConfiguration);
+ if (!resolvedConfiguration.IoUringEnabled || !IsNativeMsghdrLayoutSupportedForIoUring() || !TryInitializeManagedIoUring(in resolvedConfiguration))
+ {
+ _ioUringCapabilities = ResolveLinuxIoUringCapabilities(isIoUringPort: false);
+ SocketsTelemetry.Log.ReportSocketEngineBackendSelected(
+ isIoUringPort: false,
+ isCompletionMode: false,
+ sqPollEnabled: false);
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringModeSelection(_ioUringCapabilities);
+ }
+
+ return;
+ }
+
+ // Managed init succeeded — set capabilities and initialize managed-side state.
+ _ioUringCapabilities = default(LinuxIoUringCapabilities)
+ .WithIsIoUringPort(true)
+ .WithMode(IoUringMode.Completion)
+ .WithSupportsMultishotRecv(_supportsMultishotRecv)
+ .WithSupportsMultishotAccept(_supportsMultishotAccept)
+ .WithSupportsZeroCopySend(_zeroCopySendEnabled)
+ .WithSqPollEnabled(_sqPollEnabled)
+ .WithSupportsProvidedBufferRings(false)
+ .WithHasRegisteredBuffers(false);
+
+ SocketsTelemetry.Log.ReportSocketEngineBackendSelected(
+ isIoUringPort: true,
+ isCompletionMode: true,
+ sqPollEnabled: _ioUringCapabilities.SqPollEnabled);
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringModeSelection(_ioUringCapabilities);
+ }
+
+ InitializeLinuxIoUringDiagnosticsState();
+
+ _ioUringSlotCapacity = (int)Math.Max(_managedCqEntries, IoUringConstants.QueueEntries);
+ // Slot pool capacity is 2x slot capacity (currently 8192 with default cq sizing).
+ // Multishot operations retain slots for their full lifetime, so this bounds
+ // concurrent long-lived multishot receives before backpressure/exhaustion.
+ _ioUringPrepareQueue = new MpscQueue();
+ _ioUringCancelQueue = new MpscQueue();
+ int completionSlotCapacity = _ioUringSlotCapacity * IoUringConstants.CompletionOperationPoolCapacityFactor;
+ InitializeCompletionSlotPool(completionSlotCapacity);
+
+ _managedCqDrainEnabled = true;
+ }
+
+ ///
+ /// Enables the io_uring ring from the event loop thread, setting submitter_task
+ /// to this thread. Required because io_uring_setup uses R_DISABLED to defer
+ /// submitter_task assignment, and DEFER_TASKRUN requires submitter_task == current
+ /// on every io_uring_enter call.
+ ///
+ partial void LinuxEventLoopEnableRings()
+ {
+ if (!_ioUringCapabilities.IsIoUringPort || _managedRingFd < 0)
+ {
+ return;
+ }
+
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimRegister(
+ _managedRingFd, IoUringConstants.RegisterEnableRings, null, 0, &result);
+ if (err != Interop.Error.SUCCESS)
+ {
+ if (NetEventSource.Log.IsEnabled())
+ {
+ NetEventSource.Error(this, $"io_uring REGISTER_ENABLE_RINGS failed: {err}");
+ }
+ }
+ }
+
+ /// Tears down io_uring state before native resource cleanup.
+ partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort || _port == (IntPtr)(-1))
+ {
+ return;
+ }
+
+ // Publish teardown before draining queues/closing the native port so concurrent
+ // producer paths observe shutdown via acquire reads and stop queueing new work.
+ Volatile.Write(ref _ioUringTeardownInitiated, 1);
+ DrainQueuedIoUringOperationsForTeardown();
+
+ Interop.Error closeError = Interop.Sys.CloseSocketEventPort(_port);
+ if (closeError == Interop.Error.SUCCESS)
+ {
+ closeSocketEventPort = false;
+ Volatile.Write(ref _ioUringPortClosedForTeardown, 1);
+ }
+ }
+
+ /// Submits pending SQEs before entering the wait.
+ partial void LinuxEventLoopBeforeWait()
+ {
+ Interop.Error submitError = SubmitIoUringBatch();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ // FailFast site: the event-loop submit step cannot degrade safely once
+ // io_uring completion mode is active; losing submit progress would orphan tracked ops.
+ ThrowInternalException(submitError);
+ }
+ }
+
+ /// Attempts a managed completion wait using io_uring_enter with timeout.
+ partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode)
+ {
+ return;
+ }
+
+ // Managed CQE drain path: read CQEs directly from mmap'd ring.
+ // First, try a non-blocking drain of any already-available CQEs.
+ bool hadCqes = DrainCqeRingBatch(handler);
+ if (hadCqes)
+ {
+ numCompletions = 1;
+ waitHandled = true;
+ err = Interop.Error.SUCCESS;
+ return;
+ }
+
+ // No CQEs available — submit pending SQEs and wait for at least 1 CQE.
+ uint enterFlags = IoUringConstants.EnterGetevents;
+ int ringFd = _managedRingFd;
+ if (_ioUringSqRingInfo.RegisteredRingFd >= 0)
+ {
+ enterFlags |= IoUringConstants.EnterRegisteredRing;
+ ringFd = _ioUringSqRingInfo.RegisteredRingFd;
+ }
+
+ uint submitCount = _sqPollEnabled ? 0u : _ioUringManagedPendingSubmissions;
+ if (_sqPollEnabled &&
+ _ioUringManagedPendingSubmissions != 0 &&
+ SqNeedWakeup())
+ {
+ enterFlags |= IoUringConstants.EnterSqWakeup;
+ }
+
+ if (_managedUsesExtArg)
+ {
+ // Snapshot the wakeup generation counter before entering the blocking syscall.
+ // After waking, we compare to detect wakeups that arrived during the syscall.
+ uint wakeGenBefore = Volatile.Read(ref _ioUringWakeupGeneration);
+ // Bounded wait via EXT_ARG; timeout shortens when wake circuit-breaker is active.
+ uint waitEnterFlags = enterFlags | IoUringConstants.EnterExtArg;
+ int waitRingFd = ringFd;
+ Interop.Sys.IoUringKernelTimespec timeout = default;
+ timeout.TvNsec = GetManagedCompletionWaitTimeoutNanos();
+ Interop.Sys.IoUringGeteventsArg extArg = default;
+ extArg.Ts = (ulong)(nuint)(&timeout);
+
+ int result;
+ err = Interop.Sys.IoUringShimEnterExt(
+ waitRingFd, submitCount, 1, waitEnterFlags, &extArg, &result);
+ if (err == Interop.Error.EINVAL && (waitEnterFlags & IoUringConstants.EnterRegisteredRing) != 0)
+ {
+ DisableRegisteredRingFd();
+ waitEnterFlags &= ~IoUringConstants.EnterRegisteredRing;
+ waitRingFd = _managedRingFd;
+ err = Interop.Sys.IoUringShimEnterExt(
+ waitRingFd, submitCount, 1, waitEnterFlags, &extArg, &result);
+ }
+
+ if (err == Interop.Error.SUCCESS)
+ {
+ UpdateManagedPendingSubmissionCountAfterEnter(submitCount, result);
+ }
+ }
+ else
+ {
+ Debug.Assert(
+ false,
+ "Non-EXT_ARG io_uring wait fallback is unexpected on supported kernels (>= 6.1).");
+ // Snapshot the wakeup generation counter before entering the blocking syscall (non-EXT_ARG fallback).
+ uint wakeGenBefore = Volatile.Read(ref _ioUringWakeupGeneration);
+ uint waitEnterFlags = enterFlags;
+ int waitRingFd = ringFd;
+ int result;
+ err = Interop.Sys.IoUringShimEnter(
+ waitRingFd, submitCount, 1, waitEnterFlags, &result);
+ if (err == Interop.Error.EINVAL && (waitEnterFlags & IoUringConstants.EnterRegisteredRing) != 0)
+ {
+ DisableRegisteredRingFd();
+ waitEnterFlags &= ~IoUringConstants.EnterRegisteredRing;
+ waitRingFd = _managedRingFd;
+ err = Interop.Sys.IoUringShimEnter(
+ waitRingFd, submitCount, 1, waitEnterFlags, &result);
+ }
+
+ if (err == Interop.Error.SUCCESS)
+ {
+ UpdateManagedPendingSubmissionCountAfterEnter(submitCount, result);
+ }
+ }
+
+ // Drain after waking. If a producer signalled while we were inside io_uring_enter,
+ // the generation counter will have advanced. In that case, drain again to ensure
+ // enqueued work is not delayed until the next bounded wait timeout.
+ hadCqes = false;
+ do
+ {
+ hadCqes |= DrainCqeRingBatch(handler);
+ }
+ while (Volatile.Read(ref _ioUringWakeupGeneration) != wakeGenBefore);
+ numCompletions = hadCqes ? 1 : 0;
+ numEvents = 0;
+ waitHandled = true;
+ err = Interop.Error.SUCCESS;
+ }
+
+ /// Polls diagnostics after each event loop iteration.
+ partial void LinuxEventLoopAfterIteration()
+ {
+ PollIoUringDiagnosticsIfNeeded(force: false);
+ TrySweepStaleTrackedIoUringOperationsAfterCqOverflowRecovery();
+ }
+
+ /// Queued work item pairing an operation with its prepare sequence number for deferred SQE preparation.
+ private readonly struct IoUringPrepareWorkItem
+ {
+ /// The operation to prepare.
+ public readonly SocketAsyncContext.AsyncOperation Operation;
+ /// The sequence number that must match for the preparation to proceed.
+ public readonly long PrepareSequence;
+
+ /// Creates a work item pairing an operation with its prepare sequence number.
+ public IoUringPrepareWorkItem(SocketAsyncContext.AsyncOperation operation, long prepareSequence)
+ {
+ Operation = operation;
+ PrepareSequence = prepareSequence;
+ }
+ }
+
+ /// Enqueues an operation for deferred SQE preparation on the event loop thread.
+ internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation, long prepareSequence)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return false;
+ }
+
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is null)
+ {
+ return false;
+ }
+
+ long queueLength = Interlocked.Increment(ref _ioUringPrepareQueueLength);
+ if (queueLength > s_ioUringPrepareQueueCapacity)
+ {
+ Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ long overflowCount = Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount);
+ if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringPrepareQueueOverflow(overflowCount, s_ioUringPrepareQueueCapacity);
+ }
+
+ return false;
+ }
+
+ if (!prepareQueue.TryEnqueue(new IoUringPrepareWorkItem(operation, prepareSequence)))
+ {
+ Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ long overflowCount = Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount);
+ if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringPrepareQueueOverflow(overflowCount, s_ioUringPrepareQueueCapacity);
+ }
+
+ return false;
+ }
+
+ WakeEventLoop();
+ return true;
+ }
+
+ /// Extracts completion-slot index and generation from tracked reserved-completion user_data.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool TryDecodeTrackedIoUringUserData(ulong userData, out int slotIndex, out ulong generation)
+ {
+ generation = 0;
+ slotIndex = 0;
+ if (userData == 0)
+ {
+ return false;
+ }
+
+ if ((byte)(userData >> IoUringUserDataTagShift) != IoUringConstants.TagReservedCompletion)
+ {
+ return false;
+ }
+
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ return false;
+ }
+
+ ulong payload = userData & IoUringUserDataPayloadMask;
+ slotIndex = DecodeCompletionSlotIndex(payload);
+ if ((uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ generation = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask;
+ return true;
+ }
+
+ /// Atomically removes and returns the tracked operation matching the user_data and generation.
+ private bool TryTakeTrackedIoUringOperation(ulong userData, out SocketAsyncContext.AsyncOperation? operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryTakeTrackedIoUringOperation must run on the event-loop thread.");
+ operation = null;
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ // Writers publish generation before operation; if operation is visible here,
+ // generation must match unless this CQE belongs to an older slot incarnation.
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return false;
+ }
+
+ // Single-owner handoff: exactly one completion-side CAS can null out TrackedOperation
+ // for this slot incarnation. A racing replace may swap references, but cannot create
+ // two winners for the same user_data token.
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, null, currentOperation) != currentOperation)
+ {
+ continue;
+ }
+
+ // Reset generation to zero so TryReattachTrackedIoUringOperation (used by
+ // SEND_ZC to re-register while awaiting the NOTIF CQE) can CAS from 0 to
+ // the new generation. Volatile.Write ensures visibility on ARM64 before the
+ // count decrement below, preventing a concurrent TryTrack from observing
+ // TrackedOperation == null with a stale non-zero generation.
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0UL);
+ DecrementTrackedIoUringOperationCountOnEventLoop();
+ operation = currentOperation;
+ return true;
+ }
+ }
+
+ /// Returns the tracked operation for the given user_data without untracking it.
+ private bool TryGetTrackedIoUringOperation(ulong userData, out SocketAsyncContext.AsyncOperation? operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryGetTrackedIoUringOperation must run on the event-loop thread.");
+ operation = null;
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return false;
+ }
+
+ operation = currentOperation;
+ return true;
+ }
+
+ /// Returns whether an operation with the given user_data and generation is currently tracked.
+ private bool ContainsTrackedIoUringOperation(ulong userData)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "ContainsTrackedIoUringOperation must run on the event-loop thread.");
+ return TryGetTrackedIoUringOperation(userData, out _);
+ }
+
+ /// Re-attaches a completion owner after dispatch-side deferral (for example SEND_ZC waiting on NOTIF CQE).
+ private bool TryReattachTrackedIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryReattachTrackedIoUringOperation must run on the event-loop thread.");
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ // Verify the completion slot is still in the expected SEND_ZC NOTIF-pending state
+ // before attempting to reattach. If the slot was freed and reallocated between the
+ // first CQE dispatch and this reattach call, the slot's state will not match.
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null || (uint)slotIndex >= (uint)completionEntries.Length)
+ {
+ return false;
+ }
+
+ ref IoUringCompletionSlot slot = ref completionEntries[slotIndex];
+ if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending || slot.Generation != generation)
+ {
+ // Slot was freed and possibly reallocated. The NOTIF CQE was either already
+ // processed or will be discarded by HandleZeroCopyNotification's generation check.
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ if (Interlocked.CompareExchange(ref entry.TrackedOperationGeneration, generation, 0) != 0)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, operation, null) is not null)
+ {
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0);
+ return false;
+ }
+
+ IncrementTrackedIoUringOperationCountOnEventLoop();
+ return true;
+ }
+
+ /// Atomically replaces the tracked operation for the given user_data.
+ private bool TryReplaceTrackedIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation)
+ {
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return false;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return false;
+ }
+
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, newOperation, currentOperation) == currentOperation)
+ {
+ return true;
+ }
+ }
+ }
+
+ /// Removes a tracked operation, optionally verifying it matches an expected reference.
+ private IoUringTrackedOperationRemoveResult TryUntrackTrackedIoUringOperation(
+ ulong userData,
+ SocketAsyncContext.AsyncOperation? expectedOperation,
+ out SocketAsyncContext.AsyncOperation? removedOperation)
+ {
+ removedOperation = null;
+ if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation))
+ {
+ return IoUringTrackedOperationRemoveResult.NotFound;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ while (true)
+ {
+ SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation);
+ if (currentOperation is null)
+ {
+ return IoUringTrackedOperationRemoveResult.NotFound;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation)
+ {
+ return IoUringTrackedOperationRemoveResult.NotFound;
+ }
+
+ if (expectedOperation is not null && !ReferenceEquals(currentOperation, expectedOperation))
+ {
+ return IoUringTrackedOperationRemoveResult.Mismatch;
+ }
+
+ if (Interlocked.CompareExchange(ref entry.TrackedOperation, null, currentOperation) != currentOperation)
+ {
+ continue;
+ }
+
+ // Volatile.Write ensures the generation reset is visible on ARM64 before
+ // the count decrement. This method runs from worker threads (cancellation),
+ // and a plain store could reorder past Interlocked.Decrement, leaving a
+ // window where the event loop sees TrackedOperation == null but generation != 0.
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0UL);
+ Interlocked.Decrement(ref _trackedIoUringOperationCount);
+ removedOperation = currentOperation;
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Canceled);
+ return IoUringTrackedOperationRemoveResult.Removed;
+ }
+ }
+
+ /// Returns true when no io_uring operations are currently tracked.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsIoUringTrackingEmpty() =>
+ Volatile.Read(ref _trackedIoUringOperationCount) == 0;
+
+ private void IncrementTrackedIoUringOperationCountOnEventLoop()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Tracked-operation increments must run on the event-loop thread.");
+ int nextCount = _trackedIoUringOperationCount + 1;
+ Volatile.Write(ref _trackedIoUringOperationCount, nextCount);
+ }
+
+ private void DecrementTrackedIoUringOperationCountOnEventLoop()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "Tracked-operation decrements must run on the event-loop thread.");
+ int nextCount = _trackedIoUringOperationCount - 1;
+ Debug.Assert(nextCount >= 0, "Tracked-operation count underflow.");
+ Volatile.Write(ref _trackedIoUringOperationCount, nextCount);
+ }
+
+ /// Removes an operation from completion-slot tracking, logging on mismatch.
+ internal bool TryUntrackIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation? expectedOperation = null)
+ {
+ IoUringTrackedOperationRemoveResult removeResult = TryUntrackTrackedIoUringOperation(userData, expectedOperation, out _);
+ if (removeResult == IoUringTrackedOperationRemoveResult.Mismatch)
+ {
+ Debug.Fail("io_uring tracked operation mismatch while untracking user_data.");
+ long mismatchCount = Interlocked.Increment(ref _ioUringUntrackMismatchCount);
+ if ((mismatchCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringUntrackMismatch(userData, mismatchCount);
+ }
+
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Attempts to replace the currently tracked operation for an existing user_data slot.
+ internal bool TryReplaceIoUringTrackedOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation)
+ {
+ // Replacement keeps the same slot+generation token; completion ownership is still
+ // resolved by the CompareExchange gate in TryTakeTrackedIoUringOperation.
+ return TryReplaceTrackedIoUringOperation(userData, newOperation);
+ }
+
+ /// Enqueues a user_data for ASYNC_CANCEL on the event loop thread.
+ private bool TryEnqueueIoUringCancellation(ulong userData)
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || userData == 0 || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return false;
+ }
+
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is null)
+ {
+ return false;
+ }
+
+ // First attempt: enqueue directly.
+ long queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength);
+ if (queueLength <= s_ioUringCancellationQueueCapacity)
+ {
+ if (cancelQueue.TryEnqueue(userData))
+ {
+ return true;
+ }
+
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ }
+ else
+ {
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ }
+
+ // Queue-full can be transient under cancellation bursts. Wake the event loop and retry once.
+#if DEBUG
+ // Keep a dedicated test counter so functional tests can verify the wake-and-retry path.
+ Interlocked.Increment(ref _testCancelQueueWakeRetryCount);
+#endif
+ WakeEventLoop();
+ // Retry while SpinWait remains in active-spin mode; once it would yield, take slow-path accounting.
+ SpinWait retryBackoff = default;
+ do
+ {
+ retryBackoff.SpinOnce();
+
+ queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength);
+ if (queueLength <= s_ioUringCancellationQueueCapacity)
+ {
+ if (cancelQueue.TryEnqueue(userData))
+ {
+ return true;
+ }
+
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ continue;
+ }
+
+ Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ } while (!retryBackoff.NextSpinWillYield);
+
+ long overflowCount = Interlocked.Increment(ref _ioUringCancelQueueOverflowCount);
+ SocketsTelemetry.Log.IoUringCancellationQueueOverflow();
+ if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCancellationQueueOverflow(overflowCount, s_ioUringCancellationQueueCapacity);
+ }
+
+ return false;
+ }
+
+ /// Writes an ASYNC_CANCEL SQE directly if the engine is on the event loop thread.
+ private bool TryQueueIoUringAsyncCancel(ulong userData)
+ {
+ if (!_ioUringCapabilities.IsIoUringPort || userData == 0)
+ {
+ return false;
+ }
+
+ if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out _))
+ {
+ return false;
+ }
+
+ WriteAsyncCancelSqe(sqe, userData);
+ return true;
+ }
+
+ /// Writes to the eventfd to wake the event loop from a blocking wait.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private Interop.Error ManagedWakeEventLoop()
+ {
+ return Interop.Sys.IoUringShimWriteEventFd(_managedWakeupEventFd);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private long GetManagedCompletionWaitTimeoutNanos()
+ {
+ return Volatile.Read(ref _ioUringWakeFailureConsecutiveCount) >= IoUringWakeFailureCircuitBreakerThreshold
+ ? IoUringConstants.WakeFailureFallbackWaitTimeoutNanos
+ : IoUringConstants.BoundedWaitTimeoutNanos;
+ }
+
+ /// Sends a coalesced wake signal to the event loop thread.
+ private void WakeEventLoop()
+ {
+ if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0)
+ {
+ return;
+ }
+
+ // Advance the wakeup generation. The event loop compares its snapshot to detect
+ // wakeups that arrived during the blocking syscall. Coalescing: only the thread
+ // that moves the counter from even to odd actually writes the eventfd.
+ uint gen = Interlocked.Increment(ref _ioUringWakeupGeneration);
+ if ((gen & 1) == 0)
+ {
+ // Another producer already advanced the generation and is responsible
+ // for the eventfd write. Our increment is visible to the consumer's
+ // post-wake generation comparison, so the work will be drained.
+ return;
+ }
+
+ Interop.Error error = ManagedWakeEventLoop();
+ if (error == Interop.Error.SUCCESS)
+ {
+ int previousFailureCount = Interlocked.Exchange(ref _ioUringWakeFailureConsecutiveCount, 0);
+ if (previousFailureCount >= IoUringWakeFailureCircuitBreakerThreshold &&
+ NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringWakeCircuitBreakerStateChanged(enabled: false, previousFailureCount);
+ }
+
+ return;
+ }
+
+ int consecutiveFailures = Interlocked.Increment(ref _ioUringWakeFailureConsecutiveCount);
+ if (consecutiveFailures == IoUringWakeFailureCircuitBreakerThreshold &&
+ NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringWakeCircuitBreakerStateChanged(enabled: true, consecutiveFailures);
+ }
+
+ // Advance generation again so the next producer can attempt an eventfd write.
+ // (Moves from odd back to even, re-enabling the coalescing gate.)
+ Interlocked.Increment(ref _ioUringWakeupGeneration);
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringWakeFailure(error);
+ }
+ }
+
+ /// Enqueues a cancellation request and wakes the event loop.
+ internal void TryRequestIoUringCancellation(ulong userData)
+ {
+ if (!TryEnqueueIoUringCancellation(userData))
+ {
+ return;
+ }
+
+ WakeEventLoop();
+ }
+
+ ///
+ /// Enqueues a readiness fallback event when io_uring submission is congested.
+ ///
+ internal void EnqueueReadinessFallbackEvent(
+ SocketAsyncContext context,
+ Interop.Sys.SocketEvents events,
+ bool countAsPrepareQueueOverflowFallback = false)
+ {
+ if (events == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ _eventQueue.Enqueue(new SocketIOEvent(context, events));
+ if (countAsPrepareQueueOverflowFallback)
+ {
+ RecordIoUringPrepareQueueOverflowFallback();
+ }
+ EnsureWorkerScheduled();
+ }
+
+ /// Drains queued cancellation requests into ASYNC_CANCEL SQEs.
+ private bool DrainIoUringCancellationQueue()
+ {
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is null)
+ {
+ return false;
+ }
+
+ bool preparedSqe = false;
+ for (int drained = 0; drained < MaxIoUringCancelQueueDrainPerSubmit &&
+ cancelQueue.TryDequeue(out ulong userData); drained++)
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ Debug.Assert(remainingLength >= 0);
+
+ // Cancellation requests can race with terminal completion/untracking.
+ // Skip stale requests to avoid issuing known -ENOENT async-cancel SQEs.
+ if (!IsTrackedIoUringOperation(userData))
+ {
+ continue;
+ }
+
+ if (TryQueueIoUringAsyncCancel(userData))
+ {
+ preparedSqe = true;
+ }
+ }
+ return preparedSqe;
+ }
+
+ /// Drains both prepare and cancel queues, then submits all pending SQEs.
+ private Interop.Error SubmitIoUringBatch()
+ {
+ if (!_ioUringCapabilities.IsIoUringPort)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SubmitIoUringBatch must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ bool preparedSqe = false;
+ if (_ioUringCapabilities.IsCompletionMode)
+ {
+ preparedSqe |= DrainIoUringCancellationQueue();
+
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is null)
+ {
+ ThrowInternalException("io_uring invariant violation: prepare queue is null while engine is in completion mode");
+ }
+
+ for (int drained = 0; drained < MaxIoUringPrepareQueueDrainPerSubmit &&
+ prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem); drained++)
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Debug.Assert(remainingLength >= 0);
+ Interop.Error prepareError = TryPrepareAndTrackIoUringOperation(
+ workItem.Operation,
+ workItem.PrepareSequence,
+ out bool preparedOperation);
+ if (prepareError != Interop.Error.SUCCESS)
+ {
+ return prepareError;
+ }
+
+ preparedSqe |= preparedOperation;
+ if (!preparedOperation && workItem.Operation.IsInWaitingState())
+ {
+ if (IsPotentialCompletionSlotExhaustion())
+ {
+ int retryCount = workItem.Operation.IncrementIoUringSlotExhaustionRetryCount();
+ if (retryCount < MaxSlotExhaustionRetries &&
+ workItem.Operation.TryQueueIoUringPreparation())
+ {
+ continue;
+ }
+ }
+
+ workItem.Operation.ResetIoUringSlotExhaustionRetryCount();
+ EmitReadinessFallbackForUnpreparedOperation(workItem.Operation);
+ }
+ }
+
+ }
+
+ if (!preparedSqe)
+ {
+ // Inline re-prepare paths can write SQEs outside queue drains; ensure they are submitted.
+ if (_ioUringManagedPendingSubmissions != 0)
+ {
+ return SubmitIoUringOperationsNormalized();
+ }
+
+ if ((_ioUringCancelQueue?.IsEmpty == false) || (_ioUringPrepareQueue?.IsEmpty == false))
+ {
+ WakeEventLoop();
+ }
+
+ return Interop.Error.SUCCESS;
+ }
+
+ return SubmitIoUringOperationsNormalized();
+ }
+
+ ///
+ /// Prepares an operation for io_uring submission and tracks it in completion-slot metadata.
+ /// On non-prepared paths, clears operation user_data and releases preparation resources.
+ ///
+ private Interop.Error TryPrepareAndTrackIoUringOperation(
+ SocketAsyncContext.AsyncOperation operation,
+ long prepareSequence,
+ out bool preparedSqe)
+ {
+ preparedSqe = false;
+
+ bool prepared = operation.TryPrepareIoUring(operation.AssociatedContext, prepareSequence);
+ if (prepared)
+ {
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Queued,
+ IoUringOperationLifecycleState.Prepared);
+ }
+
+ if (prepared && operation.ErrorCode == SocketError.Success)
+ {
+ preparedSqe = true;
+ if (!TryTrackPreparedIoUringOperation(operation))
+ {
+ // Invariant violation: tracking collision after prepare.
+ // A prepared SQE may now complete without a managed owner; do not attempt best-effort recovery.
+ operation.ClearIoUringUserData();
+ ThrowInternalException("io_uring tracking collision: prepared SQE could not be tracked by user_data");
+ }
+
+ return Interop.Error.SUCCESS;
+ }
+
+ if (prepared)
+ {
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Prepared,
+ IoUringOperationLifecycleState.Detached);
+ }
+
+ if (!TryUntrackIoUringOperation(operation.IoUringUserData, operation))
+ {
+ // Mismatch indicates token ownership confusion; avoid releasing
+ // resources that may still be associated with another tracked op.
+ ThrowInternalException("io_uring untrack mismatch: token ownership confusion during prepare cleanup");
+ }
+
+ operation.ClearIoUringUserData();
+ return Interop.Error.SUCCESS;
+ }
+
+ ///
+ /// Falls back to readiness notification for an operation that remained waiting after a failed prepare attempt.
+ ///
+ private void EmitReadinessFallbackForUnpreparedOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ operation.ClearIoUringUserData();
+ Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents();
+ if (fallbackEvents == Interop.Sys.SocketEvents.None)
+ {
+ return;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringPrepareFallbackToReadiness(fallbackEvents);
+ }
+
+ EnqueueReadinessFallbackEvent(operation.AssociatedContext, fallbackEvents);
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ void LogIoUringPrepareFallbackToReadiness(Interop.Sys.SocketEvents events)
+ {
+ NetEventSource.Error(
+ this,
+ $"io_uring prepare fallback to readiness notification: events={events}");
+ }
+ }
+
+ /// Registers a prepared operation in completion-slot metadata.
+ private bool TryTrackPreparedIoUringOperation(SocketAsyncContext.AsyncOperation operation)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "TryTrackPreparedIoUringOperation must run on the event-loop thread.");
+ if (!TryDecodeTrackedIoUringUserData(operation.IoUringUserData, out int slotIndex, out ulong generation))
+ {
+ return false;
+ }
+
+ ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex];
+ if (Volatile.Read(ref entry.TrackedOperationGeneration) == 0 &&
+ Volatile.Read(ref entry.TrackedOperation) is null)
+ {
+ // Publish generation before operation so readers never observe a new
+ // operation paired with a stale generation on weakly-ordered CPUs.
+ Volatile.Write(ref entry.TrackedOperationGeneration, generation);
+ Volatile.Write(ref entry.TrackedOperation, operation);
+ IncrementTrackedIoUringOperationCountOnEventLoop();
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Prepared,
+ IoUringOperationLifecycleState.Submitted);
+ return true;
+ }
+
+ if (Volatile.Read(ref entry.TrackedOperation) is null &&
+ Volatile.Read(ref entry.TrackedOperationGeneration) == generation)
+ {
+ Volatile.Write(ref entry.TrackedOperationGeneration, 0);
+ }
+
+ // Persistent multishot receive can rebind an existing tracked user_data to a new
+ // managed operation before this call. In that case, tracking is already satisfied.
+ return operation.IoUringUserData != 0 &&
+ TryGetTrackedIoUringOperation(operation.IoUringUserData, out SocketAsyncContext.AsyncOperation? trackedOperation) &&
+ ReferenceEquals(trackedOperation, operation);
+ }
+
+ /// Returns whether the given user_data is currently tracked.
+ private bool IsTrackedIoUringOperation(ulong userData)
+ {
+ return ContainsTrackedIoUringOperation(userData);
+ }
+
+ /// Returns whether current completion-slot usage indicates likely slot exhaustion pressure.
+ private bool IsPotentialCompletionSlotExhaustion()
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null || completionEntries.Length == 0)
+ {
+ return false;
+ }
+
+ int threshold = Math.Max(0, completionEntries.Length - 16);
+ return _completionSlotsInUse >= threshold;
+ }
+
+ /// Debug assertion that tracked completion-slot usage never exceeds pool bounds.
+ [Conditional("DEBUG")]
+ private void AssertCompletionSlotUsageBounded()
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ Debug.Assert(
+ _completionSlotsInUse == 0,
+ "Completion slot usage must be zero when the slot pool is not allocated.");
+ return;
+ }
+
+ Debug.Assert(
+ _completionSlotsInUse >= 0 && _completionSlotsInUse <= completionEntries.Length,
+ $"Completion slot usage out of bounds: inUse={_completionSlotsInUse}, capacity={completionEntries.Length}.");
+ }
+
+ /// Debug assertion that completion-slot free-list topology matches .
+ [Conditional("DEBUG")]
+ private void AssertCompletionSlotPoolConsistency()
+ {
+ IoUringCompletionSlot[]? completionEntries = _completionSlots;
+ if (completionEntries is null)
+ {
+ Debug.Assert(_completionSlotsInUse == 0, "Completion slot usage must be zero when slots are not allocated.");
+ Debug.Assert(_completionSlotFreeListHead == -1, "Free-list head must be reset when slots are not allocated.");
+ return;
+ }
+
+ bool[] visited = new bool[completionEntries.Length];
+ int freeCount = 0;
+ int current = _completionSlotFreeListHead;
+ while (current >= 0)
+ {
+ Debug.Assert(
+ (uint)current < (uint)completionEntries.Length,
+ $"Completion-slot free-list index out of range: {current}.");
+ if ((uint)current >= (uint)completionEntries.Length || visited[current])
+ {
+ break;
+ }
+
+ visited[current] = true;
+ freeCount++;
+ current = completionEntries[current].FreeListNext;
+ }
+
+ int expectedInUse = completionEntries.Length - freeCount;
+ Debug.Assert(
+ expectedInUse == _completionSlotsInUse,
+ $"Completion-slot accounting mismatch: expected in-use={expectedInUse}, actual in-use={_completionSlotsInUse}, free={freeCount}, capacity={completionEntries.Length}.");
+ }
+
+ /// Returns whether the calling thread is the event loop thread.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsCurrentThreadEventLoopThread() =>
+ Volatile.Read(ref _eventLoopManagedThreadId) == Environment.CurrentManagedThreadId;
+
+ /// Returns whether a submit error indicates an unsupported operation rather than a real failure.
+ private static bool IsIgnoredIoUringSubmitError(Interop.Error error) =>
+ error == Interop.Error.ENOSYS ||
+ error == Interop.Error.ENOTSUP ||
+ error == Interop.Error.EOPNOTSUPP ||
+ error == Interop.Error.EINTR ||
+ error == Interop.Error.EPERM;
+
+ /// Disables the registered ring fd after an EINVAL and falls back to the raw ring fd.
+ private void DisableRegisteredRingFd()
+ {
+ _ioUringSqRingInfo.RegisteredRingFd = -1;
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringRegisteredRingFdDisabled();
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringRegisteredRingFdDisabled() =>
+ NetEventSource.Error(this, "io_uring registered ring fd disabled after EINVAL; falling back to raw ring fd.");
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void LogIoUringSubmitErrorDrained(uint rejectedCount, Interop.Error error) =>
+ NetEventSource.Error(this, $"io_uring submit returned {error}: draining {rejectedCount} rejected SQE(s) as failed completions.");
+
+ ///
+ /// Completes rejected-but-published SQEs as failed completions so ignored submit
+ /// errors do not re-queue the same work indefinitely.
+ ///
+ private unsafe void DrainRejectedManagedSqesAsFailedCompletions(uint rejectedSubmitCount, Interop.Error submitError)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "DrainRejectedManagedSqesAsFailedCompletions must run on the event-loop thread.");
+ if (rejectedSubmitCount == 0)
+ {
+ return;
+ }
+
+ ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo;
+ if (ringInfo.SqeBase == IntPtr.Zero || ringInfo.SqEntries == 0 || ringInfo.SqeSize < (uint)sizeof(IoUringSqe))
+ {
+ return;
+ }
+
+ int completionResult = -Interop.Sys.ConvertErrorPalToPlatform(submitError);
+ uint firstRejectedSqTail = _ioUringManagedSqTail - rejectedSubmitCount;
+ SocketEventHandler handler = new SocketEventHandler(this);
+ bool enqueuedFallbackEvent = false;
+
+ for (uint i = 0; i < rejectedSubmitCount; i++)
+ {
+ uint sqTail = firstRejectedSqTail + i;
+ uint ringIndex = sqTail & ringInfo.SqMask;
+ nint sqeOffset = checked((nint)((nuint)ringIndex * ringInfo.SqeSize));
+ IoUringSqe* sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset);
+ ulong sqeUserData = sqe->UserData;
+ byte tag = (byte)(sqeUserData >> IoUringUserDataTagShift);
+
+ if (tag == IoUringConstants.TagReservedCompletion)
+ {
+ handler.DispatchSingleIoUringCompletion(
+ sqeUserData,
+ completionResult,
+ flags: 0,
+ socketAddressLen: 0,
+ controlBufferLen: 0,
+ auxiliaryData: 0,
+ hasFixedRecvBuffer: false,
+ fixedRecvBufferId: 0,
+ ref enqueuedFallbackEvent);
+ }
+ else if (tag != IoUringConstants.TagNone && tag != IoUringConstants.TagWakeupSignal)
+ {
+ Debug.Fail($"Unexpected io_uring SQE user_data tag on rejected submit drain: {tag}.");
+ }
+ }
+
+ if (enqueuedFallbackEvent)
+ {
+ EnsureWorkerScheduled();
+ }
+ }
+
+ /// Returns the accepted SQE count from an io_uring_enter result, clamped to the requested submit count.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ComputeAcceptedSubmissionCount(uint requestedSubmitCount, int enterResult)
+ {
+ if (requestedSubmitCount == 0 || enterResult <= 0)
+ {
+ return 0;
+ }
+
+ uint acceptedSubmitCount = (uint)enterResult;
+ return acceptedSubmitCount <= requestedSubmitCount ? acceptedSubmitCount : requestedSubmitCount;
+ }
+
+ /// Updates pending-submission accounting after an io_uring_enter wait call.
+ private void UpdateManagedPendingSubmissionCountAfterEnter(uint requestedSubmitCount, int enterResult)
+ {
+ if (_sqPollEnabled)
+ {
+ // SQPOLL consumes published SQEs asynchronously after wakeup.
+ _ioUringManagedPendingSubmissions = 0;
+ return;
+ }
+
+ uint acceptedSubmitCount = ComputeAcceptedSubmissionCount(requestedSubmitCount, enterResult);
+ uint rejectedSubmitCount = requestedSubmitCount - acceptedSubmitCount;
+ Debug.Assert(
+ acceptedSubmitCount + rejectedSubmitCount == requestedSubmitCount,
+ "Partial-submit accounting mismatch in io_uring wait path.");
+ _ioUringManagedPendingSubmissions = rejectedSubmitCount;
+ }
+
+ /// Submits the specified number of pending SQEs via io_uring_enter.
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe Interop.Error ManagedSubmitPendingEntries(
+ uint toSubmit,
+ out uint acceptedSubmitCount)
+ {
+ acceptedSubmitCount = 0;
+ if (toSubmit == 0)
+ {
+ return Interop.Error.SUCCESS;
+ }
+
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "ManagedSubmitPendingEntries must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ if (TryConsumeDebugForcedSubmitError(out Interop.Error forcedSubmitError))
+ {
+ return forcedSubmitError;
+ }
+
+ if (_sqPollEnabled)
+ {
+ if (!SqNeedWakeup())
+ {
+ SocketsTelemetry.Log.IoUringSqPollSubmissionSkipped(toSubmit);
+ acceptedSubmitCount = toSubmit;
+ return Interop.Error.SUCCESS;
+ }
+
+ uint wakeupFlags = IoUringConstants.EnterSqWakeup;
+ int wakeupRingFd = _managedRingFd;
+ if (_ioUringSqRingInfo.RegisteredRingFd >= 0)
+ {
+ wakeupFlags |= IoUringConstants.EnterRegisteredRing;
+ wakeupRingFd = _ioUringSqRingInfo.RegisteredRingFd;
+ }
+
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogSqPollWakeup(this, toSubmit);
+ }
+
+ // Wakeup accounting is intentionally optimistic: this counter tracks wake requests
+ // issued by managed code, not guaranteed kernel-side SQ consumption.
+ SocketsTelemetry.Log.IoUringSqPollWakeup();
+ int wakeupResult;
+ Interop.Error wakeupError = Interop.Sys.IoUringShimEnter(wakeupRingFd, 0, 0, wakeupFlags, &wakeupResult);
+ if (wakeupError == Interop.Error.EINVAL && (wakeupFlags & IoUringConstants.EnterRegisteredRing) != 0)
+ {
+ DisableRegisteredRingFd();
+ wakeupFlags &= ~IoUringConstants.EnterRegisteredRing;
+ wakeupRingFd = _managedRingFd;
+ wakeupError = Interop.Sys.IoUringShimEnter(wakeupRingFd, 0, 0, wakeupFlags, &wakeupResult);
+ }
+
+ if (wakeupError == Interop.Error.SUCCESS)
+ {
+ acceptedSubmitCount = toSubmit;
+ }
+
+ return wakeupError;
+ }
+
+ uint enterFlags = 0;
+ int ringFd = _managedRingFd;
+ if (_ioUringSqRingInfo.RegisteredRingFd >= 0)
+ {
+ enterFlags |= IoUringConstants.EnterRegisteredRing;
+ ringFd = _ioUringSqRingInfo.RegisteredRingFd;
+ }
+
+ while (toSubmit > 0)
+ {
+ int result;
+ Interop.Error err = Interop.Sys.IoUringShimEnter(ringFd, toSubmit, 0, enterFlags, &result);
+ if (err == Interop.Error.EINVAL && (enterFlags & IoUringConstants.EnterRegisteredRing) != 0)
+ {
+ DisableRegisteredRingFd();
+ enterFlags &= ~IoUringConstants.EnterRegisteredRing;
+ ringFd = _managedRingFd;
+ err = Interop.Sys.IoUringShimEnter(ringFd, toSubmit, 0, enterFlags, &result);
+ }
+
+ if (err != Interop.Error.SUCCESS)
+ return err;
+
+ uint acceptedThisCall = ComputeAcceptedSubmissionCount(toSubmit, result);
+ if (acceptedThisCall == 0)
+ {
+ return Interop.Error.EAGAIN;
+ }
+
+ acceptedSubmitCount += acceptedThisCall;
+ toSubmit -= acceptedThisCall;
+ }
+ return Interop.Error.SUCCESS;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void LogSqPollWakeup(SocketAsyncEngine engine, uint pendingSubmissionCount) =>
+ NetEventSource.Info(engine, $"io_uring SQPOLL wakeup requested for pending SQEs: {pendingSubmissionCount}");
+
+ /// Computes pending submissions and calls ManagedSubmitPendingEntries.
+ private Interop.Error SubmitIoUringOperationsNormalized()
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "SubmitIoUringOperationsNormalized must only be called from the event loop thread (SINGLE_ISSUER contract).");
+ PublishManagedSqeTail();
+ uint managedPending = _ioUringManagedPendingSubmissions;
+ _ioUringManagedPendingSubmissions = 0;
+
+ Interop.Error error = ManagedSubmitPendingEntries(managedPending, out uint acceptedSubmitCount);
+ uint rejectedSubmitCount = managedPending - acceptedSubmitCount;
+ Debug.Assert(
+ acceptedSubmitCount + rejectedSubmitCount == managedPending,
+ "Partial-submit accounting mismatch in io_uring submit path.");
+
+ // EFAULT indicates corrupted SQ ring memory; propagate to FailFast.
+ // All other errors drain rejected SQEs as failed completions so individual
+ // operations receive error callbacks and the engine survives.
+ bool fatalSubmitError = error == Interop.Error.EFAULT;
+ if (error != Interop.Error.SUCCESS && rejectedSubmitCount != 0 && !fatalSubmitError)
+ {
+ if (!IsIgnoredIoUringSubmitError(error))
+ {
+ LogIoUringSubmitErrorDrained(rejectedSubmitCount, error);
+ }
+
+ DrainRejectedManagedSqesAsFailedCompletions(rejectedSubmitCount, error);
+ }
+
+ return fatalSubmitError ? error : Interop.Error.SUCCESS;
+ }
+
+ /// Cancels all queued-but-not-submitted operations during teardown.
+ private void DrainQueuedIoUringOperationsForTeardown()
+ {
+ MpscQueue? prepareQueue = _ioUringPrepareQueue;
+ if (prepareQueue is not null)
+ {
+ while (prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem))
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength);
+ Debug.Assert(remainingLength >= 0);
+
+ SocketAsyncContext.AsyncOperation operation = workItem.Operation;
+ operation.CancelPendingIoUringPreparation(workItem.PrepareSequence);
+ operation.TryCancelForTeardown();
+ operation.ClearIoUringUserData();
+ }
+ }
+
+ MpscQueue? cancelQueue = _ioUringCancelQueue;
+ if (cancelQueue is not null)
+ {
+ while (cancelQueue.TryDequeue(out _))
+ {
+ long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength);
+ Debug.Assert(remainingLength >= 0);
+ }
+ }
+
+ // No reset needed for generation counter; teardown does not re-enter the wait loop.
+ }
+
+ ///
+ /// Cancels all tracked in-flight operations during teardown.
+ /// This includes any future long-lived operations (for example multishot recv).
+ ///
+ private void DrainTrackedIoUringOperationsForTeardown(bool portClosedForTeardown)
+ {
+ Debug.Assert(IsCurrentThreadEventLoopThread(),
+ "DrainTrackedIoUringOperationsForTeardown must run on the event-loop thread.");
+ if (_completionSlots is null || IsIoUringTrackingEmpty())
+ {
+ return;
+ }
+
+ if (_cqOverflowRecoveryActive)
+ {
+ // Phase 1 spec branch (b): teardown preempts overflow-recovery ownership;
+ // tracked-operation drain/cancel paths become the single shutdown owner.
+ _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.Teardown;
+ _cqOverflowRecoveryActive = false;
+ if (NetEventSource.Log.IsEnabled())
+ {
+ LogIoUringCqOverflowRecoveryTeardownPreempted();
+ }
+ }
+
+ bool queuedAsyncCancel = false;
+ bool canPrepareTeardownCancels = !portClosedForTeardown && IsCurrentThreadEventLoopThread();
+ IoUringTrackedOperationState[]? trackedOperations = _trackedOperations;
+ if (trackedOperations is null)
+ {
+ return;
+ }
+
+ // Teardown uses an explicit array walk to avoid iterator state-machine allocations.
+ for (int i = 0; i < trackedOperations.Length; i++)
+ {
+ SocketAsyncContext.AsyncOperation? operation = Interlocked.Exchange(ref trackedOperations[i].TrackedOperation, null);
+ if (operation is null)
+ {
+ continue;
+ }
+
+ Volatile.Write(ref trackedOperations[i].TrackedOperationGeneration, 0UL);
+ DecrementTrackedIoUringOperationCountOnEventLoop();
+ AssertIoUringLifecycleTransition(
+ IoUringOperationLifecycleState.Submitted,
+ IoUringOperationLifecycleState.Detached);
+
+ ulong userData = operation.IoUringUserData;
+ if (canPrepareTeardownCancels &&
+ TryQueueIoUringAsyncCancel(userData))
+ {
+ queuedAsyncCancel = true;
+ }
+
+ // Teardown policy: if the port was already closed, native ownership has been
+ // detached and it is now safe to release operation-owned resources eagerly.
+ // Otherwise, queue best-effort async cancel before releasing resources.
+ operation.TryCancelForTeardown();
+ operation.ClearIoUringUserData();
+ }
+
+ if (canPrepareTeardownCancels && queuedAsyncCancel)
+ {
+ Interop.Error submitError = SubmitIoUringOperationsNormalized();
+ if (submitError != Interop.Error.SUCCESS)
+ {
+ if (NetEventSource.Log.IsEnabled()) LogIoUringAsyncCancelSubmitFailure(submitError, IoUringCancellationOrigin.Teardown);
+ }
+ }
+ }
+
+ /// Increments the late-completion counter and samples to the log.
+ private void RecordBenignLateIoUringCompletion(ulong userData)
+ {
+ RecordIoUringCounterAndMaybeLog(ref _ioUringBenignLateCompletionCount, userData, "io_uring completion arrived after managed untrack");
+ }
+
+ /// Increments the diagnostic counter tracking pending completion retries that queued prepare work.
+ private void RecordIoUringPendingRetryQueuedToPrepareQueue()
+ {
+ Interlocked.Increment(ref _ioUringPendingRetryQueuedToPrepareQueueCount);
+ }
+
+ /// Increments the non-pinnable prepare fallback counter for this engine instance.
+ internal void RecordIoUringNonPinnablePrepareFallback()
+ {
+ Interlocked.Increment(ref _ioUringNonPinnablePrepareFallbackCount);
+ }
+
+ /// Increments the completion-slot exhaustion counter.
+ private void RecordIoUringCompletionSlotExhaustion()
+ {
+ Interlocked.Increment(ref _ioUringCompletionSlotExhaustionCount);
+ }
+
+ /// Increments the completion-slot drain-recovery counter.
+ private void RecordIoUringCompletionSlotDrainRecovery()
+ {
+ Interlocked.Increment(ref _ioUringCompletionSlotDrainRecoveryCount);
+ }
+
+ /// Increments the prepare-queue overflow fallback counter.
+ private void RecordIoUringPrepareQueueOverflowFallback()
+ {
+ Interlocked.Increment(ref _ioUringPrepareQueueOverflowFallbackCount);
+ }
+
+ /// Increments the requeue-failure counter and samples to the log.
+ private void RecordIoUringCompletionRequeueFailure(ulong userData)
+ {
+ RecordIoUringCounterAndMaybeLog(ref _ioUringCompletionRequeueFailureCount, userData, "io_uring completion requeue failed; queued readiness fallback");
+ }
+
+
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
index ae9b6c9095e43f..b6f4494980fd95 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs
@@ -4,20 +4,22 @@
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace System.Net.Sockets
{
- internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem
+ internal sealed unsafe partial class SocketAsyncEngine : IThreadPoolWorkItem
{
- private const int EventBufferCount =
+ private const int DefaultEventBufferCount =
#if DEBUG
32;
#else
1024;
#endif
+ private static readonly int s_eventBufferCount = GetEventBufferCount();
// Socket continuations are dispatched to the ThreadPool from the event thread.
// This avoids continuations blocking the event handling.
@@ -25,9 +27,55 @@ internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem
// PreferInlineCompletions defaults to false and can be set to true using the DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS envvar.
internal static readonly bool InlineSocketCompletionsEnabled = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS") == "1";
+#if DEBUG
+ ///
+ /// Central registry of DEBUG-only io_uring test environment variables.
+ /// These switches are intentionally unsupported for production tuning.
+ ///
+ private static class IoUringTestEnvironmentVariables
+ {
+ internal const string EventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT";
+ internal const string QueueEntries = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_QUEUE_ENTRIES";
+ internal const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY";
+ internal const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE";
+ internal const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND";
+ internal const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK";
+ internal const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK";
+ internal const string ForceSubmitEpermOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_SUBMIT_EPERM_ONCE";
+ internal const string ForceEnterEintrRetryLimitOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE";
+ internal const string ForceKernelVersionUnsupported = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_KERNEL_VERSION_UNSUPPORTED";
+ internal const string ForceProvidedBufferRingOomOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_PROVIDED_BUFFER_RING_OOM_ONCE";
+ internal const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE";
+ internal const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING";
+ internal const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS";
+ }
+#endif
+
+ private static int GetEventBufferCount()
+ {
+#if DEBUG
+ // Test-only knob to make wait-buffer saturation deterministic for io_uring diagnostics coverage.
+ // Only available in DEBUG builds so production code never reads test env vars.
+ if (OperatingSystem.IsLinux())
+ {
+ string? configuredValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.EventBufferCount);
+ if (configuredValue is not null &&
+ int.TryParse(configuredValue, out int parsedValue) &&
+ parsedValue >= 1 &&
+ parsedValue <= DefaultEventBufferCount)
+ {
+ return parsedValue;
+ }
+ }
+#endif
+
+ return DefaultEventBufferCount;
+ }
+
private static int GetEngineCount()
{
// The responsibility of SocketAsyncEngine is to get notifications from epoll|kqueue
+ // (or io_uring on Linux when enabled in the native shim)
// and schedule corresponding work items to ThreadPool (socket reads and writes).
//
// Using TechEmpower benchmarks that generate a LOT of SMALL socket reads and writes under a VERY HIGH load
@@ -85,11 +133,12 @@ private static SocketAsyncEngine[] CreateEngines()
private readonly IntPtr _port;
private readonly Interop.Sys.SocketEvent* _buffer;
+ private int _eventLoopManagedThreadId;
//
// Queue of events generated by EventLoop() that would be processed by the thread pool
//
- private readonly ConcurrentQueue _eventQueue = new ConcurrentQueue();
+ private readonly SocketIOEventQueue _eventQueue = new SocketIOEventQueue();
// This flag is used for communication between item enqueuing and workers that process the items.
// There are two states of this flag:
@@ -143,8 +192,20 @@ private bool TryRegisterCore(IntPtr socketHandle, SocketAsyncContext context, ou
context.GlobalContextIndex = index;
}
- error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None,
- Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex);
+ Interop.Error managedError = default;
+ bool managedHandled = false;
+ LinuxTryChangeSocketEventRegistration(socketHandle, Interop.Sys.SocketEvents.None,
+ Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write,
+ context.GlobalContextIndex, ref managedError, ref managedHandled);
+ if (managedHandled)
+ {
+ error = managedError;
+ }
+ else
+ {
+ error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None,
+ Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex);
+ }
if (error == Interop.Error.SUCCESS)
{
return true;
@@ -182,19 +243,21 @@ private SocketAsyncEngine()
err = Interop.Sys.CreateSocketEventPort(portPtr);
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
}
fixed (Interop.Sys.SocketEvent** bufferPtr = &_buffer)
{
- err = Interop.Sys.CreateSocketEventBuffer(EventBufferCount, bufferPtr);
+ err = Interop.Sys.CreateSocketEventBuffer(s_eventBufferCount, bufferPtr);
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
}
+ LinuxDetectAndInitializeIoUring();
+
var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop())
{
IsBackground = true,
@@ -204,37 +267,92 @@ private SocketAsyncEngine()
}
catch
{
+ // Constructor failure path only: if construction throws, clean up immediately.
+ // This path is the sole caller of FreeNativeResources().
FreeNativeResources();
throw;
}
}
+ partial void LinuxDetectAndInitializeIoUring();
+ partial void LinuxEventLoopEnableRings();
+ partial void LinuxEventLoopBeforeWait();
+ partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled);
+ partial void LinuxEventLoopAfterIteration();
+ partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort);
+ partial void LinuxFreeIoUringResources();
+ partial void LinuxTryChangeSocketEventRegistration(IntPtr socketHandle, Interop.Sys.SocketEvents currentEvents, Interop.Sys.SocketEvents newEvents, int data, ref Interop.Error error, ref bool handled);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(Interop.Error error) =>
+ throw new InternalException(error);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ private static void ThrowInternalException(string message) =>
+ throw new InternalException(message);
+
+ [DoesNotReturn]
+ [StackTraceHidden]
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void FailFastEventLoop(Exception exception) =>
+ Environment.FailFast($"Exception thrown from SocketAsyncEngine event loop: {exception}", exception);
+
+ private void RecordAndAssertEventLoopThreadIdentity()
+ {
+ int currentThreadId = Environment.CurrentManagedThreadId;
+#if DEBUG
+ int previousThreadId = Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0);
+ Debug.Assert(
+ previousThreadId == 0 || previousThreadId == currentThreadId,
+ $"SocketAsyncEngine event loop thread changed: previous={previousThreadId}, current={currentThreadId}");
+#else
+ Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0);
+#endif
+ }
+
private void EventLoop()
{
try
{
+ RecordAndAssertEventLoopThreadIdentity();
+ LinuxEventLoopEnableRings();
SocketEventHandler handler = new SocketEventHandler(this);
while (true)
{
- int numEvents = EventBufferCount;
- Interop.Error err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents);
+ LinuxEventLoopBeforeWait();
+
+ int numEvents = s_eventBufferCount;
+ int numCompletions = 0;
+ Interop.Error err = default;
+ bool waitHandled = false;
+ LinuxEventLoopTryCompletionWait(handler, ref numEvents, ref numCompletions, ref err, ref waitHandled);
+ if (!waitHandled)
+ {
+ err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents);
+ }
+
if (err != Interop.Error.SUCCESS)
{
- throw new InternalException(err);
+ ThrowInternalException(err);
}
- // The native shim is responsible for ensuring this condition.
- Debug.Assert(numEvents > 0, $"Unexpected numEvents: {numEvents}");
+ // io_uring completion-mode wait can return with zero surfaced events/completions
+ // when woken only to flush managed prepare/cancel queues.
+ Debug.Assert(waitHandled || numEvents > 0 || numCompletions > 0, $"Unexpected wait result: events={numEvents}, completions={numCompletions}");
- if (handler.HandleSocketEvents(numEvents))
+ if (numEvents > 0 && handler.HandleSocketEvents(numEvents))
{
EnsureWorkerScheduled();
}
+
+ LinuxEventLoopAfterIteration();
}
}
catch (Exception e)
{
- Environment.FailFast("Exception thrown from SocketAsyncEngine event loop: " + e.ToString(), e);
+ FailFastEventLoop(e);
}
}
@@ -257,7 +375,7 @@ void IThreadPoolWorkItem.Execute()
// Checking for items must happen after resetting the processing state.
Interlocked.MemoryBarrier();
- ConcurrentQueue eventQueue = _eventQueue;
+ SocketIOEventQueue eventQueue = _eventQueue;
if (!eventQueue.TryDequeue(out SocketIOEvent ev))
{
return;
@@ -295,11 +413,22 @@ void IThreadPoolWorkItem.Execute()
private void FreeNativeResources()
{
+ Debug.Assert(
+ Volatile.Read(ref _eventLoopManagedThreadId) == 0,
+ "FreeNativeResources is only used by constructor-failure cleanup; event loop thread must not have started.");
+ bool closeSocketEventPort = true;
+ // Linux io_uring teardown may need to close the port first to ensure native
+ // ownership is detached before managed operation resources are released.
+ LinuxBeforeFreeNativeResources(ref closeSocketEventPort);
+
+ LinuxFreeIoUringResources();
+
if (_buffer != null)
{
Interop.Sys.FreeSocketEventBuffer(_buffer);
}
- if (_port != (IntPtr)(-1))
+
+ if (closeSocketEventPort && _port != (IntPtr)(-1))
{
Interop.Sys.CloseSocketEventPort(_port);
}
@@ -310,14 +439,16 @@ private void FreeNativeResources()
// To avoid this, the event handling logic is delegated to a non-inlined processing method.
// See discussion: https://github.com/dotnet/runtime/issues/37064
// SocketEventHandler holds an on-stack cache of SocketAsyncEngine members needed by the handler method.
- private readonly struct SocketEventHandler
+ private readonly partial struct SocketEventHandler
{
public Interop.Sys.SocketEvent* Buffer { get; }
- private readonly ConcurrentQueue _eventQueue;
+ private readonly SocketIOEventQueue _eventQueue;
+ private readonly SocketAsyncEngine _engine;
public SocketEventHandler(SocketAsyncEngine engine)
{
+ _engine = engine;
Buffer = engine._buffer;
_eventQueue = engine._eventQueue;
}
@@ -358,6 +489,25 @@ public bool HandleSocketEvents(int numEvents)
}
}
+ private sealed class SocketIOEventQueue
+ {
+#if TARGET_LINUX
+ private readonly MpscQueue _queue = new MpscQueue();
+#else
+ private readonly ConcurrentQueue _queue = new ConcurrentQueue();
+#endif
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ // Event delivery cannot drop entries. Use Enqueue's retrying contract here;
+ // io_uring prepare/cancel queues use TryEnqueue where fallback paths exist.
+ public void Enqueue(SocketIOEvent socketEvent) => _queue.Enqueue(socketEvent);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public bool TryDequeue(out SocketIOEvent socketEvent) => _queue.TryDequeue(out socketEvent);
+
+ public bool IsEmpty => _queue.IsEmpty;
+ }
+
private readonly struct SocketIOEvent
{
public SocketAsyncContext Context { get; }
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs
new file mode 100644
index 00000000000000..38d7ef78334b34
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs
@@ -0,0 +1,12 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Net.Sockets
+{
+ internal static partial class SocketPal
+ {
+ /// Extracts from a completed io_uring recvmsg message header.
+ internal static unsafe IPPacketInformation GetIoUringIPPacketInformation(Interop.Sys.MessageHeader* messageHeader, bool isIPv4, bool isIPv6) =>
+ GetIPPacketInformation(messageHeader, isIPv4, isIPv6);
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
index 1171961a204351..114bcad4dc2f3a 100644
--- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
+++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs
@@ -14,6 +14,29 @@ internal sealed partial class SocketsTelemetry : EventSource
private const string ConnectActivityName = ActivitySourceName + ".Connect";
private static readonly ActivitySource s_connectActivitySource = new ActivitySource(ActivitySourceName);
+ internal static class Keywords
+ {
+ // Stable operational counters are always published when the source is enabled on Linux.
+ // Diagnostic counters are opt-in and can evolve without name stability guarantees.
+ internal const EventKeywords IoUringDiagnostics = (EventKeywords)0x1;
+ }
+
+ internal static class IoUringCounterNames
+ {
+ internal const string PrepareNonPinnableFallbacks = "io-uring-prepare-nonpinnable-fallbacks";
+ internal const string SocketEventBufferFull = "io-uring-socket-event-buffer-full";
+ internal const string CqOverflows = "io-uring-cq-overflows";
+ internal const string CqOverflowRecoveries = "io-uring-cq-overflow-recoveries";
+ internal const string PrepareQueueOverflows = "io-uring-prepare-queue-overflows";
+ internal const string PrepareQueueOverflowFallbacks = "io-uring-prepare-queue-overflow-fallbacks";
+ internal const string CompletionSlotExhaustions = "io-uring-completion-slot-exhaustions";
+ internal const string CompletionSlotHighWaterMark = "io-uring-completion-slot-high-water-mark";
+ internal const string CancellationQueueOverflows = "io-uring-cancellation-queue-overflows";
+ internal const string ProvidedBufferDepletions = "io-uring-provided-buffer-depletions";
+ internal const string SqPollWakeups = "io-uring-sqpoll-wakeups";
+ internal const string SqPollSubmissionsSkipped = "io-uring-sqpoll-submissions-skipped";
+ }
+
public static readonly SocketsTelemetry Log = new SocketsTelemetry();
private PollingCounter? _currentOutgoingConnectAttemptsCounter;
@@ -23,6 +46,20 @@ internal sealed partial class SocketsTelemetry : EventSource
private PollingCounter? _bytesSentCounter;
private PollingCounter? _datagramsReceivedCounter;
private PollingCounter? _datagramsSentCounter;
+ // Keep io_uring counter backing fields always present so EventCounter name contracts remain stable
+ // across platforms; OnEventCommand only registers these counters on Linux.
+ private PollingCounter? _ioUringPrepareNonPinnableFallbacksCounter;
+ private PollingCounter? _ioUringSocketEventBufferFullCounter;
+ private PollingCounter? _ioUringCqOverflowCounter;
+ private PollingCounter? _ioUringCqOverflowRecoveriesCounter;
+ private PollingCounter? _ioUringPrepareQueueOverflowsCounter;
+ private PollingCounter? _ioUringPrepareQueueOverflowFallbacksCounter;
+ private PollingCounter? _ioUringCompletionSlotExhaustionsCounter;
+ private PollingCounter? _ioUringCompletionSlotHighWaterMarkCounter;
+ private PollingCounter? _ioUringCancellationQueueOverflowsCounter;
+ private PollingCounter? _ioUringProvidedBufferDepletionsCounter;
+ private PollingCounter? _ioUringSqPollWakeupsCounter;
+ private PollingCounter? _ioUringSqPollSubmissionsSkippedCounter;
private long _currentOutgoingConnectAttempts;
private long _outgoingConnectionsEstablished;
@@ -31,6 +68,92 @@ internal sealed partial class SocketsTelemetry : EventSource
private long _bytesSent;
private long _datagramsReceived;
private long _datagramsSent;
+ // Backing fields stay cross-platform for contract stability; they are only surfaced as counters on Linux.
+ private long _ioUringPrepareNonPinnableFallbacks;
+ private long _ioUringAsyncCancelRequestCqes;
+ private long _ioUringSocketEventBufferFull;
+ private long _ioUringCqOverflow;
+ private long _ioUringCqOverflowRecoveries;
+ private long _ioUringCompletionRequeueFailures;
+ private long _ioUringZeroCopyNotificationPendingSlots;
+ private long _ioUringPrepareQueueOverflows;
+ private long _ioUringPrepareQueueOverflowFallbacks;
+ private long _ioUringPrepareQueueDepth;
+ private long _ioUringCompletionSlotExhaustions;
+ private long _ioUringCompletionSlotHighWaterMark;
+ private long _ioUringCancellationQueueOverflows;
+ private long _ioUringCompletionSlotDrainRecoveries;
+ private long _ioUringProvidedBufferDepletions;
+ private long _ioUringProvidedBufferCurrentSize;
+ private long _ioUringProvidedBufferRecycles;
+ private long _ioUringProvidedBufferResizes;
+ private long _ioUringRegisteredBuffersInitialSuccess;
+ private long _ioUringRegisteredBuffersInitialFailure;
+ private long _ioUringRegisteredBuffersReregistrationSuccess;
+ private long _ioUringRegisteredBuffersReregistrationFailure;
+ private long _ioUringFixedRecvSelected;
+ private long _ioUringFixedRecvFallbacks;
+ private long _ioUringSqPollWakeups;
+ private long _ioUringSqPollSubmissionsSkipped;
+ private long _ioUringPersistentMultishotRecvReuse;
+ private long _ioUringPersistentMultishotRecvTermination;
+ private long _ioUringPersistentMultishotRecvEarlyData;
+
+ internal enum IoUringCounterFieldForTest
+ {
+ PrepareNonPinnableFallbacks,
+ AsyncCancelRequestCqes,
+ SocketEventBufferFull,
+ CqOverflow,
+ CqOverflowRecoveries,
+ CompletionRequeueFailures,
+ PrepareQueueOverflows,
+ PrepareQueueOverflowFallbacks,
+ CompletionSlotExhaustions,
+ CompletionSlotHighWaterMark,
+ CancellationQueueOverflows,
+ SqPollWakeups,
+ SqPollSubmissionsSkipped,
+ ProvidedBufferDepletions,
+ ProvidedBufferRecycles,
+ RegisteredBuffersInitialSuccess,
+ RegisteredBuffersInitialFailure,
+ RegisteredBuffersReregistrationSuccess,
+ RegisteredBuffersReregistrationFailure,
+ PersistentMultishotRecvReuse,
+ PersistentMultishotRecvTermination,
+ }
+
+ internal static ulong GetIoUringCounterValueForTest(IoUringCounterFieldForTest field)
+ {
+ // Test hook used by io_uring functional tests to validate counter deltas without
+ // reflecting over private fields that are brittle under refactoring.
+ return field switch
+ {
+ IoUringCounterFieldForTest.PrepareNonPinnableFallbacks => (ulong)Interlocked.Read(ref Log._ioUringPrepareNonPinnableFallbacks),
+ IoUringCounterFieldForTest.AsyncCancelRequestCqes => (ulong)Interlocked.Read(ref Log._ioUringAsyncCancelRequestCqes),
+ IoUringCounterFieldForTest.SocketEventBufferFull => (ulong)Interlocked.Read(ref Log._ioUringSocketEventBufferFull),
+ IoUringCounterFieldForTest.CqOverflow => (ulong)Interlocked.Read(ref Log._ioUringCqOverflow),
+ IoUringCounterFieldForTest.CqOverflowRecoveries => (ulong)Interlocked.Read(ref Log._ioUringCqOverflowRecoveries),
+ IoUringCounterFieldForTest.CompletionRequeueFailures => (ulong)Interlocked.Read(ref Log._ioUringCompletionRequeueFailures),
+ IoUringCounterFieldForTest.PrepareQueueOverflows => (ulong)Interlocked.Read(ref Log._ioUringPrepareQueueOverflows),
+ IoUringCounterFieldForTest.PrepareQueueOverflowFallbacks => (ulong)Interlocked.Read(ref Log._ioUringPrepareQueueOverflowFallbacks),
+ IoUringCounterFieldForTest.CompletionSlotExhaustions => (ulong)Interlocked.Read(ref Log._ioUringCompletionSlotExhaustions),
+ IoUringCounterFieldForTest.CompletionSlotHighWaterMark => (ulong)Interlocked.Read(ref Log._ioUringCompletionSlotHighWaterMark),
+ IoUringCounterFieldForTest.CancellationQueueOverflows => (ulong)Interlocked.Read(ref Log._ioUringCancellationQueueOverflows),
+ IoUringCounterFieldForTest.SqPollWakeups => (ulong)Interlocked.Read(ref Log._ioUringSqPollWakeups),
+ IoUringCounterFieldForTest.SqPollSubmissionsSkipped => (ulong)Interlocked.Read(ref Log._ioUringSqPollSubmissionsSkipped),
+ IoUringCounterFieldForTest.ProvidedBufferDepletions => (ulong)Interlocked.Read(ref Log._ioUringProvidedBufferDepletions),
+ IoUringCounterFieldForTest.ProvidedBufferRecycles => (ulong)Interlocked.Read(ref Log._ioUringProvidedBufferRecycles),
+ IoUringCounterFieldForTest.RegisteredBuffersInitialSuccess => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersInitialSuccess),
+ IoUringCounterFieldForTest.RegisteredBuffersInitialFailure => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersInitialFailure),
+ IoUringCounterFieldForTest.RegisteredBuffersReregistrationSuccess => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersReregistrationSuccess),
+ IoUringCounterFieldForTest.RegisteredBuffersReregistrationFailure => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersReregistrationFailure),
+ IoUringCounterFieldForTest.PersistentMultishotRecvReuse => (ulong)Interlocked.Read(ref Log._ioUringPersistentMultishotRecvReuse),
+ IoUringCounterFieldForTest.PersistentMultishotRecvTermination => (ulong)Interlocked.Read(ref Log._ioUringPersistentMultishotRecvTermination),
+ _ => 0UL,
+ };
+ }
[Event(1, Level = EventLevel.Informational)]
private void ConnectStart(string? address)
@@ -80,6 +203,33 @@ private void AcceptFailed(SocketError error, string? exceptionMessage)
}
}
+ [Event(7, Level = EventLevel.Informational)]
+ private void SocketEngineBackendSelected(string backend, int isIoUringPort, int sqPollEnabled)
+ {
+ if (IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ WriteEvent(eventId: 7, backend, isIoUringPort, sqPollEnabled);
+ }
+ }
+
+ [Event(8, Level = EventLevel.Warning)]
+ private void IoUringSqPollNegotiatedWarning(string message)
+ {
+ if (IsEnabled(EventLevel.Warning, EventKeywords.All))
+ {
+ WriteEvent(eventId: 8, message);
+ }
+ }
+
+ [Event(9, Level = EventLevel.Informational)]
+ private void IoUringResolvedConfiguration(string configuration)
+ {
+ if (IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ WriteEvent(eventId: 9, configuration);
+ }
+ }
+
[NonEvent]
public Activity? ConnectStart(SocketAddress address, ProtocolType protocolType, EndPoint endPoint, bool keepActivityCurrent)
{
@@ -189,6 +339,43 @@ public void AcceptStart(EndPoint address)
}
}
+ [NonEvent]
+ internal void ReportSocketEngineBackendSelected(bool isIoUringPort, bool isCompletionMode, bool sqPollEnabled)
+ {
+ if (!IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ return;
+ }
+
+ SocketEngineBackendSelected(
+ isCompletionMode ? "io_uring_completion" : "epoll",
+ isIoUringPort ? 1 : 0,
+ sqPollEnabled ? 1 : 0);
+ }
+
+ [NonEvent]
+ internal void ReportIoUringSqPollNegotiatedWarning()
+ {
+ if (!IsEnabled(EventLevel.Warning, EventKeywords.All))
+ {
+ return;
+ }
+
+ IoUringSqPollNegotiatedWarning(
+ "io_uring SQPOLL negotiated: kernel polling thread is enabled and may increase privileges in containerized environments.");
+ }
+
+ [NonEvent]
+ internal void ReportIoUringResolvedConfiguration(string configuration)
+ {
+ if (!IsEnabled(EventLevel.Informational, EventKeywords.All))
+ {
+ return;
+ }
+
+ IoUringResolvedConfiguration(configuration);
+ }
+
[NonEvent]
public void AfterAccept(SocketError error, string? exceptionMessage = null)
{
@@ -231,6 +418,222 @@ public void DatagramSent()
Interlocked.Increment(ref _datagramsSent);
}
+ [NonEvent]
+ public void IoUringPrepareNonPinnableFallback(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPrepareNonPinnableFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringAsyncCancelRequestCqes(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringAsyncCancelRequestCqes, count);
+ }
+
+ [NonEvent]
+ public void IoUringSocketEventBufferFull(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringSocketEventBufferFull, count);
+ }
+
+ [NonEvent]
+ public void IoUringCqOverflow(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCqOverflow, count);
+ }
+
+ [NonEvent]
+ public void IoUringCqOverflowRecovery(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCqOverflowRecoveries, count);
+ }
+
+ [NonEvent]
+ public void IoUringCompletionRequeueFailure(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCompletionRequeueFailures, count);
+ }
+
+ [NonEvent]
+ public void IoUringZeroCopyNotificationPendingSlots(int count)
+ {
+ Debug.Assert(count >= 0);
+ Volatile.Write(ref _ioUringZeroCopyNotificationPendingSlots, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueOverflow(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPrepareQueueOverflows, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueOverflowFallback(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPrepareQueueOverflowFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringPrepareQueueDepthDelta(long delta)
+ {
+ long value = Interlocked.Add(ref _ioUringPrepareQueueDepth, delta);
+ Debug.Assert(value >= 0, $"io_uring prepare queue depth cannot be negative: {value}");
+ }
+
+ [NonEvent]
+ public void IoUringCompletionSlotExhaustion(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCompletionSlotExhaustions, count);
+ }
+
+ [NonEvent]
+ public void IoUringCompletionSlotHighWaterMark(long count)
+ {
+ Debug.Assert(count >= 0);
+ while (true)
+ {
+ long observed = Volatile.Read(ref _ioUringCompletionSlotHighWaterMark);
+ if (count <= observed)
+ {
+ return;
+ }
+
+ if (Interlocked.CompareExchange(ref _ioUringCompletionSlotHighWaterMark, count, observed) == observed)
+ {
+ return;
+ }
+ }
+ }
+
+ [NonEvent]
+ public void IoUringCompletionSlotDrainRecovery(long count)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCompletionSlotDrainRecoveries, count);
+ }
+
+ [NonEvent]
+ public void IoUringCancellationQueueOverflow(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringCancellationQueueOverflows, count);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferDepletion(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringProvidedBufferDepletions, count);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferCurrentSize(int size)
+ {
+ Debug.Assert(size >= 0);
+ Volatile.Write(ref _ioUringProvidedBufferCurrentSize, size);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferRecycle(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringProvidedBufferRecycles, count);
+ }
+
+ [NonEvent]
+ public void IoUringProvidedBufferResize(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringProvidedBufferResizes, count);
+ }
+
+ [NonEvent]
+ public void IoUringRegisteredBuffersResult(bool success, int bufferCount, int bufferSize)
+ {
+ Debug.Assert(bufferCount >= 0);
+ Debug.Assert(bufferSize >= 0);
+
+ if (success)
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersInitialSuccess);
+ }
+ else
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersInitialFailure);
+ }
+ }
+
+ [NonEvent]
+ public void IoUringRegisteredBuffersReregistration(bool success)
+ {
+ if (success)
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationSuccess);
+ }
+ else
+ {
+ Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationFailure);
+ }
+ }
+
+ [NonEvent]
+ public void IoUringFixedRecvSelected(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringFixedRecvSelected, count);
+ }
+
+ [NonEvent]
+ public void IoUringFixedRecvFallback(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringFixedRecvFallbacks, count);
+ }
+
+ [NonEvent]
+ public void IoUringSqPollWakeup(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringSqPollWakeups, count);
+ }
+
+ [NonEvent]
+ public void IoUringSqPollSubmissionSkipped(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringSqPollSubmissionsSkipped, count);
+ }
+
+ [NonEvent]
+ public void IoUringPersistentMultishotRecvReuse(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPersistentMultishotRecvReuse, count);
+ }
+
+ [NonEvent]
+ public void IoUringPersistentMultishotRecvTermination(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPersistentMultishotRecvTermination, count);
+ }
+
+ [NonEvent]
+ public void IoUringPersistentMultishotRecvEarlyData(long count = 1)
+ {
+ Debug.Assert(count >= 0);
+ Interlocked.Add(ref _ioUringPersistentMultishotRecvEarlyData, count);
+ }
+
private static string GetErrorType(SocketError socketError) => socketError switch
{
// Common connect() errors expected to be seen:
@@ -291,6 +694,60 @@ protected override void OnEventCommand(EventCommandEventArgs command)
{
DisplayName = "Datagrams Sent",
};
+
+ if (!OperatingSystem.IsLinux())
+ {
+ return;
+ }
+
+ _ioUringPrepareNonPinnableFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareNonPinnableFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareNonPinnableFallbacks))
+ {
+ DisplayName = "io_uring Prepare Non-Pinnable Fallbacks",
+ };
+ _ioUringSocketEventBufferFullCounter ??= new PollingCounter(IoUringCounterNames.SocketEventBufferFull, this, () => Interlocked.Read(ref _ioUringSocketEventBufferFull))
+ {
+ DisplayName = "io_uring Socket Event Buffer Full",
+ };
+ _ioUringCqOverflowCounter ??= new PollingCounter(IoUringCounterNames.CqOverflows, this, () => Interlocked.Read(ref _ioUringCqOverflow))
+ {
+ DisplayName = "io_uring Completion Queue Overflow",
+ };
+ _ioUringCqOverflowRecoveriesCounter ??= new PollingCounter(IoUringCounterNames.CqOverflowRecoveries, this, () => Interlocked.Read(ref _ioUringCqOverflowRecoveries))
+ {
+ DisplayName = "io_uring Completion Queue Overflow Recoveries",
+ };
+ _ioUringPrepareQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflows, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflows))
+ {
+ DisplayName = "io_uring Prepare Queue Overflows",
+ };
+ _ioUringPrepareQueueOverflowFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflowFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflowFallbacks))
+ {
+ DisplayName = "io_uring Prepare Queue Overflow Fallbacks",
+ };
+ _ioUringCompletionSlotExhaustionsCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotExhaustions, this, () => Interlocked.Read(ref _ioUringCompletionSlotExhaustions))
+ {
+ DisplayName = "io_uring Completion Slot Exhaustions",
+ };
+ _ioUringCompletionSlotHighWaterMarkCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotHighWaterMark, this, () => Interlocked.Read(ref _ioUringCompletionSlotHighWaterMark))
+ {
+ DisplayName = "io_uring Completion Slot High-Water Mark",
+ };
+ _ioUringCancellationQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.CancellationQueueOverflows, this, () => Interlocked.Read(ref _ioUringCancellationQueueOverflows))
+ {
+ DisplayName = "io_uring Cancellation Queue Overflows",
+ };
+ _ioUringProvidedBufferDepletionsCounter ??= new PollingCounter(IoUringCounterNames.ProvidedBufferDepletions, this, () => Interlocked.Read(ref _ioUringProvidedBufferDepletions))
+ {
+ DisplayName = "io_uring Provided Buffer Depletions",
+ };
+ _ioUringSqPollWakeupsCounter ??= new PollingCounter(IoUringCounterNames.SqPollWakeups, this, () => Interlocked.Read(ref _ioUringSqPollWakeups))
+ {
+ DisplayName = "io_uring SQPOLL Wakeups",
+ };
+ _ioUringSqPollSubmissionsSkippedCounter ??= new PollingCounter(IoUringCounterNames.SqPollSubmissionsSkipped, this, () => Interlocked.Read(ref _ioUringSqPollSubmissionsSkipped))
+ {
+ DisplayName = "io_uring SQPOLL Submissions Skipped",
+ };
}
}
}
diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs
new file mode 100644
index 00000000000000..c64d942f73fa6b
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs
@@ -0,0 +1,697 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Reflection;
+using System.Runtime.ExceptionServices;
+
+namespace System.Net.Sockets
+{
+ ///
+ /// Linux test-only shim that mirrors internal SocketAsyncEngine test hooks through reflection.
+ ///
+ internal sealed class SocketAsyncEngine
+ {
+ private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic;
+ private const BindingFlags InstanceFlags = BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic;
+
+ // Keep shim type initialization inert: all reflection is resolved lazily per call.
+ [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketAsyncEngine", "System.Net.Sockets")]
+ static SocketAsyncEngine()
+ {
+ }
+
+ private readonly object _inner;
+
+ private SocketAsyncEngine(object inner)
+ {
+ _inner = inner;
+ }
+
+ internal readonly struct IoUringNonPinnableFallbackPublicationState
+ {
+ internal IoUringNonPinnableFallbackPublicationState(long publishedCount, int publishingGate, long fallbackCount)
+ {
+ PublishedCount = publishedCount;
+ PublishingGate = publishingGate;
+ FallbackCount = fallbackCount;
+ }
+
+ internal long PublishedCount { get; }
+ internal int PublishingGate { get; }
+ internal long FallbackCount { get; }
+ }
+
+ internal readonly struct IoUringNativeDiagnosticsSnapshotForTest
+ {
+ internal IoUringNativeDiagnosticsSnapshotForTest(
+ bool hasIoUringPort,
+ ulong asyncCancelRequestCqeCount,
+ ulong asyncCancelRequestCqeEnoentCount,
+ ulong asyncCancelRequestCqeEalreadyCount,
+ ulong asyncCancelRequestCqeOtherCount,
+ ulong socketEventBufferFullCount,
+ ulong unsupportedOpcodePrepareCount,
+ ulong cqOverflowCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ AsyncCancelRequestCqeCount = asyncCancelRequestCqeCount;
+ AsyncCancelRequestCqeEnoentCount = asyncCancelRequestCqeEnoentCount;
+ AsyncCancelRequestCqeEalreadyCount = asyncCancelRequestCqeEalreadyCount;
+ AsyncCancelRequestCqeOtherCount = asyncCancelRequestCqeOtherCount;
+ SocketEventBufferFullCount = socketEventBufferFullCount;
+ UnsupportedOpcodePrepareCount = unsupportedOpcodePrepareCount;
+ CqOverflowCount = cqOverflowCount;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal ulong AsyncCancelRequestCqeCount { get; }
+ internal ulong AsyncCancelRequestCqeEnoentCount { get; }
+ internal ulong AsyncCancelRequestCqeEalreadyCount { get; }
+ internal ulong AsyncCancelRequestCqeOtherCount { get; }
+ internal ulong SocketEventBufferFullCount { get; }
+ internal ulong UnsupportedOpcodePrepareCount { get; }
+ internal ulong CqOverflowCount { get; }
+ }
+
+ internal readonly struct IoUringProvidedBufferSnapshotForTest
+ {
+ internal IoUringProvidedBufferSnapshotForTest(
+ bool hasIoUringPort,
+ bool supportsProvidedBufferRings,
+ bool hasProvidedBufferRing,
+ bool hasRegisteredBuffers,
+ bool adaptiveBufferSizingEnabled,
+ int availableCount,
+ int inUseCount,
+ int totalBufferCount,
+ int bufferSize,
+ int recommendedBufferSize,
+ long recycledCount,
+ long allocationFailureCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsProvidedBufferRings = supportsProvidedBufferRings;
+ HasProvidedBufferRing = hasProvidedBufferRing;
+ HasRegisteredBuffers = hasRegisteredBuffers;
+ AdaptiveBufferSizingEnabled = adaptiveBufferSizingEnabled;
+ AvailableCount = availableCount;
+ InUseCount = inUseCount;
+ TotalBufferCount = totalBufferCount;
+ BufferSize = bufferSize;
+ RecommendedBufferSize = recommendedBufferSize;
+ RecycledCount = recycledCount;
+ AllocationFailureCount = allocationFailureCount;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SupportsProvidedBufferRings { get; }
+ internal bool HasProvidedBufferRing { get; }
+ internal bool HasRegisteredBuffers { get; }
+ internal bool AdaptiveBufferSizingEnabled { get; }
+ internal int AvailableCount { get; }
+ internal int InUseCount { get; }
+ internal int TotalBufferCount { get; }
+ internal int BufferSize { get; }
+ internal int RecommendedBufferSize { get; }
+ internal long RecycledCount { get; }
+ internal long AllocationFailureCount { get; }
+ }
+
+ internal readonly struct IoUringZeroCopySendSnapshotForTest
+ {
+ internal IoUringZeroCopySendSnapshotForTest(
+ bool hasIoUringPort,
+ bool supportsSendZc,
+ bool supportsSendMsgZc,
+ bool zeroCopySendEnabled)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsSendZc = supportsSendZc;
+ SupportsSendMsgZc = supportsSendMsgZc;
+ ZeroCopySendEnabled = zeroCopySendEnabled;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SupportsSendZc { get; }
+ internal bool SupportsSendMsgZc { get; }
+ internal bool ZeroCopySendEnabled { get; }
+ }
+
+ internal readonly struct IoUringFixedRecvSnapshotForTest
+ {
+ internal IoUringFixedRecvSnapshotForTest(
+ bool hasIoUringPort,
+ bool supportsReadFixed,
+ bool hasRegisteredBuffers)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SupportsReadFixed = supportsReadFixed;
+ HasRegisteredBuffers = hasRegisteredBuffers;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SupportsReadFixed { get; }
+ internal bool HasRegisteredBuffers { get; }
+ }
+
+ internal readonly struct IoUringSqPollSnapshotForTest
+ {
+ internal IoUringSqPollSnapshotForTest(bool hasIoUringPort, bool sqPollEnabled, bool deferTaskrunEnabled)
+ {
+ HasIoUringPort = hasIoUringPort;
+ SqPollEnabled = sqPollEnabled;
+ DeferTaskrunEnabled = deferTaskrunEnabled;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal bool SqPollEnabled { get; }
+ internal bool DeferTaskrunEnabled { get; }
+ }
+
+ internal readonly struct IoUringZeroCopyPinHoldSnapshotForTest
+ {
+ internal IoUringZeroCopyPinHoldSnapshotForTest(
+ bool hasIoUringPort,
+ int activePinHolds,
+ int pendingNotificationCount)
+ {
+ HasIoUringPort = hasIoUringPort;
+ ActivePinHolds = activePinHolds;
+ PendingNotificationCount = pendingNotificationCount;
+ }
+
+ internal bool HasIoUringPort { get; }
+ internal int ActivePinHolds { get; }
+ internal int PendingNotificationCount { get; }
+ }
+
+ internal readonly struct IoUringNativeMsghdrLayoutSnapshotForTest
+ {
+ internal IoUringNativeMsghdrLayoutSnapshotForTest(
+ int size,
+ int msgNameOffset,
+ int msgNameLengthOffset,
+ int msgIovOffset,
+ int msgIovLengthOffset,
+ int msgControlOffset,
+ int msgControlLengthOffset,
+ int msgFlagsOffset)
+ {
+ Size = size;
+ MsgNameOffset = msgNameOffset;
+ MsgNameLengthOffset = msgNameLengthOffset;
+ MsgIovOffset = msgIovOffset;
+ MsgIovLengthOffset = msgIovLengthOffset;
+ MsgControlOffset = msgControlOffset;
+ MsgControlLengthOffset = msgControlLengthOffset;
+ MsgFlagsOffset = msgFlagsOffset;
+ }
+
+ internal int Size { get; }
+ internal int MsgNameOffset { get; }
+ internal int MsgNameLengthOffset { get; }
+ internal int MsgIovOffset { get; }
+ internal int MsgIovLengthOffset { get; }
+ internal int MsgControlOffset { get; }
+ internal int MsgControlLengthOffset { get; }
+ internal int MsgFlagsOffset { get; }
+ }
+
+ internal readonly struct IoUringCompletionSlotLayoutSnapshotForTest
+ {
+ internal IoUringCompletionSlotLayoutSnapshotForTest(
+ int size,
+ int generationOffset,
+ int freeListNextOffset,
+ int packedStateOffset,
+ int fixedRecvBufferIdOffset,
+ int testForcedResultOffset)
+ {
+ Size = size;
+ GenerationOffset = generationOffset;
+ FreeListNextOffset = freeListNextOffset;
+ PackedStateOffset = packedStateOffset;
+ FixedRecvBufferIdOffset = fixedRecvBufferIdOffset;
+ TestForcedResultOffset = testForcedResultOffset;
+ }
+
+ internal int Size { get; }
+ internal int GenerationOffset { get; }
+ internal int FreeListNextOffset { get; }
+ internal int PackedStateOffset { get; }
+ internal int FixedRecvBufferIdOffset { get; }
+ internal int TestForcedResultOffset { get; }
+ }
+
+ internal static IoUringNonPinnableFallbackPublicationState GetIoUringNonPinnableFallbackPublicationStateForTest()
+ {
+ object state = InvokeStatic("GetIoUringNonPinnableFallbackPublicationStateForTest")!;
+ return new IoUringNonPinnableFallbackPublicationState(
+ ReadProperty(state, "PublishedCount"),
+ ReadProperty(state, "PublishingGate"),
+ ReadProperty(state, "FallbackCount"));
+ }
+
+ internal static void SetIoUringNonPinnableFallbackPublicationStateForTest(IoUringNonPinnableFallbackPublicationState state)
+ {
+ MethodInfo setter = GetRequiredMethod(GetEngineType(), "SetIoUringNonPinnableFallbackPublicationStateForTest", StaticFlags);
+ Type stateType = setter.GetParameters()[0].ParameterType;
+ ConstructorInfo constructor = stateType.GetConstructor(
+ BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic,
+ binder: null,
+ types: new[] { typeof(long), typeof(int), typeof(long) },
+ modifiers: null) ?? throw new MissingMethodException(stateType.FullName, ".ctor(long,int,long)");
+
+ object rawState = constructor.Invoke(new object[] { state.PublishedCount, state.PublishingGate, state.FallbackCount });
+ _ = setter.Invoke(null, new object[] { rawState });
+ }
+
+ internal static long GetIoUringNonPinnablePrepareFallbackDeltaForTest() => (long)InvokeStatic("GetIoUringNonPinnablePrepareFallbackDeltaForTest")!;
+ internal static bool IsIoUringEnabledForTest() => (bool)InvokeStatic("IsIoUringEnabledForTest")!;
+ internal static bool IsSqPollRequestedForTest() => (bool)InvokeStatic("IsSqPollRequestedForTest")!;
+ internal static bool IsIoUringDirectSqeDisabledForTest() => (bool)InvokeStatic("IsIoUringDirectSqeDisabledForTest")!;
+ internal static bool IsZeroCopySendOptedInForTest() => (bool)InvokeStatic("IsZeroCopySendOptedInForTest")!;
+ internal static bool IsIoUringRegisterBuffersEnabledForTest() => (bool)InvokeStatic("IsIoUringRegisterBuffersEnabledForTest")!;
+ internal static bool IsNativeMsghdrLayoutSupportedForIoUringForTest(int pointerSize, int nativeMsghdrSize) =>
+ (bool)InvokeStatic("IsNativeMsghdrLayoutSupportedForIoUringForTest", new object?[] { pointerSize, nativeMsghdrSize })!;
+ internal static long GetIoUringPendingRetryQueuedToPrepareQueueCountForTest() => (long)InvokeStatic("GetIoUringPendingRetryQueuedToPrepareQueueCountForTest")!;
+ internal static int GetIoUringCancellationQueueCapacityForTest() => (int)InvokeStatic("GetIoUringCancellationQueueCapacityForTest")!;
+ internal static bool IsIoUringMultishotRecvSupportedForTest() => (bool)InvokeStatic("IsIoUringMultishotRecvSupportedForTest")!;
+ internal static bool IsIoUringMultishotAcceptSupportedForTest() => (bool)InvokeStatic("IsIoUringMultishotAcceptSupportedForTest")!;
+ internal static bool HasActiveIoUringEngineWithInitializedCqStateForTest() => (bool)InvokeStatic("HasActiveIoUringEngineWithInitializedCqStateForTest")!;
+ internal static int GetIoUringCompletionSlotsInUseForTest() => (int)InvokeStatic("GetIoUringCompletionSlotsInUseForTest")!;
+ internal static int GetIoUringTrackedOperationCountForTest() => (int)InvokeStatic("GetIoUringTrackedOperationCountForTest")!;
+ internal static bool IsAnyIoUringSqPollEngineNeedingWakeupForTest() => (bool)InvokeStatic("IsAnyIoUringSqPollEngineNeedingWakeupForTest")!;
+ internal static bool ValidateIoUringProvidedBufferTeardownOrderingForTest() => (bool)InvokeStatic("ValidateIoUringProvidedBufferTeardownOrderingForTest")!;
+ internal static ulong EncodeCompletionSlotUserDataForTest(int slotIndex, ulong generation) =>
+ (ulong)InvokeStatic("EncodeCompletionSlotUserDataForTest", new object?[] { slotIndex, generation })!;
+ internal static ulong IncrementCompletionSlotGenerationForTest(ulong generation) =>
+ (ulong)InvokeStatic("IncrementCompletionSlotGenerationForTest", new object?[] { generation })!;
+
+ internal static bool TryDecodeCompletionSlotUserDataForTest(ulong userData, out int slotIndex, out ulong generation)
+ {
+ object?[] args = new object?[] { userData, 0, 0UL };
+ bool result = (bool)InvokeStatic("TryDecodeCompletionSlotUserDataForTest", args)!;
+ slotIndex = (int)args[1]!;
+ generation = (ulong)args[2]!;
+ return result;
+ }
+
+ internal static IoUringNativeMsghdrLayoutSnapshotForTest GetIoUringNativeMsghdrLayoutForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringNativeMsghdrLayoutForTest")!;
+ return new IoUringNativeMsghdrLayoutSnapshotForTest(
+ ReadProperty(snapshot, "Size"),
+ ReadProperty(snapshot, "MsgNameOffset"),
+ ReadProperty(snapshot, "MsgNameLengthOffset"),
+ ReadProperty(snapshot, "MsgIovOffset"),
+ ReadProperty(snapshot, "MsgIovLengthOffset"),
+ ReadProperty(snapshot, "MsgControlOffset"),
+ ReadProperty(snapshot, "MsgControlLengthOffset"),
+ ReadProperty(snapshot, "MsgFlagsOffset"));
+ }
+
+ internal static IoUringCompletionSlotLayoutSnapshotForTest GetIoUringCompletionSlotLayoutForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringCompletionSlotLayoutForTest")!;
+ return new IoUringCompletionSlotLayoutSnapshotForTest(
+ ReadProperty(snapshot, "Size"),
+ ReadProperty(snapshot, "GenerationOffset"),
+ ReadProperty(snapshot, "FreeListNextOffset"),
+ ReadProperty(snapshot, "PackedStateOffset"),
+ ReadProperty(snapshot, "FixedRecvBufferIdOffset"),
+ ReadProperty(snapshot, "TestForcedResultOffset"));
+ }
+
+ internal static IoUringNativeDiagnosticsSnapshotForTest GetIoUringNativeDiagnosticsSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringNativeDiagnosticsSnapshotForTest")!;
+ return new IoUringNativeDiagnosticsSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "AsyncCancelRequestCqeCount"),
+ ReadProperty(snapshot, "AsyncCancelRequestCqeEnoentCount"),
+ ReadProperty(snapshot, "AsyncCancelRequestCqeEalreadyCount"),
+ ReadProperty(snapshot, "AsyncCancelRequestCqeOtherCount"),
+ ReadProperty(snapshot, "SocketEventBufferFullCount"),
+ ReadProperty(snapshot, "UnsupportedOpcodePrepareCount"),
+ ReadProperty(snapshot, "CqOverflowCount"));
+ }
+
+ internal static IoUringProvidedBufferSnapshotForTest GetIoUringProvidedBufferSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringProvidedBufferSnapshotForTest")!;
+ return new IoUringProvidedBufferSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SupportsProvidedBufferRings"),
+ ReadProperty(snapshot, "HasProvidedBufferRing"),
+ ReadProperty(snapshot, "HasRegisteredBuffers"),
+ ReadProperty(snapshot, "AdaptiveBufferSizingEnabled"),
+ ReadProperty(snapshot, "AvailableCount"),
+ ReadProperty(snapshot, "InUseCount"),
+ ReadProperty(snapshot, "TotalBufferCount"),
+ ReadProperty(snapshot, "BufferSize"),
+ ReadProperty(snapshot, "RecommendedBufferSize"),
+ ReadProperty(snapshot, "RecycledCount"),
+ ReadProperty(snapshot, "AllocationFailureCount"));
+ }
+
+ internal static IoUringZeroCopySendSnapshotForTest GetIoUringZeroCopySendSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringZeroCopySendSnapshotForTest")!;
+ return new IoUringZeroCopySendSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SupportsSendZc"),
+ ReadProperty(snapshot, "SupportsSendMsgZc"),
+ ReadProperty(snapshot, "ZeroCopySendEnabled"));
+ }
+
+ internal static IoUringFixedRecvSnapshotForTest GetIoUringFixedRecvSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringFixedRecvSnapshotForTest")!;
+ return new IoUringFixedRecvSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SupportsReadFixed"),
+ ReadProperty(snapshot, "HasRegisteredBuffers"));
+ }
+
+ internal static IoUringSqPollSnapshotForTest GetIoUringSqPollSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringSqPollSnapshotForTest")!;
+ return new IoUringSqPollSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "SqPollEnabled"),
+ ReadProperty(snapshot, "DeferTaskrunEnabled"));
+ }
+
+ internal static IoUringZeroCopyPinHoldSnapshotForTest GetIoUringZeroCopyPinHoldSnapshotForTest()
+ {
+ object snapshot = InvokeStatic("GetIoUringZeroCopyPinHoldSnapshotForTest")!;
+ return new IoUringZeroCopyPinHoldSnapshotForTest(
+ ReadProperty(snapshot, "HasIoUringPort"),
+ ReadProperty(snapshot, "ActivePinHolds"),
+ ReadProperty(snapshot, "PendingNotificationCount"));
+ }
+
+ internal static bool TryInjectIoUringCqOverflowForTest(uint delta, out int injectedEngineCount)
+ {
+ object?[] args = new object?[] { delta, 0 };
+ bool result = (bool)InvokeStatic("TryInjectIoUringCqOverflowForTest", args)!;
+ injectedEngineCount = (int)args[1]!;
+ return result;
+ }
+
+ internal static bool TryGetIoUringRingFdForTest(out int ringFd)
+ {
+ object?[] args = new object?[] { -1 };
+ bool result = (bool)InvokeStatic("TryGetIoUringRingFdForTest", args)!;
+ ringFd = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryGetIoUringWakeupEventFdForTest(out int eventFd)
+ {
+ object?[] args = new object?[] { -1 };
+ bool result = (bool)InvokeStatic("TryGetIoUringWakeupEventFdForTest", args)!;
+ eventFd = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest(out bool matches)
+ {
+ object?[] args = new object?[] { false };
+ bool result = (bool)InvokeStatic("TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest", args)!;
+ matches = (bool)args[0]!;
+ return result;
+ }
+
+ internal static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)
+ {
+ object?[] args = new object?[] { 0 };
+ bool result = (bool)InvokeStatic("TryForceIoUringProvidedBufferRingExhaustionForTest", args)!;
+ forcedBufferCount = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount)
+ {
+ object?[] args = new object?[] { 0 };
+ bool result = (bool)InvokeStatic("TryRecycleForcedIoUringProvidedBufferRingForTest", args)!;
+ recycledBufferCount = (int)args[0]!;
+ return result;
+ }
+
+ internal static bool TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine)
+ {
+ object?[] args = new object?[] { null };
+ bool result = (bool)InvokeStatic("TryGetFirstIoUringEngineForTest", args)!;
+ if (!result || args[0] is null)
+ {
+ ioUringEngine = null;
+ return false;
+ }
+
+ ioUringEngine = new SocketAsyncEngine(args[0]);
+ return true;
+ }
+
+ internal static SocketAsyncEngine[] GetActiveIoUringEnginesForTest()
+ {
+ Array engines = (Array)InvokeStatic("GetActiveIoUringEnginesForTest")!;
+ var wrappers = new SocketAsyncEngine[engines.Length];
+ for (int i = 0; i < engines.Length; i++)
+ {
+ wrappers[i] = new SocketAsyncEngine(engines.GetValue(i)!);
+ }
+
+ return wrappers;
+ }
+
+ internal bool SupportsMultishotAcceptForTest
+ {
+ get => GetInstanceProperty("SupportsMultishotAcceptForTest");
+ set => SetInstanceProperty("SupportsMultishotAcceptForTest", value);
+ }
+
+ internal bool SupportsOpSendZcForTest
+ {
+ get => GetInstanceProperty("SupportsOpSendZcForTest");
+ set => SetInstanceProperty("SupportsOpSendZcForTest", value);
+ }
+
+ internal bool ZeroCopySendEnabledForTest
+ {
+ get => GetInstanceProperty("ZeroCopySendEnabledForTest");
+ set => SetInstanceProperty("ZeroCopySendEnabledForTest", value);
+ }
+
+ internal long IoUringCancelQueueLengthForTest
+ {
+ get => GetInstanceProperty("IoUringCancelQueueLengthForTest");
+ set => SetInstanceProperty("IoUringCancelQueueLengthForTest", value);
+ }
+
+ internal long IoUringCancelQueueOverflowCountForTest => GetInstanceProperty("IoUringCancelQueueOverflowCountForTest");
+ internal long IoUringCancelQueueWakeRetryCountForTest => GetInstanceProperty("IoUringCancelQueueWakeRetryCountForTest");
+
+ internal int IoUringWakeupRequestedForTest
+ {
+ get => GetInstanceProperty("IoUringWakeupRequestedForTest");
+ set => SetInstanceProperty("IoUringWakeupRequestedForTest", value);
+ }
+
+ internal bool TryEnqueueIoUringCancellationForTest(ulong userData)
+ => (bool)InvokeInstance("TryEnqueueIoUringCancellationForTest", userData)!;
+
+ internal int SubmitIoUringOperationsNormalizedForTest()
+ => Convert.ToInt32(InvokeInstance("SubmitIoUringOperationsNormalizedForTest"));
+
+ private static object? InvokeStatic(string methodName, params object?[]? args)
+ {
+ MethodInfo method = GetRequiredMethod(GetEngineType(), methodName, StaticFlags);
+ return method.Invoke(null, args);
+ }
+
+ private object? InvokeInstance(string methodName, params object?[]? args)
+ {
+ MethodInfo method = GetRequiredMethod(GetEngineType(), methodName, InstanceFlags);
+ return method.Invoke(_inner, args);
+ }
+
+ private T GetInstanceProperty(string propertyName)
+ {
+ PropertyInfo property = GetRequiredProperty(GetEngineType(), propertyName, InstanceFlags);
+ return (T)property.GetValue(_inner)!;
+ }
+
+ private void SetInstanceProperty(string propertyName, object? value)
+ {
+ PropertyInfo property = GetRequiredProperty(GetEngineType(), propertyName, InstanceFlags);
+ property.SetValue(_inner, value);
+ }
+
+ private static T ReadProperty(object instance, string propertyName)
+ {
+ PropertyInfo property = GetRequiredProperty(instance.GetType(), propertyName, InstanceFlags);
+ return (T)property.GetValue(instance)!;
+ }
+
+ [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)]
+ private static Type GetEngineType()
+ {
+ return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true, ignoreCase: false)!;
+ }
+
+ private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags)
+ {
+ return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName);
+ }
+
+ private static PropertyInfo GetRequiredProperty([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string propertyName, BindingFlags flags)
+ {
+ return type.GetProperty(propertyName, flags) ?? throw new MissingMemberException(type.FullName, propertyName);
+ }
+ }
+
+ ///
+ /// Linux test-only shim that forwards SocketAsyncContext test hooks through reflection.
+ ///
+ internal static class SocketAsyncContext
+ {
+ private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic;
+
+ [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketAsyncContext", "System.Net.Sockets")]
+ internal static bool IsMultishotAcceptArmedForTest(Socket socket)
+ => (bool)GetRequiredMethod(GetContextType(), "IsMultishotAcceptArmedForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static int GetMultishotAcceptQueueCountForTest(Socket socket)
+ => (int)GetRequiredMethod(GetContextType(), "GetMultishotAcceptQueueCountForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ internal static bool IsPersistentMultishotRecvArmedForTest(Socket socket)
+ => (bool)GetRequiredMethod(GetContextType(), "IsPersistentMultishotRecvArmedForTest", StaticFlags).Invoke(null, new object[] { socket })!;
+
+ [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)]
+ private static Type GetContextType()
+ {
+ return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncContext", throwOnError: true, ignoreCase: false)!;
+ }
+
+ private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags)
+ {
+ return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName);
+ }
+ }
+
+ ///
+ /// Linux test-only shim that forwards SocketsTelemetry test hook access through reflection.
+ ///
+ internal static class SocketsTelemetry
+ {
+ private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic;
+
+ internal enum IoUringCounterFieldForTest
+ {
+ PrepareNonPinnableFallbacks,
+ AsyncCancelRequestCqes,
+ SocketEventBufferFull,
+ CqOverflow,
+ CqOverflowRecoveries,
+ CompletionRequeueFailures,
+ PrepareQueueOverflows,
+ PrepareQueueOverflowFallbacks,
+ CompletionSlotExhaustions,
+ CompletionSlotHighWaterMark,
+ CancellationQueueOverflows,
+ SqPollWakeups,
+ SqPollSubmissionsSkipped,
+ ProvidedBufferDepletions,
+ ProvidedBufferRecycles,
+ RegisteredBuffersInitialSuccess,
+ RegisteredBuffersInitialFailure,
+ RegisteredBuffersReregistrationSuccess,
+ RegisteredBuffersReregistrationFailure,
+ PersistentMultishotRecvReuse,
+ PersistentMultishotRecvTermination,
+ }
+
+ [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketsTelemetry", "System.Net.Sockets")]
+ internal static ulong GetIoUringCounterValueForTest(IoUringCounterFieldForTest counter)
+ {
+ Type telemetryType = GetTelemetryType();
+ MethodInfo method = GetRequiredMethod(telemetryType, "GetIoUringCounterValueForTest", StaticFlags);
+ Type counterType = method.GetParameters()[0].ParameterType;
+ object counterValue = Enum.ToObject(counterType, (int)counter);
+ return (ulong)method.Invoke(null, new object[] { counterValue })!;
+ }
+
+ [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)]
+ private static Type GetTelemetryType()
+ {
+ return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true, ignoreCase: false)!;
+ }
+
+ private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags)
+ {
+ return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName);
+ }
+ }
+
+ ///
+ /// Linux test-only shim that mirrors internal MpscQueue{T} through reflection.
+ ///
+ internal sealed class MpscQueue
+ {
+ private const BindingFlags InstanceFlags = BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic;
+ private readonly object _inner;
+ private readonly MethodInfo _enqueueMethod;
+ private readonly MethodInfo _tryDequeueMethod;
+ private readonly PropertyInfo _isEmptyProperty;
+
+ [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.MpscQueue`1", "System.Net.Sockets")]
+ internal MpscQueue(int segmentSize)
+ {
+ Type queueType = GetQueueType();
+ ConstructorInfo constructor = queueType.GetConstructor(
+ BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic,
+ binder: null,
+ types: new[] { typeof(int) },
+ modifiers: null) ?? throw new MissingMethodException(queueType.FullName, ".ctor(int)");
+
+ _enqueueMethod = queueType.GetMethod("Enqueue", InstanceFlags) ?? throw new MissingMethodException(queueType.FullName, "Enqueue");
+ _tryDequeueMethod = queueType.GetMethod("TryDequeue", InstanceFlags) ?? throw new MissingMethodException(queueType.FullName, "TryDequeue");
+ _isEmptyProperty = queueType.GetProperty("IsEmpty", InstanceFlags) ?? throw new MissingMemberException(queueType.FullName, "IsEmpty");
+ try
+ {
+ _inner = constructor.Invoke(new object[] { segmentSize });
+ }
+ catch (TargetInvocationException tie) when (tie.InnerException is Exception inner)
+ {
+ ExceptionDispatchInfo.Capture(inner).Throw();
+ throw;
+ }
+ }
+
+ internal void Enqueue(T item)
+ {
+ _ = _enqueueMethod.Invoke(_inner, new object?[] { item });
+ }
+
+ internal bool TryDequeue(out T item)
+ {
+ object?[] args = new object?[] { null };
+ bool dequeued = (bool)_tryDequeueMethod.Invoke(_inner, args)!;
+ item = dequeued ? (T)args[0]! : default!;
+ return dequeued;
+ }
+
+ internal bool IsEmpty => (bool)_isEmptyProperty.GetValue(_inner)!;
+
+ [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)]
+ private static Type GetQueueType()
+ {
+ Type genericType = typeof(Socket).Assembly.GetType("System.Net.Sockets.MpscQueue`1", throwOnError: true, ignoreCase: false)!;
+ return genericType.MakeGenericType(typeof(T));
+ }
+ }
+}
diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs
new file mode 100644
index 00000000000000..4ff32891c1622d
--- /dev/null
+++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs
@@ -0,0 +1,7256 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Buffers;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Net;
+using System.Runtime.InteropServices;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.DotNet.RemoteExecutor;
+using Xunit;
+using IoUringFixedRecvSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringFixedRecvSnapshotForTest;
+using IoUringNativeDiagnosticsSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringNativeDiagnosticsSnapshotForTest;
+using IoUringProvidedBufferSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringProvidedBufferSnapshotForTest;
+using IoUringSqPollSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringSqPollSnapshotForTest;
+using IoUringZeroCopyPinHoldSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringZeroCopyPinHoldSnapshotForTest;
+using IoUringZeroCopySendSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringZeroCopySendSnapshotForTest;
+
+namespace System.Net.Sockets.Tests
+{
+ // io_uring internals and reflection-based test hooks are currently validated on CoreCLR.
+ [ConditionalClass(typeof(PlatformDetection), nameof(PlatformDetection.IsNotMonoRuntime))]
+ public partial class IoUring
+ {
+ private const int F_GETFD = 1;
+ private const int F_GETFL = 3;
+ private const int FD_CLOEXEC = 1;
+ private const int O_NONBLOCK = 0x800;
+ private const int RLIMIT_NOFILE = 7;
+
+ [StructLayout(LayoutKind.Sequential)]
+ private struct RLimit
+ {
+ public nuint Current;
+ public nuint Maximum;
+ }
+
+ private static class IoUringEnvironmentVariables
+ {
+ public const string Enabled = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING";
+ public const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE";
+ public const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING";
+ public const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS";
+ public const string SqPoll = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL";
+ public const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND";
+ public const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE";
+ public const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK";
+ public const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK";
+ public const string ForceSubmitEpermOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_SUBMIT_EPERM_ONCE";
+ public const string ForceEnterEintrRetryLimitOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE";
+ public const string ForceKernelVersionUnsupported = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_KERNEL_VERSION_UNSUPPORTED";
+ public const string ForceProvidedBufferRingOomOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_PROVIDED_BUFFER_RING_OOM_ONCE";
+ public const string TestEventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT";
+ public const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY";
+ public const string QueueEntries = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_QUEUE_ENTRIES";
+ public const string ThreadCount = "DOTNET_SYSTEM_NET_SOCKETS_THREAD_COUNT";
+ }
+
+ // fcntl uses C int for fd/cmd/return on Linux ABIs.
+ [LibraryImport("libc", EntryPoint = "fcntl", SetLastError = true)]
+ private static partial int Fcntl(int fd, int cmd);
+
+ [LibraryImport("libc", EntryPoint = "getrlimit", SetLastError = true)]
+ private static partial int GetRLimit(int resource, out RLimit limit);
+
+ [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
+ [PlatformSpecific(TestPlatforms.Linux)] // Uses Linux-only io_uring publication internals.
+ public static async Task IoUringNonPinnableFallbackPublication_ConcurrentPublishers_EmitSingleDelta()
+ {
+ await RemoteExecutor.Invoke(static () =>
+ {
+ SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState originalState =
+ SocketAsyncEngine.GetIoUringNonPinnableFallbackPublicationStateForTest();
+
+ try
+ {
+ const long firstFallbackCount = 17;
+ const int publisherCount = 16;
+ long[] deltas = new long[publisherCount];
+ using var start = new ManualResetEventSlim(initialState: false);
+ var tasks = new Task[publisherCount];
+
+ SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest(
+ new SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState(
+ publishedCount: 0L,
+ publishingGate: 0,
+ fallbackCount: firstFallbackCount));
+
+ for (int i = 0; i < publisherCount; i++)
+ {
+ int capturedIndex = i;
+ tasks[i] = Task.Run(() =>
+ {
+ start.Wait();
+ deltas[capturedIndex] = SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest();
+ });
+ }
+
+ start.Set();
+ Task.WaitAll(tasks);
+
+ long deltaTotal = 0;
+ int nonZeroCount = 0;
+ long nonZeroValue = 0;
+ foreach (long delta in deltas)
+ {
+ deltaTotal += delta;
+ if (delta != 0)
+ {
+ nonZeroCount++;
+ nonZeroValue = delta;
+ }
+ }
+
+ Assert.Equal(firstFallbackCount, deltaTotal);
+ Assert.Equal(1, nonZeroCount);
+ Assert.Equal(firstFallbackCount, nonZeroValue);
+
+ const long secondFallbackCount = 23;
+ SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest(
+ new SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState(
+ publishedCount: firstFallbackCount,
+ publishingGate: 0,
+ fallbackCount: secondFallbackCount));
+ Assert.Equal(secondFallbackCount - firstFallbackCount, SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest());
+ Assert.Equal(0, SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest());
+ }
+ finally
+ {
+ SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest(originalState);
+ }
+ }).DisposeAsync();
+ }
+
+ private static RemoteInvokeOptions CreateSocketEngineOptions(
+ string? ioUringValue = "1",
+ string? forceEagainOnceMask = null,
+ string? forceEcanceledOnceMask = null,
+ bool? forceSubmitEpermOnce = null,
+ bool? forceEnterEintrRetryLimitOnce = null,
+ bool? forceKernelVersionUnsupported = null,
+ bool? forceProvidedBufferRingOomOnce = null,
+ int? testEventBufferCount = null,
+ string? testEventBufferCountRaw = null,
+ int? prepareQueueCapacity = null,
+ int? queueEntries = null,
+ int? threadCount = null,
+ int? providedBufferSize = null,
+ bool? adaptiveBufferSizingEnabled = null,
+ bool? registerBuffersEnabled = null,
+ bool? sqPollEnabled = null,
+ bool? directSqeEnabled = null,
+ bool? zeroCopySendEnabled = null)
+ {
+ static void SetOrRemoveEnvironmentVariable(RemoteInvokeOptions options, string name, string? value)
+ {
+ if (value is null)
+ {
+ options.StartInfo.EnvironmentVariables.Remove(name);
+ }
+ else
+ {
+ options.StartInfo.EnvironmentVariables[name] = value;
+ }
+ }
+
+ static void ValidateSocketEngineOptionCombination(int? configuredEventBufferCount, string? configuredEventBufferCountRaw)
+ {
+ if (configuredEventBufferCount.HasValue && configuredEventBufferCountRaw is not null)
+ {
+ throw new ArgumentException(
+ "Specify either testEventBufferCount or testEventBufferCountRaw, not both.",
+ nameof(configuredEventBufferCountRaw));
+ }
+ }
+
+ ValidateSocketEngineOptionCombination(testEventBufferCount, testEventBufferCountRaw);
+
+ RemoteInvokeOptions options = new RemoteInvokeOptions();
+ string? configuredEventBufferCount =
+ testEventBufferCountRaw ?? (testEventBufferCount.HasValue ? testEventBufferCount.Value.ToString() : null);
+ (string Name, string? Value)[] ioUringEnvironmentAssignments =
+ {
+ (IoUringEnvironmentVariables.Enabled, ioUringValue),
+ (IoUringEnvironmentVariables.ProvidedBufferSize, providedBufferSize?.ToString()),
+ (IoUringEnvironmentVariables.AdaptiveBufferSizing, adaptiveBufferSizingEnabled.HasValue ? (adaptiveBufferSizingEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.RegisterBuffers, registerBuffersEnabled.HasValue ? (registerBuffersEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.SqPoll, sqPollEnabled.HasValue ? (sqPollEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.DirectSqe, directSqeEnabled.HasValue ? (directSqeEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ZeroCopySend, zeroCopySendEnabled.HasValue ? (zeroCopySendEnabled.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceEagainOnceMask, string.IsNullOrEmpty(forceEagainOnceMask) ? null : forceEagainOnceMask),
+ (IoUringEnvironmentVariables.ForceEcanceledOnceMask, string.IsNullOrEmpty(forceEcanceledOnceMask) ? null : forceEcanceledOnceMask),
+ (IoUringEnvironmentVariables.ForceSubmitEpermOnce, forceSubmitEpermOnce.HasValue ? (forceSubmitEpermOnce.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceEnterEintrRetryLimitOnce, forceEnterEintrRetryLimitOnce.HasValue ? (forceEnterEintrRetryLimitOnce.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceKernelVersionUnsupported, forceKernelVersionUnsupported.HasValue ? (forceKernelVersionUnsupported.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.ForceProvidedBufferRingOomOnce, forceProvidedBufferRingOomOnce.HasValue ? (forceProvidedBufferRingOomOnce.Value ? "1" : "0") : null),
+ (IoUringEnvironmentVariables.TestEventBufferCount, configuredEventBufferCount),
+ (IoUringEnvironmentVariables.PrepareQueueCapacity, prepareQueueCapacity?.ToString()),
+ (IoUringEnvironmentVariables.QueueEntries, queueEntries?.ToString()),
+ (IoUringEnvironmentVariables.ThreadCount, threadCount?.ToString()),
+ };
+
+ foreach ((string Name, string? Value) assignment in ioUringEnvironmentAssignments)
+ {
+ SetOrRemoveEnvironmentVariable(options, assignment.Name, assignment.Value);
+ }
+
+ options.TimeOut = (int)TimeSpan.FromMinutes(10).TotalMilliseconds;
+ return options;
+ }
+
+ private static Task ToTask(Task task) => task;
+ private static Task ToTask(ValueTask task) => task.AsTask();
+
+ private static async Task AwaitWithTimeoutAsync(Task task, string operationName)
+ {
+ Task completed = await Task.WhenAny(task, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.True(ReferenceEquals(task, completed), $"Timed out waiting for {operationName}");
+ return await task;
+ }
+
+ private static void AssertCanceledOrInterrupted(Exception? ex)
+ {
+ Assert.NotNull(ex);
+ Assert.True(
+ ex is OperationCanceledException ||
+ ex is SocketException socketException &&
+ (socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted),
+ $"Unexpected exception: {ex}");
+ }
+
+ private static void AssertCanceledDisposedOrInterrupted(Exception? ex)
+ {
+ if (ex is null)
+ {
+ return;
+ }
+
+ Assert.True(
+ ex is ObjectDisposedException ||
+ ex is OperationCanceledException ||
+ ex is SocketException socketException &&
+ (socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted),
+ $"Unexpected exception: {ex}");
+ }
+
+ private static bool IsProvidedBufferSnapshotUsable(IoUringProvidedBufferSnapshot snapshot) =>
+ snapshot.HasIoUringPort &&
+ snapshot.SupportsProvidedBufferRings &&
+ snapshot.HasProvidedBufferRing &&
+ snapshot.TotalBufferCount > 0;
+
+ private static bool IsAdaptiveSizingUsable(IoUringProvidedBufferSnapshot snapshot) =>
+ IsProvidedBufferSnapshotUsable(snapshot) && snapshot.AdaptiveBufferSizingEnabled;
+
+ private static bool IsFixedRecvEnabled(IoUringFixedRecvSnapshot snapshot) =>
+ snapshot.SupportsReadFixed && snapshot.HasRegisteredBuffers;
+
+ private static bool IsSqPollActive(IoUringSqPollSnapshot snapshot) =>
+ snapshot.HasIoUringPort && snapshot.SqPollEnabled;
+
+ private sealed class NonPinnableMemoryManager : MemoryManager
+ {
+ private readonly byte[] _buffer;
+
+ public NonPinnableMemoryManager(byte[] buffer)
+ {
+ _buffer = buffer;
+ }
+
+ public override Span GetSpan() => _buffer;
+
+ public override MemoryHandle Pin(int elementIndex = 0)
+ {
+ _ = elementIndex;
+ throw new NotSupportedException("Non-pinnable test memory.");
+ }
+
+ public override void Unpin()
+ {
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ }
+ }
+
+ private sealed unsafe class TrackingPinnableMemoryManager : MemoryManager
+ {
+ private readonly byte[] _buffer;
+ private int _pinCount;
+ private int _unpinCount;
+
+ public TrackingPinnableMemoryManager(byte[] buffer)
+ {
+ _buffer = buffer;
+ }
+
+ public int PinCount => Volatile.Read(ref _pinCount);
+ public int UnpinCount => Volatile.Read(ref _unpinCount);
+
+ public override Span GetSpan() => _buffer;
+
+ public override MemoryHandle Pin(int elementIndex = 0)
+ {
+ if ((uint)elementIndex > (uint)_buffer.Length)
+ {
+ throw new ArgumentOutOfRangeException(nameof(elementIndex));
+ }
+
+ Interlocked.Increment(ref _pinCount);
+ GCHandle handle = GCHandle.Alloc(_buffer, GCHandleType.Pinned);
+ byte* pointer = (byte*)handle.AddrOfPinnedObject() + elementIndex;
+ return new MemoryHandle(pointer, handle, this);
+ }
+
+ public override void Unpin()
+ {
+ Interlocked.Increment(ref _unpinCount);
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ }
+ }
+
+#if DEBUG
+ private sealed class ThrowingTraceListener : TraceListener
+ {
+ public override void Write(string? message)
+ {
+ }
+
+ public override void WriteLine(string? message)
+ {
+ }
+
+ public override void Fail(string? message, string? detailMessage)
+ {
+ throw new InvalidOperationException($"{message} {detailMessage}");
+ }
+ }
+#endif
+
+ private static long GetIoUringPrepareNonPinnableFallbackCounterValue()
+ => (long)SocketsTelemetry.GetIoUringCounterValueForTest(SocketsTelemetry.IoUringCounterFieldForTest.PrepareNonPinnableFallbacks);
+
+ private static long GetIoUringCompletionRequeueFailureCounterValue()
+ => (long)SocketsTelemetry.GetIoUringCounterValueForTest(SocketsTelemetry.IoUringCounterFieldForTest.CompletionRequeueFailures);
+
+ private static bool InvokeSocketAsyncEngineBoolMethod(string methodName)
+ {
+ return methodName switch
+ {
+ "IsIoUringEnabled" => SocketAsyncEngine.IsIoUringEnabledForTest(),
+ "IsSqPollRequested" => SocketAsyncEngine.IsSqPollRequestedForTest(),
+ "IsIoUringDirectSqeDisabled" => SocketAsyncEngine.IsIoUringDirectSqeDisabledForTest(),
+ "IsZeroCopySendOptedIn" => SocketAsyncEngine.IsZeroCopySendOptedInForTest(),
+ "IsIoUringRegisterBuffersEnabled" => SocketAsyncEngine.IsIoUringRegisterBuffersEnabledForTest(),
+ _ => throw new ArgumentOutOfRangeException(nameof(methodName), methodName, "Unknown SocketAsyncEngine bool selector."),
+ };
+ }
+
+ private static void AssertBooleanAppContextSwitch(
+ string switchName,
+ string methodName,
+ bool expectedWhenSwitchTrue,
+ bool expectedWhenSwitchFalse)
+ {
+ AppContext.SetSwitch(switchName, true);
+ Assert.Equal(expectedWhenSwitchTrue, InvokeSocketAsyncEngineBoolMethod(methodName));
+
+ AppContext.SetSwitch(switchName, false);
+ Assert.Equal(expectedWhenSwitchFalse, InvokeSocketAsyncEngineBoolMethod(methodName));
+ }
+
+ private static ulong GetIoUringTelemetryCounterValue(string fieldName)
+ {
+ SocketsTelemetry.IoUringCounterFieldForTest counter = fieldName switch
+ {
+ "_ioUringPrepareNonPinnableFallbacks" => SocketsTelemetry.IoUringCounterFieldForTest.PrepareNonPinnableFallbacks,
+ "_ioUringAsyncCancelRequestCqes" => SocketsTelemetry.IoUringCounterFieldForTest.AsyncCancelRequestCqes,
+ "_ioUringSocketEventBufferFull" => SocketsTelemetry.IoUringCounterFieldForTest.SocketEventBufferFull,
+ "_ioUringCqOverflow" => SocketsTelemetry.IoUringCounterFieldForTest.CqOverflow,
+ "_ioUringCqOverflowRecoveries" => SocketsTelemetry.IoUringCounterFieldForTest.CqOverflowRecoveries,
+ "_ioUringCompletionRequeueFailures" => SocketsTelemetry.IoUringCounterFieldForTest.CompletionRequeueFailures,
+ "_ioUringPrepareQueueOverflows" => SocketsTelemetry.IoUringCounterFieldForTest.PrepareQueueOverflows,
+ "_ioUringPrepareQueueOverflowFallbacks" => SocketsTelemetry.IoUringCounterFieldForTest.PrepareQueueOverflowFallbacks,
+ "_ioUringCompletionSlotExhaustions" => SocketsTelemetry.IoUringCounterFieldForTest.CompletionSlotExhaustions,
+ "_ioUringCompletionSlotHighWaterMark" => SocketsTelemetry.IoUringCounterFieldForTest.CompletionSlotHighWaterMark,
+ "_ioUringSqPollWakeups" => SocketsTelemetry.IoUringCounterFieldForTest.SqPollWakeups,
+ "_ioUringSqPollSubmissionsSkipped" => SocketsTelemetry.IoUringCounterFieldForTest.SqPollSubmissionsSkipped,
+ "_ioUringProvidedBufferDepletions" => SocketsTelemetry.IoUringCounterFieldForTest.ProvidedBufferDepletions,
+ "_ioUringProvidedBufferRecycles" => SocketsTelemetry.IoUringCounterFieldForTest.ProvidedBufferRecycles,
+ "_ioUringRegisteredBuffersInitialSuccess" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersInitialSuccess,
+ "_ioUringRegisteredBuffersInitialFailure" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersInitialFailure,
+ "_ioUringRegisteredBuffersReregistrationSuccess" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersReregistrationSuccess,
+ "_ioUringRegisteredBuffersReregistrationFailure" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersReregistrationFailure,
+ "_ioUringPersistentMultishotRecvReuse" => SocketsTelemetry.IoUringCounterFieldForTest.PersistentMultishotRecvReuse,
+ "_ioUringPersistentMultishotRecvTermination" => SocketsTelemetry.IoUringCounterFieldForTest.PersistentMultishotRecvTermination,
+ _ => throw new ArgumentOutOfRangeException(nameof(fieldName), fieldName, "Unknown io_uring telemetry counter selector."),
+ };
+
+ return SocketsTelemetry.GetIoUringCounterValueForTest(counter);
+ }
+
+ private static long GetIoUringPendingRetryQueuedToPrepareQueueCount()
+ => SocketAsyncEngine.GetIoUringPendingRetryQueuedToPrepareQueueCountForTest();
+
+ private static void AssertNativeMsghdrLayoutContractForIoUring()
+ {
+ SocketAsyncEngine.IoUringNativeMsghdrLayoutSnapshotForTest layout =
+ SocketAsyncEngine.GetIoUringNativeMsghdrLayoutForTest();
+
+ Assert.Equal(56, layout.Size);
+ Assert.Equal(0, layout.MsgNameOffset);
+ Assert.Equal(8, layout.MsgNameLengthOffset);
+ Assert.Equal(16, layout.MsgIovOffset);
+ Assert.Equal(24, layout.MsgIovLengthOffset);
+ Assert.Equal(32, layout.MsgControlOffset);
+ Assert.Equal(40, layout.MsgControlLengthOffset);
+ Assert.Equal(48, layout.MsgFlagsOffset);
+ }
+
+ private static void AssertNativeMsghdr32BitRejectionPathForIoUring()
+ {
+ Assert.True(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 8, nativeMsghdrSize: 56));
+ Assert.False(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 4, nativeMsghdrSize: 56));
+ Assert.False(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 8, nativeMsghdrSize: 48));
+ }
+
+ private static void AssertIoUringCompletionSlotLayoutContractForIoUring()
+ {
+ SocketAsyncEngine.IoUringCompletionSlotLayoutSnapshotForTest layout =
+ SocketAsyncEngine.GetIoUringCompletionSlotLayoutForTest();
+
+ Assert.Equal(24, layout.Size);
+ Assert.Equal(0, layout.GenerationOffset);
+ Assert.Equal(8, layout.FreeListNextOffset);
+ Assert.Equal(12, layout.PackedStateOffset);
+ Assert.Equal(16, layout.FixedRecvBufferIdOffset);
+ if (layout.TestForcedResultOffset >= 0)
+ {
+ Assert.Equal(20, layout.TestForcedResultOffset);
+ }
+ }
+
+ private static bool TryInjectIoUringCqOverflowForTest(uint delta, out int injectedEngineCount)
+ => SocketAsyncEngine.TryInjectIoUringCqOverflowForTest(delta, out injectedEngineCount);
+
+ private static bool AssertIoUringCqReflectionTargetsStableForTest()
+ => SocketAsyncEngine.HasActiveIoUringEngineWithInitializedCqStateForTest();
+
+ private static int GetIoUringCompletionSlotsInUseForTest()
+ => SocketAsyncEngine.GetIoUringCompletionSlotsInUseForTest();
+
+ private static int GetIoUringTrackedOperationCountForTest()
+ => SocketAsyncEngine.GetIoUringTrackedOperationCountForTest();
+
+ private static ulong EncodeCompletionSlotUserDataForTest(int slotIndex, ulong generation)
+ => SocketAsyncEngine.EncodeCompletionSlotUserDataForTest(slotIndex, generation);
+
+ private static bool TryDecodeCompletionSlotUserDataForTest(ulong userData, out int slotIndex, out ulong generation)
+ => SocketAsyncEngine.TryDecodeCompletionSlotUserDataForTest(userData, out slotIndex, out generation);
+
+ private static ulong IncrementCompletionSlotGenerationForTest(ulong generation)
+ => SocketAsyncEngine.IncrementCompletionSlotGenerationForTest(generation);
+
+ private static bool IsTrackedIoUringUserDataForTest(ulong userData)
+ => SocketAsyncEngine.IsTrackedIoUringUserDataForTest(userData);
+
+ private static bool TryGetIoUringRingFdForTest(out int ringFd)
+ => SocketAsyncEngine.TryGetIoUringRingFdForTest(out ringFd);
+
+ private static bool TryGetIoUringWakeupEventFdForTest(out int eventFd)
+ => SocketAsyncEngine.TryGetIoUringWakeupEventFdForTest(out eventFd);
+
+ private static bool TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine)
+ {
+ return SocketAsyncEngine.TryGetFirstIoUringEngineForTest(out ioUringEngine);
+ }
+
+ private static void AssertCompletionSlotUserDataEncodingBoundaryContractForIoUring()
+ {
+ const int MaxSlotIndex = 8191;
+ const ulong MaxGeneration = (1UL << 43) - 1;
+
+ ulong encoded = EncodeCompletionSlotUserDataForTest(MaxSlotIndex, MaxGeneration);
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(encoded, out int decodedSlotIndex, out ulong decodedGeneration));
+ Assert.Equal(MaxSlotIndex, decodedSlotIndex);
+ Assert.Equal(MaxGeneration, decodedGeneration);
+
+ ulong wrappedGeneration = IncrementCompletionSlotGenerationForTest(MaxGeneration);
+ Assert.Equal(1UL, wrappedGeneration);
+
+ ulong wrappedEncoded = EncodeCompletionSlotUserDataForTest(MaxSlotIndex, wrappedGeneration);
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(wrappedEncoded, out int wrappedSlotIndex, out ulong wrappedDecodedGeneration));
+ Assert.Equal(MaxSlotIndex, wrappedSlotIndex);
+ Assert.Equal(1UL, wrappedDecodedGeneration);
+ }
+
+ private static async Task WaitForIoUringTelemetryCounterAtLeastAsync(string counterFieldName, ulong targetValue, int timeoutMilliseconds = 30000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (GetIoUringTelemetryCounterValue(counterFieldName) >= targetValue)
+ {
+ return true;
+ }
+
+ await Task.Delay(25);
+ }
+
+ return GetIoUringTelemetryCounterValue(counterFieldName) >= targetValue;
+ }
+
+ private static async Task WaitForIoUringCompletionSlotsInUseAtMostAsync(int maxValue, int timeoutMilliseconds = 10000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (GetIoUringCompletionSlotsInUseForTest() <= maxValue)
+ {
+ return true;
+ }
+
+ await Task.Delay(25);
+ }
+
+ return GetIoUringCompletionSlotsInUseForTest() <= maxValue;
+ }
+
+ private static async Task WaitForIoUringCompletionSlotsInUseAboveAsync(int baselineValue, int minimumDelta, int timeoutMilliseconds = 10000)
+ {
+ int threshold = baselineValue + minimumDelta;
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (GetIoUringCompletionSlotsInUseForTest() > threshold)
+ {
+ return true;
+ }
+
+ await Task.Delay(25);
+ }
+
+ return GetIoUringCompletionSlotsInUseForTest() > threshold;
+ }
+
+ private static async Task WaitForIoUringTrackedOperationsAtMostAsync(int maxValue, int timeoutMilliseconds = 10000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (GetIoUringTrackedOperationCountForTest() <= maxValue)
+ {
+ return true;
+ }
+
+ await Task.Delay(25);
+ }
+
+ return GetIoUringTrackedOperationCountForTest() <= maxValue;
+ }
+
+ private static bool IsIoUringMultishotRecvSupported()
+ => SocketAsyncEngine.IsIoUringMultishotRecvSupportedForTest();
+
+ private static bool IsIoUringMultishotAcceptSupported()
+ => SocketAsyncEngine.IsIoUringMultishotAcceptSupportedForTest();
+
+ private static bool IsListenerMultishotAcceptArmed(Socket listener)
+ => SocketAsyncContext.IsMultishotAcceptArmedForTest(listener);
+
+ private static int GetListenerMultishotAcceptQueueCount(Socket listener)
+ => SocketAsyncContext.GetMultishotAcceptQueueCountForTest(listener);
+
+ private static async Task WaitForMultishotAcceptArmedStateAsync(Socket listener, bool expectedArmed, int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (IsListenerMultishotAcceptArmed(listener) == expectedArmed)
+ {
+ return true;
+ }
+
+ await Task.Delay(20);
+ }
+
+ return IsListenerMultishotAcceptArmed(listener) == expectedArmed;
+ }
+
+ private static bool IsPersistentMultishotRecvArmed(Socket socket)
+ => SocketAsyncContext.IsPersistentMultishotRecvArmedForTest(socket);
+
+ private static ulong GetPersistentMultishotRecvUserData(Socket socket)
+ => SocketAsyncContext.GetPersistentMultishotRecvUserDataForTest(socket);
+
+ private static int GetPersistentMultishotRecvBufferedCount(Socket socket)
+ => SocketAsyncContext.GetPersistentMultishotRecvBufferedCountForTest(socket);
+
+ private static async Task WaitForPersistentMultishotRecvArmedStateAsync(Socket socket, bool expectedArmed, int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ while (DateTime.UtcNow < deadline)
+ {
+ if (IsPersistentMultishotRecvArmed(socket) == expectedArmed)
+ {
+ return true;
+ }
+
+ await Task.Delay(20);
+ }
+
+ return IsPersistentMultishotRecvArmed(socket) == expectedArmed;
+ }
+
+ private static bool HasSufficientFileDescriptorLimit(int requiredDescriptorCount)
+ {
+ if (requiredDescriptorCount <= 0)
+ {
+ return true;
+ }
+
+ if (GetRLimit(RLIMIT_NOFILE, out RLimit limit) != 0)
+ {
+ return true;
+ }
+
+ return limit.Current >= (nuint)requiredDescriptorCount;
+ }
+
+ private static bool DoesExecChildObserveFileDescriptor(int fd)
+ {
+ if (fd < 0)
+ {
+ return false;
+ }
+
+ using Process process = Process.Start(
+ new ProcessStartInfo
+ {
+ FileName = "/bin/sh",
+ Arguments = $"-c \"[ -e /proc/self/fd/{fd} ]\"",
+ UseShellExecute = false,
+ })!;
+
+ process.WaitForExit();
+ return process.ExitCode == 0;
+ }
+
+ private static async Task WaitForZeroCopyPinHoldSnapshotAsync(
+ Func predicate,
+ int timeoutMilliseconds = 5000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ IoUringZeroCopyPinHoldSnapshot snapshot = GetIoUringZeroCopyPinHoldSnapshot();
+ while (DateTime.UtcNow < deadline)
+ {
+ if (predicate(snapshot))
+ {
+ return snapshot;
+ }
+
+ await Task.Delay(20);
+ snapshot = GetIoUringZeroCopyPinHoldSnapshot();
+ }
+
+ return snapshot;
+ }
+
+ private static async Task AssertConnectedPairRoundTripAsync(Socket client, Socket server, byte marker)
+ {
+ byte[] payload = new byte[] { marker };
+ byte[] receiveBuffer = new byte[1];
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ Assert.Equal(marker, receiveBuffer[0]);
+ }
+
+ private static async Task AssertPinsReleasedAsync(TrackingPinnableMemoryManager manager)
+ {
+ DateTime start = DateTime.UtcNow;
+ while (manager.PinCount != manager.UnpinCount)
+ {
+ if (DateTime.UtcNow - start > TimeSpan.FromSeconds(10))
+ {
+ break;
+ }
+
+ await Task.Delay(20);
+ }
+
+ Assert.True(manager.PinCount > 0, "Expected at least one pin.");
+ Assert.Equal(manager.PinCount, manager.UnpinCount);
+ }
+
+ private static IoUringNativeDiagnosticsSnapshot GetIoUringNativeDiagnosticsSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringNativeDiagnosticsSnapshotForTest();
+ }
+
+ private static IoUringProvidedBufferSnapshot GetIoUringProvidedBufferSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringProvidedBufferSnapshotForTest();
+ }
+
+ private static IoUringZeroCopySendSnapshot GetIoUringZeroCopySendSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringZeroCopySendSnapshotForTest();
+ }
+
+ private static IoUringFixedRecvSnapshot GetIoUringFixedRecvSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringFixedRecvSnapshotForTest();
+ }
+
+ private static IoUringSqPollSnapshot GetIoUringSqPollSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringSqPollSnapshotForTest();
+ }
+
+ private static bool IsAnyIoUringSqPollEngineNeedingWakeup()
+ => SocketAsyncEngine.IsAnyIoUringSqPollEngineNeedingWakeupForTest();
+
+ private static bool ValidateSqNeedWakeupMatchesRawSqFlagBit()
+ {
+ if (!SocketAsyncEngine.TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest(out bool matches))
+ {
+ return false;
+ }
+
+ Assert.True(matches, "SqNeedWakeup should match the SQ_NEED_WAKEUP bit contract.");
+ return true;
+ }
+
+ private static void EnableSqPollAppContextOptIn() =>
+ AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", true);
+
+ private static IoUringZeroCopyPinHoldSnapshot GetIoUringZeroCopyPinHoldSnapshot()
+ {
+ return SocketAsyncEngine.GetIoUringZeroCopyPinHoldSnapshotForTest();
+ }
+
+ private static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)
+ => SocketAsyncEngine.TryForceIoUringProvidedBufferRingExhaustionForTest(out forcedBufferCount);
+
+ private static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount)
+ => SocketAsyncEngine.TryRecycleForcedIoUringProvidedBufferRingForTest(out recycledBufferCount);
+
+ private static ulong CounterDelta(ulong before, ulong after)
+ {
+ Assert.True(after >= before, $"Expected monotonic io_uring counter. before={before}, after={after}");
+ return after - before;
+ }
+
+ private static async Task WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ Func scenario,
+ Action validateDelta,
+ int settleDelayMilliseconds = 0,
+ bool skipScenarioWhenIoUringUnavailable = false)
+ {
+ IoUringNativeDiagnosticsSnapshot diagnosticsBefore = GetIoUringNativeDiagnosticsSnapshot();
+ if (skipScenarioWhenIoUringUnavailable && !diagnosticsBefore.HasIoUringPort)
+ {
+ return;
+ }
+
+ await scenario();
+
+ if (settleDelayMilliseconds > 0)
+ {
+ await Task.Delay(settleDelayMilliseconds);
+ }
+
+ IoUringNativeDiagnosticsSnapshot diagnosticsAfter = GetIoUringNativeDiagnosticsSnapshot();
+ if (!diagnosticsBefore.HasIoUringPort && !diagnosticsAfter.HasIoUringPort)
+ {
+ return;
+ }
+
+ validateDelta(diagnosticsBefore, diagnosticsAfter);
+ }
+
+ private static Task StartReceiveMessageFromAsync(Socket socket, SocketAsyncEventArgs eventArgs)
+ => StartSocketAsyncEventArgsOperation(socket, eventArgs, static (s, args) => s.ReceiveMessageFromAsync(args));
+
+ private static Task StartSocketAsyncEventArgsOperation(
+ Socket socket,
+ SocketAsyncEventArgs eventArgs,
+ Func startOperation)
+ {
+ var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ EventHandler handler = null!;
+ handler = (_, completedArgs) =>
+ {
+ eventArgs.Completed -= handler;
+ tcs.TrySetResult(completedArgs);
+ };
+
+ eventArgs.Completed += handler;
+ if (!startOperation(socket, eventArgs))
+ {
+ eventArgs.Completed -= handler;
+ tcs.TrySetResult(eventArgs);
+ }
+
+ return tcs.Task;
+ }
+
+ private static async Task<(Socket Listener, Socket Client, Socket Server)> CreateConnectedTcpSocketTrioAsync(int listenBacklog = 1)
+ {
+ Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(listenBacklog);
+
+ Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ Task acceptTask = listener.AcceptAsync();
+ await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!);
+ Socket server = await acceptTask;
+ return (listener, client, server);
+ }
+ catch
+ {
+ client.Dispose();
+ throw;
+ }
+ }
+ catch
+ {
+ listener.Dispose();
+ throw;
+ }
+ }
+
+ private static async Task<(Socket Client, Socket Server)> AcceptConnectedTcpPairAsync(Socket listener, IPEndPoint endpoint)
+ {
+ Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ try
+ {
+ Task acceptTask = listener.AcceptAsync();
+ await client.ConnectAsync(endpoint);
+ Socket server = await acceptTask;
+ return (client, server);
+ }
+ catch
+ {
+ client.Dispose();
+ throw;
+ }
+ }
+
+ private static async Task RunTcpRoundTripAsync(int iterations)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[] { 1 };
+ byte[] receiveBuffer = new byte[1];
+
+ for (int i = 0; i < iterations; i++)
+ {
+ var serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ await Task.Yield();
+
+ int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, clientSent);
+
+ int serverReceived = await serverReceiveTask;
+ Assert.Equal(1, serverReceived);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ var clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None);
+ await Task.Yield();
+
+ int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, serverSent);
+
+ int clientReceived = await clientReceiveTask;
+ Assert.Equal(1, clientReceived);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ unchecked
+ {
+ sendBuffer[0]++;
+ }
+ }
+ }
+
+ private static async Task RunUnixDomainSocketRoundTripAsync()
+ {
+ if (!Socket.OSSupportsUnixDomainSockets)
+ {
+ return;
+ }
+
+ string path = UnixDomainSocketTest.GetRandomNonExistingFilePath();
+ var endpoint = new UnixDomainSocketEndPoint(path);
+ try
+ {
+ using Socket listener = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified);
+ listener.Bind(endpoint);
+ listener.Listen(1);
+
+ using Socket client = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified);
+ Task acceptTask = listener.AcceptAsync();
+ await client.ConnectAsync(endpoint);
+
+ using Socket server = await acceptTask;
+ await AssertConnectedPairRoundTripAsync(client, server, 0x31);
+ await AssertConnectedPairRoundTripAsync(server, client, 0x32);
+ }
+ finally
+ {
+ try
+ {
+ System.IO.File.Delete(path);
+ }
+ catch
+ {
+ }
+ }
+ }
+
+ private static async Task RunHybridIoUringAndEpollEngineScenarioAsync()
+ {
+ await RunTcpRoundTripAsync(4);
+
+ // With DOTNET_SYSTEM_NET_SOCKETS_THREAD_COUNT=2, one io_uring engine indicates a hybrid mix.
+ if (SocketAsyncEngine.GetActiveIoUringEnginesForTest().Length != 1)
+ {
+ return;
+ }
+
+ const int ConnectionCount = 32;
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(ConnectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ var acceptTasks = new Task[ConnectionCount];
+ var clients = new Socket[ConnectionCount];
+ var connectTasks = new Task[ConnectionCount];
+
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ acceptTasks[i] = listener.AcceptAsync();
+ }
+
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ connectTasks[i] = clients[i].ConnectAsync(endpoint);
+ }
+
+ await Task.WhenAll(connectTasks);
+ Socket[] servers = await Task.WhenAll(acceptTasks);
+
+ try
+ {
+ var work = new Task[ConnectionCount];
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ Socket client = clients[i];
+ Socket server = servers[i];
+ byte value = (byte)(i + 1);
+
+ work[i] = Task.Run(async () =>
+ {
+ byte[] tx = new byte[] { value };
+ byte[] rx = new byte[1];
+
+ int sent = await client.SendAsync(tx, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ int received = await server.ReceiveAsync(rx, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(value, rx[0]);
+
+ sent = await server.SendAsync(tx, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ received = await client.ReceiveAsync(rx, SocketFlags.None);
+ Assert.Equal(1, received);
+ Assert.Equal(value, rx[0]);
+ });
+ }
+
+ await Task.WhenAll(work);
+ }
+ finally
+ {
+ for (int i = 0; i < ConnectionCount; i++)
+ {
+ servers[i].Dispose();
+ clients[i].Dispose();
+ }
+ }
+ }
+
+ private static async Task RunThreadCountTwoCancellationRoutingScenarioAsync()
+ {
+ await RunHybridIoUringAndEpollEngineScenarioAsync();
+
+ SocketAsyncEngine[] ioUringEngines = SocketAsyncEngine.GetActiveIoUringEnginesForTest();
+ if (ioUringEngines.Length != 1)
+ {
+ return;
+ }
+
+ SocketAsyncEngine ioUringEngine = ioUringEngines[0];
+ long queueLengthBefore = ioUringEngine.IoUringCancelQueueLengthForTest;
+ long wakeRetryBefore = ioUringEngine.IoUringCancelQueueWakeRetryCountForTest;
+
+ await RunCancellationSubmitContentionScenarioAsync(connectionCount: 8, cancellationsPerConnection: 64);
+
+ Assert.True(queueLengthBefore >= 0);
+ Assert.True(ioUringEngine.IoUringCancelQueueLengthForTest >= 0);
+ Assert.True(
+ ioUringEngine.IoUringCancelQueueLengthForTest <= SocketAsyncEngine.GetIoUringCancellationQueueCapacityForTest());
+ Assert.True(ioUringEngine.IoUringCancelQueueWakeRetryCountForTest >= wakeRetryBefore);
+ }
+
+ private static async Task RunKernelVersionUnsupportedFallbackScenarioAsync()
+ {
+ await RunTcpRoundTripAsync(4);
+ Assert.Equal(0, SocketAsyncEngine.GetActiveIoUringEnginesForTest().Length);
+ }
+
+ private static async Task RunTrackedOperationGenerationTransitionStressScenarioAsync(int connectionCount, int iterationsPerConnection)
+ {
+ if (!PlatformDetection.IsArm64Process)
+ {
+ return;
+ }
+
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(connectionCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest();
+ int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest();
+
+ var clients = new List(connectionCount);
+ var servers = new List(connectionCount);
+ try
+ {
+ for (int i = 0; i < connectionCount; i++)
+ {
+ (Socket client, Socket server) = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ clients.Add(client);
+ servers.Add(server);
+ }
+
+ var workers = new Task[connectionCount];
+ for (int i = 0; i < connectionCount; i++)
+ {
+ Socket client = clients[i];
+ Socket server = servers[i];
+ workers[i] = Task.Run(async () =>
+ {
+ byte[] sendBuffer = new byte[1];
+ byte[] receiveBuffer = new byte[1];
+ for (int iteration = 0; iteration < iterationsPerConnection; iteration++)
+ {
+ // Stress rapid slot reuse so generation mismatches surface as stuck operations
+ // rather than silently passing under low churn.
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ int sent = await client.SendAsync(sendBuffer, SocketFlags.None);
+ Assert.Equal(1, sent);
+
+ int received = await receiveTask;
+ Assert.Equal(1, received);
+ Assert.Equal(sendBuffer[0], receiveBuffer[0]);
+
+ unchecked
+ {
+ sendBuffer[0]++;
+ }
+ }
+ });
+ }
+
+ Task workerTask = Task.WhenAll(workers);
+ Task completed = await Task.WhenAny(workerTask, Task.Delay(TimeSpan.FromSeconds(60)));
+ Assert.Same(workerTask, completed);
+ await workerTask;
+ }
+ finally
+ {
+ foreach (Socket server in servers)
+ {
+ server.Dispose();
+ }
+
+ foreach (Socket client in clients)
+ {
+ client.Dispose();
+ }
+ }
+
+ Assert.True(
+ await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 2, timeoutMilliseconds: 15000),
+ "Completion-slot usage remained elevated after ARM64 generation-transition stress.");
+ Assert.True(
+ await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 2, timeoutMilliseconds: 15000),
+ "Tracked-operation count remained elevated after ARM64 generation-transition stress.");
+ }
+
+ private static async Task RunGenerationWrapAroundDispatchScenarioAsync()
+ {
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ Task armReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x5C }, SocketFlags.None));
+ Assert.Equal(1, await armReceive);
+ Assert.True(
+ await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true),
+ "Expected persistent multishot recv to arm before generation-wrap dispatch validation.");
+
+ ulong activeUserData = GetPersistentMultishotRecvUserData(server);
+ Assert.NotEqual(0UL, activeUserData);
+ Assert.True(IsTrackedIoUringUserDataForTest(activeUserData), "Active multishot user_data should be tracked.");
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(activeUserData, out int slotIndex, out ulong generation));
+
+ // Derive max generation from encoding mask and verify helper wrap contract.
+ ulong maxEncodedUserData = EncodeCompletionSlotUserDataForTest(slotIndex, ulong.MaxValue);
+ Assert.True(TryDecodeCompletionSlotUserDataForTest(maxEncodedUserData, out _, out ulong maxGeneration));
+ Assert.Equal(1UL, IncrementCompletionSlotGenerationForTest(maxGeneration));
+
+ ulong staleGeneration = IncrementCompletionSlotGenerationForTest(generation);
+ ulong staleUserData = EncodeCompletionSlotUserDataForTest(slotIndex, staleGeneration);
+ if (staleUserData == activeUserData)
+ {
+ staleUserData = EncodeCompletionSlotUserDataForTest(slotIndex, generation == 1UL ? 2UL : 1UL);
+ }
+
+ Assert.NotEqual(activeUserData, staleUserData);
+ Assert.False(
+ IsTrackedIoUringUserDataForTest(staleUserData),
+ "Stale wrapped-generation user_data should be rejected during dispatch lookup.");
+ Assert.True(IsTrackedIoUringUserDataForTest(activeUserData));
+ }
+
+ private static async Task RunBufferListSendRoundTripAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[] { 0x11, 0x22, 0x33, 0x44, 0x55 };
+ var sendBuffers = new List>
+ {
+ new ArraySegment(payload, 0, 2),
+ new ArraySegment(payload, 2, 1),
+ new ArraySegment(payload, 3, 2)
+ };
+
+ byte[] receiveBuffer = new byte[payload.Length];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ int sent = await client.SendAsync(sendBuffers, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static async Task RunReceiveMessageFromRoundTripAsync()
+ {
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+
+ receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] payload = new byte[] { 0x91, 0x92, 0x93 };
+ byte[] receiveBuffer = new byte[payload.Length];
+ EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0);
+
+ var receiveTask = receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint);
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveMessageFromResult result = await receiveTask;
+ Assert.Equal(payload.Length, result.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint);
+ }
+
+ private static async Task RunReceiveMessageFromPacketInformationRoundTripAsync(bool useIpv6)
+ {
+ if (useIpv6 && !Socket.OSSupportsIPv6)
+ {
+ return;
+ }
+
+ AddressFamily addressFamily = useIpv6 ? AddressFamily.InterNetworkV6 : AddressFamily.InterNetwork;
+ SocketOptionLevel optionLevel = useIpv6 ? SocketOptionLevel.IPv6 : SocketOptionLevel.IP;
+ IPAddress loopbackAddress = useIpv6 ? IPAddress.IPv6Loopback : IPAddress.Loopback;
+ IPAddress anyAddress = useIpv6 ? IPAddress.IPv6Any : IPAddress.Any;
+
+ using Socket receiver = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp);
+ using Socket sender = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp);
+
+ receiver.SetSocketOption(optionLevel, SocketOptionName.PacketInformation, true);
+ receiver.Bind(new IPEndPoint(loopbackAddress, 0));
+ sender.Bind(new IPEndPoint(loopbackAddress, 0));
+
+ byte[] payload = useIpv6 ?
+ new byte[] { 0xA1, 0xA2, 0xA3 } :
+ new byte[] { 0x90, 0x91, 0x92, 0x93 };
+ byte[] receiveBuffer = new byte[payload.Length];
+ EndPoint remoteEndPoint = new IPEndPoint(anyAddress, 0);
+
+ Task receiveTask =
+ ToTask(receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint));
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveMessageFromResult result = await receiveTask;
+ Assert.Equal(payload.Length, result.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint);
+ Assert.Equal(((IPEndPoint)sender.LocalEndPoint!).Address, result.PacketInformation.Address);
+ }
+
+ private static async Task RunNonPinnableMemorySendFallbackScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[] { 0x71, 0x72, 0x73, 0x74 };
+ using var nonPinnableMemory = new NonPinnableMemoryManager(payload);
+ byte[] receiveBuffer = new byte[payload.Length];
+
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ int sent = await client.SendAsync(nonPinnableMemory.Memory, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static async Task RunNonPinnableMemoryReceiveFallbackScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[4];
+ using var nonPinnableMemory = new NonPinnableMemoryManager(receiveBuffer);
+ byte[] payload = new byte[] { 0x81, 0x82, 0x83, 0x84 };
+
+ Task receiveTask = ToTask(server.ReceiveAsync(nonPinnableMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(payload.Length, await receiveTask);
+ Assert.Equal(payload, receiveBuffer);
+ }
+
+ private static Task RunNonPinnableMemoryFallbackScenarioAsync(bool receivePath) =>
+ receivePath ? RunNonPinnableMemoryReceiveFallbackScenarioAsync() : RunNonPinnableMemorySendFallbackScenarioAsync();
+
+ private static async Task RunNonPinnableFallbackTelemetryScenarioAsync()
+ {
+ long before = 0;
+ long after = 0;
+
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ before = GetIoUringPrepareNonPinnableFallbackCounterValue();
+ await RunNonPinnableMemorySendFallbackScenarioAsync();
+ await RunNonPinnableMemoryReceiveFallbackScenarioAsync();
+ after = GetIoUringPrepareNonPinnableFallbackCounterValue();
+ },
+ (_, _) =>
+ {
+ Assert.True(
+ after > before,
+ $"Expected io_uring non-pinnable fallback telemetry to increase. before={before}, after={after}");
+ },
+ skipScenarioWhenIoUringUnavailable: true);
+ }
+
+ private static async Task RunPinnableMemoryPinReleaseLifecycleScenarioAsync()
+ {
+ await WithIoUringNativeDiagnosticsSnapshotDeltaAsync(
+ async () =>
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Completion path: receive completes with data and must release pin.
+ byte[] completionPayload = new byte[] { 0x91 };
+ using var completionMemory = new TrackingPinnableMemoryManager(new byte[completionPayload.Length]);
+ Task completionReceive = ToTask(server.ReceiveAsync(completionMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(completionPayload, SocketFlags.None));
+ Assert.Equal(1, await completionReceive);
+ Assert.Equal(completionPayload, completionMemory.GetSpan().ToArray());
+ await AssertPinsReleasedAsync(completionMemory);
+
+ // Cancellation path: pending receive canceled by token must release pin.
+ using var cancellationMemory = new TrackingPinnableMemoryManager(new byte[16]);
+ using (var cts = new CancellationTokenSource())
+ {
+ Task canceledReceive = ToTask(server.ReceiveAsync(cancellationMemory.Memory, SocketFlags.None, cts.Token));
+ await Task.Delay(20);
+ cts.Cancel();
+
+ Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(canceledException);
+ }
+
+ await AssertPinsReleasedAsync(cancellationMemory);
+
+ // Teardown/abort path: pending receive interrupted by close must release pin.
+ using var teardownMemory = new TrackingPinnableMemoryManager(new byte[16]);
+ Task teardownReceive = ToTask(server.ReceiveAsync(teardownMemory.Memory, SocketFlags.None));
+ await Task.Yield();
+ client.Dispose();
+ server.Dispose();
+
+ Exception? teardownException = await Record.ExceptionAsync(async () => await teardownReceive);
+ AssertCanceledDisposedOrInterrupted(teardownException);
+ await AssertPinsReleasedAsync(teardownMemory);
+ },
+ static (_, _) => { },
+ skipScenarioWhenIoUringUnavailable: true);
+ }
+
+ private static async Task RunProvidedBufferRegistrationLifecycleScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] receiveBuffer = new byte[1];
+ Task initialReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xA1 }, SocketFlags.None));
+ Assert.Equal(1, await initialReceive);
+
+ IoUringProvidedBufferSnapshot initialSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(initialSnapshot))
+ {
+ return;
+ }
+
+ Assert.Equal(initialSnapshot.TotalBufferCount, initialSnapshot.AvailableCount + initialSnapshot.InUseCount);
+ Assert.Equal(0, initialSnapshot.InUseCount);
+
+ using (var cts = new CancellationTokenSource())
+ {
+ Task canceledReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token));
+ await Task.Yield();
+ cts.Cancel();
+
+ Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive);
+ AssertCanceledOrInterrupted(canceledException);
+ }
+
+ await Task.Delay(50);
+ IoUringProvidedBufferSnapshot postCancellationSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.Equal(initialSnapshot.TotalBufferCount, postCancellationSnapshot.TotalBufferCount);
+ Assert.Equal(postCancellationSnapshot.TotalBufferCount, postCancellationSnapshot.AvailableCount + postCancellationSnapshot.InUseCount);
+ Assert.Equal(0, postCancellationSnapshot.InUseCount);
+ }
+
+ private static async Task RunProvidedBufferSelectReceiveScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xB2 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(0xB2, receiveBuffer[0]);
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase after a completion.");
+ Assert.Equal(depletionBefore, depletionAfter);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ }
+
+ private static async Task RunProvidedBufferRecycleReuseScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+ long allocationFailuresBefore = beforeSnapshot.AllocationFailureCount;
+
+ int iterations = Math.Max(beforeSnapshot.TotalBufferCount + 64, 512);
+ byte[] receiveBuffer = new byte[1];
+ byte[] payload = new byte[1];
+
+ for (int i = 0; i < iterations; i++)
+ {
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ payload[0] = unchecked((byte)i);
+ Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+ Assert.Equal(payload[0], receiveBuffer[0]);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ Assert.True(
+ recycleAfter >= recycleBefore + (ulong)iterations,
+ $"Expected at least {iterations} provided-buffer recycle increments. before={recycleBefore}, after={recycleAfter}");
+ Assert.Equal(depletionBefore, depletionAfter);
+ Assert.Equal(allocationFailuresBefore, afterSnapshot.AllocationFailureCount);
+ Assert.Equal(beforeSnapshot.TotalBufferCount, afterSnapshot.TotalBufferCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount);
+ }
+
+ private static async Task RunProvidedBufferExhaustionScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] warmupBuffer = new byte[1];
+ Task warmupReceive = ToTask(server.ReceiveAsync(warmupBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC1 }, SocketFlags.None));
+ Assert.Equal(1, await warmupReceive);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot))
+ {
+ return;
+ }
+
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+ Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount));
+ Assert.True(forcedBufferCount > 0);
+
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xC2 }, SocketFlags.None));
+ Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15)));
+ Assert.Same(receiveTask, completed);
+
+ Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask);
+ SocketException socketException = Assert.IsType(receiveException);
+ Assert.Equal(SocketError.NoBufferSpaceAvailable, socketException.SocketErrorCode);
+ Assert.True(
+ GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions") > depletionBefore,
+ "Expected provided-buffer depletion counter to increase when ring buffers are forced unavailable.");
+ }
+
+ private static async Task RunProvidedBufferMixedWorkloadScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ using Socket udpReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ udpReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ byte[] tcpReceiveBuffer = new byte[1];
+ byte[] udpReceiveBuffer = new byte[2];
+
+ Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None));
+ Task udpReceive = ToTask(
+ udpReceiver.ReceiveFromAsync(
+ udpReceiveBuffer,
+ SocketFlags.None,
+ new IPEndPoint(IPAddress.Any, 0)));
+ await Task.Yield();
+
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0xD1 }, SocketFlags.None));
+ Assert.Equal(2, await udpSender.SendToAsync(new byte[] { 0xE1, 0xE2 }, SocketFlags.None, udpReceiver.LocalEndPoint!));
+
+ Assert.Equal(1, await tcpReceive);
+ Assert.Equal(0xD1, tcpReceiveBuffer[0]);
+
+ SocketReceiveFromResult udpResult = await udpReceive;
+ Assert.Equal(2, udpResult.ReceivedBytes);
+ Assert.Equal(0xE1, udpReceiveBuffer[0]);
+ Assert.Equal(0xE2, udpReceiveBuffer[1]);
+
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles");
+ ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions");
+
+ Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase in mixed workload.");
+ Assert.Equal(depletionBefore, depletionAfter);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount);
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ }
+
+ private static async Task SendExactlyAsync(Socket socket, ReadOnlyMemory buffer)
+ {
+ int totalSent = 0;
+ while (totalSent < buffer.Length)
+ {
+ int sent = await socket.SendAsync(buffer.Slice(totalSent), SocketFlags.None);
+ Assert.True(sent > 0, "Socket.SendAsync returned 0 before sending all bytes.");
+ totalSent += sent;
+ }
+ }
+
+ private static async Task ReceiveExactlyAsync(Socket socket, Memory buffer)
+ {
+ int totalReceived = 0;
+ while (totalReceived < buffer.Length)
+ {
+ int received = await socket.ReceiveAsync(buffer.Slice(totalReceived), SocketFlags.None);
+ Assert.True(received > 0, "Socket.ReceiveAsync returned 0 before receiving all expected bytes.");
+ totalReceived += received;
+ }
+ }
+
+ private static async Task WaitForProvidedBufferSnapshotAsync(
+ Func predicate,
+ int timeoutMilliseconds = 10000)
+ {
+ DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds);
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ while (DateTime.UtcNow < deadline)
+ {
+ if (predicate(snapshot))
+ {
+ return snapshot;
+ }
+
+ await Task.Delay(50);
+ snapshot = GetIoUringProvidedBufferSnapshot();
+ }
+
+ return snapshot;
+ }
+
+ private static async Task RunAdaptiveProvidedBufferSmallMessageShrinkScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+
+ for (int i = 0; i < 320; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)i));
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) &&
+ (snapshot.RecommendedBufferSize < initialBufferSize || snapshot.BufferSize < initialBufferSize));
+
+ Assert.True(
+ afterSnapshot.RecommendedBufferSize < initialBufferSize || afterSnapshot.BufferSize < initialBufferSize,
+ $"Expected adaptive recommendation to shrink from {initialBufferSize}. " +
+ $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferLargeMessageGrowScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ int payloadSize = initialBufferSize;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ sendBuffer.AsSpan().Fill(0x5A);
+
+ for (int i = 0; i < 320; i++)
+ {
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) &&
+ (snapshot.RecommendedBufferSize > initialBufferSize || snapshot.BufferSize > initialBufferSize));
+
+ Assert.True(
+ afterSnapshot.RecommendedBufferSize > initialBufferSize || afterSnapshot.BufferSize > initialBufferSize,
+ $"Expected adaptive recommendation to grow from {initialBufferSize}. " +
+ $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferMixedWorkloadStableScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ byte[] smallSend = new byte[64];
+ byte[] smallReceive = new byte[64];
+ byte[] largeSend = new byte[initialBufferSize];
+ byte[] largeReceive = new byte[initialBufferSize];
+ smallSend.AsSpan().Fill(0x11);
+ largeSend.AsSpan().Fill(0x77);
+
+ for (int i = 0; i < 320; i++)
+ {
+ bool useLarge = (i & 1) == 1;
+ byte[] send = useLarge ? largeSend : smallSend;
+ byte[] receive = useLarge ? largeReceive : smallReceive;
+
+ Task receiveTask = ReceiveExactlyAsync(server, receive);
+ await SendExactlyAsync(client, send);
+ await receiveTask;
+ Assert.Equal(send, receive);
+ }
+
+ await Task.Delay(250);
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.True(IsAdaptiveSizingUsable(afterSnapshot));
+ Assert.Equal(initialBufferSize, afterSnapshot.RecommendedBufferSize);
+ }
+
+ private static async Task RunAdaptiveProvidedBufferResizeSwapNoDataLossScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ for (int i = 0; i < 384; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)i));
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) && snapshot.BufferSize < initialBufferSize,
+ timeoutMilliseconds: 15000);
+
+ Assert.True(
+ afterSnapshot.BufferSize < initialBufferSize,
+ $"Expected adaptive resize swap to shrink active ring. initial={initialBufferSize}, current={afterSnapshot.BufferSize}");
+ }
+
+ private static async Task RunAdaptiveProvidedBufferResizeSwapConcurrentInFlightNoDataLossScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int batchSize = 64;
+ const int rounds = 24;
+
+ // Keep many receives in flight while driving enough completions to trigger adaptive
+ // resize; this exercises ring-swap safety under concurrent tracked receive activity.
+ for (int round = 0; round < rounds; round++)
+ {
+ Task[] receiveTasks = new Task[batchSize];
+ byte[][] receiveBuffers = new byte[batchSize][];
+ for (int i = 0; i < batchSize; i++)
+ {
+ byte[] receiveBuffer = new byte[1];
+ receiveBuffers[i] = receiveBuffer;
+ receiveTasks[i] = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ }
+
+ await Task.Yield();
+
+ for (int i = 0; i < batchSize; i++)
+ {
+ byte expected = unchecked((byte)(round + i + 1));
+ Assert.Equal(1, await client.SendAsync(new[] { expected }, SocketFlags.None));
+ }
+
+ int[] completed = await Task.WhenAll(receiveTasks);
+ for (int i = 0; i < batchSize; i++)
+ {
+ Assert.Equal(1, completed[i]);
+ Assert.NotEqual(0, receiveBuffers[i][0]);
+ }
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) && snapshot.BufferSize < initialBufferSize,
+ timeoutMilliseconds: 15000);
+
+ Assert.True(
+ afterSnapshot.BufferSize < initialBufferSize,
+ $"Expected adaptive resize swap to shrink active ring under in-flight receive stress. initial={initialBufferSize}, current={afterSnapshot.BufferSize}");
+ Assert.Equal(0, afterSnapshot.InUseCount);
+ Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount);
+ }
+
+ private static async Task RunAdaptiveProvidedBufferDisabledScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ Assert.False(beforeSnapshot.AdaptiveBufferSizingEnabled);
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ int initialRecommendedSize = beforeSnapshot.RecommendedBufferSize;
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ sendBuffer.AsSpan().Fill(0xA5);
+
+ for (int i = 0; i < 320; i++)
+ {
+ Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receiveTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ await Task.Delay(250);
+ IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot();
+ Assert.True(IsProvidedBufferSnapshotUsable(afterSnapshot));
+ Assert.False(afterSnapshot.AdaptiveBufferSizingEnabled);
+ Assert.Equal(initialBufferSize, afterSnapshot.BufferSize);
+ Assert.Equal(initialRecommendedSize, afterSnapshot.RecommendedBufferSize);
+ }
+
+ private static async Task RunAdaptiveProvidedBufferSizingStateScenarioAsync(bool expectedEnabled)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state is initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot))
+ {
+ return;
+ }
+
+ Assert.Equal(expectedEnabled, snapshot.AdaptiveBufferSizingEnabled);
+ }
+
+ private static async Task RunProvidedBufferKernelRegistrationDisabledScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state is initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot))
+ {
+ return;
+ }
+
+ Assert.False(snapshot.HasRegisteredBuffers);
+ }
+
+ private static async Task RunProvidedBufferKernelRegistrationSuccessScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot))
+ {
+ return;
+ }
+
+ ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess");
+ ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure");
+ Assert.True(
+ successCount + failureCount > 0,
+ "Expected at least one registered-buffer initialization attempt.");
+
+ // Best-effort success-path assertion: only enforce when registration succeeded on this machine.
+ if (!snapshot.HasRegisteredBuffers)
+ {
+ return;
+ }
+
+ Assert.True(successCount > 0, "Expected success telemetry when registered buffers are active.");
+ }
+
+ private static async Task RunProvidedBufferKernelRegistrationFailureNonFatalScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized.
+ byte[] receiveBuffer = new byte[1];
+ Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None));
+ Assert.Equal(1, await receiveTask);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot) || snapshot.HasRegisteredBuffers)
+ {
+ // No observed registration failure in this environment.
+ return;
+ }
+
+ // Registration is not active: verify provided-buffer receive path still works.
+ byte[] payload = new byte[4096];
+ byte[] received = new byte[payload.Length];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 31));
+ }
+
+ Task receiveAllTask = ReceiveExactlyAsync(server, received);
+ await SendExactlyAsync(client, payload);
+ await receiveAllTask;
+ Assert.Equal(payload, received);
+
+ ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure");
+ Assert.True(failureCount > 0, "Expected failure telemetry when registered buffers are inactive.");
+ }
+
+ private static async Task RunProvidedBufferKernelReregistrationOnResizeScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsAdaptiveSizingUsable(beforeSnapshot))
+ {
+ return;
+ }
+
+ ulong reregSuccessBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess");
+ ulong reregFailureBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure");
+
+ int initialBufferSize = beforeSnapshot.BufferSize;
+ Assert.True(initialBufferSize > 0);
+
+ const int payloadSize = 64;
+ byte[] sendBuffer = new byte[payloadSize];
+ byte[] receiveBuffer = new byte[payloadSize];
+ for (int i = 0; i < 384; i++)
+ {
+ sendBuffer.AsSpan().Fill(unchecked((byte)(i + 1)));
+ Task receivePayloadTask = ReceiveExactlyAsync(server, receiveBuffer);
+ await SendExactlyAsync(client, sendBuffer);
+ await receivePayloadTask;
+ Assert.Equal(sendBuffer, receiveBuffer);
+ }
+
+ IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync(
+ snapshot => IsAdaptiveSizingUsable(snapshot) && snapshot.BufferSize < initialBufferSize,
+ timeoutMilliseconds: 15000);
+
+ Assert.True(afterSnapshot.BufferSize < initialBufferSize);
+
+ ulong reregSuccessAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess");
+ ulong reregFailureAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure");
+ Assert.True(
+ (reregSuccessAfter + reregFailureAfter) > (reregSuccessBefore + reregFailureBefore),
+ "Expected at least one registered-buffer re-registration attempt after adaptive resize.");
+ }
+
+ private static async Task RunProvidedBufferRegisteredBuffersDataCorrectnessScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot) || !snapshot.HasRegisteredBuffers)
+ {
+ return;
+ }
+
+ // Reuse the mixed workload profile to validate payload correctness with registered buffers active.
+ byte[] smallSend = new byte[64];
+ byte[] largeSend = new byte[Math.Max(snapshot.BufferSize, 4096)];
+ byte[] smallReceive = new byte[smallSend.Length];
+ byte[] largeReceive = new byte[largeSend.Length];
+
+ for (int i = 0; i < 64; i++)
+ {
+ smallSend.AsSpan().Fill(unchecked((byte)(i + 5)));
+ largeSend.AsSpan().Fill(unchecked((byte)(i + 11)));
+
+ Task smallReceiveTask = ReceiveExactlyAsync(server, smallReceive);
+ await SendExactlyAsync(client, smallSend);
+ await smallReceiveTask;
+ Assert.Equal(smallSend, smallReceive);
+
+ Task largeReceiveTask = ReceiveExactlyAsync(server, largeReceive);
+ await SendExactlyAsync(client, largeSend);
+ await largeReceiveTask;
+ Assert.Equal(largeSend, largeReceive);
+ }
+ }
+
+ private static async Task RunProvidedBufferRegistrationMemoryPressureScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!IsProvidedBufferSnapshotUsable(snapshot))
+ {
+ return;
+ }
+
+ int payloadSize = Math.Min(snapshot.BufferSize, 16 * 1024);
+ payloadSize = Math.Max(payloadSize, 1024);
+ byte[] payload = new byte[payloadSize];
+ byte[] received = new byte[payloadSize];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 41));
+ }
+
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ await SendExactlyAsync(client, payload);
+ await receiveTask;
+ Assert.Equal(payload, received);
+
+ ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess");
+ ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure");
+ if (snapshot.HasRegisteredBuffers)
+ {
+ Assert.True(successCount > 0, "Expected successful registration telemetry when buffers are registered.");
+ }
+ else
+ {
+ Assert.True(failureCount > 0, "Expected failure telemetry when registration falls back under pressure.");
+ }
+ }
+
+ private static async Task RunProvidedBufferRingForcedAllocationFailureFallbackScenarioAsync()
+ {
+ await RunTcpRoundTripAsync(4);
+
+ IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.False(snapshot.HasProvidedBufferRing, "Provided-buffer ring should be disabled after forced allocation failure.");
+ Assert.False(snapshot.SupportsProvidedBufferRings, "Capability should remain disabled when provided-buffer ring creation fails.");
+
+ // Ensure sockets continue to function after provided-buffer OOM fallback.
+ await RunTcpRoundTripAsync(4);
+ }
+
+ private static Task RunProvidedBufferTeardownOrderingContractScenarioAsync()
+ {
+ Assert.True(
+ SocketAsyncEngine.ValidateIoUringProvidedBufferTeardownOrderingForTest(),
+ "Expected teardown to unregister/dispose provided buffers before ring unmap/close.");
+
+ return Task.CompletedTask;
+ }
+
+ private static async Task RunZeroCopySendStateScenarioAsync(bool expectedEnabledWhenSupported)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[64];
+ byte[] receiveBuffer = new byte[sendBuffer.Length];
+ Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None));
+ await ReceiveExactlyAsync(server, receiveBuffer);
+
+ IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ if (!snapshot.SupportsSendZc)
+ {
+ Assert.False(snapshot.ZeroCopySendEnabled);
+ return;
+ }
+
+ Assert.Equal(expectedEnabledWhenSupported, snapshot.ZeroCopySendEnabled);
+ }
+
+ private static async Task RunFixedRecvStateScenarioAsync(bool expectedEnabled)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[64];
+ byte[] receiveBuffer = new byte[sendBuffer.Length];
+ Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None));
+ await ReceiveExactlyAsync(server, receiveBuffer);
+
+ IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.Equal(expectedEnabled, IsFixedRecvEnabled(snapshot));
+ }
+
+ private static async Task RunFixedRecvActivationFollowsRuntimeCapabilitiesScenarioAsync()
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] sendBuffer = new byte[64];
+ byte[] receiveBuffer = new byte[sendBuffer.Length];
+ Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None));
+ await ReceiveExactlyAsync(server, receiveBuffer);
+
+ IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.Equal(snapshot.SupportsReadFixed && snapshot.HasRegisteredBuffers, IsFixedRecvEnabled(snapshot));
+ }
+
+ private static async Task RunFixedRecvDataCorrectnessScenarioAsync()
+ {
+ IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot();
+ if (!snapshot.HasIoUringPort || !IsFixedRecvEnabled(snapshot) || !snapshot.SupportsReadFixed || !snapshot.HasRegisteredBuffers)
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[32 * 1024];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i * 13));
+ }
+
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None));
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunSqPollBasicSendReceiveScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(8);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!IsSqPollActive(snapshot))
+ {
+ return;
+ }
+
+ await RunTcpRoundTripAsync(16);
+ }
+
+ private static async Task RunDeferTaskrunEventLoopInitScenarioAsync()
+ {
+ // TCP round-trips exercise io_uring_enter from the event loop thread.
+ // With DEFER_TASKRUN + SINGLE_ISSUER, the kernel checks that io_uring_enter
+ // is called from the same thread that called io_uring_setup (submitter_task).
+ // If initialization ran on the wrong thread, io_uring_enter would return
+ // EEXIST and all operations would fail.
+ await RunTcpRoundTripAsync(8);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ // Non-SQPOLL engines negotiate DEFER_TASKRUN by default.
+ Assert.False(
+ snapshot.SqPollEnabled,
+ "SQPOLL should be disabled in the default DEFER_TASKRUN initialization scenario.");
+ Assert.True(
+ snapshot.DeferTaskrunEnabled,
+ "Non-SQPOLL io_uring engines should negotiate DEFER_TASKRUN.");
+
+ // Additional round-trips after the assertion to confirm ongoing stability.
+ await RunTcpRoundTripAsync(8);
+ }
+
+ private static async Task RunSqPollRequestedScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(8);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ // Some Helix legs can run without an active io_uring port (kernel/config/runtime gating).
+ // In that case this SQPOLL-request scenario is not applicable.
+ if (!snapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ if (!snapshot.SqPollEnabled)
+ {
+ // SQPOLL wasn't active on this leg, but socket operations must continue to succeed.
+ await RunTcpRoundTripAsync(16);
+ return;
+ }
+
+ Assert.False(
+ snapshot.DeferTaskrunEnabled,
+ "SQPOLL and DEFER_TASKRUN must be mutually exclusive in negotiated io_uring setup flags.");
+ }
+
+ private static async Task RunSqPollWakeupAfterIdleScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!IsSqPollActive(snapshot))
+ {
+ return;
+ }
+
+ ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+
+ // Let the kernel SQPOLL thread go idle and set SQ_NEED_WAKEUP.
+ bool observedNeedWakeup = false;
+ for (int i = 0; i < 25; i++)
+ {
+ await Task.Delay(100);
+ if (IsAnyIoUringSqPollEngineNeedingWakeup())
+ {
+ observedNeedWakeup = true;
+ break;
+ }
+ }
+
+ if (!observedNeedWakeup)
+ {
+ return;
+ }
+
+ await RunTcpRoundTripAsync(2);
+
+ ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+ Assert.True(
+ wakeupsAfter > wakeupsBefore,
+ $"Expected SQPOLL wakeups to increase after idle wake path. before={wakeupsBefore}, after={wakeupsAfter}");
+ }
+
+ private static async Task RunSqPollMultishotRecvScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!IsSqPollActive(snapshot))
+ {
+ return;
+ }
+
+ await RunMultishotRecvBasicScenarioAsync(iterations: 32);
+ }
+
+ private static async Task RunSqPollZeroCopySendScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!IsSqPollActive(snapshot))
+ {
+ return;
+ }
+
+ await RunZeroCopySendLargeBufferRoundTripScenarioAsync();
+ }
+
+ private static async Task RunSqPollTelemetryCountersScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!IsSqPollActive(snapshot))
+ {
+ return;
+ }
+
+ ulong skippedBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped");
+ ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+
+ await RunTcpRoundTripAsync(32);
+ ulong skippedAfterBurst = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped");
+ Assert.True(
+ skippedAfterBurst > skippedBefore,
+ $"Expected SQPOLL submission-skipped counter to increase. before={skippedBefore}, after={skippedAfterBurst}");
+
+ await Task.Delay(1500);
+ await RunTcpRoundTripAsync(2);
+
+ ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups");
+ Assert.True(
+ wakeupsAfter >= wakeupsBefore,
+ $"Expected SQPOLL wakeup counter to be readable/nondecreasing. before={wakeupsBefore}, after={wakeupsAfter}");
+ }
+
+ private static async Task RunSqPollNeedWakeupContractScenarioAsync()
+ {
+ EnableSqPollAppContextOptIn();
+ await RunTcpRoundTripAsync(4);
+
+ IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot();
+ if (!IsSqPollActive(snapshot))
+ {
+ return;
+ }
+
+ Assert.True(
+ ValidateSqNeedWakeupMatchesRawSqFlagBit(),
+ "Expected at least one active SQPOLL io_uring engine for SqNeedWakeup contract validation.");
+ }
+
+ private static bool IsZeroCopySendEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot)
+ {
+ snapshot = GetIoUringZeroCopySendSnapshot();
+ return snapshot.HasIoUringPort && snapshot.SupportsSendZc && snapshot.ZeroCopySendEnabled;
+ }
+
+ private static bool IsZeroCopySendMessageEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot)
+ {
+ snapshot = GetIoUringZeroCopySendSnapshot();
+ return snapshot.HasIoUringPort && snapshot.SupportsSendMsgZc && snapshot.ZeroCopySendEnabled;
+ }
+
+ private static async Task RunZeroCopySendLargeBufferRoundTripScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[64 * 1024];
+ byte[] received = new byte[payload.Length];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)i);
+ }
+
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(payload, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunZeroCopySendSmallBufferUsesRegularSendWithForcedSendErrorScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] smallPayload = new byte[1024];
+ // forceEcanceledOnceMask: "send" is set by the caller. Small payloads should use regular SEND,
+ // so the first send is expected to observe the injected cancellation/interruption.
+ Exception? sendException = await Record.ExceptionAsync(async () => await client.SendAsync(smallPayload, SocketFlags.None));
+ AssertCanceledOrInterrupted(sendException);
+
+ byte[] verificationPayload = new byte[] { 0x5A };
+ byte[] verificationReceive = new byte[1];
+ Task verificationReceiveTask = ToTask(server.ReceiveAsync(verificationReceive, SocketFlags.None));
+ await Task.Yield();
+ Assert.Equal(1, await client.SendAsync(verificationPayload, SocketFlags.None));
+ Assert.Equal(1, await verificationReceiveTask);
+ Assert.Equal(verificationPayload[0], verificationReceive[0]);
+ }
+
+ private static async Task RunZeroCopySendNotifCqeReleasesPinHoldsScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[128 * 1024];
+ byte[] received = new byte[payload.Length];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 1));
+ }
+
+ const int iterations = 8;
+ for (int i = 0; i < iterations; i++)
+ {
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(payload, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ IoUringZeroCopyPinHoldSnapshot releasedSnapshot = await WaitForZeroCopyPinHoldSnapshotAsync(
+ static snapshot => !snapshot.HasIoUringPort || (snapshot.ActivePinHolds == 0 && snapshot.PendingNotificationCount == 0));
+ if (!releasedSnapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.Equal(0, releasedSnapshot.ActivePinHolds);
+ Assert.Equal(0, releasedSnapshot.PendingNotificationCount);
+ }
+
+ private static async Task RunZeroCopySendResetStormSlotRecoveryScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ const int ConcurrentSendCount = 512;
+ const int SlotPressureDelta = 32;
+ TimeSpan runDuration = TimeSpan.FromSeconds(60);
+
+ using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
+ listener.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+ listener.Listen(ConcurrentSendCount);
+ IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!;
+
+ int baselineSlotsInUse = GetIoUringCompletionSlotsInUseForTest();
+ IoUringZeroCopyPinHoldSnapshot baselineSnapshot = GetIoUringZeroCopyPinHoldSnapshot();
+
+ byte[] payload = new byte[64 * 1024];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 11));
+ }
+
+ DateTime deadline = DateTime.UtcNow + runDuration;
+ int rounds = 0;
+ int roundsWithConnectionReset = 0;
+ bool observedPendingNotifications = false;
+ // Long-running reset churn is intentional: leaked pending-NOTIF slots tend to show
+ // up only after repeated mid-flight resets, not short happy-path bursts.
+ while (DateTime.UtcNow < deadline)
+ {
+ (Socket client, Socket server) = await AcceptConnectedTcpPairAsync(listener, endpoint);
+ using (client)
+ using (server)
+ {
+ server.LingerState = new LingerOption(enable: true, seconds: 0);
+ var sendTasks = new Task[ConcurrentSendCount];
+ for (int i = 0; i < sendTasks.Length; i++)
+ {
+ sendTasks[i] = ToTask(client.SendAsync(payload, SocketFlags.None));
+ }
+
+ // Wait for slot pressure rather than sleeping arbitrarily so the test
+ // only resets once a meaningful in-flight SEND_ZC wave exists.
+ Assert.True(
+ await WaitForIoUringCompletionSlotsInUseAboveAsync(baselineSlotsInUse, SlotPressureDelta, timeoutMilliseconds: 2_000),
+ $"Expected completion slots to exceed baseline {baselineSlotsInUse} by at least {SlotPressureDelta}, observed {GetIoUringCompletionSlotsInUseForTest()}.");
+
+ if (GetIoUringZeroCopyPinHoldSnapshot().PendingNotificationCount > 0)
+ {
+ observedPendingNotifications = true;
+ }
+
+ server.Dispose();
+
+ bool roundSawConnectionReset = false;
+ for (int i = 0; i < sendTasks.Length; i++)
+ {
+ Exception? ex = await Record.ExceptionAsync(async () => await sendTasks[i]);
+ if (ex is null)
+ {
+ continue;
+ }
+
+ if (ex is SocketException socketException)
+ {
+ if (socketException.SocketErrorCode == SocketError.ConnectionReset)
+ {
+ roundSawConnectionReset = true;
+ }
+
+ Assert.True(
+ socketException.SocketErrorCode == SocketError.ConnectionReset ||
+ socketException.SocketErrorCode == SocketError.ConnectionAborted ||
+ socketException.SocketErrorCode == SocketError.OperationAborted ||
+ socketException.SocketErrorCode == SocketError.Interrupted ||
+ socketException.SocketErrorCode == SocketError.Shutdown,
+ $"Unexpected socket error during reset-churn SEND_ZC stress: {socketException.SocketErrorCode}");
+ }
+ else
+ {
+ Assert.True(
+ ex is ObjectDisposedException || ex is OperationCanceledException,
+ $"Unexpected exception during reset-churn SEND_ZC stress: {ex}");
+ }
+ }
+
+ if (roundSawConnectionReset)
+ {
+ roundsWithConnectionReset++;
+ }
+ }
+
+ rounds++;
+ }
+
+ Assert.True(rounds > 0, "Expected at least one reset-churn round in the SEND_ZC recovery scenario.");
+ Assert.True(
+ observedPendingNotifications,
+ "Expected to observe at least one in-flight pending SEND_ZC notification during reset-churn stress.");
+ Assert.True(
+ (double)roundsWithConnectionReset / rounds >= 0.10,
+ $"Expected at least 10% of reset-churn rounds to include ConnectionReset; observed {roundsWithConnectionReset}/{rounds}.");
+
+ IoUringZeroCopyPinHoldSnapshot settledSnapshot = await WaitForZeroCopyPinHoldSnapshotAsync(
+ snapshot => !snapshot.HasIoUringPort ||
+ (snapshot.ActivePinHolds == baselineSnapshot.ActivePinHolds &&
+ snapshot.PendingNotificationCount == baselineSnapshot.PendingNotificationCount),
+ timeoutMilliseconds: 30_000);
+ if (!settledSnapshot.HasIoUringPort)
+ {
+ return;
+ }
+
+ Assert.Equal(baselineSnapshot.ActivePinHolds, settledSnapshot.ActivePinHolds);
+ Assert.Equal(baselineSnapshot.PendingNotificationCount, settledSnapshot.PendingNotificationCount);
+ Assert.True(
+ await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineSlotsInUse, timeoutMilliseconds: 30_000),
+ $"Expected completion slots to recover to baseline {baselineSlotsInUse}, observed {GetIoUringCompletionSlotsInUseForTest()}.");
+
+ ulong completionSlotExhaustionsAfterStress = GetIoUringTelemetryCounterValue("_ioUringCompletionSlotExhaustions");
+ await RunZeroCopySendLargeBufferRoundTripScenarioAsync();
+ ulong completionSlotExhaustionsAfterRecovery = GetIoUringTelemetryCounterValue("_ioUringCompletionSlotExhaustions");
+ Assert.Equal(
+ completionSlotExhaustionsAfterStress,
+ completionSlotExhaustionsAfterRecovery);
+ }
+
+ private static async Task RunZeroCopySendPartialSendResubmissionScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ await RunLargeSendWithBackpressureAsync(useBufferListSend: false);
+ }
+
+ private static async Task RunZeroCopySendCompletionPinLifetimeScenarioAsync()
+ {
+ if (!IsZeroCopySendEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ byte[] payload = new byte[96 * 1024];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 3));
+ }
+
+ using var trackingMemory = new TrackingPinnableMemoryManager(payload);
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(trackingMemory.Memory, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ await AssertPinsReleasedAsync(trackingMemory);
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunZeroCopySendUnsupportedOpcodeFallbackScenarioAsync()
+ {
+ SocketAsyncEngine[] engines = SocketAsyncEngine.GetActiveIoUringEnginesForTest();
+ var overrides = new List<(SocketAsyncEngine Engine, bool SupportsSendZc, bool ZeroCopyEnabled)>(engines.Length);
+ foreach (SocketAsyncEngine engine in engines)
+ {
+ overrides.Add((engine, engine.SupportsOpSendZcForTest, engine.ZeroCopySendEnabledForTest));
+ engine.SupportsOpSendZcForTest = false;
+ engine.ZeroCopySendEnabledForTest = false;
+ }
+
+ if (engines.Length == 0)
+ {
+ return;
+ }
+
+ try
+ {
+ IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot();
+ Assert.False(snapshot.SupportsSendZc);
+ Assert.False(snapshot.ZeroCopySendEnabled);
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ byte[] payload = new byte[64 * 1024];
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(payload, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+ finally
+ {
+ foreach ((SocketAsyncEngine engine, bool supports, bool enabled) in overrides)
+ {
+ engine.SupportsOpSendZcForTest = supports;
+ engine.ZeroCopySendEnabledForTest = enabled;
+ }
+ }
+ }
+
+ private static async Task RunZeroCopySendBufferListSegmentThresholdScenarioAsync()
+ {
+ if (!IsZeroCopySendMessageEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket listener = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+ _ = listener;
+
+ const int segmentCount = 8;
+ const int segmentSize = 4 * 1024;
+ int payloadLength = segmentCount * segmentSize;
+ byte[] payload = new byte[payloadLength];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 17));
+ }
+
+ var sendBuffers = new List>(segmentCount);
+ for (int i = 0; i < segmentCount; i++)
+ {
+ sendBuffers.Add(new ArraySegment(payload, i * segmentSize, segmentSize));
+ }
+
+ byte[] received = new byte[payload.Length];
+ Task receiveTask = ReceiveExactlyAsync(server, received);
+ int sent = await client.SendAsync(sendBuffers, SocketFlags.None);
+ Assert.Equal(payload.Length, sent);
+ await receiveTask;
+ Assert.Equal(payload, received);
+ }
+
+ private static async Task RunZeroCopySendToAboveThresholdScenarioAsync()
+ {
+ if (!IsZeroCopySendMessageEnabledAndSupported(out _))
+ {
+ return;
+ }
+
+ using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp);
+ sender.Bind(new IPEndPoint(IPAddress.Loopback, 0));
+
+ byte[] payload = new byte[20 * 1024];
+ for (int i = 0; i < payload.Length; i++)
+ {
+ payload[i] = unchecked((byte)(i + 23));
+ }
+
+ byte[] receiveBuffer = new byte[payload.Length];
+ Task receiveTask =
+ ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0)));
+ await Task.Yield();
+
+ int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!);
+ Assert.Equal(payload.Length, sent);
+
+ SocketReceiveFromResult receiveResult = await receiveTask;
+ Assert.Equal(payload.Length, receiveResult.ReceivedBytes);
+ Assert.Equal(payload, receiveBuffer);
+ Assert.Equal(sender.LocalEndPoint, receiveResult.RemoteEndPoint);
+ }
+
+ private static async Task RunMultishotRecvBasicScenarioAsync(int iterations)
+ {
+ var trio = await CreateConnectedTcpSocketTrioAsync();
+ using Socket _ = trio.Listener;
+ using Socket client = trio.Client;
+ using Socket server = trio.Server;
+
+ if (!IsIoUringMultishotRecvSupported())
+ {
+ return;
+ }
+
+ ulong reuseBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvReuse");
+ ulong asyncCancelBefore = GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes");
+ byte[] receiveBuffer = new byte[1];
+ byte[] payload = new byte[1];
+ for (int i = 0; i < iterations; i++)
+ {
+ Task