diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs index 71c97699fd07e7..0ce5d006a059ae 100644 --- a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs +++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.Fcntl.cs @@ -22,6 +22,9 @@ internal static partial class Fcntl [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlSetFD", SetLastError = true)] internal static partial int SetFD(SafeHandle fd, int flags); + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlSetFD", SetLastError = true)] + internal static partial int SetFD(IntPtr fd, int flags); + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_FcntlGetFD", SetLastError = true)] internal static partial int GetFD(SafeHandle fd); diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs new file mode 100644 index 00000000000000..1a2216d8d6723c --- /dev/null +++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.IoUringShim.cs @@ -0,0 +1,58 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.InteropServices; + +internal static partial class Interop +{ + internal static partial class Sys + { + /// Wraps io_uring_setup(2): creates an io_uring instance. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimSetup")] + internal static unsafe partial Error IoUringShimSetup( + uint entries, void* parms, int* ringFd); + + /// Wraps io_uring_enter(2): submits SQEs and/or waits for CQEs. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnter")] + internal static unsafe partial Error IoUringShimEnter( + int ringFd, uint toSubmit, uint minComplete, uint flags, int* result); + + /// Wraps io_uring_enter2(2) with IORING_ENTER_EXT_ARG for bounded waits. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimEnterExt")] + internal static unsafe partial Error IoUringShimEnterExt( + int ringFd, uint toSubmit, uint minComplete, uint flags, void* arg, int* result); + + /// Wraps io_uring_register(2): registers resources (files, buffers, ring fds). + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimRegister")] + internal static unsafe partial Error IoUringShimRegister( + int ringFd, uint opcode, void* arg, uint nrArgs, int* result); + + /// Wraps mmap(2): maps io_uring SQ/CQ ring memory. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMmap")] + internal static unsafe partial Error IoUringShimMmap( + int ringFd, ulong size, ulong offset, void** mappedPtr); + + /// Wraps munmap(2): unmaps io_uring ring memory. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimMunmap")] + internal static unsafe partial Error IoUringShimMunmap( + void* addr, ulong size); + + /// Creates an eventfd for io_uring wakeup signaling. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCreateEventFd")] + internal static unsafe partial Error IoUringShimCreateEventFd( + int* eventFd); + + /// Writes to an eventfd to wake the io_uring event loop. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimWriteEventFd")] + internal static partial Error IoUringShimWriteEventFd(int eventFd); + + /// Reads from an eventfd to consume a wakeup signal. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimReadEventFd")] + internal static unsafe partial Error IoUringShimReadEventFd( + int eventFd, ulong* value); + + /// Wraps close(2): closes a file descriptor. + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IoUringShimCloseFd")] + internal static partial Error IoUringShimCloseFd(int fd); + } +} diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs new file mode 100644 index 00000000000000..1472d04c8b676a --- /dev/null +++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.Linux.cs @@ -0,0 +1,150 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Net.Sockets; +using System.Runtime.InteropServices; + +internal static partial class Interop +{ + internal static partial class Sys + { + /// Derived SQ ring state computed after mmap, used by the managed submission path. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringSqRingInfo + { + public IntPtr SqeBase; + public IntPtr SqTailPtr; + public IntPtr SqHeadPtr; + public uint SqMask; + public uint SqEntries; + public uint SqeSize; + public byte UsesNoSqArray; + public int RingFd; + public int RegisteredRingFd; + public byte UsesEnterExtArg; + public byte UsesRegisteredFiles; + } + + /// Mirrors kernel struct io_sqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused. + [StructLayout(LayoutKind.Explicit, Size = 40)] + internal struct IoUringSqOffsets + { + [FieldOffset(0)] public uint Head; + [FieldOffset(4)] public uint Tail; + [FieldOffset(8)] public uint RingMask; + [FieldOffset(12)] public uint RingEntries; + [FieldOffset(16)] public uint Flags; + [FieldOffset(20)] public uint Dropped; + [FieldOffset(24)] public uint Array; + // resv1 at 28, user_addr at 32 - not needed by managed code + } + + /// Mirrors kernel struct io_cqring_offsets (40 bytes). Fields at offset 28+ (resv1, user_addr) are unused. + [StructLayout(LayoutKind.Explicit, Size = 40)] + internal struct IoUringCqOffsets + { + [FieldOffset(0)] public uint Head; + [FieldOffset(4)] public uint Tail; + [FieldOffset(8)] public uint RingMask; + [FieldOffset(12)] public uint RingEntries; + [FieldOffset(16)] public uint Overflow; + [FieldOffset(20)] public uint Cqes; + [FieldOffset(24)] public uint Flags; + // resv1 at 28, user_addr at 32 - not needed by managed code + } + + /// Mirrors kernel struct io_uring_params (120 bytes), passed to io_uring_setup. + [StructLayout(LayoutKind.Explicit, Size = 120)] + internal struct IoUringParams + { + [FieldOffset(0)] public uint SqEntries; + [FieldOffset(4)] public uint CqEntries; + [FieldOffset(8)] public uint Flags; + [FieldOffset(12)] public uint SqThreadCpu; + [FieldOffset(16)] public uint SqThreadIdle; + [FieldOffset(20)] public uint Features; + [FieldOffset(24)] public uint WqFd; + // resv[3] at 28-39 + [FieldOffset(40)] public IoUringSqOffsets SqOff; + [FieldOffset(80)] public IoUringCqOffsets CqOff; + } + + /// Mirrors kernel struct io_uring_cqe (16 bytes), read from the CQ ring. + [StructLayout(LayoutKind.Explicit, Size = 16)] + internal struct IoUringCqe + { + [FieldOffset(0)] public ulong UserData; + [FieldOffset(8)] public int Result; + [FieldOffset(12)] public uint Flags; + } + + /// Mirrors kernel struct io_uring_buf (16 bytes), used by provided-buffer rings. + [StructLayout(LayoutKind.Explicit, Size = 16)] + internal struct IoUringBuf + { + [FieldOffset(0)] public ulong Address; + [FieldOffset(8)] public uint Length; + [FieldOffset(12)] public ushort BufferId; + [FieldOffset(14)] public ushort Reserved; + } + + /// + /// Mirrors the header overlay of kernel struct io_uring_buf_ring (16 bytes). + /// In UAPI this shares offset 0 with the first io_uring_buf entry via a union. + /// + [StructLayout(LayoutKind.Explicit, Size = 16)] + internal struct IoUringBufRingHeader + { + [FieldOffset(0)] public ulong Reserved1; + [FieldOffset(8)] public uint Reserved2; + [FieldOffset(12)] public ushort Reserved3; + [FieldOffset(14)] public ushort Tail; + } + + /// Mirrors kernel struct io_uring_buf_reg (40 bytes), used for pbuf ring registration. + [StructLayout(LayoutKind.Explicit, Size = 40)] + internal struct IoUringBufReg + { + [FieldOffset(0)] public ulong RingAddress; + [FieldOffset(8)] public uint RingEntries; + [FieldOffset(12)] public ushort BufferGroupId; + [FieldOffset(14)] public ushort Padding; + [FieldOffset(16)] public ulong Reserved0; + [FieldOffset(24)] public ulong Reserved1; + [FieldOffset(32)] public ulong Reserved2; + } + + /// Derived CQ ring state computed after mmap, used by the managed completion drain path. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringCqRingInfo + { + public IntPtr CqeBase; // io_uring_cqe* base of CQE array + public IntPtr CqTailPtr; // uint32_t* kernel writes CQ tail + public IntPtr CqHeadPtr; // uint32_t* managed advances CQ head + public uint CqMask; // CqEntries - 1 + public uint CqEntries; // number of CQ slots + public uint CqeSize; // sizeof(io_uring_cqe) = 16 + public IntPtr CqOverflowPtr; // uint32_t* kernel CQ overflow counter + } + + /// Mirrors kernel struct io_uring_getevents_arg, used with IORING_ENTER_EXT_ARG. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringGeteventsArg + { + public ulong Sigmask; + public uint SigmaskSize; + public uint MinWaitUsec; + public ulong Ts; + } + + /// Mirrors kernel struct __kernel_timespec, used for io_uring timeout arguments. + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringKernelTimespec + { + public long TvSec; + public long TvNsec; + } + + } +} diff --git a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj index bdb03b5a7b5548..89eaa2c02785ea 100644 --- a/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj +++ b/src/libraries/System.Net.Sockets/src/System.Net.Sockets.csproj @@ -197,9 +197,34 @@ + + + + + + + + + + + + + + + + diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs new file mode 100644 index 00000000000000..98c6f93862417d --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/IoUringProvidedBufferRing.Linux.cs @@ -0,0 +1,1126 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed partial class SocketAsyncEngine + { + private const string IoUringAdaptiveBufferSizingSwitchName = "System.Net.Sockets.IoUringAdaptiveBufferSizing"; + private const int IoUringProvidedBufferRingEntries = (int)IoUringConstants.QueueEntries; + private const int IoUringProvidedBufferSizeDefault = 4096; + private const ushort IoUringProvidedBufferGroupIdStart = 0x8000; + private static readonly int s_ioUringProvidedBufferSize = GetConfiguredIoUringProvidedBufferSize(); + private static readonly bool s_ioUringAdaptiveBufferSizingEnabled = IsAdaptiveIoUringProvidedBufferSizingEnabled(); + private static readonly bool s_ioUringRegisterBuffersEnabled = IsIoUringRegisterBuffersEnabled(); + private bool _adaptiveBufferSizingEnabled; + private ushort _nextIoUringProvidedBufferGroupId = IoUringProvidedBufferGroupIdStart; + + /// + /// Initializes a provided-buffer ring and registers it with the kernel when supported. + /// Failures are non-fatal and leave completion mode enabled without provided buffers. + /// + private void InitializeIoUringProvidedBufferRingIfSupported(int ringFd) + { + SetIoUringProvidedBufferCapabilityState( + supportsProvidedBufferRings: false, + hasRegisteredBuffers: false); + _adaptiveBufferSizingEnabled = false; + _ioUringProvidedBufferGroupId = 0; + _ioUringProvidedBufferRing = null; + ushort initialGroupId = AllocateProvidedBufferGroupId(); + + if (!IoUringProvidedBufferRing.TryCreate( + initialGroupId, + IoUringProvidedBufferRingEntries, + s_ioUringProvidedBufferSize, + s_ioUringAdaptiveBufferSizingEnabled, + out IoUringProvidedBufferRing? bufferRing) || + bufferRing is null) + { + return; + } + + Interop.Error registerError = bufferRing.Register(ringFd); + if (registerError != Interop.Error.SUCCESS) + { + bufferRing.Dispose(); + return; + } + + _ioUringProvidedBufferRing = bufferRing; + _ioUringProvidedBufferGroupId = bufferRing.BufferGroupId; + _adaptiveBufferSizingEnabled = s_ioUringAdaptiveBufferSizingEnabled; + SetIoUringProvidedBufferCapabilityState( + supportsProvidedBufferRings: true, + hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry(bufferRing, ringFd, isReregistration: false)); + + SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(bufferRing.BufferSize); + } + + /// + /// Evaluates adaptive buffer-sizing recommendations and hot-swaps the provided-buffer ring when safe. + /// Must run on the event-loop thread. + /// + private void EvaluateProvidedBufferRingResize() + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "Provided-buffer resize evaluation must run on the io_uring event-loop thread."); + if (!_adaptiveBufferSizingEnabled || _managedRingFd < 0) + { + return; + } + + IoUringProvidedBufferRing? currentRing = _ioUringProvidedBufferRing; + if (currentRing is null) + { + return; + } + + int currentBufferSize = currentRing.BufferSize; + int recommendedBufferSize = currentRing.RecommendedBufferSize; + if (recommendedBufferSize == 0 || recommendedBufferSize == currentBufferSize) + { + return; + } + + if (!IsProvidedBufferResizeQuiescent(currentRing)) + { + return; + } + + ushort newGroupId = AllocateProvidedBufferGroupId(_ioUringProvidedBufferGroupId); + if (!IoUringProvidedBufferRing.TryCreate( + newGroupId, + IoUringProvidedBufferRingEntries, + recommendedBufferSize, + adaptiveSizingEnabled: true, + out IoUringProvidedBufferRing? replacementRing) || + replacementRing is null) + { + return; + } + + AssertProvidedBufferResizeQuiescent(currentRing); + + bool restorePreviousBufferRegistration = _ioUringCapabilities.HasRegisteredBuffers; + TryUnregisterProvidedBuffersIfRegistered(currentRing, _managedRingFd, restorePreviousBufferRegistration); + + if (replacementRing.Register(_managedRingFd) != Interop.Error.SUCCESS) + { + replacementRing.Dispose(); + if (restorePreviousBufferRegistration) + { + SetIoUringProvidedBufferCapabilityState( + supportsProvidedBufferRings: true, + hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry( + currentRing, + _managedRingFd, + isReregistration: true)); + } + + return; + } + + currentRing.Unregister(_managedRingFd); + currentRing.Dispose(); + + _ioUringProvidedBufferRing = replacementRing; + _ioUringProvidedBufferGroupId = replacementRing.BufferGroupId; + RefreshIoUringMultishotRecvSupport(); + SetIoUringProvidedBufferCapabilityState( + supportsProvidedBufferRings: true, + hasRegisteredBuffers: TryRegisterProvidedBuffersWithTelemetry( + replacementRing, + _managedRingFd, + isReregistration: true)); + + SocketsTelemetry.Log.IoUringProvidedBufferResize(); + SocketsTelemetry.Log.IoUringProvidedBufferCurrentSize(replacementRing.BufferSize); + } + + private bool IsProvidedBufferResizeQuiescent(IoUringProvidedBufferRing currentRing) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "Provided-buffer resize quiescence must be evaluated on the io_uring event-loop thread."); + + if (currentRing.InUseCount != 0) + { + return false; + } + + if (_cqOverflowRecoveryActive) + { + return false; + } + + // Ring swap frees/replaces native buffer-ring memory. Delay swap until all tracked + // io_uring operations have drained so no in-flight SQE can still reference the old ring. + return Volatile.Read(ref _trackedIoUringOperationCount) == 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private ushort AllocateProvidedBufferGroupId(ushort avoidGroupId = 0) + { + ushort candidate = _nextIoUringProvidedBufferGroupId; + for (int attempts = 0; attempts < ushort.MaxValue; attempts++) + { + if (candidate != 0 && + candidate != ushort.MaxValue && + candidate != avoidGroupId) + { + _nextIoUringProvidedBufferGroupId = GetNextProvidedBufferGroupId(candidate); + return candidate; + } + + candidate = GetNextProvidedBufferGroupId(candidate); + } + + Debug.Fail("Unable to allocate an io_uring provided-buffer group id."); + _nextIoUringProvidedBufferGroupId = IoUringProvidedBufferGroupIdStart; + return IoUringProvidedBufferGroupIdStart; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ushort GetNextProvidedBufferGroupId(ushort currentGroupId) + { + ushort nextGroupId = unchecked((ushort)(currentGroupId + 1)); + if (nextGroupId < IoUringProvidedBufferGroupIdStart || nextGroupId == ushort.MaxValue) + { + nextGroupId = IoUringProvidedBufferGroupIdStart; + } + + return nextGroupId; + } + + [Conditional("DEBUG")] + private void AssertProvidedBufferResizeQuiescent(IoUringProvidedBufferRing currentRing) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "Provided-buffer resize assertions must run on the io_uring event-loop thread."); + Debug.Assert( + currentRing.InUseCount == 0, + "Provided-buffer resize requires no checked-out buffers before ring swap."); + Debug.Assert( + !_cqOverflowRecoveryActive, + "Provided-buffer resize must not run during CQ overflow recovery."); + Debug.Assert( + Volatile.Read(ref _trackedIoUringOperationCount) == 0, + "Provided-buffer resize requires no tracked io_uring operations before old ring disposal."); + } + + private static int GetConfiguredIoUringProvidedBufferSize() + { +#if DEBUG + string? configuredValue = Environment.GetEnvironmentVariable( + IoUringTestEnvironmentVariables.ProvidedBufferSize); + + if (!string.IsNullOrWhiteSpace(configuredValue)) + { + return int.TryParse(configuredValue, out int parsedSize) && parsedSize > 0 + ? parsedSize + : IoUringProvidedBufferSizeDefault; + } +#endif + + return IoUringProvidedBufferSizeDefault; + } + + private static bool IsAdaptiveIoUringProvidedBufferSizingEnabled() + { + bool enabled = AppContext.TryGetSwitch(IoUringAdaptiveBufferSizingSwitchName, out bool configured) && configured; + +#if DEBUG + string? configuredValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.AdaptiveBufferSizing); + if (string.Equals(configuredValue, "1", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(configuredValue, "0", StringComparison.Ordinal)) + { + return false; + } +#endif + + return enabled; + } + + private static bool IsIoUringRegisterBuffersEnabled() + { +#if DEBUG + // Test-only override for deterministic tests. + string? configuredValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.RegisterBuffers); + if (string.Equals(configuredValue, "1", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(configuredValue, "0", StringComparison.Ordinal)) + { + return false; + } +#endif + + // Default: enabled. + return true; + } + + private static bool TryRegisterProvidedBuffersWithTelemetry( + IoUringProvidedBufferRing bufferRing, + int ringFd, + bool isReregistration) + { + if (!s_ioUringRegisterBuffersEnabled || ringFd < 0) + { + return false; + } + + // REGISTER_BUFFERS is orthogonal to provided-buffer selection (RECV + IOSQE_BUFFER_SELECT). + // Any performance benefit for this path is kernel-dependent and must be validated empirically. + bool registered = bufferRing.TryRegisterBuffersWithKernel(ringFd); + if (isReregistration) + { + SocketsTelemetry.Log.IoUringRegisteredBuffersReregistration(registered); + } + else + { + SocketsTelemetry.Log.IoUringRegisteredBuffersResult( + registered, + IoUringProvidedBufferRingEntries, + bufferRing.BufferSize); + } + + return registered; + } + + private void TryUnregisterProvidedBuffersIfRegistered( + IoUringProvidedBufferRing bufferRing, + int ringFd, + bool hasRegisteredBuffers) + { + if (!hasRegisteredBuffers || ringFd < 0) + { + return; + } + + bufferRing.TryUnregisterBuffersFromKernel(ringFd); + SetIoUringProvidedBufferCapabilityState( + supportsProvidedBufferRings: _ioUringCapabilities.SupportsProvidedBufferRings, + hasRegisteredBuffers: false); + } + + /// Unregisters and disposes the provided-buffer ring. + private void FreeIoUringProvidedBufferRing() + { + IoUringProvidedBufferRing? bufferRing = _ioUringProvidedBufferRing; + bool hadRegisteredBuffers = _ioUringCapabilities.HasRegisteredBuffers; + _ioUringProvidedBufferRing = null; + SetIoUringProvidedBufferCapabilityState( + supportsProvidedBufferRings: false, + hasRegisteredBuffers: false); + _adaptiveBufferSizingEnabled = false; + _ioUringProvidedBufferGroupId = 0; + + if (bufferRing is null) + { + return; + } + + int recycledForTeardown = bufferRing.RecycleCheckedOutBuffersForTeardown(); + if (recycledForTeardown > 0) + { + SocketsTelemetry.Log.IoUringProvidedBufferRecycle(recycledForTeardown); + } + + TryUnregisterProvidedBuffersIfRegistered(bufferRing, _managedRingFd, hadRegisteredBuffers); + + if (_managedRingFd >= 0) + { + bufferRing.Unregister(_managedRingFd); + } + + bufferRing.Dispose(); + SetIoUringProvidedBufferCapabilityState( + supportsProvidedBufferRings: false, + hasRegisteredBuffers: false); + } + + /// + /// Owns a managed provided-buffer ring registration: native ring memory, pinned managed + /// buffers, buffer-id lifecycle, and recycle counters. + /// Lifetime is process-engine managed and deterministic via ; no finalizer is used. + /// + private sealed unsafe class IoUringProvidedBufferRing : IDisposable + { + private const int AdaptiveWindowCompletionCount = 256; + private const int AdaptiveMinBufferSize = 128; + private const int AdaptiveMaxBufferSize = 65536; + private const int PreparedReceiveMinimumReserve = 8; + private const int PreparedReceiveMaximumReserve = 64; + private const byte BufferStatePosted = 1; + private const byte BufferStateCheckedOut = 2; +#if DEBUG + private static int s_testForceCreateOomOnce = -1; +#endif + + private readonly ushort _bufferGroupId; + private readonly int _bufferSize; + private readonly uint _ringEntries; + private readonly uint _ringMask; + private readonly bool _adaptiveSizingEnabled; + private readonly byte[][] _buffers; + private readonly nint[] _bufferAddresses; + private readonly byte[] _bufferStates; + private readonly ulong[] _postedBufferStateBits; + private Interop.Sys.IoUringBuf* _ringBuffers; + private Interop.Sys.IoUringBufRingHeader* _ringHeader; + private readonly void* _ringMemory; + private bool _registered; + private bool _disposed; + private int _availableCount; + private int _inUseCount; + private long _recycledCount; + private long _allocationFailureCount; + private long _totalCompletionBytes; + private long _totalCompletionCount; + private long _completionsAboveHighWatermark; + private long _completionsBelowLowWatermark; + private int _recommendedBufferSize; + private uint _nextPreparedReceiveBufferHint; + private uint _nextPreparedReceivePostedWordHint; + private bool _deferTailPublish; + private bool _deferredTailDirty; + private ushort _deferredTailValue; + private int _debugOwningThreadId; + + internal ushort BufferGroupId => _bufferGroupId; + internal int BufferSize => _bufferSize; + internal int AvailableCount => Volatile.Read(ref _availableCount); + // Writers are single-threaded via AssertSingleThreadAccess; Volatile.Read keeps + // diagnostics/resize sampling conservative when observed outside mutation sites. + internal int InUseCount => Volatile.Read(ref _inUseCount); + internal long RecycledCount => Interlocked.Read(ref _recycledCount); + internal long AllocationFailureCount => Interlocked.Read(ref _allocationFailureCount); + internal int RecommendedBufferSize => Volatile.Read(ref _recommendedBufferSize); + internal int TotalBufferCountForTest => _bufferStates.Length; + + private IoUringProvidedBufferRing(ushort bufferGroupId, int ringEntries, int bufferSize, bool adaptiveSizingEnabled) + { + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(ringEntries); + if (!BitOperations.IsPow2((uint)ringEntries) || ringEntries > ushort.MaxValue) + { + throw new ArgumentOutOfRangeException(nameof(ringEntries)); + } + + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(bufferSize); + + _bufferGroupId = bufferGroupId; + _bufferSize = bufferSize; + _adaptiveSizingEnabled = adaptiveSizingEnabled; + _ringEntries = (uint)ringEntries; + _ringMask = (uint)ringEntries - 1; + _availableCount = ringEntries; + _recommendedBufferSize = bufferSize; + _buffers = new byte[ringEntries][]; + _bufferAddresses = new nint[ringEntries]; + _bufferStates = GC.AllocateUninitializedArray(ringEntries); + _postedBufferStateBits = new ulong[(ringEntries + 63) / 64]; + + nuint ringByteCount = checked((nuint)ringEntries * (nuint)sizeof(Interop.Sys.IoUringBuf)); + _ringMemory = NativeMemory.AlignedAlloc(ringByteCount, (nuint)Environment.SystemPageSize); + if (_ringMemory is null) + { + throw new OutOfMemoryException(); + } + + NativeMemory.Clear(_ringMemory, ringByteCount); + _ringBuffers = (Interop.Sys.IoUringBuf*)_ringMemory; + _ringHeader = (Interop.Sys.IoUringBufRingHeader*)_ringMemory; + + int initializedCount = 0; + try + { + for (int i = 0; i < ringEntries; i++) + { + byte[] buffer = GC.AllocateUninitializedArray(bufferSize, pinned: true); + _buffers[i] = buffer; + _bufferAddresses[i] = (nint)Unsafe.AsPointer(ref MemoryMarshal.GetArrayDataReference(buffer)); + _bufferStates[i] = BufferStatePosted; + SetPostedBufferBit((ushort)i, isPosted: true); + + WriteBufferDescriptor((uint)i, (ushort)i); + initializedCount++; + } + + PublishTail((ushort)initializedCount); + } + catch + { + _allocationFailureCount++; + Array.Clear(_buffers, 0, initializedCount); + Array.Clear(_bufferAddresses, 0, initializedCount); + NativeMemory.AlignedFree(_ringMemory); + throw; + } + } + +#if DEBUG + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool TryConsumeForcedCreateOutOfMemoryForTest() + { + int configured = Volatile.Read(ref s_testForceCreateOomOnce); + if (configured < 0) + { + configured = string.Equals( + Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceProvidedBufferRingOomOnce), + "1", + StringComparison.Ordinal) ? 1 : 0; + Volatile.Write(ref s_testForceCreateOomOnce, configured); + } + + if (configured == 0) + { + return false; + } + + return Interlocked.Exchange(ref s_testForceCreateOomOnce, 0) != 0; + } +#endif + + internal static bool TryCreate( + ushort bufferGroupId, + int ringEntries, + int bufferSize, + bool adaptiveSizingEnabled, + out IoUringProvidedBufferRing? bufferRing) + { +#if DEBUG + if (TryConsumeForcedCreateOutOfMemoryForTest()) + { + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Error(null, "io_uring provided-buffer ring create forced OOM via test hook."); + } + + bufferRing = null; + return false; + } +#endif + + try + { + bufferRing = new IoUringProvidedBufferRing(bufferGroupId, ringEntries, bufferSize, adaptiveSizingEnabled); + return true; + } + catch (ArgumentOutOfRangeException exception) + { + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Error(null, $"io_uring provided-buffer ring create rejected configuration: {exception.Message}"); + } + } + catch (OutOfMemoryException) + { + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Error(null, "io_uring provided-buffer ring create failed: out of memory."); + } + } + + bufferRing = null; + return false; + } + + /// Records a completion's bytes-transferred for adaptive sizing decisions. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void RecordCompletionUtilization(int bytesTransferred) + { + AssertSingleThreadAccess(); + if (!_adaptiveSizingEnabled || bytesTransferred <= 0) + { + return; + } + + int clampedBytes = Math.Min(bytesTransferred, _bufferSize); + _totalCompletionBytes += clampedBytes; + long count = ++_totalCompletionCount; + + int highWatermark = (_bufferSize * 3) / 4; + int lowWatermark = _bufferSize / 4; + if (clampedBytes > highWatermark) + { + _completionsAboveHighWatermark++; + } + else if (clampedBytes < lowWatermark) + { + _completionsBelowLowWatermark++; + } + + if ((count & (AdaptiveWindowCompletionCount - 1)) == 0) + { + EvaluateAdaptiveResize(); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void EvaluateAdaptiveResize() + { + AssertSingleThreadAccess(); + if (!_adaptiveSizingEnabled) + { + return; + } + + long windowBytes = _totalCompletionBytes; + long aboveHigh = _completionsAboveHighWatermark; + long belowLow = _completionsBelowLowWatermark; + _totalCompletionBytes = 0; + _completionsAboveHighWatermark = 0; + _completionsBelowLowWatermark = 0; + + int currentSize = _bufferSize; + int recommendedSize = currentSize; + if (aboveHigh > AdaptiveWindowCompletionCount / 2 || + windowBytes > (long)AdaptiveWindowCompletionCount * ((long)currentSize * 3 / 4)) + { + recommendedSize = Math.Min(currentSize * 2, AdaptiveMaxBufferSize); + } + else if (belowLow > AdaptiveWindowCompletionCount / 2 || + windowBytes < (long)AdaptiveWindowCompletionCount * ((long)currentSize / 4)) + { + recommendedSize = Math.Max(currentSize / 2, AdaptiveMinBufferSize); + } + + Volatile.Write(ref _recommendedBufferSize, recommendedSize); + } + + internal Interop.Error Register(int ringFd) + { + Debug.Assert(!_disposed); + + if (_registered) + { + return Interop.Error.SUCCESS; + } + + Interop.Sys.IoUringBufReg registration = default; + registration.RingAddress = (ulong)(nuint)_ringMemory; + registration.RingEntries = _ringEntries; + registration.BufferGroupId = _bufferGroupId; + + int result; + Interop.Error registerError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.RegisterPbufRing, + ®istration, + 1u, + &result); + if (registerError == Interop.Error.SUCCESS) + { + _registered = true; + } + + return registerError; + } + + internal Interop.Error Unregister(int ringFd) + { + if (!_registered) + { + return Interop.Error.SUCCESS; + } + + Interop.Sys.IoUringBufReg registration = default; + registration.BufferGroupId = _bufferGroupId; + int result; + Interop.Error unregisterError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.UnregisterPbufRing, + ®istration, + 1u, + &result); + if (unregisterError == Interop.Error.SUCCESS) + { + _registered = false; + } + + return unregisterError; + } + + /// + /// Attempts to register pinned buffer payload pages with the kernel via IORING_REGISTER_BUFFERS. + /// Failure is non-fatal and callers should gracefully continue with unregistered buffers. + /// This does not switch recv SQEs to fixed-buffer opcodes; provided-buffer recv stays on + /// IORING_OP_RECV + IOSQE_BUFFER_SELECT. + /// + internal bool TryRegisterBuffersWithKernel(int ringFd) + { + if (_disposed || ringFd < 0 || _buffers.Length == 0) + { + return false; + } + + nuint allocationSize = checked((nuint)_buffers.Length * (nuint)sizeof(Interop.Sys.IOVector)); + Interop.Sys.IOVector* iovecArray; + try + { + iovecArray = (Interop.Sys.IOVector*)NativeMemory.Alloc(allocationSize); + } + catch (OutOfMemoryException) + { + return false; + } + + try + { + for (int i = 0; i < _buffers.Length; i++) + { + nint bufferAddress = _bufferAddresses[i]; + if (bufferAddress == 0) + { + return false; + } + + iovecArray[i].Base = (byte*)bufferAddress; + iovecArray[i].Count = (UIntPtr)_bufferSize; + } + + int result; + Interop.Error registerError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.RegisterBuffers, + iovecArray, + (uint)_buffers.Length, + &result); + return registerError == Interop.Error.SUCCESS; + } + finally + { + NativeMemory.Free(iovecArray); + } + } + + /// Unregisters previously registered pinned buffers via IORING_UNREGISTER_BUFFERS. + internal bool TryUnregisterBuffersFromKernel(int ringFd) + { + if (_disposed || ringFd < 0) + { + return false; + } + + int result; + Interop.Error unregisterError = Interop.Sys.IoUringShimRegister( + ringFd, + IoUringConstants.UnregisterBuffers, + null, + 0u, + &result); + return unregisterError == Interop.Error.SUCCESS; + } + + /// Acquires a kernel-selected buffer id for completion processing. + internal bool TryAcquireBufferForCompletion(ushort bufferId, out byte* buffer, out int bufferLength) + { + AssertSingleThreadAccess(); + buffer = null; + bufferLength = 0; + + if (bufferId >= _ringEntries) + { + _allocationFailureCount++; + return false; + } + + byte state = _bufferStates[bufferId]; + if (state != BufferStatePosted) + { + Debug.Assert( + state == BufferStateCheckedOut, + $"Unexpected provided-buffer state during acquire: id={bufferId}, state={state}"); + _allocationFailureCount++; + return false; + } + + _bufferStates[bufferId] = BufferStateCheckedOut; + SetPostedBufferBit(bufferId, isPosted: false); + Debug.Assert(_availableCount > 0, "Provided-buffer available count underflow."); + _availableCount--; + _inUseCount++; + + nint bufferAddress = _bufferAddresses[bufferId]; + if (bufferAddress == 0) + { + _bufferStates[bufferId] = BufferStatePosted; + SetPostedBufferBit(bufferId, isPosted: true); + _availableCount++; + _inUseCount--; + _allocationFailureCount++; + return false; + } + + buffer = (byte*)bufferAddress; + bufferLength = _bufferSize; + return true; + } + + /// + /// Acquires any currently posted provided buffer for fixed-recv submission. + /// The acquired buffer remains checked out until completion recycles it. + /// + internal bool TryAcquireBufferForPreparedReceive(out ushort bufferId, out byte* buffer, out int bufferLength) + { + AssertSingleThreadAccess(); + bufferId = 0; + buffer = null; + bufferLength = 0; + + // Keep a reserve for kernel-selected (IOSQE_BUFFER_SELECT) receive completions so + // fixed-recv one-shots don't deplete the provided-buffer pool under sustained load. + int reserveCount = GetPreparedReceiveReserveCount(); + if (Volatile.Read(ref _availableCount) <= reserveCount) + { + return false; + } + + uint searchStart = _nextPreparedReceiveBufferHint; + int maxAttempts = _postedBufferStateBits.Length + 1; + for (int attempt = 0; attempt < maxAttempts && TryFindPostedBufferId(searchStart, out ushort candidateId); attempt++) + { + if (TryAcquireBufferForCompletion(candidateId, out buffer, out bufferLength)) + { + bufferId = candidateId; + uint nextSearchStart = ((uint)candidateId + 1) & _ringMask; + _nextPreparedReceiveBufferHint = nextSearchStart; + _nextPreparedReceivePostedWordHint = nextSearchStart >> 6; + return true; + } + + searchStart = ((uint)candidateId + 1) & _ringMask; + _nextPreparedReceiveBufferHint = searchStart; + _nextPreparedReceivePostedWordHint = searchStart >> 6; + } + + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int GetPreparedReceiveReserveCount() + { + int ringEntryCount = (int)_ringEntries; + int dynamicReserve = ringEntryCount / 16; + return Math.Clamp(dynamicReserve, PreparedReceiveMinimumReserve, PreparedReceiveMaximumReserve); + } + + /// Returns the pointer/length for a buffer that is already checked out. + internal bool TryGetCheckedOutBuffer(ushort bufferId, out byte* buffer, out int bufferLength) + { + buffer = null; + bufferLength = 0; + + if (bufferId >= _ringEntries || _bufferStates[bufferId] != BufferStateCheckedOut) + { + return false; + } + + nint bufferAddress = _bufferAddresses[bufferId]; + if (bufferAddress == 0) + { + _allocationFailureCount++; + return false; + } + + buffer = (byte*)bufferAddress; + bufferLength = _bufferSize; + return true; + } + + /// Returns a previously acquired buffer id back to the provided-buffer ring. + internal bool TryRecycleBufferFromCompletion(ushort bufferId) + { + AssertSingleThreadAccess(); + if (bufferId >= _ringEntries) + { + return false; + } + + byte state = _bufferStates[bufferId]; + if (state != BufferStateCheckedOut) + { + Debug.Assert( + state == BufferStatePosted, + $"Unexpected provided-buffer state during recycle: id={bufferId}, state={state}"); + return false; + } + + RecycleCheckedOutBuffer(bufferId); + return true; + } + + /// + /// Recycles any still-checked-out ids back into the ring during teardown. + /// Returns the number of ids recycled. + /// + internal int RecycleCheckedOutBuffersForTeardown() + { + AssertSingleThreadAccess(); + int recycledCount = 0; + for (ushort bufferId = 0; bufferId < _ringEntries; bufferId++) + { + if (_bufferStates[bufferId] != BufferStateCheckedOut) + { + continue; + } + + RecycleCheckedOutBuffer(bufferId); + recycledCount++; + } + + return recycledCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void BeginDeferredRecyclePublish() + { + AssertSingleThreadAccess(); + if (_deferTailPublish) + { + return; + } + + _deferTailPublish = true; + _deferredTailDirty = false; + _deferredTailValue = ReadTail(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void EndDeferredRecyclePublish() + { + AssertSingleThreadAccess(); + if (!_deferTailPublish) + { + return; + } + + _deferTailPublish = false; + if (_deferredTailDirty) + { + PublishTail(_deferredTailValue); + _deferredTailDirty = false; + } + } + + /// + /// Marks every provided buffer as checked out for deterministic test-only depletion setup. + /// + internal void ForceAllBuffersCheckedOutForTest() + { + AssertSingleThreadAccess(); + for (int i = 0; i < _bufferStates.Length; i++) + { + _bufferStates[i] = BufferStateCheckedOut; + } + + Array.Clear(_postedBufferStateBits); + _nextPreparedReceivePostedWordHint = 0; + Volatile.Write(ref _availableCount, 0); + Volatile.Write(ref _inUseCount, _bufferStates.Length); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecycleCheckedOutBuffer(ushort bufferId) + { + ushort tail = _deferTailPublish ? _deferredTailValue : ReadTail(); + uint ringIndex = (uint)tail & _ringMask; + WriteBufferDescriptor(ringIndex, bufferId); + _bufferStates[bufferId] = BufferStatePosted; + SetPostedBufferBit(bufferId, isPosted: true); + _availableCount++; + Debug.Assert(_inUseCount > 0, "Provided-buffer in-use count underflow."); + _inUseCount--; + ushort nextTail = unchecked((ushort)(tail + 1)); + if (_deferTailPublish) + { + _deferredTailValue = nextTail; + _deferredTailDirty = true; + } + else + { + PublishTail(nextTail); + } + _recycledCount++; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void SetPostedBufferBit(ushort bufferId, bool isPosted) + { + int wordIndex = bufferId >> 6; + ulong bit = 1UL << (bufferId & 63); + if (isPosted) + { + bool wordWasEmpty = _postedBufferStateBits[wordIndex] == 0; + _postedBufferStateBits[wordIndex] |= bit; + if (wordWasEmpty) + { + _nextPreparedReceivePostedWordHint = (uint)wordIndex; + } + } + else + { + _postedBufferStateBits[wordIndex] &= ~bit; + } + } + + private bool TryFindPostedBufferId(uint startIndex, out ushort bufferId) + { + int wordCount = _postedBufferStateBits.Length; + if (wordCount == 0) + { + bufferId = 0; + return false; + } + + int hintWord = (int)(_nextPreparedReceivePostedWordHint % (uint)wordCount); + if (TryFindBitInWord(hintWord, _postedBufferStateBits[hintWord], out bufferId)) + { + _nextPreparedReceivePostedWordHint = (uint)hintWord; + return true; + } + + uint startWord = startIndex >> 6; + int bitOffset = (int)(startIndex & 63); + if (startWord >= (uint)wordCount) + { + bufferId = 0; + return false; + } + + if (TryFindBitInWord((int)startWord, _postedBufferStateBits[startWord] & (~0UL << bitOffset), out bufferId)) + { + _nextPreparedReceivePostedWordHint = startWord; + return true; + } + + for (int word = (int)startWord + 1; word < wordCount; word++) + { + if (TryFindBitInWord(word, _postedBufferStateBits[word], out bufferId)) + { + _nextPreparedReceivePostedWordHint = (uint)word; + return true; + } + } + + for (int word = 0; word < (int)startWord; word++) + { + if (TryFindBitInWord(word, _postedBufferStateBits[word], out bufferId)) + { + _nextPreparedReceivePostedWordHint = (uint)word; + return true; + } + } + + bufferId = 0; + return false; + } + + private bool TryFindBitInWord(int wordIndex, ulong wordBits, out ushort bufferId) + { + while (wordBits != 0) + { + int bitIndex = BitOperations.TrailingZeroCount(wordBits); + int candidate = (wordIndex << 6) + bitIndex; + if ((uint)candidate < _ringEntries) + { + bufferId = (ushort)candidate; + return true; + } + + wordBits &= wordBits - 1; + } + + bufferId = 0; + return false; + } + + [Conditional("DEBUG")] + private void AssertSingleThreadAccess() + { + int currentThreadId = Environment.CurrentManagedThreadId; + int ownerThreadId = Volatile.Read(ref _debugOwningThreadId); + if (ownerThreadId == 0) + { + int prior = Interlocked.CompareExchange(ref _debugOwningThreadId, currentThreadId, comparand: 0); + ownerThreadId = prior == 0 ? currentThreadId : prior; + } + + Debug.Assert( + ownerThreadId == currentThreadId, + $"IoUringProvidedBufferRing mutable state must be accessed from one thread. Owner={ownerThreadId}, current={currentThreadId}"); + } + + public void Dispose() + { + if (_disposed) + { + return; + } + +#if DEBUG + int checkedOutBufferCount = 0; + for (int i = 0; i < _bufferStates.Length; i++) + { + if (_bufferStates[i] == BufferStateCheckedOut) + { + checkedOutBufferCount++; + } + } + + Debug.Assert( + checkedOutBufferCount == 0, + $"Disposing provided-buffer ring with outstanding checked-out buffers: {checkedOutBufferCount}"); +#endif + + Debug.Assert( + !_registered, + "Provided-buffer ring must be unregistered before disposing native ring memory."); + if (_registered) + { + return; + } + + _ringBuffers = null; + _ringHeader = null; + NativeMemory.AlignedFree(_ringMemory); + _disposed = true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private ushort ReadTail() => + Volatile.Read(ref Unsafe.AsRef(&_ringHeader->Tail)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void PublishTail(ushort tail) => + Volatile.Write(ref Unsafe.AsRef(&_ringHeader->Tail), tail); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void WriteBufferDescriptor(uint ringIndex, ushort bufferId) + { + Debug.Assert(ringIndex < _ringEntries); + Debug.Assert(bufferId < _ringEntries); + Debug.Assert(_bufferAddresses[bufferId] != 0); + + Interop.Sys.IoUringBuf* bufferSlot = _ringBuffers + ringIndex; + bufferSlot->Address = (ulong)(nuint)_bufferAddresses[bufferId]; + bufferSlot->Length = (uint)_bufferSize; + bufferSlot->BufferId = bufferId; + bufferSlot->Reserved = 0; + } + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs new file mode 100644 index 00000000000000..ee2b5ff767f392 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/MpscQueue.cs @@ -0,0 +1,421 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; +using System.Threading; + +namespace System.Net.Sockets +{ + /// + /// Lock-free multi-producer, single-consumer queue optimized for the io_uring + /// event loop pattern where many threads enqueue work items but exactly one + /// thread drains them. + /// + /// Liveness contract: + /// TryDequeue/IsEmpty may observe a producer between index claim and publish + /// (Interlocked.Increment followed by Volatile.Write), and can transiently report + /// no available item even though an enqueue is in progress. Callers must provide + /// their own wakeup/progress mechanism after Enqueue. + /// + internal sealed class MpscQueue + { + private const int DefaultSegmentSize = 256; + private const int UnlinkedSegmentCacheCapacity = 4; + private const int MaxEnqueueSlowAttempts = 2048; +#if DEBUG + private static int s_testSegmentAllocationFailuresRemaining; +#endif + + private readonly int _segmentSize; + private PaddedSegment _head; + private PaddedSegment _tail; + // Segment cache is shared by: + // - unlinked segments that lost tail->next publication races, and + // - drained head segments returned only after producer quiescence checks. + // Cache bookkeeping is protected by a tiny lock because this path is already slow-path only. + private readonly Lock _cachedUnlinkedSegmentGate = new Lock(); + private readonly Segment?[] _cachedUnlinkedSegments = new Segment?[UnlinkedSegmentCacheCapacity]; + private int _cachedUnlinkedSegmentCount; + private int _activeEnqueueOperations; + + internal MpscQueue(int segmentSize = DefaultSegmentSize) + { + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(segmentSize); + _segmentSize = segmentSize; + Segment initial = new Segment(segmentSize); + _head.Value = initial; + _tail.Value = initial; + } + + /// + /// Attempts to enqueue an item. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryEnqueue(T item) + { + if (TryEnqueueFast(item)) + { + return true; + } + + return TryEnqueueSlowWithProducerTracking(item); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private bool TryEnqueueSlowWithProducerTracking(T item) + { + // Only slow-path producers can retain stale segment references long enough to race with + // drained-segment recycling. Fast-path success doesn't need this accounting. + Interlocked.Increment(ref _activeEnqueueOperations); + try + { + return TryEnqueueSlow(item); + } + finally + { + Interlocked.Decrement(ref _activeEnqueueOperations); + } + } + + /// + /// Enqueues an item, retrying until an enqueue slot is observed. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void Enqueue(T item) + { + SpinWait spinner = default; + while (!TryEnqueue(item)) + { + spinner.SpinOnce(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryEnqueueFast(T item) + { + Segment tail = Volatile.Read(ref _tail.Value)!; + // Snapshot incarnation before claiming a slot. If the segment is recycled + // between this read and the Interlocked.Increment, the incarnation will differ. + int incarnation = Volatile.Read(ref tail.Incarnation); + int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1; + // A stale claim can over-increment the old segment index before incarnation + // mismatch is detected; this is safe because ResetForReuse resets EnqueueIndex. + if ((uint)index < (uint)tail.Entries.Length) + { + // Verify segment was not recycled while we were claiming the slot. + // A recycled segment has a different incarnation because ResetForReuse + // increments it. Without this check, TryReturnDrainedSegmentToCache can + // recycle the segment (since fast-path producers are not tracked by + // _activeEnqueueOperations) and we would write into reused memory. + if (Volatile.Read(ref tail.Incarnation) == incarnation) + { + ref SegmentEntry entry = ref tail.Entries[index]; + entry.Item = item; + Volatile.Write(ref entry.State, 1); + return true; + } + } + + return false; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private bool TryEnqueueSlow(T item) + { + SpinWait spinner = default; + for (int attempt = 0; attempt < MaxEnqueueSlowAttempts; attempt++) + { + Segment tail = Volatile.Read(ref _tail.Value)!; + int index = Interlocked.Increment(ref tail.EnqueueIndex.Value) - 1; + if ((uint)index < (uint)tail.Entries.Length) + { + ref SegmentEntry entry = ref tail.Entries[index]; + entry.Item = item; + Volatile.Write(ref entry.State, 1); + return true; + } + + Segment? next = Volatile.Read(ref tail.Next); + if (next is null) + { + Segment newSegment; + try + { + newSegment = RentUnlinkedSegment(); + } + catch (OutOfMemoryException) + { + return false; + } + + if (Interlocked.CompareExchange(ref tail.Next, newSegment, null) is null) + { + next = newSegment; + } + else + { + // Another producer linked its own segment first. Reuse ours later. + ReturnUnlinkedSegment(newSegment); + next = Volatile.Read(ref tail.Next); + } + } + + if (next is not null) + { + Interlocked.CompareExchange(ref _tail.Value, next, tail); + } + + spinner.SpinOnce(); + } + + return false; + } + + /// + /// Attempts to dequeue an item. Must only be called by the single consumer thread. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryDequeue(out T item) + { + if (TryDequeueFast(out item)) + { + return true; + } + + return TryDequeueSlow(out item); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool TryDequeueFromSegment(Segment head, out T item) + { + int index = head.DequeueIndex; + if ((uint)index >= (uint)head.Entries.Length) + { + item = default!; + return false; + } + + // Acquire published slot before reading the item value. + ref SegmentEntry entry = ref head.Entries[index]; + if (Volatile.Read(ref entry.State) != 1) + { + item = default!; + return false; + } + + item = entry.Item; + if (RuntimeHelpers.IsReferenceOrContainsReferences()) + { + entry.Item = default!; + } + + head.DequeueIndex = index + 1; + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryDequeueFast(out T item) + { + Segment head = Volatile.Read(ref _head.Value)!; + return TryDequeueFromSegment(head, out item); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private bool TryDequeueSlow(out T item) + { + Segment head = Volatile.Read(ref _head.Value)!; + while ((uint)head.DequeueIndex >= (uint)head.Entries.Length) + { + Segment? next = Volatile.Read(ref head.Next); + if (next is null) + { + item = default!; + return false; + } + + // Consumer publishes head advance; producers read _head when resolving slow-path + // enqueue progress, so this store must be visible across cores. + Volatile.Write(ref _head.Value, next); + TryReturnDrainedSegmentToCache(head); + head = next; + } + + return TryDequeueFromSegment(head, out item); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void TryReturnDrainedSegmentToCache(Segment drainedSegment) + { + // Safe reuse requires producer quiescence and tail advancement away from this segment. + // Without these checks, a producer that captured a stale segment pointer could publish + // into a reset segment after it has been recycled. + if (Volatile.Read(ref _activeEnqueueOperations) != 0 || + ReferenceEquals(Volatile.Read(ref _tail.Value), drainedSegment)) + { + return; + } + + ReturnUnlinkedSegment(drainedSegment); + } + + /// + /// Returns whether the queue currently appears empty (snapshot, not linearizable). + /// A return value of can also mean an enqueue is mid-flight. + /// + internal bool IsEmpty + { + get + { + Segment head = Volatile.Read(ref _head.Value)!; + while (true) + { + int index = head.DequeueIndex; + if ((uint)index >= (uint)head.Entries.Length) + { + Segment? next = Volatile.Read(ref head.Next); + if (next is null) + { + return true; + } + + head = next; + continue; + } + + return Volatile.Read(ref head.Entries[index].State) != 1; + } + } + } + + private Segment RentUnlinkedSegment() + { + lock (_cachedUnlinkedSegmentGate) + { + if (_cachedUnlinkedSegmentCount != 0) + { + int nextIndex = _cachedUnlinkedSegmentCount - 1; + Segment segment = _cachedUnlinkedSegments[nextIndex]!; + _cachedUnlinkedSegments[nextIndex] = null; + _cachedUnlinkedSegmentCount = nextIndex; + segment.ResetForReuse(); + return segment; + } + } + +#if DEBUG + if (TryConsumeSegmentAllocationFailureForTest()) + { + throw new OutOfMemoryException("Injected MpscQueue segment allocation failure for test."); + } +#endif + + return new Segment(_segmentSize); + } + +#if DEBUG + internal static void SetSegmentAllocationFailuresForTest(int failureCount) + { + ArgumentOutOfRangeException.ThrowIfNegative(failureCount); + + Volatile.Write(ref s_testSegmentAllocationFailuresRemaining, failureCount); + } + + private static bool TryConsumeSegmentAllocationFailureForTest() + { + while (true) + { + int remainingFailures = Volatile.Read(ref s_testSegmentAllocationFailuresRemaining); + if (remainingFailures <= 0) + { + return false; + } + + if (Interlocked.CompareExchange( + ref s_testSegmentAllocationFailuresRemaining, + remainingFailures - 1, + remainingFailures) == remainingFailures) + { + return true; + } + } + } +#endif + + private void ReturnUnlinkedSegment(Segment segment) + { + segment.ResetForReuse(); + lock (_cachedUnlinkedSegmentGate) + { + if (_cachedUnlinkedSegmentCount < _cachedUnlinkedSegments.Length) + { + _cachedUnlinkedSegments[_cachedUnlinkedSegmentCount++] = segment; + } + } + } + + private sealed class Segment + { + internal readonly SegmentEntry[] Entries; + internal int Incarnation; + internal PaddedInt32 EnqueueIndex; + internal int DequeueIndex; + internal Segment? Next; + + internal Segment(int size) + { + Entries = new SegmentEntry[size]; + ResetForReuse(); + } + + internal void ResetForReuse() + { + Interlocked.Increment(ref Incarnation); + EnqueueIndex.Value = 0; + DequeueIndex = 0; + Next = null; + if (RuntimeHelpers.IsReferenceOrContainsReferences()) + { + Array.Clear(Entries); + } + else + { + for (int i = 0; i < Entries.Length; i++) + { + Entries[i].State = 0; + } + } + } + } + + private struct SegmentEntry + { + internal T Item; + internal int State; + } + +#if TARGET_ARM64 || TARGET_LOONGARCH64 + private const int CacheLineWordCount = 16; // 128-byte cache line / sizeof(nint) +#else + private const int CacheLineWordCount = 8; // 64-byte cache line / sizeof(nint) +#endif + + [InlineArray(CacheLineWordCount - 1)] + private struct CacheLinePadding + { + internal nint _element0; + } + + private struct PaddedSegment + { + internal Segment? Value; + internal CacheLinePadding _padding; + } + + private struct PaddedInt32 + { + internal int Value; + internal CacheLinePadding _padding; + } + + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs new file mode 100644 index 00000000000000..12014434570b1b --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.IoUring.Linux.cs @@ -0,0 +1,2764 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Collections.Generic; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed partial class SocketAsyncContext + { + private const int MultishotAcceptQueueMaxSize = 256; + private const int PersistentMultishotRecvDataQueueMaxSize = 16; + private const int IoUringUserDataTagShift = 56; + private const byte IoUringReservedCompletionTag = 2; + private const long MultishotAcceptStateDisarmed = 0; + private const long MultishotAcceptStateArming = 1; + private Queue? _multishotAcceptQueue; + private long _multishotAcceptState; // 0=disarmed, 1=arming, otherwise encoded reserved-completion user_data + private ulong _persistentMultishotRecvUserData; // user_data of armed multishot recv SQE + private int _persistentMultishotRecvArmed; // 0=not armed, 1=armed + private Queue? _persistentMultishotRecvDataQueue; + private BufferedPersistentMultishotRecvData _persistentMultishotRecvDataHead; + private bool _hasPersistentMultishotRecvDataHead; + private int _persistentMultishotRecvDataHeadOffset; + private Lock? _multishotAcceptQueueGate; + private Lock? _persistentMultishotRecvDataGate; + + private readonly struct BufferedPersistentMultishotRecvData + { + internal readonly byte[] Data; + internal readonly int Length; + internal readonly bool UsesPooledBuffer; + + internal BufferedPersistentMultishotRecvData(byte[] data, int length, bool usesPooledBuffer) + { + Data = data; + Length = length; + UsesPooledBuffer = usesPooledBuffer; + } + } + + /// Holds a pre-accepted connection's fd and socket address from a multishot accept CQE. + private readonly struct PreAcceptedConnection + { + internal readonly IntPtr FileDescriptor; + internal readonly byte[] SocketAddressData; + internal readonly int SocketAddressLength; + internal readonly bool UsesPooledBuffer; + + internal PreAcceptedConnection(IntPtr fileDescriptor, byte[] socketAddressData, int socketAddressLength, bool usesPooledBuffer) + { + FileDescriptor = fileDescriptor; + SocketAddressData = socketAddressData; + SocketAddressLength = socketAddressLength; + UsesPooledBuffer = usesPooledBuffer; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Lock EnsureMultishotAcceptQueueGate() => EnsureLockInitialized(ref _multishotAcceptQueueGate); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Lock EnsurePersistentMultishotRecvDataGate() => EnsureLockInitialized(ref _persistentMultishotRecvDataGate); + + private int PersistentMultishotRecvBufferedCount => + (_persistentMultishotRecvDataQueue?.Count ?? 0) + (_hasPersistentMultishotRecvDataHead ? 1 : 0); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Lock EnsureLockInitialized(ref Lock? gate) + { + Lock? existing = Volatile.Read(ref gate); + if (existing is not null) + { + return existing; + } + + Lock created = new Lock(); + Lock? prior = Interlocked.CompareExchange(ref gate, created, null); + return prior ?? created; + } + + /// Returns whether this context's engine is using io_uring completion mode. + private bool IsIoUringCompletionModeEnabled() + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + return engine is not null && engine.IsIoUringCompletionModeEnabled; + } + + /// Returns the total count of non-pinnable buffer prepare fallbacks across active engines. + internal static long GetIoUringNonPinnablePrepareFallbackCount() => + SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackCount(); + + /// Test-only setter for the non-pinnable fallback counter. + internal static void SetIoUringNonPinnablePrepareFallbackCountForTest(long value) => + SocketAsyncEngine.SetIoUringNonPinnablePrepareFallbackCountForTest(value); + + internal static bool TryGetSocketAsyncContextForTest(Socket socket, out SocketAsyncContext? context) + { + try + { + context = socket.SafeHandle.AsyncContext; + return true; + } + catch (ObjectDisposedException) + { + context = null; + return false; + } + } + + internal static bool IsMultishotAcceptArmedForTest(Socket socket) + { + if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null) + { + return false; + } + + return context.IsMultishotAcceptArmed; + } + + internal static int GetMultishotAcceptQueueCountForTest(Socket socket) + { + if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null) + { + return 0; + } + + Lock gate = context.EnsureMultishotAcceptQueueGate(); + lock (gate) + { + return context._multishotAcceptQueue?.Count ?? 0; + } + } + + internal static bool IsPersistentMultishotRecvArmedForTest(Socket socket) + { + if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null) + { + return false; + } + + return context.IsPersistentMultishotRecvArmed(); + } + + internal static ulong GetPersistentMultishotRecvUserDataForTest(Socket socket) + { + if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null) + { + return 0; + } + + return context.IsPersistentMultishotRecvArmed() ? context.PersistentMultishotRecvUserData : 0; + } + + internal static int GetPersistentMultishotRecvBufferedCountForTest(Socket socket) + { + if (!TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context) || context is null) + { + return 0; + } + + Lock gate = context.EnsurePersistentMultishotRecvDataGate(); + lock (gate) + { + return context.PersistentMultishotRecvBufferedCount; + } + } + + /// Returns whether a multishot accept SQE is currently armed for this context. + internal bool IsMultishotAcceptArmed => Volatile.Read(ref _multishotAcceptState) != MultishotAcceptStateDisarmed; + + /// Returns the user_data payload for the armed multishot accept SQE, if any. + internal ulong MultishotAcceptUserData => DecodeMultishotAcceptUserData(Volatile.Read(ref _multishotAcceptState)); + + /// Clears multishot accept armed-state for this context. + internal void DisarmMultishotAccept() + { + Volatile.Write(ref _multishotAcceptState, MultishotAcceptStateDisarmed); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong DecodeMultishotAcceptUserData(long packedState) + { + ulong rawState = (ulong)packedState; + return (byte)(rawState >> IoUringUserDataTagShift) == IoUringReservedCompletionTag + ? rawState + : 0; + } + + /// Returns whether a persistent multishot recv SQE is currently armed for this context. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsPersistentMultishotRecvArmed() => + Volatile.Read(ref _persistentMultishotRecvArmed) != 0; + + /// Records that a persistent multishot recv SQE has been armed for this context. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void SetPersistentMultishotRecvArmed(ulong userData) + { + Volatile.Write(ref _persistentMultishotRecvUserData, userData); + Volatile.Write(ref _persistentMultishotRecvArmed, 1); + } + + /// Clears this context's armed persistent multishot recv state. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void ClearPersistentMultishotRecvArmed() + { + Volatile.Write(ref _persistentMultishotRecvUserData, 0); + Volatile.Write(ref _persistentMultishotRecvArmed, 0); + } + + /// Gets the user_data of the armed persistent multishot recv SQE, or 0 if none is armed. + internal ulong PersistentMultishotRecvUserData => + Volatile.Read(ref _persistentMultishotRecvUserData); + + /// + /// Clears persistent multishot recv armed-state and requests ASYNC_CANCEL for + /// the armed user_data when available. + /// + internal void RequestPersistentMultishotRecvCancel() + { + ulong recvUserData = Volatile.Read(ref _persistentMultishotRecvUserData); + ClearPersistentMultishotRecvArmed(); + if (recvUserData != 0) + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + engine?.TryRequestIoUringCancellation(recvUserData); + } + } + + /// Copies an early multishot-recv payload into the per-socket replay queue. + internal bool TryBufferEarlyPersistentMultishotRecvData(ReadOnlySpan payload) + { + if (payload.Length == 0) + { + return true; + } + + EnsurePersistentMultishotRecvDataQueueInitialized(); + Queue? queue = _persistentMultishotRecvDataQueue; + if (queue is null) + { + return false; + } + + byte[] copy = ArrayPool.Shared.Rent(payload.Length); + payload.CopyTo(copy); + Lock gate = EnsurePersistentMultishotRecvDataGate(); + lock (gate) + { + if (PersistentMultishotRecvBufferedCount >= PersistentMultishotRecvDataQueueMaxSize) + { + ArrayPool.Shared.Return(copy); + return false; + } + + // Publish queue count only after enqueue to avoid teardown observing phantom items. + queue.Enqueue(new BufferedPersistentMultishotRecvData(copy, payload.Length, usesPooledBuffer: true)); + } + + return true; + } + + /// Attempts to drain buffered multishot-recv payload into the caller destination. + internal bool TryConsumeBufferedPersistentMultishotRecvData(Memory destination, out int bytesTransferred) + { + bytesTransferred = 0; + if (destination.Length == 0) + { + return false; + } + + Lock gate = EnsurePersistentMultishotRecvDataGate(); + byte[] sourceBuffer; + int sourceOffset; + int toCopy; + bool releaseHeadAfterCopy; + BufferedPersistentMultishotRecvData sourceHead; + lock (gate) + { + if (!TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered)) + { + return false; + } + + int headOffset = _persistentMultishotRecvDataHeadOffset; + int remaining = buffered.Length - headOffset; + Debug.Assert(remaining > 0); + if (remaining <= 0) + { + ReleasePersistentMultishotRecvDataHead(); + return false; + } + + toCopy = Math.Min(destination.Length, remaining); + sourceBuffer = buffered.Data; + sourceOffset = headOffset; + sourceHead = buffered; + _persistentMultishotRecvDataHeadOffset = headOffset + toCopy; + releaseHeadAfterCopy = _persistentMultishotRecvDataHeadOffset >= buffered.Length; + } + + sourceBuffer.AsSpan(sourceOffset, toCopy).CopyTo(destination.Span); + bytesTransferred = toCopy; + + if (releaseHeadAfterCopy) + { + lock (gate) + { + if (_hasPersistentMultishotRecvDataHead && + _persistentMultishotRecvDataHead.Length == sourceHead.Length && + ReferenceEquals(_persistentMultishotRecvDataHead.Data, sourceHead.Data) && + _persistentMultishotRecvDataHeadOffset >= sourceHead.Length) + { + ReleasePersistentMultishotRecvDataHead(); + } + } + } + + return true; + } + + /// Ensures the pre-accepted connection queue exists. + private void EnsureMultishotAcceptQueueInitialized() + { + if (_multishotAcceptQueue is null) + { + Lock gate = EnsureMultishotAcceptQueueGate(); + lock (gate) + { + _multishotAcceptQueue ??= new Queue(); + } + } + } + + /// + /// Attempts to enqueue a pre-accepted connection from a multishot accept CQE. + /// Caller is responsible for closing when this returns false. + /// + internal bool TryEnqueuePreAcceptedConnection(IntPtr acceptedFd, ReadOnlySpan socketAddressData, int socketAddressLen) + { + EnsureMultishotAcceptQueueInitialized(); + Queue? queue = _multishotAcceptQueue; + if (queue is null) + { + return false; + } + + int length = socketAddressLen; + if (length < 0) + { + length = 0; + } + + if ((uint)length > (uint)socketAddressData.Length) + { + length = socketAddressData.Length; + } + + Lock gate = EnsureMultishotAcceptQueueGate(); + lock (gate) + { + if (queue.Count >= MultishotAcceptQueueMaxSize) + { + return false; + } + + byte[] copy; + if (length != 0) + { + copy = ArrayPool.Shared.Rent(length); + socketAddressData.Slice(0, length).CopyTo(copy); + } + else + { + copy = Array.Empty(); + } + + queue.Enqueue(new PreAcceptedConnection(acceptedFd, copy, length, usesPooledBuffer: length != 0)); + } + + return true; + } + + /// + /// Attempts to dequeue a pre-accepted connection from the multishot accept queue. + /// Returns true if a connection was available, populating the operation fields. + /// + internal bool TryDequeuePreAcceptedConnection(AcceptOperation operation) + { + EnsureMultishotAcceptQueueInitialized(); + Queue? queue = _multishotAcceptQueue; + if (queue is null) + { + return false; + } + + PreAcceptedConnection accepted; + Lock gate = EnsureMultishotAcceptQueueGate(); + lock (gate) + { + if (queue.Count == 0) + { + return false; + } + + accepted = queue.Dequeue(); + } + + try + { + operation.AcceptedFileDescriptor = accepted.FileDescriptor; + int socketAddressLen = accepted.SocketAddressLength; + if ((uint)socketAddressLen > (uint)operation.SocketAddress.Length) + { + socketAddressLen = operation.SocketAddress.Length; + } + + if (socketAddressLen != 0) + { + accepted.SocketAddressData.AsSpan(0, socketAddressLen).CopyTo(operation.SocketAddress.Span); + } + + operation.AcceptSocketAddressLength = socketAddressLen; + operation.SocketAddress = operation.SocketAddress.Slice(0, socketAddressLen); + operation.ErrorCode = SocketError.Success; + return true; + } + finally + { + ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer); + } + } + + /// Removes a completed io_uring operation from its queue and signals or dispatches its callback. + internal bool TryCompleteIoUringOperation(AsyncOperation operation) + { + bool removed = + operation is ReadOperation readOperation ? _receiveQueue.TryRemoveCompletedOperation(this, readOperation) : + operation is WriteOperation writeOperation ? _sendQueue.TryRemoveCompletedOperation(this, writeOperation) : + false; + if (!removed) + { + return false; + } + + ManualResetEventSlim? e = operation.Event; + if (e is not null) + { + e.Set(); + return true; + } + + operation.CancellationRegistration.Dispose(); + if (ShouldDispatchCompletionCallback(operation)) + { + if (PreferInlineCompletions) + { + // Inline completion: invoke directly on the event-loop thread, + // matching the epoll path (HandleEventsInline). This avoids the + // ThreadPool hop for latency-sensitive workloads that opted in + // via DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS=1. + operation.InvokeCallback(allowPooling: true); + } + else + { + operation.QueueIoUringCompletionCallback(); + } + } + + return true; + } + + /// Enqueues an operation for deferred SQE preparation on the event loop thread. + private bool TryEnqueueIoUringPreparation(AsyncOperation operation, long prepareSequence) + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + return engine is not null && engine.TryEnqueueIoUringPreparation(operation, prepareSequence); + } + + /// Applies cancellation and/or untracking to an operation's io_uring state. + private void HandleIoUringCancellationTransition( + AsyncOperation operation, + bool requestKernelCancellation, + bool untrackAndClear) + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + ulong userData = operation.IoUringUserData; + if (userData == 0) + { + return; + } + + if (requestKernelCancellation) + { + engine?.TryRequestIoUringCancellation(userData); + } + + if (untrackAndClear) + { + bool clearAllowed = engine?.TryUntrackIoUringOperation(userData, operation) ?? true; + if (clearAllowed) + { + operation.ClearIoUringUserData(); + } + } + } + + /// Requests kernel-level ASYNC_CANCEL for an in-flight operation. + private void TryRequestIoUringCancellation(AsyncOperation operation) + { + HandleIoUringCancellationTransition( + operation, + requestKernelCancellation: true, + untrackAndClear: false); + } + + /// Removes an operation from the registry and clears its user_data. + internal void TryUntrackIoUringOperation(AsyncOperation operation) + { + HandleIoUringCancellationTransition( + operation, + requestKernelCancellation: false, + untrackAndClear: true); + } + + /// Stages an operation for io_uring preparation if completion mode is active. + static partial void LinuxTryStageIoUringOperation(AsyncOperation operation) + { + if (operation.Event is null && operation.AssociatedContext.IsIoUringCompletionModeEnabled()) + { + if (!operation.TryQueueIoUringPreparation()) + { + operation.EmitReadinessFallbackForQueueOverflow(); + } + } + } + + partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued) + { + dequeued = TryDequeuePreAcceptedConnection(operation); + } + + partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred) + { + consumed = TryConsumeBufferedPersistentMultishotRecvData(destination, out bytesTransferred); + } + + /// Cleans up multishot-accept state and queued pre-accepted descriptors during abort. + partial void LinuxOnStopAndAbort() + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + if (IsPersistentMultishotRecvArmed()) + { + RequestPersistentMultishotRecvCancel(); + } + + ulong armedUserData = GetArmedMultishotAcceptUserDataForCancellation(); + if (engine is not null && armedUserData != 0) + { + engine.TryRequestIoUringCancellation(armedUserData); + } + + DisarmMultishotAccept(); + + Queue? multishotAcceptQueue = _multishotAcceptQueue; + if (multishotAcceptQueue is not null) + { + while (true) + { + PreAcceptedConnection accepted; + Lock gate = EnsureMultishotAcceptQueueGate(); + lock (gate) + { + if (multishotAcceptQueue.Count == 0) + { + break; + } + + accepted = multishotAcceptQueue.Dequeue(); + } + + Interop.Sys.Close(accepted.FileDescriptor); + ReturnPooledBufferIfNeeded(accepted.SocketAddressData, accepted.UsesPooledBuffer); + } + } + + Lock persistentGate = EnsurePersistentMultishotRecvDataGate(); + lock (persistentGate) + { + ReleasePersistentMultishotRecvDataHead(); + + Queue? bufferedQueue = _persistentMultishotRecvDataQueue; + if (bufferedQueue is not null) + { + while (bufferedQueue.Count != 0) + { + BufferedPersistentMultishotRecvData buffered = bufferedQueue.Dequeue(); + ReturnPooledBufferIfNeeded(buffered.Data, buffered.UsesPooledBuffer); + } + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void EnsurePersistentMultishotRecvDataQueueInitialized() + { + if (_persistentMultishotRecvDataQueue is null) + { + Lock gate = EnsurePersistentMultishotRecvDataGate(); + lock (gate) + { + _persistentMultishotRecvDataQueue ??= new Queue(); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryAcquirePersistentMultishotRecvDataHead(out BufferedPersistentMultishotRecvData buffered) + { + if (_hasPersistentMultishotRecvDataHead) + { + buffered = _persistentMultishotRecvDataHead; + return true; + } + + Queue? queue = _persistentMultishotRecvDataQueue; + if (queue is null || queue.Count == 0) + { + buffered = default; + return false; + } + + BufferedPersistentMultishotRecvData dequeued = queue.Dequeue(); + _persistentMultishotRecvDataHead = dequeued; + _hasPersistentMultishotRecvDataHead = true; + _persistentMultishotRecvDataHeadOffset = 0; + buffered = dequeued; + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ReleasePersistentMultishotRecvDataHead() + { + if (!_hasPersistentMultishotRecvDataHead) + { + return; + } + + BufferedPersistentMultishotRecvData head = _persistentMultishotRecvDataHead; + _persistentMultishotRecvDataHead = default; + _hasPersistentMultishotRecvDataHead = false; + _persistentMultishotRecvDataHeadOffset = 0; + ReturnPooledBufferIfNeeded(head.Data, head.UsesPooledBuffer); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ReturnPooledBufferIfNeeded(byte[] buffer, bool usesPooledBuffer) + { + if (usesPooledBuffer) + { + ArrayPool.Shared.Return(buffer); + } + } + + private ulong GetArmedMultishotAcceptUserDataForCancellation() + { + long packedState = Volatile.Read(ref _multishotAcceptState); + ulong userData = DecodeMultishotAcceptUserData(packedState); + if (userData != 0 || packedState == MultishotAcceptStateDisarmed) + { + return userData; + } + + // A transient "arming without published user_data" state can race this read. + // Bounded spin is best-effort; a miss is benign because later cancellation + // and teardown paths still unarm/cleanup safely. + SpinWait spinner = default; + do + { + spinner.SpinOnce(); + packedState = Volatile.Read(ref _multishotAcceptState); + userData = DecodeMultishotAcceptUserData(packedState); + if (userData != 0 || packedState == MultishotAcceptStateDisarmed) + { + break; + } + } while (!spinner.NextSpinWillYield); + + return userData; + } + + internal abstract partial class AsyncOperation + { + /// Outcome of processing an io_uring CQE, determining the dispatch action. + internal enum IoUringCompletionResult + { + Completed = 0, + Pending = 1, + Canceled = 2, + Ignored = 3 + } + + /// Tri-state result from direct (managed) SQE preparation. + internal enum IoUringDirectPrepareResult + { + Unsupported = 0, // Direct path unavailable for this shape; caller keeps operation pending. + Prepared = 1, // SQE written + PrepareFailed = 2 // Direct preparation failed; caller handles retry/fallback without native prepare. + } + + /// Tracks whether a receive operation prepared as one-shot or multishot. + internal enum IoUringReceiveSubmissionMode : byte + { + None = 0, + OneShot = 1, + Multishot = 2 + } + + private long _ioUringPrepareSequence; + private int _ioUringPrepareQueued; + private int _ioUringPreparationReusable; + private MemoryHandle _ioUringPinnedBuffer; + private int _ioUringPinnedBufferActive; + private int _ioUringCompletionSocketAddressLen; + private int _ioUringCompletionControlBufferLen; + private int _ioUringReceiveSubmissionMode; + private int _ioUringSlotExhaustionRetryCount; + internal ulong IoUringUserData; + + /// Requests kernel cancellation if the flag is set. + partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation) + { + if (requestIoUringCancellation) + { + AssociatedContext.TryRequestIoUringCancellation(this); + } + } + + /// Untracks this operation unless it is in the Canceled state awaiting a terminal CQE. + partial void LinuxUntrackIoUringOperation() + { + // Canceled operations remain tracked until the terminal CQE arrives so that + // pinned/user-owned resources are not released while the kernel may still + // reference them. Dispatch will clear resources on that terminal completion. + if (_state == State.Canceled) + { + return; + } + + AssociatedContext.TryUntrackIoUringOperation(this); + } + + /// Resets all io_uring preparation state and advances the prepare sequence. + partial void ResetIoUringState() + { + ReleaseIoUringPreparationResources(); + IoUringUserData = 0; + Volatile.Write(ref _ioUringPreparationReusable, 0); + _ioUringCompletionSocketAddressLen = 0; + _ioUringCompletionControlBufferLen = 0; + _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None; + _ioUringSlotExhaustionRetryCount = 0; + long nextPrepareSequence = unchecked(_ioUringPrepareSequence + 1); + // Keep sequence strictly positive so stale queued work from previous resets never matches. + if (nextPrepareSequence <= 0) + { + nextPrepareSequence = 1; + } + + Volatile.Write(ref _ioUringPrepareSequence, nextPrepareSequence); + Volatile.Write(ref _ioUringPrepareQueued, 0); + } + + /// Marks this operation as ready for SQE preparation and returns its sequence number. + internal long MarkReadyForIoUringPreparation() + { + long prepareSequence = Volatile.Read(ref _ioUringPrepareSequence); + Debug.Assert(prepareSequence > 0); + Volatile.Write(ref _ioUringPrepareQueued, 1); + return prepareSequence; + } + + /// Cancels a pending preparation if the sequence number still matches. + internal void CancelPendingIoUringPreparation(long prepareSequence) + { + if (Volatile.Read(ref _ioUringPrepareSequence) == prepareSequence) + { + Volatile.Write(ref _ioUringPrepareQueued, 0); + } + } + + /// Attempts to prepare an SQE for this operation via the managed direct path. + internal bool TryPrepareIoUring(SocketAsyncContext context, long prepareSequence) + { + if (prepareSequence <= 0 || + Volatile.Read(ref _ioUringPrepareSequence) != prepareSequence || + Interlocked.Exchange(ref _ioUringPrepareQueued, 0) == 0 || + _state != State.Waiting) + { + return false; + } + + if (Interlocked.Exchange(ref _ioUringPreparationReusable, 0) == 0) + { + ReleaseIoUringPreparationResources(); + } + + SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine); + if (engine is null || !engine.IsIoUringDirectSqeEnabled) + { + // Managed completion mode assumes direct SQE submission. + // If direct submission is unavailable, keep operation pending for fallback handling. + ErrorCode = SocketError.Success; + IoUringUserData = 0; + return false; + } + + IoUringDirectPrepareResult directResult = IoUringPrepareDirect(context, engine, out ulong directUserData); + if (directResult == IoUringDirectPrepareResult.Prepared) + { + _ioUringSlotExhaustionRetryCount = 0; + IoUringUserData = ErrorCode == SocketError.Success ? directUserData : 0; + return true; + } + + if (directResult == IoUringDirectPrepareResult.PrepareFailed) + { + IoUringUserData = 0; + return false; + } + + // Direct preparation unsupported for this operation shape. + // Leave operation pending so caller can use completion-path fallback semantics. + ErrorCode = SocketError.Success; + IoUringUserData = 0; + return false; + } + + /// Queues this operation for deferred preparation on the event loop thread. + internal bool TryQueueIoUringPreparation() + { + if (!AssociatedContext.IsIoUringCompletionModeEnabled()) + { + return false; + } + + long prepareSequence = MarkReadyForIoUringPreparation(); + if (AssociatedContext.TryEnqueueIoUringPreparation(this, prepareSequence)) + { + return true; + } + + CancelPendingIoUringPreparation(prepareSequence); + return false; + } + + /// Returns whether this operation is currently in the waiting state. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsInWaitingState() => _state == State.Waiting; + + /// Increments and returns the slot-exhaustion retry count for this operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal int IncrementIoUringSlotExhaustionRetryCount() => ++_ioUringSlotExhaustionRetryCount; + + /// Resets slot-exhaustion retry tracking for this operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void ResetIoUringSlotExhaustionRetryCount() => _ioUringSlotExhaustionRetryCount = 0; + + /// + /// Emits a readiness fallback event when io_uring prepare-queue staging fails. + /// + internal void EmitReadinessFallbackForQueueOverflow() + { + Interop.Sys.SocketEvents fallbackEvents = GetIoUringFallbackSocketEvents(); + if (fallbackEvents == Interop.Sys.SocketEvents.None) + { + return; + } + + SocketAsyncContext context = AssociatedContext; + SocketAsyncEngine? engine = Volatile.Read(ref context._asyncEngine); + if (engine is null) + { + return; + } + + engine.EnqueueReadinessFallbackEvent( + context, + fallbackEvents, + countAsPrepareQueueOverflowFallback: true); + } + + /// Processes a CQE result and returns the dispatch action for the completion handler. + internal IoUringCompletionResult ProcessIoUringCompletionResult(int result, uint flags, uint auxiliaryData) + { + Trace($"Enter, result={result}, flags={flags}, auxiliaryData={auxiliaryData}"); + + // Claim ownership of completion processing; if cancellation already won, do not publish completion. + State oldState = Interlocked.CompareExchange(ref _state, State.Running, State.Waiting); + if (oldState == State.Canceled) + { + Trace("Exit, previously canceled"); + return IoUringCompletionResult.Canceled; + } + + if (oldState != State.Waiting) + { + Trace("Exit, ignored"); + return IoUringCompletionResult.Ignored; + } + + if (ProcessIoUringCompletionViaDiscriminator(AssociatedContext, result, auxiliaryData)) + { + _state = State.Complete; + Trace("Exit, completed"); + return IoUringCompletionResult.Completed; + } + + // Incomplete path (e.g. transient retry): mirror TryComplete state transition handling. + State newState; + while (true) + { + State state = _state; + Debug.Assert(state is State.Running or State.RunningWithPendingCancellation, $"Unexpected operation state: {(State)state}"); + + newState = (state == State.Running ? State.Waiting : State.Canceled); + if (state == Interlocked.CompareExchange(ref _state, newState, state)) + { + break; + } + } + + if (newState == State.Canceled) + { + ProcessCancellation(); + Trace("Exit, canceled while pending"); + return IoUringCompletionResult.Canceled; + } + + Trace("Exit, pending"); + return IoUringCompletionResult.Pending; + } + + /// Stores recvmsg output lengths from the CQE for post-completion processing. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void SetIoUringCompletionMessageMetadata(int socketAddressLen, int controlBufferLen) + { + _ioUringCompletionSocketAddressLen = socketAddressLen; + _ioUringCompletionControlBufferLen = controlBufferLen; + } + + /// Releases preparation resources and resets the user_data to zero. + internal void ClearIoUringUserData() + { + ReleaseIoUringPreparationResources(); + IoUringUserData = 0; + Volatile.Write(ref _ioUringPreparationReusable, 0); + _ioUringCompletionSocketAddressLen = 0; + _ioUringCompletionControlBufferLen = 0; + _ioUringReceiveSubmissionMode = (int)IoUringReceiveSubmissionMode.None; + _ioUringSlotExhaustionRetryCount = 0; + } + + /// Clears user_data without releasing preparation resources for pending requeue. + internal void ResetIoUringUserDataForRequeue() + { + IoUringUserData = 0; + _ioUringCompletionSocketAddressLen = 0; + _ioUringCompletionControlBufferLen = 0; + } + + /// Records whether the current receive preparation uses one-shot or multishot mode. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected void SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode mode) + { + Volatile.Write(ref _ioUringReceiveSubmissionMode, (int)mode); + } + + /// Marks preparation resources as reusable so the next prepare skips re-pinning. + internal void MarkIoUringPreparationReusable() + { + Volatile.Write(ref _ioUringPreparationReusable, 1); + } + + /// Socket address length reported by the kernel in the CQE. + protected int IoUringCompletionSocketAddressLen => _ioUringCompletionSocketAddressLen; + /// Control buffer length reported by the kernel in the CQE. + protected int IoUringCompletionControlBufferLen => _ioUringCompletionControlBufferLen; + + /// Pins a buffer and returns the raw pointer, recording the handle for later release. + protected unsafe byte* PinIoUringBuffer(Memory buffer) + { + ReleasePinnedIoUringBuffer(); + if (buffer.Length == 0) + { + return null; + } + + _ioUringPinnedBuffer = buffer.Pin(); + Volatile.Write(ref _ioUringPinnedBufferActive, 1); + return (byte*)_ioUringPinnedBuffer.Pointer; + } + + /// Attempts to pin a buffer, falling back to the readiness path if not pinnable. + protected unsafe bool TryPinIoUringBuffer(Memory buffer, out byte* pinnedBuffer) + { + if (Volatile.Read(ref _ioUringPinnedBufferActive) != 0) + { + pinnedBuffer = (byte*)_ioUringPinnedBuffer.Pointer; + if (buffer.Length > 0 && pinnedBuffer is null) + { + ReleasePinnedIoUringBuffer(); + RecordIoUringNonPinnablePrepareFallback("null-reused-pin-pointer", buffer.Length); + ErrorCode = SocketError.Success; + return false; + } + + return true; + } + + try + { + pinnedBuffer = PinIoUringBuffer(buffer); + if (buffer.Length > 0 && pinnedBuffer is null) + { + ReleasePinnedIoUringBuffer(); + RecordIoUringNonPinnablePrepareFallback("null-pin-pointer", buffer.Length); + ErrorCode = SocketError.Success; + return false; + } + + return true; + } + catch (NotSupportedException) + { + pinnedBuffer = null; + RecordIoUringNonPinnablePrepareFallback("pin-not-supported", buffer.Length); + ErrorCode = SocketError.Success; + return false; + } + } + + /// Transfers ownership of the active pinned buffer to the caller. + internal MemoryHandle TransferPinnedBuffer() + { + if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) == 0) + { + return default; + } + + MemoryHandle pinnedBuffer = _ioUringPinnedBuffer; + _ioUringPinnedBuffer = default; + return pinnedBuffer; + } + + /// + /// Attempts to pin a socket address buffer, reusing an existing pin when possible. + /// Caller is responsible for setting operation ErrorCode on failure if needed. + /// + protected static unsafe bool TryPinIoUringSocketAddress( + Memory socketAddress, + ref MemoryHandle pinnedSocketAddress, + ref int pinnedSocketAddressActive, + out byte* rawSocketAddress) + { + rawSocketAddress = null; + if (socketAddress.Length == 0) + { + return true; + } + + if (Volatile.Read(ref pinnedSocketAddressActive) != 0) + { + rawSocketAddress = (byte*)pinnedSocketAddress.Pointer; + if (rawSocketAddress is null) + { + pinnedSocketAddress.Dispose(); + pinnedSocketAddress = default; + Volatile.Write(ref pinnedSocketAddressActive, 0); + return false; + } + + return true; + } + + try + { + pinnedSocketAddress = socketAddress.Pin(); + Volatile.Write(ref pinnedSocketAddressActive, 1); + } + catch (NotSupportedException) + { + rawSocketAddress = null; + return false; + } + + rawSocketAddress = (byte*)pinnedSocketAddress.Pointer; + if (rawSocketAddress is null) + { + pinnedSocketAddress.Dispose(); + pinnedSocketAddress = default; + Volatile.Write(ref pinnedSocketAddressActive, 0); + return false; + } + + return true; + } + + /// + /// Pins a socket address buffer and normalizes pinning failures to a non-terminal fallback signal. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected unsafe bool TryPinIoUringSocketAddressForPrepare( + Memory socketAddress, + ref MemoryHandle pinnedSocketAddress, + ref int pinnedSocketAddressActive, + out byte* rawSocketAddress) + { + if (TryPinIoUringSocketAddress( + socketAddress, + ref pinnedSocketAddress, + ref pinnedSocketAddressActive, + out rawSocketAddress)) + { + return true; + } + + ErrorCode = SocketError.Success; + return false; + } + + /// Releases an operation-owned pinned socket-address buffer and message-header allocation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected static unsafe void ReleaseIoUringSocketAddressAndMessageHeader( + ref MemoryHandle pinnedSocketAddress, + ref int pinnedSocketAddressActive, + ref IntPtr messageHeader) + { + if (Interlocked.Exchange(ref pinnedSocketAddressActive, 0) != 0) + { + pinnedSocketAddress.Dispose(); + pinnedSocketAddress = default; + } + + IntPtr header = Interlocked.Exchange(ref messageHeader, IntPtr.Zero); + if (header != IntPtr.Zero) + { + NativeMemory.Free((void*)header); + } + } + + /// Records a telemetry counter for a non-pinnable buffer fallback. + private void RecordIoUringNonPinnablePrepareFallback(string reason, int bufferLength) + { + SocketAsyncEngine? engine = Volatile.Read(ref AssociatedContext._asyncEngine); + if (engine is null || !engine.IsIoUringCompletionModeEnabled) + { + return; + } + + engine.RecordIoUringNonPinnablePrepareFallback(); + long count = SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackCount(); + if (NetEventSource.Log.IsEnabled() && (count & 0x3F) == 1) + { + LogIoUringNonPinnablePrepareFallback(reason, bufferLength, count); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + void LogIoUringNonPinnablePrepareFallback(string fallbackReason, int fallbackBufferLength, long fallbackCount) + { + NetEventSource.Info( + AssociatedContext, + $"io_uring prepare fallback due to non-pinnable buffer: reason={fallbackReason}, length={fallbackBufferLength}, count={fallbackCount}"); + } + } + + /// Releases the currently pinned buffer handle if active. + private void ReleasePinnedIoUringBuffer() + { + if (Interlocked.Exchange(ref _ioUringPinnedBufferActive, 0) != 0) + { + _ioUringPinnedBuffer.Dispose(); + _ioUringPinnedBuffer = default; + } + } + + /// Releases the pinned buffer when the operation shape (single vs list) changes. + protected void ReleaseIoUringPinnedBufferForShapeTransition() => + ReleasePinnedIoUringBuffer(); + + /// Releases all preparation resources including the pinned buffer and subclass resources. + private void ReleaseIoUringPreparationResources() + { + ReleasePinnedIoUringBuffer(); + ReleaseIoUringPreparationResourcesCore(); + } + + /// Subclass hook to release operation-specific preparation resources. + protected virtual void ReleaseIoUringPreparationResourcesCore() + { + } + + /// Frees a set of GCHandles used for buffer list pinning. + protected static void ReleasePinnedHandles(GCHandle[] pinnedHandles, int count) + { + if (count <= 0) + { + return; + } + + int releaseCount = count < pinnedHandles.Length ? count : pinnedHandles.Length; + for (int i = 0; i < releaseCount; i++) + { + if (pinnedHandles[i].IsAllocated) + { + pinnedHandles[i].Free(); + } + } + } + + /// Rents an array from the shared pool for temporary io_uring preparation use. + private static T[] RentIoUringArray(int minimumLength) => + minimumLength == 0 ? Array.Empty() : ArrayPool.Shared.Rent(minimumLength); + + /// Returns a rented array to the shared pool. + private static void ReturnIoUringArray(T[] array, bool clearArray = false) + { + if (array.Length != 0) + { + ArrayPool.Shared.Return(array, clearArray); + } + } + + /// Releases pinned handles and returns the iovec array to the pool. + protected static void ReleaseIoUringPinnedHandlesAndIovecs( + ref GCHandle[]? pinnedHandles, + ref Interop.Sys.IOVector[]? iovecs, + ref int pinnedHandleCount) + { + GCHandle[]? handles = Interlocked.Exchange(ref pinnedHandles, null); + int handleCount = Interlocked.Exchange(ref pinnedHandleCount, 0); + if (handles is not null) + { + ReleasePinnedHandles(handles, handleCount); + ReturnIoUringArray(handles, clearArray: true); + } + + Interop.Sys.IOVector[]? vectors = Interlocked.Exchange(ref iovecs, null); + if (vectors is not null) + { + ReturnIoUringArray(vectors, clearArray: true); + } + } + + /// Pins a list of buffer segments and builds an iovec array for scatter/gather I/O. + protected static unsafe bool TryPinBufferListForIoUring( + IList> buffers, + int startIndex, + int startOffset, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out int iovCount, + out int pinnedHandleCount, + out SocketError errorCode) + { + iovCount = 0; + pinnedHandleCount = 0; + if ((uint)startIndex > (uint)buffers.Count) + { + errorCode = SocketError.InvalidArgument; + pinnedHandles = Array.Empty(); + iovecs = Array.Empty(); + return false; + } + + int remainingBufferCount = buffers.Count - startIndex; + pinnedHandles = RentIoUringArray(remainingBufferCount); + iovecs = RentIoUringArray(remainingBufferCount); + + int currentOffset = startOffset; + byte[]? lastPinnedArray = null; + GCHandle lastPinnedHandle = default; + try + { + for (int i = 0; i < remainingBufferCount; i++, currentOffset = 0) + { + ArraySegment buffer = buffers[startIndex + i]; + RangeValidationHelpers.ValidateSegment(buffer); + + if ((uint)currentOffset > (uint)buffer.Count) + { + ReleasePinnedHandles(pinnedHandles, pinnedHandleCount); + ReturnIoUringArray(pinnedHandles, clearArray: true); + ReturnIoUringArray(iovecs, clearArray: true); + errorCode = SocketError.InvalidArgument; + return false; + } + + int bufferCount = buffer.Count - currentOffset; + byte* basePtr = null; + if (bufferCount != 0) + { + byte[] array = buffer.Array!; + GCHandle handle; + if (ReferenceEquals(array, lastPinnedArray)) + { + handle = lastPinnedHandle; + } + else + { + handle = GCHandle.Alloc(array, GCHandleType.Pinned); + pinnedHandles[pinnedHandleCount] = handle; + pinnedHandleCount++; + lastPinnedArray = array; + lastPinnedHandle = handle; + } + + basePtr = &((byte*)handle.AddrOfPinnedObject())[buffer.Offset + currentOffset]; + } + + iovecs[i].Base = basePtr; + iovecs[i].Count = (UIntPtr)bufferCount; + iovCount++; + } + } + catch + { + ReleasePinnedHandles(pinnedHandles, pinnedHandleCount); + ReturnIoUringArray(pinnedHandles, clearArray: true); + ReturnIoUringArray(iovecs, clearArray: true); + throw; + } + + errorCode = SocketError.Success; + return true; + } + + /// Prepares an SQE via the managed direct path. Override in subclasses for direct submission. + protected virtual IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + return IoUringDirectPrepareResult.Unsupported; + } + + /// + /// Routes a CQE using an operation-kind discriminator to avoid virtual completion dispatch + /// on this hot path. + /// + private bool ProcessIoUringCompletionViaDiscriminator(SocketAsyncContext context, int result, uint auxiliaryData) + { + IoUringCompletionDispatchKind kind = GetIoUringCompletionDispatchKind(); + if (result >= 0) + { + return kind switch + { + IoUringCompletionDispatchKind.BufferListSendOperation => ((BufferListSendOperation)this).ProcessIoUringCompletionSuccessBufferListSend(result), + IoUringCompletionDispatchKind.BufferMemoryReceiveOperation => ((BufferMemoryReceiveOperation)this).ProcessIoUringCompletionSuccessBufferMemoryReceive(result, auxiliaryData), + IoUringCompletionDispatchKind.BufferListReceiveOperation => ((BufferListReceiveOperation)this).ProcessIoUringCompletionSuccessBufferListReceive(result, auxiliaryData), + IoUringCompletionDispatchKind.ReceiveMessageFromOperation => ((ReceiveMessageFromOperation)this).ProcessIoUringCompletionSuccessReceiveMessageFrom(result, auxiliaryData), + IoUringCompletionDispatchKind.AcceptOperation => ((AcceptOperation)this).ProcessIoUringCompletionSuccessAccept(result, auxiliaryData), + IoUringCompletionDispatchKind.ConnectOperation => ((ConnectOperation)this).ProcessIoUringCompletionSuccessConnect(context), + IoUringCompletionDispatchKind.SendOperation => ((SendOperation)this).ProcessIoUringCompletionSuccessSend(result), + _ => ProcessIoUringCompletionSuccessDefault(result) + }; + } + + return kind switch + { + IoUringCompletionDispatchKind.ReceiveMessageFromOperation => ((ReceiveMessageFromOperation)this).ProcessIoUringCompletionErrorReceiveMessageFrom(result), + IoUringCompletionDispatchKind.AcceptOperation => ((AcceptOperation)this).ProcessIoUringCompletionErrorAccept(result), + IoUringCompletionDispatchKind.ConnectOperation => ((ConnectOperation)this).ProcessIoUringCompletionErrorConnect(context, result), + IoUringCompletionDispatchKind.ReadOperation or + IoUringCompletionDispatchKind.BufferMemoryReceiveOperation or + IoUringCompletionDispatchKind.BufferListReceiveOperation => ((ReadOperation)this).ProcessIoUringCompletionErrorRead(result), + IoUringCompletionDispatchKind.WriteOperation or + IoUringCompletionDispatchKind.SendOperation or + IoUringCompletionDispatchKind.BufferListSendOperation => ((WriteOperation)this).ProcessIoUringCompletionErrorWrite(result), + _ => ProcessIoUringCompletionErrorDefault(result) + }; + } + + /// Processes a successful (non-negative) io_uring completion result. + private bool ProcessIoUringCompletionSuccessDefault(int result) + { + Debug.Assert(result >= 0, $"Expected non-negative io_uring result, got {result}"); + ErrorCode = SocketError.Success; + return true; + } + + /// Processes a failed (negative) io_uring completion result. + private bool ProcessIoUringCompletionErrorDefault(int result) + { + Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}"); + ErrorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result)); + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private IoUringCompletionDispatchKind GetIoUringCompletionDispatchKind() + { + int dispatchKind = _ioUringCompletionDispatchKind; + return dispatchKind != 0 ? + (IoUringCompletionDispatchKind)dispatchKind : + IoUringCompletionDispatchKind.Default; + } + + /// Whether preparation resources should be preserved when the operation is requeued. + internal virtual bool ShouldReuseIoUringPreparationResourcesOnPending => false; + + /// Returns whether the negative result represents EAGAIN/EWOULDBLOCK. + protected static bool IsIoUringRetryableError(int result) + { + if (result >= 0) + { + return false; + } + + Interop.Error error = GetIoUringPalError(result); + return error == Interop.Error.EAGAIN || error == Interop.Error.EWOULDBLOCK; + } + + /// Converts a negative io_uring result to a SocketError, returning false for retryable errors. + protected static bool ProcessIoUringErrorResult(int result, out SocketError errorCode) + { + Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}"); + + if (IsIoUringRetryableError(result)) + { + errorCode = SocketError.Success; + return false; + } + + errorCode = SocketPal.GetSocketErrorForErrorCode(GetIoUringPalError(result)); + return true; + } + + /// Converts a negative io_uring CQE result (raw -errno) to PAL error space. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected static Interop.Error GetIoUringPalError(int result) + { + Debug.Assert(result < 0, $"Expected negative io_uring result, got {result}"); + int platformErrno = -result; + return Interop.Sys.ConvertErrorPlatformToPal(platformErrno); + } + + /// Returns the epoll event mask to use when falling back from io_uring to readiness notification. + internal virtual Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.None; + + /// + /// Copies payload bytes from a provided-buffer ring selection into the operation's target memory. + /// Returns false when this operation shape does not support provided-buffer payload materialization. + /// + internal virtual unsafe bool TryProcessIoUringProvidedBufferCompletion( + byte* providedBuffer, + int providedBufferLength, + int bytesTransferred, + ref uint auxiliaryData) + { + _ = providedBuffer; + _ = providedBufferLength; + _ = bytesTransferred; + _ = auxiliaryData; + return false; + } + } + + internal abstract partial class ReadOperation + { + internal bool ProcessIoUringCompletionErrorRead(int result) => + ProcessIoUringErrorResult(result, out ErrorCode); + + /// + // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback. + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Read; + } + + private abstract partial class WriteOperation + { + internal bool ProcessIoUringCompletionErrorWrite(int result) => + ProcessIoUringErrorResult(result, out ErrorCode); + + /// + // Retained only for defensive fallback paths; regular completion mode avoids readiness fallback. + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Write; + } + + private abstract partial class SendOperation + { + internal bool ProcessIoUringCompletionSuccessSend(int result) + { + if (result == 0) + { + // A zero-byte completion for a non-empty send payload indicates peer close + // on stream sockets; report reset instead of a spurious success/0-byte write. + if (Count > 0) + { + ErrorCode = SocketError.ConnectionReset; + return true; + } + + ErrorCode = SocketError.Success; + return true; + } + + Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}"); + Debug.Assert(result <= Count, $"Unexpected io_uring send completion size: result={result}, count={Count}"); + + int sent = Math.Min(result, Count); + BytesTransferred += sent; + Offset += sent; + Count -= sent; + ErrorCode = SocketError.Success; + return Count == 0; + } + } + + private partial class BufferMemorySendOperation + { + private IntPtr _ioUringMessageHeader; + private MemoryHandle _ioUringPinnedSocketAddress; + private int _ioUringPinnedSocketAddressActive; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringSocketAddressAndMessageHeader( + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + ref _ioUringMessageHeader); + } + + /// Gets a message header buffer and sets the common sendmsg fields. + private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringSendMessageHeader(byte* rawSocketAddress) + { + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + messageHeader->Flags = SocketFlags.None; + return messageHeader; + } + + /// Configures a message header with zero or one iovec entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConfigureSingleIov( + Interop.Sys.MessageHeader* messageHeader, + byte* rawBuffer, + int bufferLength, + Interop.Sys.IOVector* iov) + { + if (bufferLength == 0) + { + messageHeader->IOVectors = null; + messageHeader->IOVectorCount = 0; + return; + } + + iov->Base = rawBuffer; + iov->Count = (UIntPtr)bufferLength; + messageHeader->IOVectors = iov; + messageHeader->IOVectorCount = 1; + } + + /// Builds a connected send or sendmsg preparation request. + private unsafe IoUringDirectPrepareResult IoUringPrepareDirectSendMessage( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (!TryPinIoUringSocketAddressForPrepare( + SocketAddress, + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (rawBuffer is not null) + { + rawBuffer += Offset; + } + + Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringSendMessageHeader(rawSocketAddress); + Interop.Sys.IOVector sendIov; + ConfigureSingleIov(messageHeader, rawBuffer, Count, &sendIov); + + IoUringDirectPrepareResult sendMessagePrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + context._socket, + messageHeader, + Count, + Flags, + out userData, + out SocketError sendMessageErrorCode); + ErrorCode = sendMessageErrorCode; + return sendMessagePrepareResult; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (SocketAddress.Length == 0) + { + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (rawBuffer is not null) + { + rawBuffer += Offset; + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendWithZeroCopyFallback( + context._socket, + rawBuffer, + Count, + Flags, + out bool usedZeroCopy, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + if (usedZeroCopy && prepareResult == IoUringDirectPrepareResult.Prepared) + { + engine.TransferIoUringZeroCopyPinHold(userData, TransferPinnedBuffer()); + } + + return prepareResult; + } + + return IoUringPrepareDirectSendMessage(context, engine, out userData); + } + } + + private sealed partial class BufferListSendOperation + { + private GCHandle[]? _ioUringPinnedBufferHandles; + private Interop.Sys.IOVector[]? _ioUringIovecs; + private int _ioUringPinnedHandleCount; + private int _ioUringPreparedBufferCount = -1; + private int _ioUringPreparedStartIndex = -1; + private int _ioUringPreparedStartOffset = -1; + private int _ioUringPreparedIovCount; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedBufferCount = -1; + _ioUringPreparedStartIndex = -1; + _ioUringPreparedStartOffset = -1; + _ioUringPreparedIovCount = 0; + } + + /// Pins buffer segments starting at BufferIndex/Offset and builds the iovec array. + private bool TryPinIoUringBuffers( + IList> buffers, + int startIndex, + int startOffset, + out int iovCount) + { + if (_ioUringPinnedBufferHandles is not null && + _ioUringIovecs is not null && + _ioUringPreparedBufferCount == buffers.Count && + _ioUringPreparedStartIndex == startIndex && + _ioUringPreparedStartOffset == startOffset && + _ioUringPreparedIovCount <= _ioUringIovecs.Length) + { + iovCount = _ioUringPreparedIovCount; + return true; + } + + // Release any existing pinned handles and rented arrays before creating new ones. + // This handles the partial-send case where BufferIndex/Offset advanced, causing the + // reuse check above to fail while old resources are still held. + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + + if (!TryPinBufferListForIoUring( + buffers, + startIndex, + startOffset, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out iovCount, + out int pinnedHandleCount, + out SocketError errorCode)) + { + ErrorCode = errorCode; + return false; + } + + _ioUringPinnedBufferHandles = pinnedHandles; + _ioUringIovecs = iovecs; + _ioUringPinnedHandleCount = pinnedHandleCount; + _ioUringPreparedBufferCount = buffers.Count; + _ioUringPreparedStartIndex = startIndex; + _ioUringPreparedStartOffset = startOffset; + _ioUringPreparedIovCount = iovCount; + return true; + } + + /// Advances the buffer position after a partial send, returning true when all data is sent. + private bool AdvanceSendBufferPosition(int bytesSent) + { + IList>? buffers = Buffers; + if (buffers is null || bytesSent <= 0) + { + return buffers is null || BufferIndex >= buffers.Count; + } + + int remaining = bytesSent; + int index = BufferIndex; + int offset = Offset; + + while (remaining > 0 && index < buffers.Count) + { + int available = buffers[index].Count - offset; + Debug.Assert(available >= 0, "Unexpected negative buffer availability during io_uring send completion."); + + if (available > remaining) + { + offset += remaining; + break; + } + + remaining -= Math.Max(available, 0); + index++; + offset = 0; + } + + BufferIndex = index; + Offset = offset; + return index >= buffers.Count; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + IList>? buffers = Buffers; + if (buffers is null) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if ((uint)BufferIndex > (uint)buffers.Count) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringBuffers(buffers, BufferIndex, Offset, out int iovCount)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + byte* rawSocketAddress = null; + if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader messageHeader; + messageHeader.SocketAddress = rawSocketAddress; + messageHeader.SocketAddressLen = SocketAddress.Length; + messageHeader.ControlBuffer = null; + messageHeader.ControlBufferLen = 0; + messageHeader.Flags = SocketFlags.None; + + Interop.Sys.IOVector[] iovecs = _ioUringIovecs!; + if (iovCount != 0) + { + fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0]) + { + messageHeader.IOVectors = iovecsPtr; + messageHeader.IOVectorCount = iovCount; + // Buffer-list sends can be many small segments (e.g. 4KB chunks). Use + // aggregate payload size for zero-copy eligibility, not per-segment size. + long totalPayloadBytes = 0; + for (int i = 0; i < iovCount; i++) + { + totalPayloadBytes += (long)(nuint)iovecs[i].Count; + if (totalPayloadBytes >= int.MaxValue) + { + totalPayloadBytes = int.MaxValue; + break; + } + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + context._socket, + &messageHeader, + (int)totalPayloadBytes, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + } + + messageHeader.IOVectors = null; + messageHeader.IOVectorCount = 0; + IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + context._socket, + &messageHeader, + payloadLength: 0, + Flags, + out userData, + out SocketError zeroIovErrorCode); + ErrorCode = zeroIovErrorCode; + return zeroIovPrepareResult; + } + + internal bool ProcessIoUringCompletionSuccessBufferListSend(int result) + { + if (result == 0) + { + // Buffer-list sends can represent empty payloads; only treat result=0 as + // reset when there are still bytes pending across remaining segments. + if (HasPendingBufferListSendBytes()) + { + ErrorCode = SocketError.ConnectionReset; + return true; + } + + ErrorCode = SocketError.Success; + return true; + } + + Debug.Assert(result > 0, $"Expected positive io_uring send completion size, got {result}"); + BytesTransferred += result; + bool complete = AdvanceSendBufferPosition(result); + ErrorCode = SocketError.Success; + return complete; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool HasPendingBufferListSendBytes() + { + IList>? buffers = Buffers; + if (buffers is null || BufferIndex >= buffers.Count) + { + return false; + } + + int index = BufferIndex; + int offset = Offset; + while (index < buffers.Count) + { + int available = buffers[index].Count - offset; + if (available > 0) + { + return true; + } + + index++; + offset = 0; + } + + return false; + } + } + + private sealed partial class BufferMemoryReceiveOperation + { + private IntPtr _ioUringMessageHeader; + private MemoryHandle _ioUringPinnedSocketAddress; + private int _ioUringPinnedSocketAddressActive; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringSocketAddressAndMessageHeader( + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + ref _ioUringMessageHeader); + } + + /// Gets a message header buffer and sets the common recvmsg fields. + private unsafe Interop.Sys.MessageHeader* GetOrCreateIoUringReceiveMessageHeader(byte* rawSocketAddress) + { + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + InitializeReceiveMessageHeader(messageHeader, rawSocketAddress); + return messageHeader; + } + + /// Initializes recvmsg header fields shared by direct preparation variants. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void InitializeReceiveMessageHeader(Interop.Sys.MessageHeader* messageHeader, byte* rawSocketAddress) + { + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + messageHeader->Flags = SocketFlags.None; + } + + /// Configures a message header with a single iovec entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConfigureSingleIov( + Interop.Sys.MessageHeader* messageHeader, + byte* rawBuffer, + int bufferLength, + Interop.Sys.IOVector* iov) + { + // Keep a single iovec even for zero-length receives so recvmsg preserves + // completion-mode readiness probe behavior for zero-byte operations. + iov->Base = rawBuffer; + iov->Count = (UIntPtr)bufferLength; + messageHeader->IOVectors = iov; + messageHeader->IOVectorCount = 1; + } + + /// Builds a connected or receive-from recvmsg operation. + private unsafe IoUringDirectPrepareResult IoUringPrepareDirectReceiveMessage( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringSocketAddressForPrepare( + SocketAddress, + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader* messageHeader = GetOrCreateIoUringReceiveMessageHeader(rawSocketAddress); + Interop.Sys.IOVector receiveIov; + ConfigureSingleIov(messageHeader, rawBuffer, Buffer.Length, &receiveIov); + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + + /// + /// Returns whether this operation shape is eligible for multishot recv submission. + /// Eligible: connected TCP receive (no socket address, no recvmsg flags) with non-empty buffer. + /// Ineligible: zero-byte probes, recvmsg-based receive paths (SetReceivedFlags/socket address). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsEligibleForIoUringMultishotRecv() + { + if (SetReceivedFlags || SocketAddress.Length != 0) + { + return false; + } + + // Multishot recv uses IORING_OP_RECV (no msg_flags). Message-oriented sockets + // rely on MSG_TRUNC to report truncation, which is not observable in this path. + if (SocketPal.GetSockOpt( + AssociatedContext._socket, + SocketOptionLevel.Socket, + SocketOptionName.Type, + out int socketTypeValue) != SocketError.Success) + { + // If type probing fails, keep completion correctness by disabling multishot recv. + return false; + } + + SocketType socketType = (SocketType)socketTypeValue; + if (socketType == SocketType.Dgram || + socketType == SocketType.Raw || + socketType == SocketType.Seqpacket) + { + return false; + } + + return Buffer.Length != 0; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (SetReceivedFlags || SocketAddress.Length != 0) + { + if (context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.OneShot); + IoUringDirectPrepareResult receiveMessagePrepareResult = + IoUringPrepareDirectReceiveMessage(context, engine, out userData); + if (receiveMessagePrepareResult != IoUringDirectPrepareResult.Prepared || ErrorCode != SocketError.Success) + { + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None); + } + + return receiveMessagePrepareResult; + } + + bool allowMultishotRecv = IsEligibleForIoUringMultishotRecv() && engine.SupportsMultishotRecv; + if (!allowMultishotRecv && context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + SetIoUringReceiveSubmissionMode( + allowMultishotRecv ? IoUringReceiveSubmissionMode.Multishot : IoUringReceiveSubmissionMode.OneShot); + + // Persistent multishot receive: if one is already armed, attach this operation to + // that existing user_data instead of submitting a new recv SQE. + if (allowMultishotRecv && context.IsPersistentMultishotRecvArmed()) + { + ulong armedUserData = context.PersistentMultishotRecvUserData; + if (armedUserData != 0 && + engine.TryReplaceIoUringTrackedOperation(armedUserData, this)) + { + SocketsTelemetry.Log.IoUringPersistentMultishotRecvReuse(); + userData = armedUserData; + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.Prepared; + } + + // Stale armed-state; clear and submit a fresh SQE below. + context.ClearPersistentMultishotRecvArmed(); + } + + if (!TryPinIoUringBuffer(Buffer, out byte* rawBuffer)) + { + ErrorCode = SocketError.Success; + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None); + return IoUringDirectPrepareResult.PrepareFailed; + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectRecv( + context._socket, + rawBuffer, + Buffer.Length, + Flags, + allowMultishotRecv, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + if (allowMultishotRecv && + prepareResult == IoUringDirectPrepareResult.Prepared && + errorCode == SocketError.Success) + { + context.SetPersistentMultishotRecvArmed(userData); + } + + if (prepareResult != IoUringDirectPrepareResult.Prepared || errorCode != SocketError.Success) + { + SetIoUringReceiveSubmissionMode(IoUringReceiveSubmissionMode.None); + } + + return prepareResult; + } + + internal bool ProcessIoUringCompletionSuccessBufferMemoryReceive(int result, uint auxiliaryData) + { + BytesTransferred = result; + ReceivedFlags = SetReceivedFlags ? (SocketFlags)(int)auxiliaryData : SocketFlags.None; + + if (SocketAddress.Length != 0) + { + int socketAddressLen = IoUringCompletionSocketAddressLen; + if (socketAddressLen < 0) + { + socketAddressLen = 0; + } + + if ((uint)socketAddressLen > (uint)SocketAddress.Length) + { + socketAddressLen = SocketAddress.Length; + } + + SocketAddress = SocketAddress.Slice(0, socketAddressLen); + } + ErrorCode = SocketError.Success; + return true; + } + + /// + internal override unsafe bool TryProcessIoUringProvidedBufferCompletion( + byte* providedBuffer, + int providedBufferLength, + int bytesTransferred, + ref uint auxiliaryData) + { + _ = auxiliaryData; + + if (bytesTransferred <= 0) + { + return true; + } + + if (SetReceivedFlags || SocketAddress.Length != 0) + { + return false; + } + + if ((uint)bytesTransferred > (uint)providedBufferLength || + (uint)bytesTransferred > (uint)Buffer.Length) + { + return false; + } + + new ReadOnlySpan(providedBuffer, bytesTransferred).CopyTo(Buffer.Span); + return true; + } + } + + private sealed partial class BufferListReceiveOperation + { + private GCHandle[]? _ioUringPinnedBufferHandles; + private Interop.Sys.IOVector[]? _ioUringIovecs; + private int _ioUringPinnedHandleCount; + private IntPtr _ioUringMessageHeader; + private int _ioUringPreparedIovCount; + private int _ioUringPreparedBufferCount = -1; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedIovCount = 0; + _ioUringPreparedBufferCount = -1; + + IntPtr messageHeader = Interlocked.Exchange(ref _ioUringMessageHeader, IntPtr.Zero); + if (messageHeader != IntPtr.Zero) + { + NativeMemory.Free((void*)messageHeader); + } + } + + /// Pins all buffer segments and builds the iovec array. + private bool TryPinIoUringBuffers(IList> buffers, out int iovCount) + { + if (_ioUringPinnedBufferHandles is not null && + _ioUringIovecs is not null && + _ioUringPreparedIovCount != 0 && + _ioUringPreparedIovCount <= _ioUringIovecs.Length && + _ioUringPreparedBufferCount == buffers.Count) + { + iovCount = _ioUringPreparedIovCount; + return true; + } + + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + + if (!TryPinBufferListForIoUring( + buffers, + startIndex: 0, + startOffset: 0, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out iovCount, + out int pinnedHandleCount, + out SocketError errorCode)) + { + ErrorCode = errorCode; + return false; + } + + _ioUringPinnedBufferHandles = pinnedHandles; + _ioUringIovecs = iovecs; + _ioUringPinnedHandleCount = pinnedHandleCount; + _ioUringPreparedIovCount = iovCount; + _ioUringPreparedBufferCount = buffers.Count; + return true; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + IList>? buffers = Buffers; + if (buffers is null) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (!TryPinIoUringBuffers(buffers, out int iovCount)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + byte* rawSocketAddress = null; + if (SocketAddress.Length != 0 && !TryPinIoUringBuffer(SocketAddress, out rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + messageHeader->Flags = SocketFlags.None; + + Interop.Sys.IOVector[] iovecs = _ioUringIovecs!; + if (iovCount != 0) + { + fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0]) + { + messageHeader->IOVectors = iovecsPtr; + messageHeader->IOVectorCount = iovCount; + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + } + + messageHeader->IOVectors = null; + messageHeader->IOVectorCount = 0; + IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError zeroIovErrorCode); + ErrorCode = zeroIovErrorCode; + return zeroIovPrepareResult; + } + + internal unsafe bool ProcessIoUringCompletionSuccessBufferListReceive(int result, uint auxiliaryData) + { + BytesTransferred = result; + ReceivedFlags = (SocketFlags)(int)auxiliaryData; + ErrorCode = SocketError.Success; + + if (_ioUringMessageHeader != IntPtr.Zero && SocketAddress.Length != 0) + { + int socketAddressLen = IoUringCompletionSocketAddressLen; + if (socketAddressLen < 0) + { + socketAddressLen = 0; + } + + if ((uint)socketAddressLen > (uint)SocketAddress.Length) + { + socketAddressLen = SocketAddress.Length; + } + + SocketAddress = SocketAddress.Slice(0, socketAddressLen); + } + + return true; + } + } + + private sealed partial class ReceiveMessageFromOperation + { + private GCHandle[]? _ioUringPinnedBufferHandles; + private Interop.Sys.IOVector[]? _ioUringIovecs; + private int _ioUringPinnedHandleCount; + private int _ioUringPreparedIovCount; + private int _ioUringPreparedBufferListCount = -1; + private IntPtr _ioUringMessageHeader; + private IntPtr _ioUringControlBuffer; + private int _ioUringControlBufferLength; + private MemoryHandle _ioUringPinnedSocketAddress; + private int _ioUringPinnedSocketAddressActive; + + /// + internal override bool ShouldReuseIoUringPreparationResourcesOnPending => true; + + /// + protected override unsafe void ReleaseIoUringPreparationResourcesCore() + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedIovCount = 0; + _ioUringPreparedBufferListCount = -1; + + IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero); + if (controlBuffer != IntPtr.Zero) + { + NativeMemory.Free((void*)controlBuffer); + } + _ioUringControlBufferLength = 0; + + ReleaseIoUringSocketAddressAndMessageHeader( + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + ref _ioUringMessageHeader); + } + + /// Pins buffer segments and builds the iovec array for recvmsg. + private bool TryPinIoUringBuffers(IList> buffers, out int iovCount) + { + if (_ioUringPinnedBufferHandles is not null && + _ioUringIovecs is not null && + _ioUringPreparedIovCount <= _ioUringIovecs.Length && + _ioUringPreparedBufferListCount == buffers.Count) + { + iovCount = _ioUringPreparedIovCount; + return true; + } + + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + + if (!TryPinBufferListForIoUring( + buffers, + startIndex: 0, + startOffset: 0, + out GCHandle[] pinnedHandles, + out Interop.Sys.IOVector[] iovecs, + out iovCount, + out int pinnedHandleCount, + out SocketError errorCode)) + { + ErrorCode = errorCode; + return false; + } + + _ioUringPinnedBufferHandles = pinnedHandles; + _ioUringIovecs = iovecs; + _ioUringPinnedHandleCount = pinnedHandleCount; + _ioUringPreparedIovCount = iovCount; + _ioUringPreparedBufferListCount = buffers.Count; + return true; + } + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (context.IsPersistentMultishotRecvArmed()) + { + context.RequestPersistentMultishotRecvCancel(); + } + + IList>? buffers = Buffers; + byte* rawBuffer = null; + int iovCount; + if (buffers is not null) + { + ReleaseIoUringPinnedBufferForShapeTransition(); + if (!TryPinIoUringBuffers(buffers, out iovCount)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + } + else + { + if (!TryPinIoUringBuffer(Buffer, out rawBuffer)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (_ioUringPinnedBufferHandles is not null || _ioUringIovecs is not null) + { + ReleaseIoUringPinnedHandlesAndIovecs(ref _ioUringPinnedBufferHandles, ref _ioUringIovecs, ref _ioUringPinnedHandleCount); + _ioUringPreparedIovCount = 0; + _ioUringPreparedBufferListCount = -1; + } + + iovCount = 1; + } + + if (!TryPinIoUringSocketAddressForPrepare( + SocketAddress, + ref _ioUringPinnedSocketAddress, + ref _ioUringPinnedSocketAddressActive, + out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + if (messageHeader is null) + { + messageHeader = (Interop.Sys.MessageHeader*)NativeMemory.Alloc((nuint)sizeof(Interop.Sys.MessageHeader)); + _ioUringMessageHeader = (IntPtr)messageHeader; + } + + messageHeader->SocketAddress = rawSocketAddress; + messageHeader->SocketAddressLen = SocketAddress.Length; + messageHeader->Flags = SocketFlags.None; + + int controlBufferLen = Interop.Sys.GetControlMessageBufferSize(Convert.ToInt32(IsIPv4), Convert.ToInt32(IsIPv6)); + if (controlBufferLen < 0) + { + ErrorCode = SocketError.Success; + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (controlBufferLen != 0) + { + if (_ioUringControlBuffer == IntPtr.Zero || _ioUringControlBufferLength != controlBufferLen) + { + IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero); + if (controlBuffer != IntPtr.Zero) + { + NativeMemory.Free((void*)controlBuffer); + } + + void* rawControlBuffer = NativeMemory.Alloc((nuint)controlBufferLen); + _ioUringControlBuffer = (IntPtr)rawControlBuffer; + _ioUringControlBufferLength = controlBufferLen; + } + + messageHeader->ControlBuffer = (byte*)_ioUringControlBuffer; + messageHeader->ControlBufferLen = controlBufferLen; + } + else + { + IntPtr controlBuffer = Interlocked.Exchange(ref _ioUringControlBuffer, IntPtr.Zero); + if (controlBuffer != IntPtr.Zero) + { + NativeMemory.Free((void*)controlBuffer); + } + + _ioUringControlBufferLength = 0; + messageHeader->ControlBuffer = null; + messageHeader->ControlBufferLen = 0; + } + + if (buffers is not null) + { + Interop.Sys.IOVector[] iovecs = _ioUringIovecs!; + if (iovCount != 0) + { + fixed (Interop.Sys.IOVector* iovecsPtr = &iovecs[0]) + { + messageHeader->IOVectors = iovecsPtr; + messageHeader->IOVectorCount = iovCount; + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + } + + messageHeader->IOVectors = null; + messageHeader->IOVectorCount = 0; + IoUringDirectPrepareResult zeroIovPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError zeroIovErrorCode); + ErrorCode = zeroIovErrorCode; + return zeroIovPrepareResult; + } + + Interop.Sys.IOVector iov; + iov.Base = rawBuffer; + iov.Count = (UIntPtr)Buffer.Length; + messageHeader->IOVectors = &iov; + messageHeader->IOVectorCount = 1; + IoUringDirectPrepareResult singleBufferPrepareResult = engine.TryPrepareIoUringDirectReceiveMessage( + context._socket, + messageHeader, + Flags, + out userData, + out SocketError singleBufferErrorCode); + ErrorCode = singleBufferErrorCode; + return singleBufferPrepareResult; + } + + internal unsafe bool ProcessIoUringCompletionSuccessReceiveMessageFrom(int result, uint auxiliaryData) + { + BytesTransferred = result; + ReceivedFlags = (SocketFlags)(int)auxiliaryData; + ErrorCode = SocketError.Success; + IPPacketInformation = default; + + if (_ioUringMessageHeader != IntPtr.Zero) + { + Interop.Sys.MessageHeader* messageHeader = (Interop.Sys.MessageHeader*)_ioUringMessageHeader; + int socketAddressCapacity = SocketAddress.Length; + int socketAddressLen = IoUringCompletionSocketAddressLen; + if (socketAddressLen < 0) + { + socketAddressLen = 0; + } + + if ((uint)socketAddressLen > (uint)socketAddressCapacity) + { + socketAddressLen = socketAddressCapacity; + } + + if (socketAddressLen == 0 && socketAddressCapacity != 0) + { + socketAddressLen = socketAddressCapacity; + SocketAddress.Span.Clear(); + } + + int controlBufferCapacity = messageHeader->ControlBufferLen; + int controlBufferLen = IoUringCompletionControlBufferLen; + if (controlBufferLen < 0) + { + controlBufferLen = 0; + } + + if ((uint)controlBufferLen > (uint)controlBufferCapacity) + { + controlBufferLen = controlBufferCapacity; + } + + messageHeader->SocketAddressLen = socketAddressLen; + messageHeader->ControlBufferLen = controlBufferLen; + messageHeader->Flags = ReceivedFlags; + + SocketAddress = SocketAddress.Slice(0, socketAddressLen); + + IPPacketInformation = SocketPal.GetIoUringIPPacketInformation(messageHeader, IsIPv4, IsIPv6); + } + + return true; + } + + internal bool ProcessIoUringCompletionErrorReceiveMessageFrom(int result) + { + if (!ProcessIoUringErrorResult(result, out ErrorCode)) + { + return false; + } + + IPPacketInformation = default; + return true; + } + } + + internal sealed partial class AcceptOperation + { + /// + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Read; + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + AcceptSocketAddressLength = SocketAddress.Length; + if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + if (engine.SupportsMultishotAccept && + Interlocked.CompareExchange( + ref context._multishotAcceptState, + MultishotAcceptStateArming, + MultishotAcceptStateDisarmed) == MultishotAcceptStateDisarmed) + { + context.EnsureMultishotAcceptQueueInitialized(); + IoUringDirectPrepareResult multishotPrepareResult = engine.TryPrepareIoUringDirectMultishotAccept( + context._socket, + rawSocketAddress, + SocketAddress.Length, + out userData, + out SocketError multishotErrorCode); + if (multishotPrepareResult == IoUringDirectPrepareResult.Prepared) + { + Debug.Assert( + (byte)(userData >> IoUringUserDataTagShift) == IoUringReservedCompletionTag, + "Multishot accept user_data must be a reserved-completion token."); + Volatile.Write(ref context._multishotAcceptState, unchecked((long)userData)); + ErrorCode = multishotErrorCode; + return multishotPrepareResult; + } + + context.DisarmMultishotAccept(); + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectAccept( + context._socket, + rawSocketAddress, + SocketAddress.Length, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + + internal bool ProcessIoUringCompletionSuccessAccept(int result, uint auxiliaryData) + { + AcceptedFileDescriptor = (IntPtr)result; + ErrorCode = SocketError.Success; + // Keep parity with readiness path: always honor reported address length, including 0. + AcceptSocketAddressLength = auxiliaryData > (uint)SocketAddress.Length ? SocketAddress.Length : (int)auxiliaryData; + SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength); + return true; + } + + internal bool ProcessIoUringCompletionErrorAccept(int result) + { + AcceptedFileDescriptor = (IntPtr)(-1); + return ProcessIoUringCompletionErrorRead(result); + } + } + + private sealed partial class ConnectOperation + { + /// + internal override Interop.Sys.SocketEvents GetIoUringFallbackSocketEvents() => + Interop.Sys.SocketEvents.Write; + + /// + protected override unsafe IoUringDirectPrepareResult IoUringPrepareDirect( + SocketAsyncContext context, + SocketAsyncEngine engine, + out ulong userData) + { + userData = 0; + if (!TryPinIoUringBuffer(SocketAddress, out byte* rawSocketAddress)) + { + return IoUringDirectPrepareResult.PrepareFailed; + } + + IoUringDirectPrepareResult prepareResult = engine.TryPrepareIoUringDirectConnect( + context._socket, + rawSocketAddress, + SocketAddress.Length, + out userData, + out SocketError errorCode); + ErrorCode = errorCode; + return prepareResult; + } + + internal bool ProcessIoUringCompletionErrorConnect(SocketAsyncContext context, int result) + { + Interop.Error error = GetIoUringPalError(result); + if (error == Interop.Error.EINPROGRESS) + { + ErrorCode = SocketError.Success; + return false; + } + + if (!ProcessIoUringCompletionErrorWrite(result)) + { + return false; + } + + context._socket.RegisterConnectResult(ErrorCode); + return true; + } + + internal bool ProcessIoUringCompletionSuccessConnect(SocketAsyncContext context) + { + ErrorCode = SocketError.Success; + context._socket.RegisterConnectResult(ErrorCode); + + if (Buffer.Length > 0) + { + Action, SocketFlags, SocketError>? callback = Callback; + Debug.Assert(callback is not null); + SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, default, ref BytesTransferred, callback!, default); + if (error == SocketError.IOPending) + { + // Callback ownership moved to the async send operation. + Callback = null; + Buffer = default; + } + else + { + if (error != SocketError.Success) + { + ErrorCode = error; + context._socket.RegisterConnectResult(ErrorCode); + } + + // Follow-up send completed synchronously (success/error), so invoke + // Connect callback from this operation path. + Buffer = default; + } + } + + return true; + } + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs index 4e2e117984084c..37de5ad03d346d 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -43,10 +44,10 @@ internal sealed partial class SocketAsyncContext private BufferListReceiveOperation? _cachedBufferListReceiveOperation; private BufferMemorySendOperation? _cachedBufferMemorySendOperation; private BufferListSendOperation? _cachedBufferListSendOperation; - private void ReturnOperation(AcceptOperation operation) { operation.Reset(); + operation.AcceptSocketAddressLength = 0; operation.Callback = null; operation.SocketAddress = default; Volatile.Write(ref _cachedAcceptOperation, operation); // benign race condition @@ -83,6 +84,7 @@ private void ReturnOperation(BufferListSendOperation operation) { operation.Reset(); operation.Buffers = null; + operation.SetBufferPosition(bufferIndex: 0, offset: 0); operation.Callback = null; operation.SocketAddress = default; Volatile.Write(ref _cachedBufferListSendOperation, operation); // benign race condition @@ -108,8 +110,20 @@ private BufferListSendOperation RentBufferListSendOperation() => Interlocked.Exchange(ref _cachedBufferListSendOperation, null) ?? new BufferListSendOperation(this); - private abstract class AsyncOperation : IThreadPoolWorkItem + // Partial method hooks for io_uring completion-mode staging (Linux-only). + // No-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs. + static partial void LinuxTryStageIoUringOperation(AsyncOperation operation); + partial void LinuxTryDequeuePreAcceptedConnection(AcceptOperation operation, ref bool dequeued); + partial void LinuxTryConsumeBufferedPersistentMultishotRecvData(Memory destination, ref bool consumed, ref int bytesTransferred); + partial void LinuxOnStopAndAbort(); + + internal abstract partial class AsyncOperation : IThreadPoolWorkItem { + private const int CancellationCallbackBatchSize = 64; + private static readonly ConcurrentQueue s_cancellationCallbackQueue = new ConcurrentQueue(); + private static readonly IThreadPoolWorkItem s_processCancellationCallbacks = new CancellationCallbackWorker(); + private static int s_cancellationCallbackWorkerQueued; + private enum State { Waiting = 0, @@ -120,6 +134,10 @@ private enum State } private volatile AsyncOperation.State _state; + private int _ioUringCompletionCallbackQueued; + // Defined in the shared Unix partial so operation constructors can compile + // for both linux and non-linux unix TFMs; only linux consumes the value. + private int _ioUringCompletionDispatchKind; #if DEBUG private bool _callbackQueued; // When true, the callback has been queued. @@ -133,6 +151,24 @@ private enum State public ManualResetEventSlim? Event { get; set; } + protected enum IoUringCompletionDispatchKind : byte + { + Default = 0, + ReadOperation = 1, + WriteOperation = 2, + SendOperation = 3, + BufferListSendOperation = 4, + BufferMemoryReceiveOperation = 5, + BufferListReceiveOperation = 6, + ReceiveMessageFromOperation = 7, + AcceptOperation = 8, + ConnectOperation = 9 + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + protected void SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind kind) => + _ioUringCompletionDispatchKind = (int)kind; + public AsyncOperation(SocketAsyncContext context) { AssociatedContext = context; @@ -141,7 +177,9 @@ public AsyncOperation(SocketAsyncContext context) public void Reset() { + ResetIoUringState(); _state = State.Waiting; + _ioUringCompletionCallbackQueued = 0; Event = null; Next = this; #if DEBUG @@ -202,6 +240,16 @@ public OperationResult TryComplete(SocketAsyncContext context) } public bool TryCancel() + { + return TryCancelCore(requestIoUringCancellation: true); + } + + internal bool TryCancelForTeardown() + { + return TryCancelCore(requestIoUringCancellation: false); + } + + private bool TryCancelCore(bool requestIoUringCancellation) { Trace("Enter"); @@ -232,6 +280,9 @@ public bool TryCancel() return false; } + // Best effort: if completion-mode io_uring work was already submitted, request kernel-side cancellation now. + // Partial method: no-op on non-Linux; implemented in SocketAsyncContext.IoUring.Linux.cs. + LinuxRequestIoUringCancellationIfNeeded(requestIoUringCancellation); ProcessCancellation(); // Note, we leave the operation in the OperationQueue. @@ -245,6 +296,7 @@ public void ProcessCancellation() Debug.Assert(_state == State.Canceled); + LinuxUntrackIoUringOperation(); ErrorCode = SocketError.OperationAborted; ManualResetEventSlim? e = Event; @@ -261,10 +313,53 @@ public void ProcessCancellation() // we can't pool the object, as ProcessQueue may still have a reference to it, due to // using a pattern whereby it takes the lock to grab an item, but then releases the lock // to do further processing on the item that's still in the list. - ThreadPool.UnsafeQueueUserWorkItem(o => ((AsyncOperation)o!).InvokeCallback(allowPooling: false), this); + QueueCancellationCallback(this); + } + } + + private static void QueueCancellationCallback(AsyncOperation operation) + { + s_cancellationCallbackQueue.Enqueue(operation); + if (Interlocked.CompareExchange(ref s_cancellationCallbackWorkerQueued, 1, 0) == 0) + { + ThreadPool.UnsafeQueueUserWorkItem(s_processCancellationCallbacks, preferLocal: false); } } + private static void ProcessQueuedCancellationCallbacks() + { + while (true) + { + int processed = 0; + while (processed < CancellationCallbackBatchSize && + s_cancellationCallbackQueue.TryDequeue(out AsyncOperation? operation)) + { + operation.InvokeCallback(allowPooling: false); + processed++; + } + + if (s_cancellationCallbackQueue.IsEmpty) + { + Volatile.Write(ref s_cancellationCallbackWorkerQueued, 0); + if (s_cancellationCallbackQueue.IsEmpty || + Interlocked.CompareExchange(ref s_cancellationCallbackWorkerQueued, 1, 0) != 0) + { + return; + } + + continue; + } + + ThreadPool.UnsafeQueueUserWorkItem(s_processCancellationCallbacks, preferLocal: false); + return; + } + } + + private sealed class CancellationCallbackWorker : IThreadPoolWorkItem + { + void IThreadPoolWorkItem.Execute() => ProcessQueuedCancellationCallbacks(); + } + public void Dispatch() { ManualResetEventSlim? e = Event; @@ -288,6 +383,30 @@ public void Schedule() ThreadPool.UnsafeQueueUserWorkItem(this, preferLocal: false); } + internal void QueueIoUringCompletionCallback() + { + Debug.Assert(Event == null); + if (Interlocked.Exchange(ref _ioUringCompletionCallbackQueued, 1) != 0) + { + Debug.Fail("io_uring completion callback was already queued for this operation."); + return; + } + + ThreadPool.UnsafeQueueUserWorkItem(this, preferLocal: false); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryExecuteIoUringCompletionCallback() + { + if (Interlocked.Exchange(ref _ioUringCompletionCallbackQueued, 0) == 0) + { + return false; + } + + InvokeCallback(allowPooling: true); + return true; + } + public void Process() => ((IThreadPoolWorkItem)this).Execute(); void IThreadPoolWorkItem.Execute() @@ -305,17 +424,27 @@ void IThreadPoolWorkItem.Execute() // We could also add an abstract method that the base interface implementation // invokes, but that adds an extra virtual dispatch. Debug.Fail("Expected derived type to implement IThreadPoolWorkItem"); - throw new InvalidOperationException(); + ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem(); } + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowExpectedDerivedTypeToImplementThreadPoolWorkItem() => + throw new InvalidOperationException(); + // Called when op is not in the queue yet, so can't be otherwise executing public void DoAbort() { + LinuxUntrackIoUringOperation(); ErrorCode = SocketError.OperationAborted; } protected abstract bool DoTryComplete(SocketAsyncContext context); + partial void ResetIoUringState(); + partial void LinuxRequestIoUringCancellationIfNeeded(bool requestIoUringCancellation); + partial void LinuxUntrackIoUringOperation(); + public abstract void InvokeCallback(bool allowPooling); [Conditional("SOCKETASYNCCONTEXT_TRACE")] @@ -333,36 +462,74 @@ public void TraceWithContext(SocketAsyncContext context, string message, [Caller // These two abstract classes differentiate the operations that go in the // read queue vs the ones that go in the write queue. - private abstract class ReadOperation : AsyncOperation, IThreadPoolWorkItem + internal abstract partial class ReadOperation : AsyncOperation, IThreadPoolWorkItem { - public ReadOperation(SocketAsyncContext context) : base(context) { } + public ReadOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ReadOperation); + } + + void IThreadPoolWorkItem.Execute() + { + if (TryExecuteIoUringCompletionCallback()) + { + return; + } - void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncReadOperation(this); + AssociatedContext.ProcessAsyncReadOperation(this); + } } - private abstract class WriteOperation : AsyncOperation, IThreadPoolWorkItem + private static bool ShouldDispatchCompletionCallback(AsyncOperation operation) { - public WriteOperation(SocketAsyncContext context) : base(context) { } + if (operation is ConnectOperation connectOperation) + { + // Connect can hand callback ownership to a follow-up send operation; + // dispatch here only when connect still owns the callback. + return connectOperation.Buffer.Length == 0 && connectOperation.Callback is not null; + } - void IThreadPoolWorkItem.Execute() => AssociatedContext.ProcessAsyncWriteOperation(this); + return true; } - private abstract class SendOperation : WriteOperation + private abstract partial class WriteOperation : AsyncOperation, IThreadPoolWorkItem + { + public WriteOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.WriteOperation); + } + + void IThreadPoolWorkItem.Execute() + { + if (TryExecuteIoUringCompletionCallback()) + { + return; + } + + AssociatedContext.ProcessAsyncWriteOperation(this); + } + } + + private abstract partial class SendOperation : WriteOperation { public SocketFlags Flags; public int BytesTransferred; public int Offset; public int Count; - public SendOperation(SocketAsyncContext context) : base(context) { } + public SendOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.SendOperation); + } public Action, SocketFlags, SocketError>? Callback { get; set; } public override void InvokeCallback(bool allowPooling) => Callback!(BytesTransferred, SocketAddress, SocketFlags.None, ErrorCode); + } - private class BufferMemorySendOperation : SendOperation + private partial class BufferMemorySendOperation : SendOperation { public Memory Buffer; @@ -390,18 +557,27 @@ public override void InvokeCallback(bool allowPooling) } } - private sealed class BufferListSendOperation : SendOperation + private sealed partial class BufferListSendOperation : SendOperation { public IList>? Buffers; public int BufferIndex; - public BufferListSendOperation(SocketAsyncContext context) : base(context) { } + public BufferListSendOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferListSendOperation); + } protected override bool DoTryComplete(SocketAsyncContext context) { return SocketPal.TryCompleteSendTo(context._socket, default(ReadOnlySpan), Buffers, ref BufferIndex, ref Offset, ref Count, Flags, SocketAddress.Span, ref BytesTransferred, out ErrorCode); } + internal void SetBufferPosition(int bufferIndex, int offset) + { + BufferIndex = bufferIndex; + Offset = offset; + } + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; @@ -446,15 +622,31 @@ public override void InvokeCallback(bool allowPooling) => Callback!(BytesTransferred, SocketAddress, ReceivedFlags, ErrorCode); } - private sealed class BufferMemoryReceiveOperation : ReceiveOperation + private sealed partial class BufferMemoryReceiveOperation : ReceiveOperation { public Memory Buffer; public bool SetReceivedFlags; - public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context) { } + public BufferMemoryReceiveOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferMemoryReceiveOperation); + } protected override bool DoTryComplete(SocketAsyncContext context) { + bool consumedBufferedData = false; + int bufferedBytes = 0; + context.LinuxTryConsumeBufferedPersistentMultishotRecvData(Buffer, ref consumedBufferedData, ref bufferedBytes); + if (!SetReceivedFlags && + SocketAddress.Length == 0 && + consumedBufferedData) + { + BytesTransferred = bufferedBytes; + ReceivedFlags = SocketFlags.None; + ErrorCode = SocketError.Success; + return true; + } + // Zero byte read is performed to know when data is available. // We don't have to call receive, our caller is interested in the event. if (Buffer.Length == 0 && Flags == SocketFlags.None && SocketAddress.Length == 0) @@ -502,11 +694,14 @@ public override void InvokeCallback(bool allowPooling) } } - private sealed class BufferListReceiveOperation : ReceiveOperation + private sealed partial class BufferListReceiveOperation : ReceiveOperation { public IList>? Buffers; - public BufferListReceiveOperation(SocketAsyncContext context) : base(context) { } + public BufferListReceiveOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.BufferListReceiveOperation); + } protected override bool DoTryComplete(SocketAsyncContext context) { @@ -553,7 +748,7 @@ protected override bool DoTryComplete(SocketAsyncContext context) } } - private sealed class ReceiveMessageFromOperation : ReadOperation + private sealed partial class ReceiveMessageFromOperation : ReadOperation { public Memory Buffer; public SocketFlags Flags; @@ -565,7 +760,10 @@ private sealed class ReceiveMessageFromOperation : ReadOperation public bool IsIPv6; public IPPacketInformation IPPacketInformation; - public ReceiveMessageFromOperation(SocketAsyncContext context) : base(context) { } + public ReceiveMessageFromOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ReceiveMessageFromOperation); + } public Action, SocketFlags, IPPacketInformation, SocketError>? Callback { get; set; } @@ -613,21 +811,33 @@ public override void InvokeCallback(bool allowPooling) => Callback!(BytesTransferred, SocketAddress, ReceivedFlags, IPPacketInformation, ErrorCode); } - private sealed class AcceptOperation : ReadOperation + internal sealed partial class AcceptOperation : ReadOperation { public IntPtr AcceptedFileDescriptor; + public int AcceptSocketAddressLength; - public AcceptOperation(SocketAsyncContext context) : base(context) { } + public AcceptOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.AcceptOperation); + } public Action, SocketError>? Callback { get; set; } protected override bool DoTryComplete(SocketAsyncContext context) { + bool dequeuedPreAcceptedConnection = false; + context.LinuxTryDequeuePreAcceptedConnection(this, ref dequeuedPreAcceptedConnection); + if (dequeuedPreAcceptedConnection) + { + return true; + } + bool completed = SocketPal.TryCompleteAccept(context._socket, SocketAddress, out int socketAddressLen, out AcceptedFileDescriptor, out ErrorCode); + AcceptSocketAddressLength = socketAddressLen; Debug.Assert(ErrorCode == SocketError.Success || AcceptedFileDescriptor == (IntPtr)(-1), $"Unexpected values: ErrorCode={ErrorCode}, AcceptedFileDescriptor={AcceptedFileDescriptor}"); if (ErrorCode == SocketError.Success) { - SocketAddress = SocketAddress.Slice(0, socketAddressLen); + SocketAddress = SocketAddress.Slice(0, AcceptSocketAddressLength); } return completed; } @@ -648,21 +858,49 @@ public override void InvokeCallback(bool allowPooling) } } - private sealed class ConnectOperation : BufferMemorySendOperation + private sealed partial class ConnectOperation : BufferMemorySendOperation { - public ConnectOperation(SocketAsyncContext context) : base(context) { } + public ConnectOperation(SocketAsyncContext context) : base(context) + { + SetIoUringCompletionDispatchKind(IoUringCompletionDispatchKind.ConnectOperation); + } protected override bool DoTryComplete(SocketAsyncContext context) { bool result = SocketPal.TryCompleteConnect(context._socket, out ErrorCode); context._socket.RegisterConnectResult(ErrorCode); - if (result && ErrorCode == SocketError.Success && Buffer.Length > 0) + if (result && Buffer.Length > 0) { - SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, Callback!, default); - if (error != SocketError.Success && error != SocketError.IOPending) + if (ErrorCode == SocketError.Success) { - context._socket.RegisterConnectResult(ErrorCode); + Action, SocketFlags, SocketError>? callback = Callback; + Debug.Assert(callback != null); + SocketError error = context.SendToAsync(Buffer, 0, Buffer.Length, SocketFlags.None, Memory.Empty, ref BytesTransferred, callback!, default); + if (error == SocketError.IOPending) + { + // Callback ownership moved to the async send operation. + Callback = null; + Buffer = default; + } + else + { + if (error != SocketError.Success) + { + ErrorCode = error; + context._socket.RegisterConnectResult(ErrorCode); + } + + // Follow-up send completed synchronously (success/error), so invoke + // Connect callback from this operation path. + Buffer = default; + } + } + else + { + // Connect failed — no follow-up send will occur. + // Clear buffer so callback dispatch is not suppressed. + Buffer = default; } } return result; @@ -670,17 +908,18 @@ protected override bool DoTryComplete(SocketAsyncContext context) public override void InvokeCallback(bool allowPooling) { - var cb = Callback!; + Action, SocketFlags, SocketError>? cb = Callback; int bt = BytesTransferred; Memory sa = SocketAddress; SocketError ec = ErrorCode; Memory buffer = Buffer; - if (buffer.Length == 0) + if (cb != null && (buffer.Length == 0 || ec == SocketError.OperationAborted)) { // Invoke callback only when we are completely done. // In case data were provided for Connect we may or may not send them all. - // If we did not we will need follow-up with Send operation + // If we did not we will need follow-up with Send operation. + // On cancellation, always invoke — the send was never started. cb(bt, sa, SocketFlags.None, ec); } } @@ -890,6 +1129,9 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation operation.CancellationRegistration = cancellationToken.UnsafeRegister(s => ((TOperation)s!).TryCancel(), operation); } + // Completion-mode staging: partial method is no-op on non-Linux. + LinuxTryStageIoUringOperation(operation); + return true; case QueueState.Stopped: @@ -898,7 +1140,7 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation break; default: - Environment.FailFast("unexpected queue state"); + FailFastUnexpectedQueueState(_state); break; } } @@ -939,7 +1181,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper } else { - throw new InternalException(error); + ThrowInternalException(error); } } } @@ -986,7 +1228,7 @@ static void HandleFailedRegistration(SocketAsyncContext context, TOperation oper return null; default: - Environment.FailFast("unexpected queue state"); + FailFastUnexpectedQueueState(_state); return null; } } @@ -1022,7 +1264,10 @@ internal void ProcessAsyncOperation(TOperation op) // request for a previous operation could affect a subsequent one) // and here we know the operation has completed. op.CancellationRegistration.Dispose(); - op.InvokeCallback(allowPooling: true); + if (ShouldDispatchCompletionCallback(op)) + { + op.InvokeCallback(allowPooling: true); + } } } @@ -1129,6 +1374,59 @@ public OperationResult ProcessQueuedOperation(TOperation op) return result; } + public bool TryRemoveCompletedOperation(SocketAsyncContext context, TOperation operation) + { + using (Lock()) + { + if (_tail == null || _state == QueueState.Stopped) + { + return false; + } + + AsyncOperation? previous = _tail; + AsyncOperation? current = _tail.Next; + while (!ReferenceEquals(current, operation)) + { + if (ReferenceEquals(current, _tail)) + { + return false; + } + + previous = current; + current = current!.Next; + } + + Debug.Assert(previous != null && current != null); + bool removedHead = ReferenceEquals(current, _tail.Next); + bool removedTail = ReferenceEquals(current, _tail); + + if (removedHead && removedTail) + { + _tail = null; + _isNextOperationSynchronous = false; + _state = QueueState.Ready; + _sequenceNumber++; + Trace(context, $"Removed completed {IdOf(operation)} (queue empty)"); + return true; + } + + previous!.Next = current!.Next; + if (removedTail) + { + _tail = (TOperation)previous; + } + + if (removedHead) + { + Debug.Assert(_tail != null); + _isNextOperationSynchronous = _tail.Next.Event != null; + } + + Trace(context, $"Removed completed {IdOf(operation)}"); + return true; + } + } + public void CancelAndContinueProcessing(TOperation op) { // Note, only sync operations use this method. @@ -1244,6 +1542,17 @@ public bool StopAndAbort(SocketAsyncContext context) return aborted; } + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowInternalException(Interop.Error error) => + throw new InternalException(error); + + [DoesNotReturn] + [StackTraceHidden] + [MethodImpl(MethodImplOptions.NoInlining)] + private static void FailFastUnexpectedQueueState(QueueState state) => + Environment.FailFast($"unexpected queue state: {state}"); + [Conditional("SOCKETASYNCCONTEXT_TRACE")] public void Trace(SocketAsyncContext context, string message, [CallerMemberName] string? memberName = null) { @@ -1328,6 +1637,7 @@ public bool StopAndAbort() // Drain queues aborted |= _sendQueue.StopAndAbort(this); aborted |= _receiveQueue.StopAndAbort(this); + LinuxOnStopAndAbort(); // We don't need to synchronize with Register. // This method is called when the handle gets released. @@ -1360,7 +1670,7 @@ public void SetHandleNonBlocking() { if (Interop.Sys.Fcntl.SetIsNonBlocking(_socket, 1) != 0) { - throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError())); + ThrowSocketExceptionFromLastError(); } _isHandleNonBlocking = true; @@ -1369,11 +1679,36 @@ public void SetHandleNonBlocking() public bool IsHandleNonBlocking => _isHandleNonBlocking; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ThrowIfThreadsAreNotSupported() + { + if (!Socket.OSSupportsThreads) + { + ThrowPlatformNotSupportedForMissingThreadSupport(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ValidateSyncOperationPreconditions(int timeout) + { + ThrowIfThreadsAreNotSupported(); + Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + } + + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowPlatformNotSupportedForMissingThreadSupport() => + throw new PlatformNotSupportedException(); + + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowSocketExceptionFromLastError() => + throw new SocketException((int)SocketPal.GetSocketErrorForErrorCode(Interop.Sys.GetLastError())); + private void PerformSyncOperation(ref OperationQueue queue, TOperation operation, int timeout, int observedSequenceNumber) where TOperation : AsyncOperation { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); using (var e = new ManualResetEventSlim(false, 0)) { @@ -1509,7 +1844,7 @@ public SocketError AcceptAsync(Memory socketAddress, out int socketAddress public SocketError Connect(Memory socketAddress) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); + ThrowIfThreadsAreNotSupported(); Debug.Assert(socketAddress.Length > 0, $"Unexpected socketAddressLen: {socketAddress.Length}"); // Connect is different than the usual "readiness" pattern of other operations. @@ -1603,9 +1938,7 @@ public SocketError ReceiveAsync(Memory buffer, SocketFlags flags, out int public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1636,7 +1969,7 @@ public SocketError ReceiveFrom(Memory buffer, ref SocketFlags flags, Memor public unsafe SocketError ReceiveFrom(Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1748,9 +2081,7 @@ public SocketError ReceiveAsync(IList> buffers, SocketFlags f public SocketError ReceiveFrom(IList> buffers, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, int timeout, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1817,9 +2148,7 @@ public SocketError ReceiveFromAsync(IList> buffers, SocketFla public SocketError ReceiveMessageFrom( Memory buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1854,9 +2183,7 @@ public SocketError ReceiveMessageFrom( public unsafe SocketError ReceiveMessageFrom( Span buffer, ref SocketFlags flags, Memory socketAddress, out int socketAddressLen, bool isIPv4, bool isIPv6, int timeout, out IPPacketInformation ipPacketInformation, out int bytesReceived) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); SocketFlags receivedFlags; SocketError errorCode; @@ -1946,9 +2273,7 @@ public SocketError SendAsync(Memory buffer, int offset, int count, SocketF public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; SocketError errorCode; @@ -1978,9 +2303,7 @@ public SocketError SendTo(byte[] buffer, int offset, int count, SocketFlags flag public unsafe SocketError SendTo(ReadOnlySpan buffer, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; SocketError errorCode; @@ -2057,9 +2380,7 @@ public SocketError SendAsync(IList> buffers, SocketFlags flag public SocketError SendTo(IList> buffers, SocketFlags flags, Memory socketAddress, int timeout, out int bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; int bufferIndex = 0; @@ -2127,9 +2448,7 @@ public SocketError SendToAsync(IList> buffers, SocketFlags fl public SocketError SendFile(SafeFileHandle fileHandle, long offset, long count, int timeout, out long bytesSent) { - if (!Socket.OSSupportsThreads) throw new PlatformNotSupportedException(); - - Debug.Assert(timeout == -1 || timeout > 0, $"Unexpected timeout: {timeout}"); + ValidateSyncOperationPreconditions(timeout); bytesSent = 0; SocketError errorCode; diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs new file mode 100644 index 00000000000000..1baf5c67b8e6b6 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringCompletionDispatch.Linux.cs @@ -0,0 +1,684 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + private readonly partial struct SocketEventHandler + { + /// Delivers a completed operation to its owning socket context. + private void DispatchCompletedIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData) + { + if (!operation.AssociatedContext.TryCompleteIoUringOperation(operation)) + { + _engine.RecordBenignLateIoUringCompletion(userData); + } + } + + /// Completes a deferred SEND_ZC operation when its NOTIF CQE arrives. + public void DispatchZeroCopyIoUringNotification(ulong payload) + { + ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + if (!_engine.TryTakeTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null) + { + return; + } + + Debug.Assert( + !_engine.IsZeroCopyNotificationPending(userData), + "NOTIF CQE dispatch must occur only after clearing SEND_ZC pending slot state."); + Debug.Assert( + operation.IoUringUserData == userData, + "Deferred SEND_ZC operation must still be tracked with its original user_data at NOTIF dispatch."); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Completed); + operation.ClearIoUringUserData(); + DispatchCompletedIoUringOperation(operation, userData); + } + + /// Processes a single completion and dispatches it to its owning operation. + public void DispatchSingleIoUringCompletion( + ulong userData, + int result, + uint flags, + int socketAddressLen, + int controlBufferLen, + uint auxiliaryData, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId, + ref bool enqueuedFallbackEvent) + { + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "DispatchSingleIoUringCompletion must only run on the event-loop thread."); + if (userData == 0) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId); + return; + } + + // Benign race: cancellation/abort paths may have already removed this tracked entry. + if (!_engine.TryTakeTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation)) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId); + _engine.RecordBenignLateIoUringCompletion(userData); + return; + } + + if (operation is null) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer, fixedRecvBufferId); + return; + } + + SocketAsyncContext receiveContext = operation.AssociatedContext; + if (receiveContext.IsPersistentMultishotRecvArmed() && + receiveContext.PersistentMultishotRecvUserData == userData) + { + // Terminal CQE for persistent multishot recv (normal completion, cancel, + // ENOBUFS, EOF, or other error): clear armed-state so the next receive can re-arm. + SocketsTelemetry.Log.IoUringPersistentMultishotRecvTermination(); + receiveContext.ClearPersistentMultishotRecvArmed(); + } + + if (operation is SocketAsyncContext.AcceptOperation acceptOperation && + acceptOperation.AssociatedContext.MultishotAcceptUserData == userData) + { + acceptOperation.AssociatedContext.DisarmMultishotAccept(); + } + + uint completionAuxiliaryData = auxiliaryData; + int completionResultCode = result; + if (!TryMaterializeIoUringReceiveCompletion( + operation!, + completionResultCode, + flags, + hasFixedRecvBuffer, + fixedRecvBufferId, + ref completionAuxiliaryData)) + { + completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS); + completionAuxiliaryData = 0; + } + + // Process completion metadata before processing result to allow message post-processing. + operation!.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen); + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult = + operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData); + + if (completionDispatchResult == SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed && + _engine.IsZeroCopyNotificationPending(userData)) + { + // SEND_ZC API contract: complete managed operation only once NOTIF confirms + // the kernel/NIC no longer references the caller buffer. + _engine.AssertZeroCopyDeferredCompletionState(userData, operation); + if (!_engine.TryReattachTrackedIoUringOperation(userData, operation)) + { + Debug.Fail("SEND_ZC deferred completion reattach failed; completing operation with EINVAL and releasing deferred slot."); + bool cleanedDeferredSlot = _engine.TryCleanupDeferredZeroCopyCompletionSlot(userData); + Debug.Assert( + cleanedDeferredSlot, + "SEND_ZC deferred completion reattach failure should release the deferred completion slot."); + operation.ErrorCode = SocketPal.GetSocketErrorForErrorCode(Interop.Error.EINVAL); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Completed); + operation.ClearIoUringUserData(); + DispatchCompletedIoUringOperation(operation, userData); + return; + } + + return; + } + + DispatchIoUringCompletionResult( + operation, + completionDispatchResult, + userData, + ref enqueuedFallbackEvent); + } + + /// + /// Processes a multishot completion by completing the current operation and + /// requesting async cancel for non-terminal shots until full item-9 dispatch lands. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + public void DispatchMultishotIoUringCompletion( + ulong userData, + int result, + uint flags, + int socketAddressLen, + int controlBufferLen, + uint auxiliaryData, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId, + ref bool enqueuedFallbackEvent) + { + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "DispatchMultishotIoUringCompletion must only run on the event-loop thread."); + _ = enqueuedFallbackEvent; // Transitional path never requeues via readiness fallback. + _ = hasFixedRecvBuffer; + _ = fixedRecvBufferId; + Debug.Assert((flags & IoUringConstants.CqeFMore) != 0, + "Multishot dispatch must only be used for non-terminal CQEs (IORING_CQE_F_MORE)."); + + if (userData == 0) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0); + return; + } + + if (!_engine.TryGetTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) || operation is null) + { + RecycleUntrackedReceiveCompletionBuffers(flags, hasFixedRecvBuffer: false, fixedRecvBufferId: 0); + _engine.RecordBenignLateIoUringCompletion(userData); + return; + } + + if (operation is SocketAsyncContext.AcceptOperation acceptOperation) + { + DispatchMultishotAcceptIoUringCompletion( + acceptOperation, + userData, + result, + flags, + socketAddressLen, + auxiliaryData); + return; + } + + if (!operation.IsInWaitingState()) + { + if (!TryBufferEarlyPersistentMultishotRecvCompletion(operation.AssociatedContext, result, flags)) + { + _engine.TryRequestIoUringCancellation(userData); + } + + return; + } + + uint completionAuxiliaryData = auxiliaryData; + int completionResultCode = result; + if (!TryMaterializeIoUringReceiveCompletion( + operation, + completionResultCode, + flags, + hasFixedRecvBuffer: false, + fixedRecvBufferId: 0, + ref completionAuxiliaryData)) + { + completionResultCode = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.ENOBUFS); + completionAuxiliaryData = 0; + } + + operation.SetIoUringCompletionMessageMetadata(socketAddressLen, controlBufferLen); + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult = + operation.ProcessIoUringCompletionResult(completionResultCode, flags, completionAuxiliaryData); + + SocketAsyncContext context = operation.AssociatedContext; + bool isPersistentMultishotRecv = + context.IsPersistentMultishotRecvArmed() && + context.PersistentMultishotRecvUserData == userData; + + // Transitional multishot model cancels after the first shot. + // Persistent multishot receive remains armed and rebinds future operations via TryReplace. + if (!isPersistentMultishotRecv) + { + _engine.TryRequestIoUringCancellation(userData); + } + + switch (completionDispatchResult) + { + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed: + DispatchCompletedIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending: + // Transitional multishot mode does not requeue intermediate shots. + // Cancellation is already requested above; terminal CQE cleanup path + // remains responsible for tracked-state/resource release. + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled: + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored: + break; + + default: + Debug.Fail($"Unexpected io_uring multishot completion result: {completionDispatchResult}"); + break; + } + } + + /// + /// Handles transitional multishot-accept CQEs by completing one waiting operation and + /// canceling the multishot request. Extra successful shots are queued for dequeue on + /// the accept operation queue when possible. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private void DispatchMultishotAcceptIoUringCompletion( + SocketAsyncContext.AcceptOperation operation, + ulong userData, + int result, + uint flags, + int socketAddressLen, + uint auxiliaryData) + { + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "DispatchMultishotAcceptIoUringCompletion must only run on the event-loop thread."); + operation.SetIoUringCompletionMessageMetadata(socketAddressLen, 0); + SocketAsyncContext context = operation.AssociatedContext; + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionDispatchResult = + operation.ProcessIoUringCompletionResult(result, flags, auxiliaryData); + + // Transitional multishot-accept model: complete one managed accept and then + // issue async-cancel so terminal cleanup runs through single-shot dispatch. + _engine.TryRequestIoUringCancellation(userData); + + switch (completionDispatchResult) + { + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed: + DispatchCompletedIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending: + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled: + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored: + if (result >= 0) + { + int addressLength = auxiliaryData > (uint)operation.SocketAddress.Length ? + operation.SocketAddress.Length : + (int)auxiliaryData; + if (context.TryEnqueuePreAcceptedConnection((IntPtr)result, operation.SocketAddress.Span, addressLength)) + { + _engine.EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read); + } + else + { + Interop.Sys.Close((IntPtr)result); + } + } + break; + + default: + Debug.Fail($"Unexpected io_uring multishot accept completion result: {completionDispatchResult}"); + break; + } + } + + /// + /// For receive completions that used provided buffers (buffer-select or fixed receive), + /// materializes payload bytes into the operation target and recycles checked-out buffers. + /// + private unsafe bool TryMaterializeIoUringReceiveCompletion( + SocketAsyncContext.AsyncOperation operation, + int result, + uint flags, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId, + ref uint auxiliaryData) + { + bool hasSelectedBuffer = (flags & IoUringConstants.CqeFBuffer) != 0; + if (!hasFixedRecvBuffer && !hasSelectedBuffer) + { + return true; + } + + IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + return false; + } + + ushort bufferId; + bool reportRecycleFailureAsDepletion; + byte* providedBuffer = null; + int providedBufferLength = 0; + if (hasFixedRecvBuffer) + { + bufferId = fixedRecvBufferId; + reportRecycleFailureAsDepletion = true; + + if (result > 0 && + !providedBufferRing.TryGetCheckedOutBuffer( + bufferId, + out providedBuffer, + out providedBufferLength)) + { + _engine.RecordIoUringProvidedBufferDepletionForDrainBatch(); + return false; + } + } + else + { + bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift); + reportRecycleFailureAsDepletion = false; + if (!providedBufferRing.TryAcquireBufferForCompletion( + bufferId, + out providedBuffer, + out providedBufferLength)) + { + _engine.RecordIoUringProvidedBufferDepletionForDrainBatch(); + return false; + } + } + + bool handled = result <= 0; + try + { + if (result > 0) + { + handled = + operation.TryProcessIoUringProvidedBufferCompletion( + providedBuffer, + providedBufferLength, + result, + ref auxiliaryData); + } + + RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result); + } + finally + { + handled &= TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + bufferId, + reportFailureAsDepletion: reportRecycleFailureAsDepletion); + } + + return handled; + } + + /// + /// For persistent multishot recv, buffers payload bytes that arrive while no + /// managed receive operation is in the Waiting state. + /// + private unsafe bool TryBufferEarlyPersistentMultishotRecvCompletion( + SocketAsyncContext context, + int result, + uint flags) + { + if (result <= 0) + { + return true; + } + + if ((flags & IoUringConstants.CqeFBuffer) == 0) + { + return false; + } + + IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + return false; + } + + ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift); + if (!providedBufferRing.TryAcquireBufferForCompletion( + bufferId, + out byte* providedBuffer, + out int providedBufferLength)) + { + _engine.RecordIoUringProvidedBufferDepletionForDrainBatch(); + return false; + } + + bool buffered = false; + try + { + if ((uint)result <= (uint)providedBufferLength) + { + buffered = context.TryBufferEarlyPersistentMultishotRecvData( + new ReadOnlySpan(providedBuffer, result)); + if (buffered) + { + RecordProvidedBufferUtilizationIfEnabled(providedBufferRing, result); + _engine.RecordIoUringPersistentMultishotRecvEarlyDataForDrainBatch(); + } + } + } + finally + { + buffered &= TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + bufferId, + reportFailureAsDepletion: false); + } + + return buffered; + } + + /// + /// Recycles a provided-buffer selection for completions that can no longer be + /// dispatched to a tracked operation (e.g., late multishot CQEs after cancel). + /// + private unsafe void RecycleUntrackedReceiveCompletionBuffers( + uint flags, + bool hasFixedRecvBuffer, + ushort fixedRecvBufferId) + { + IoUringProvidedBufferRing? providedBufferRing = _engine._ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + return; + } + + if ((flags & IoUringConstants.CqeFBuffer) == 0) + { + if (hasFixedRecvBuffer) + { + _ = TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + fixedRecvBufferId, + reportFailureAsDepletion: true); + } + + return; + } + + ushort bufferId = (ushort)(flags >> IoUringConstants.CqeBufferShift); + if (!providedBufferRing.TryAcquireBufferForCompletion( + bufferId, + out _, + out _)) + { + _engine.RecordIoUringProvidedBufferDepletionForDrainBatch(); + } + else + { + _ = TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + bufferId, + reportFailureAsDepletion: false); + } + + if (hasFixedRecvBuffer) + { + _ = TryRecycleProvidedBufferFromCheckedOutState( + providedBufferRing, + fixedRecvBufferId, + reportFailureAsDepletion: true); + } + } + + private void RecordProvidedBufferUtilizationIfEnabled( + IoUringProvidedBufferRing providedBufferRing, + int bytesTransferred) + { + if (bytesTransferred <= 0 || !_engine._adaptiveBufferSizingEnabled) + { + return; + } + + Debug.Assert(_engine.IsCurrentThreadEventLoopThread(), + "Adaptive provided-buffer utilization tracking must run on the event-loop thread."); + providedBufferRing.RecordCompletionUtilization(bytesTransferred); + } + + private bool TryRecycleProvidedBufferFromCheckedOutState( + IoUringProvidedBufferRing providedBufferRing, + ushort bufferId, + bool reportFailureAsDepletion) + { + bool recycled = providedBufferRing.TryRecycleBufferFromCompletion(bufferId); + if (recycled) + { + _engine.RecordIoUringProvidedBufferRecycleForDrainBatch(); + } + else if (reportFailureAsDepletion) + { + _engine.RecordIoUringProvidedBufferDepletionForDrainBatch(); + } + + return recycled; + } + + /// Requeues a pending operation or falls back to readiness notification. + private bool DispatchPendingIoUringOperation(SocketAsyncContext.AsyncOperation operation, ulong userData) + { + PendingIoUringReprepareResult inlineReprepareResult = TryDispatchPendingIoUringOperationInline(operation); + if (inlineReprepareResult == PendingIoUringReprepareResult.Prepared) + { + return false; + } + + if (inlineReprepareResult == PendingIoUringReprepareResult.NotAttempted && + operation.TryQueueIoUringPreparation()) + { + _engine.RecordIoUringPendingRetryQueuedToPrepareQueue(); + return false; + } + + Debug.Assert( + inlineReprepareResult == PendingIoUringReprepareResult.Failed || + !_engine._ioUringCapabilities.IsCompletionMode, + "Requeue should not fail in pure io_uring completion mode when inline re-prepare was not attempted."); + + _engine.RecordIoUringCompletionRequeueFailure(userData); + operation.ClearIoUringUserData(); + Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents(); + if (fallbackEvents == Interop.Sys.SocketEvents.None) + { + return false; + } + + if (NetEventSource.Log.IsEnabled()) + { + LogUnexpectedCompletionFallback(_engine, fallbackEvents, userData); + } + _eventQueue.Enqueue(new SocketIOEvent(operation.AssociatedContext, fallbackEvents)); + return true; + + [MethodImpl(MethodImplOptions.NoInlining)] + static void LogUnexpectedCompletionFallback(SocketAsyncEngine engine, Interop.Sys.SocketEvents events, ulong completionUserData) + { + NetEventSource.Error( + engine, + $"io_uring completion fallback to readiness notification in unexpected path: events={events}, user_data=0x{completionUserData:x}"); + } + } + + /// + /// Attempts to re-prepare and re-track a pending operation inline on the event loop thread. + /// This avoids an extra prepare-queue round-trip for completion-mode retries. + /// + private enum PendingIoUringReprepareResult : byte + { + NotAttempted = 0, + Prepared = 1, + Failed = 2 + } + + /// + /// Attempts to re-prepare a pending operation inline. + /// Returns whether inline re-prepare was prepared, skipped, or failed without producing an SQE. + /// + private PendingIoUringReprepareResult TryDispatchPendingIoUringOperationInline(SocketAsyncContext.AsyncOperation operation) + { + if (!_engine._ioUringCapabilities.IsCompletionMode || !_engine.IsCurrentThreadEventLoopThread()) + { + return PendingIoUringReprepareResult.NotAttempted; + } + + long prepareSequence = operation.MarkReadyForIoUringPreparation(); + Interop.Error prepareError = _engine.TryPrepareAndTrackIoUringOperation( + operation, + prepareSequence, + out bool preparedSqe); + if (prepareError != Interop.Error.SUCCESS) + { + Debug.Fail($"io_uring inline re-prepare failed: {prepareError}"); + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Error(_engine, $"io_uring inline re-prepare failed: {prepareError}"); + } + + return PendingIoUringReprepareResult.Failed; + } + + return preparedSqe ? PendingIoUringReprepareResult.Prepared : PendingIoUringReprepareResult.Failed; + } + + /// Routes a CQE completion result to the appropriate dispatch behavior. + private void DispatchIoUringCompletionResult( + SocketAsyncContext.AsyncOperation operation, + SocketAsyncContext.AsyncOperation.IoUringCompletionResult completionResult, + ulong userData, + ref bool enqueuedFallbackEvent) + { + switch (completionResult) + { + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Completed: + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Completed); + operation.ClearIoUringUserData(); + DispatchCompletedIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Pending: + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Queued); + if (operation.ShouldReuseIoUringPreparationResourcesOnPending) + { + operation.MarkIoUringPreparationReusable(); + operation.ResetIoUringUserDataForRequeue(); + } + else + { + operation.ClearIoUringUserData(); + } + + enqueuedFallbackEvent |= DispatchPendingIoUringOperation(operation, userData); + break; + + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Canceled: + case SocketAsyncContext.AsyncOperation.IoUringCompletionResult.Ignored: + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Canceled); + operation.ClearIoUringUserData(); + _engine.RecordBenignLateIoUringCompletion(userData); + break; + + default: + Debug.Fail($"Unexpected io_uring completion result: {completionResult}"); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Detached); + operation.ClearIoUringUserData(); + _engine.RecordBenignLateIoUringCompletion(userData); + break; + } + } + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs new file mode 100644 index 00000000000000..c6aa7912c60fa8 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringConfiguration.Linux.cs @@ -0,0 +1,211 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + private readonly struct IoUringConfigurationInputs + { + internal readonly string? IoUringEnvironmentValue; + internal readonly bool IoUringFeatureSwitchEnabled; + internal readonly string? SqPollEnvironmentValue; + internal readonly bool SqPollFeatureSwitchEnabled; + internal readonly string? DirectSqeEnvironmentValue; + internal readonly string? ZeroCopySendEnvironmentValue; + + internal IoUringConfigurationInputs( + string? ioUringEnvironmentValue, + bool ioUringFeatureSwitchEnabled, + string? sqPollEnvironmentValue, + bool sqPollFeatureSwitchEnabled, + string? directSqeEnvironmentValue, + string? zeroCopySendEnvironmentValue) + { + IoUringEnvironmentValue = ioUringEnvironmentValue; + IoUringFeatureSwitchEnabled = ioUringFeatureSwitchEnabled; + SqPollEnvironmentValue = sqPollEnvironmentValue; + SqPollFeatureSwitchEnabled = sqPollFeatureSwitchEnabled; + DirectSqeEnvironmentValue = directSqeEnvironmentValue; + ZeroCopySendEnvironmentValue = zeroCopySendEnvironmentValue; + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static IoUringResolvedConfiguration ResolveIoUringResolvedConfiguration() + { + IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs(); + return new IoUringResolvedConfiguration( + ioUringEnabled: ResolveIoUringEnabled(inputs), + sqPollRequested: ResolveSqPollRequested(inputs), + directSqeDisabled: ResolveIoUringDirectSqeDisabled(inputs), + zeroCopySendOptedIn: ResolveZeroCopySendOptedIn(inputs), + registerBuffersEnabled: s_ioUringRegisterBuffersEnabled, + adaptiveProvidedBufferSizingEnabled: s_ioUringAdaptiveBufferSizingEnabled, + providedBufferSize: s_ioUringProvidedBufferSize, + prepareQueueCapacity: s_ioUringPrepareQueueCapacity, + cancellationQueueCapacity: s_ioUringCancellationQueueCapacity); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static IoUringConfigurationInputs ReadIoUringConfigurationInputs() + { +#if DEBUG + string? directSqeValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.DirectSqe); + string? zeroCopySendValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ZeroCopySend); +#else + string? directSqeValue = null; + string? zeroCopySendValue = null; +#endif + + return new IoUringConfigurationInputs( + ioUringEnvironmentValue: Environment.GetEnvironmentVariable(IoUringEnvironmentVariable), + ioUringFeatureSwitchEnabled: IsIoUringFeatureEnabled, + sqPollEnvironmentValue: Environment.GetEnvironmentVariable(IoUringSqPollEnvironmentVariable), + sqPollFeatureSwitchEnabled: IsSqPollFeatureEnabled, + directSqeEnvironmentValue: directSqeValue, + zeroCopySendEnvironmentValue: zeroCopySendValue); + } + + /// + /// Checks whether direct SQE submission is disabled. + /// Defaults to enabled; test-only env var can disable for deterministic tests. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsIoUringDirectSqeDisabled() + { + IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs(); + return ResolveIoUringDirectSqeDisabled(inputs); + } + + /// Checks whether io_uring is enabled. + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsIoUringEnabled() + { + IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs(); + return ResolveIoUringEnabled(inputs); + } + + [FeatureSwitchDefinition(UseIoUringAppContextSwitch)] + private static bool IsIoUringFeatureEnabled + { + get + { + if (AppContext.TryGetSwitch(UseIoUringAppContextSwitch, out bool enabled)) + { + return enabled; + } + + return false; + } + } + + /// + /// Returns whether SEND_ZC should be enabled. + /// Defaults to enabled; test-only env var can disable for deterministic tests. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsZeroCopySendOptedIn() + { + IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs(); + return ResolveZeroCopySendOptedIn(inputs); + } + + private static bool ResolveIoUringDirectSqeDisabled(in IoUringConfigurationInputs inputs) + { +#if DEBUG + // Test-only override for deterministic coverage. + string? value = inputs.DirectSqeEnvironmentValue; + if (string.Equals(value, "0", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(value, "1", StringComparison.Ordinal)) + { + return false; + } +#endif + + // Default: direct SQE enabled. + return false; + } + + private static bool ResolveIoUringEnabled(in IoUringConfigurationInputs inputs) + { + // Override order: environment variable wins over AppContext switch. + if (string.Equals(inputs.IoUringEnvironmentValue, "1", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(inputs.IoUringEnvironmentValue, "0", StringComparison.Ordinal)) + { + return false; + } + + return inputs.IoUringFeatureSwitchEnabled; + } + + private static bool ResolveZeroCopySendOptedIn(in IoUringConfigurationInputs inputs) + { +#if DEBUG + // Test-only override for deterministic coverage. + string? value = inputs.ZeroCopySendEnvironmentValue; + if (string.Equals(value, "1", StringComparison.Ordinal)) + { + return true; + } + + if (string.Equals(value, "0", StringComparison.Ordinal)) + { + return false; + } +#endif + + // Default: zero-copy send enabled. + return true; + } + + [FeatureSwitchDefinition(UseIoUringSqPollAppContextSwitch)] + private static bool IsSqPollFeatureEnabled + { + get + { + if (AppContext.TryGetSwitch(UseIoUringSqPollAppContextSwitch, out bool enabled)) + { + return enabled; + } + + return false; + } + } + + /// + /// Returns whether SQPOLL mode has been explicitly requested. + /// SQPOLL requires dual opt-in: AppContext switch + environment variable. + /// This is intentionally stricter than the primary io_uring gate + /// (`IsIoUringEnabled`), which accepts either source. + /// SQPOLL pins a kernel thread, so accidental activation should require + /// explicit confirmation from both configuration surfaces. + /// + private static bool IsSqPollRequested() + { + IoUringConfigurationInputs inputs = ReadIoUringConfigurationInputs(); + return ResolveSqPollRequested(inputs); + } + + private static bool ResolveSqPollRequested(in IoUringConfigurationInputs inputs) + { + if (!inputs.SqPollFeatureSwitchEnabled) + { + return false; + } + + return string.Equals(inputs.SqPollEnvironmentValue, "1", StringComparison.Ordinal); + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs new file mode 100644 index 00000000000000..972f6a2240615a --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringDiagnostics.Linux.cs @@ -0,0 +1,320 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + /// Resets the native diagnostics poll countdown. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void InitializeLinuxIoUringDiagnosticsState() => + _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval; + + /// Logs a failed ASYNC_CANCEL SQE preparation. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringAsyncCancelPrepareFailure(SocketError cancelError, ulong userData, IoUringCancellationOrigin origin) + { + string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty; + NetEventSource.Info(this, $"io_uring async-cancel prepare failed{originLabel}: error={cancelError}, user_data=0x{userData:x}"); + } + + /// Logs a failed ASYNC_CANCEL submission. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringAsyncCancelSubmitFailure(Interop.Error submitError, IoUringCancellationOrigin origin) + { + string originLabel = origin == IoUringCancellationOrigin.Teardown ? " during teardown" : string.Empty; + NetEventSource.Info(this, $"io_uring async-cancel submit failed{originLabel}: error={submitError}"); + } + + /// Logs a sampled counter value with its associated user_data. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCounterSample(string message, long count, ulong userData) + { + NetEventSource.Info(this, $"{message}: count={count}, user_data=0x{userData:x}"); + } + + /// Logs a prepare queue overflow event. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringPrepareQueueOverflow(long count, int capacity) + { + NetEventSource.Info(this, $"io_uring prepare queue overflow: count={count}, capacity={capacity}"); + } + + /// Logs a cancellation queue overflow event. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCancellationQueueOverflow(long count, int capacity) + { + NetEventSource.Info(this, $"io_uring cancellation queue overflow: count={count}, capacity={capacity}"); + } + + /// Logs a CQ overflow observation from the kernel CQ ring counter. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCqOverflow(uint totalOverflowCount, uint delta) + { + NetEventSource.Error(this, $"io_uring CQ overflow detected: total={totalOverflowCount}, delta={delta}"); + } + + /// Logs CQ-overflow recovery activation with branch discriminator. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCqOverflowRecoveryEntry( + IoUringCqOverflowRecoveryBranch branch, + uint totalOverflowCount, + uint delta) + { + NetEventSource.Error( + this, + $"io_uring CQ overflow recovery entered: branch={branch}, total={totalOverflowCount}, delta={delta}"); + } + + /// Logs CQ-overflow recovery completion for diagnostics correlation. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCqOverflowRecoveryCompleted( + IoUringCqOverflowRecoveryBranch branch, + int completionSlotsInUse) + { + NetEventSource.Info( + this, + $"io_uring CQ overflow recovery completed: branch={branch}, completion_slots_in_use={completionSlotsInUse}"); + } + + /// Logs a deferred multishot-accept rearm nudge issued after CQ-overflow recovery. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringDeferredMultishotAcceptRearmAfterRecovery() + { + NetEventSource.Info(this, "io_uring CQ overflow recovery branch (a): deferred multishot-accept rearm nudged after drain."); + } + + /// Logs when teardown preempts in-progress CQ-overflow recovery ownership. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCqOverflowRecoveryTeardownPreempted() + { + NetEventSource.Info(this, "io_uring CQ overflow recovery preempted by teardown; cancellation/drain owns shutdown."); + } + + /// Logs a failed eventfd wake signal. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringWakeFailure(Interop.Error error) + { + NetEventSource.Info(this, $"io_uring wake signal failed: error={error}"); + } + + /// Logs eventfd wake circuit-breaker transitions. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringWakeCircuitBreakerStateChanged(bool enabled, int consecutiveFailures) + { + NetEventSource.Info( + this, + $"io_uring wake circuit-breaker {(enabled ? "enabled" : "disabled")}: consecutiveWakeFailures={consecutiveFailures}"); + } + + /// Logs the final count of benign late completions at teardown. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringTeardownSummary(long lateCompletionCount) + { + NetEventSource.Info(this, $"io_uring benign late-completion total={lateCompletionCount}"); + } + + /// Logs an untrack operation mismatch. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringUntrackMismatch(ulong userData, long mismatchCount) + { + NetEventSource.Info(this, $"io_uring untrack mismatch: user_data=0x{userData:x}, count={mismatchCount}"); + } + + /// Logs the negotiated io_uring mode for this engine instance. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringModeSelection(LinuxIoUringCapabilities capabilities) + { + NetEventSource.Info( + this, + $"io_uring mode={capabilities.Mode}, is_io_uring_port={capabilities.IsIoUringPort}, supports_multishot_recv={capabilities.SupportsMultishotRecv}, supports_multishot_accept={capabilities.SupportsMultishotAccept}, zero_copy_send_enabled={capabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, sqpoll_enabled={capabilities.SqPollEnabled}"); + } + + /// Logs active advanced io_uring features for this engine instance. + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringAdvancedFeatureState() + { + int providedBufferSize = _ioUringProvidedBufferRing?.BufferSize ?? 0; + NetEventSource.Info( + this, + $"io_uring features: multishot_recv={_ioUringCapabilities.SupportsMultishotRecv}, multishot_accept={_ioUringCapabilities.SupportsMultishotAccept}, zero_copy_send_enabled={_ioUringCapabilities.SupportsZeroCopySend}, supports_read_fixed={_supportsOpReadFixed}, fixed_recv_active={_supportsOpReadFixed && _ioUringCapabilities.HasRegisteredBuffers}, supports_send_zc={_supportsOpSendZc}, supports_sendmsg_zc={_supportsOpSendMsgZc}, provided_buffers={_ioUringCapabilities.SupportsProvidedBufferRings}, registered_buffers={_ioUringCapabilities.HasRegisteredBuffers}, adaptive_buffer_sizing={_adaptiveBufferSizingEnabled}, sqpoll_enabled={_ioUringCapabilities.SqPollEnabled}, provided_buffer_size={providedBufferSize}"); + } + + /// Publishes prepare queue depth delta to telemetry and resets the counter. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ResetIoUringPrepareQueueDepthTelemetry() + { + long publishedDepth = Interlocked.Exchange(ref _ioUringPublishedPrepareQueueLength, 0); + if (publishedDepth != 0) + { + SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(-publishedDepth); + } + } + + /// Increments a counter and logs a sample every 64 increments. + private void RecordIoUringCounterAndMaybeLog(ref long counter, ulong userData, string message) + { + long count = Interlocked.Increment(ref counter); + if ((count & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringCounterSample(message, count, userData); + } + } + + /// Logs the teardown summary if any late completions were recorded. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void LogLinuxIoUringTeardownSummaryIfNeeded() + { + long lateCompletionCount = Interlocked.Read(ref _ioUringBenignLateCompletionCount); + if (lateCompletionCount > 0 && NetEventSource.Log.IsEnabled()) + { + LogIoUringTeardownSummary(lateCompletionCount); + } + } + + /// Periodically polls native counters and publishes deltas to telemetry. + private void PollIoUringDiagnosticsIfNeeded(bool force) + { + if (!_ioUringCapabilities.IsIoUringPort) + { + return; + } + + if (!force) + { + int countdown = _ioUringDiagnosticsPollCountdown - 1; + _ioUringDiagnosticsPollCountdown = countdown; + if (countdown > 0) + { + return; + } + } + + _ioUringDiagnosticsPollCountdown = IoUringDiagnosticsPollInterval; + PublishIoUringManagedDiagnosticsDelta(); + if (!_ioUringAdvancedFeatureStateLogged && NetEventSource.Log.IsEnabled()) + { + _ioUringAdvancedFeatureStateLogged = true; + LogIoUringAdvancedFeatureState(); + } + + if (!force) + { + EvaluateProvidedBufferRingResize(); + } + } + + /// Returns the non-negative delta between two counter snapshots. + private static long ComputeManagedCounterDelta(long previous, long current) => + current >= previous ? current - previous : current; + + /// Publishes a managed counter delta from source to published baseline. + private static bool TryPublishManagedCounterDelta( + ref long sourceCounter, + ref long publishedCounter, + out long delta, + bool monotonic = true) + { + long current = Interlocked.Read(ref sourceCounter); + long previous = Interlocked.Exchange(ref publishedCounter, current); + delta = monotonic ? ComputeManagedCounterDelta(previous, current) : current - previous; + return delta != 0; + } + + /// Computes and publishes this engine's non-pinnable fallback counter delta. + private bool TryPublishIoUringNonPinnablePrepareFallbackDelta(out long delta) + { + long current = Interlocked.Read(ref _ioUringNonPinnablePrepareFallbackCount); + long previous = Interlocked.Exchange(ref _ioUringPublishedNonPinnablePrepareFallbackCount, current); + delta = ComputeManagedCounterDelta(previous, current); + return delta != 0; + } + + /// Publishes all managed diagnostic counter deltas to telemetry. + private void PublishIoUringManagedDiagnosticsDelta() + { + // Sample pending SEND_ZC NOTIF state directly from completion slots so + // reset/teardown paths that bypass normal completion dispatch still publish accurate gauge data. + SocketsTelemetry.Log.IoUringZeroCopyNotificationPendingSlots(CountZeroCopyNotificationPendingSlots()); + if (TryPublishManagedCounterDelta( + ref _ioUringCompletionRequeueFailureCount, + ref _ioUringPublishedCompletionRequeueFailureCount, + out long requeueFailureDelta)) + { + SocketsTelemetry.Log.IoUringCompletionRequeueFailure(requeueFailureDelta); + } + + if (TryPublishIoUringNonPinnablePrepareFallbackDelta(out long nonPinnableFallbackDelta)) + { + SocketsTelemetry.Log.IoUringPrepareNonPinnableFallback(nonPinnableFallbackDelta); + } + + if (TryPublishManagedCounterDelta( + ref _ioUringPrepareQueueOverflowCount, + ref _ioUringPublishedPrepareQueueOverflowCount, + out long prepareQueueOverflowDelta)) + { + SocketsTelemetry.Log.IoUringPrepareQueueOverflow(prepareQueueOverflowDelta); + } + + if (TryPublishManagedCounterDelta( + ref _ioUringPrepareQueueOverflowFallbackCount, + ref _ioUringPublishedPrepareQueueOverflowFallbackCount, + out long prepareQueueOverflowFallbackDelta)) + { + SocketsTelemetry.Log.IoUringPrepareQueueOverflowFallback(prepareQueueOverflowFallbackDelta); + } + + if (TryPublishManagedCounterDelta( + ref _ioUringPrepareQueueLength, + ref _ioUringPublishedPrepareQueueLength, + out long prepareQueueDepthDelta, + monotonic: false)) + { + SocketsTelemetry.Log.IoUringPrepareQueueDepthDelta(prepareQueueDepthDelta); + } + + if (TryPublishManagedCounterDelta( + ref _ioUringCompletionSlotExhaustionCount, + ref _ioUringPublishedCompletionSlotExhaustionCount, + out long completionSlotExhaustionDelta)) + { + SocketsTelemetry.Log.IoUringCompletionSlotExhaustion(completionSlotExhaustionDelta); + } + + if (TryPublishManagedCounterDelta( + ref _ioUringCompletionSlotDrainRecoveryCount, + ref _ioUringPublishedCompletionSlotDrainRecoveryCount, + out long completionSlotDrainRecoveryDelta)) + { + SocketsTelemetry.Log.IoUringCompletionSlotDrainRecovery(completionSlotDrainRecoveryDelta); + } + } + + /// Counts completion slots currently waiting for SEND_ZC NOTIF CQEs. + private int CountZeroCopyNotificationPendingSlots() + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + return 0; + } + + int pendingNotificationSlots = 0; + for (int i = 0; i < completionEntries.Length; i++) + { + ref IoUringCompletionSlot slot = ref completionEntries[i]; + if (slot.IsZeroCopySend && slot.ZeroCopyNotificationPending) + { + pendingNotificationSlots++; + } + } + + return pendingNotificationSlots; + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs new file mode 100644 index 00000000000000..535807a55d6bfb --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringRings.Linux.cs @@ -0,0 +1,366 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + /// + /// Maps the SQ ring, CQ ring, and SQE array into managed address space and derives + /// all ring pointers from the kernel-reported offsets. On failure, unmaps any + /// partially-mapped regions and closes the ring fd. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool TryMmapRings(ref IoUringSetupResult setup) + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static bool IsOffsetInRange(ulong offset, ulong size, ulong mappedSize) => + offset <= mappedSize && size <= mappedSize - offset; + + ref Interop.Sys.IoUringParams p = ref setup.Params; + bool usesNoSqArray = (setup.NegotiatedFlags & IoUringConstants.SetupNoSqArray) != 0; + bool usesSqe128 = (setup.NegotiatedFlags & IoUringConstants.SetupSqe128) != 0; + uint negotiatedSqeSize = usesSqe128 ? 128u : (uint)sizeof(IoUringSqe); + if (negotiatedSqeSize != (uint)sizeof(IoUringSqe)) + { + // Managed SQE writers currently mirror the 64-byte io_uring_sqe layout. + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + + // Compute ring sizes. + ulong sqRingSize = p.SqOff.Array; + if (!usesNoSqArray) + { + sqRingSize += p.SqEntries * (uint)sizeof(uint); + } + ulong cqRingSize = p.CqOff.Cqes + p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe); + ulong sqesSize = p.SqEntries * negotiatedSqeSize; + + // mmap SQ ring (and possibly CQ ring if SINGLE_MMAP). + bool usesSingleMmap = (p.Features & IoUringConstants.FeatureSingleMmap) != 0; + + byte* sqRingPtr; + byte* cqRingPtr; + + if (usesSingleMmap) + { + ulong ringSize = sqRingSize > cqRingSize ? sqRingSize : cqRingSize; + void* ptr; + Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, ringSize, IoUringConstants.OffSqRing, &ptr); + if (err != Interop.Error.SUCCESS) + { + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + sqRingPtr = (byte*)ptr; + cqRingPtr = (byte*)ptr; + sqRingSize = ringSize; + cqRingSize = ringSize; + } + else + { + void* sqPtr; + Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqRingSize, IoUringConstants.OffSqRing, &sqPtr); + if (err != Interop.Error.SUCCESS) + { + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + sqRingPtr = (byte*)sqPtr; + + void* cqPtr; + err = Interop.Sys.IoUringShimMmap(setup.RingFd, cqRingSize, IoUringConstants.OffCqRing, &cqPtr); + if (err != Interop.Error.SUCCESS) + { + Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize); + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + cqRingPtr = (byte*)cqPtr; + } + + if (!IsOffsetInRange(p.SqOff.Head, sizeof(uint), sqRingSize) || + !IsOffsetInRange(p.SqOff.Tail, sizeof(uint), sqRingSize) || + !IsOffsetInRange(p.SqOff.RingMask, sizeof(uint), sqRingSize) || + !IsOffsetInRange(p.SqOff.RingEntries, sizeof(uint), sqRingSize) || + !IsOffsetInRange(p.SqOff.Flags, sizeof(uint), sqRingSize) || + (!usesNoSqArray && !IsOffsetInRange(p.SqOff.Array, p.SqEntries * (uint)sizeof(uint), sqRingSize)) || + !IsOffsetInRange(p.CqOff.Head, sizeof(uint), cqRingSize) || + !IsOffsetInRange(p.CqOff.Tail, sizeof(uint), cqRingSize) || + !IsOffsetInRange(p.CqOff.RingMask, sizeof(uint), cqRingSize) || + !IsOffsetInRange(p.CqOff.RingEntries, sizeof(uint), cqRingSize) || + !IsOffsetInRange(p.CqOff.Overflow, sizeof(uint), cqRingSize) || + !IsOffsetInRange(p.CqOff.Cqes, p.CqEntries * (uint)sizeof(Interop.Sys.IoUringCqe), cqRingSize)) + { + if (!usesSingleMmap) + { + Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize); + } + + Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize); + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + + // mmap SQE array. + void* sqePtr; + { + Interop.Error err = Interop.Sys.IoUringShimMmap(setup.RingFd, sqesSize, IoUringConstants.OffSqes, &sqePtr); + if (err != Interop.Error.SUCCESS) + { + if (!usesSingleMmap) + Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize); + Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize); + Interop.Sys.IoUringShimCloseFd(setup.RingFd); + return false; + } + } + + // Derive SQ pointers and populate existing _ioUringSqRingInfo for compatibility. + _ioUringSqRingInfo.SqeBase = (IntPtr)sqePtr; + _ioUringSqRingInfo.SqTailPtr = (IntPtr)(sqRingPtr + p.SqOff.Tail); + _ioUringSqRingInfo.SqHeadPtr = (IntPtr)(sqRingPtr + p.SqOff.Head); + _ioUringSqRingInfo.SqMask = *(uint*)(sqRingPtr + p.SqOff.RingMask); + _ioUringSqRingInfo.SqEntries = *(uint*)(sqRingPtr + p.SqOff.RingEntries); + _ioUringSqRingInfo.SqeSize = negotiatedSqeSize; + _ioUringSqRingInfo.UsesNoSqArray = usesNoSqArray ? (byte)1 : (byte)0; + _ioUringSqRingInfo.RingFd = setup.RingFd; + _ioUringSqRingInfo.UsesEnterExtArg = setup.UsesExtArg ? (byte)1 : (byte)0; + _managedSqFlagsPtr = (uint*)(sqRingPtr + p.SqOff.Flags); + + // Initialize SQ array identity mapping if NO_SQARRAY is not active. + if (!usesNoSqArray) + { + uint* sqArray = (uint*)(sqRingPtr + p.SqOff.Array); + for (uint i = 0; i < p.SqEntries; i++) + { + sqArray[i] = i; + } + } + + // Derive CQ pointers. + _managedCqeBase = (Interop.Sys.IoUringCqe*)(cqRingPtr + p.CqOff.Cqes); + _managedCqTailPtr = (uint*)(cqRingPtr + p.CqOff.Tail); + _managedCqHeadPtr = (uint*)(cqRingPtr + p.CqOff.Head); + _managedCqMask = *(uint*)(cqRingPtr + p.CqOff.RingMask); + _managedCqEntries = *(uint*)(cqRingPtr + p.CqOff.RingEntries); + _managedCqOverflowPtr = (uint*)(cqRingPtr + p.CqOff.Overflow); + + Debug.Assert( + BitOperations.IsPow2(_ioUringSqRingInfo.SqEntries), + $"Kernel-reported SQ entries must be power-of-two. sq_entries={_ioUringSqRingInfo.SqEntries}"); + Debug.Assert( + BitOperations.IsPow2(_managedCqEntries), + $"Kernel-reported CQ entries must be power-of-two. cq_entries={_managedCqEntries}"); + Debug.Assert( + _ioUringSqRingInfo.SqMask == _ioUringSqRingInfo.SqEntries - 1, + $"Unexpected SQ mask/entries contract: sq_mask={_ioUringSqRingInfo.SqMask}, sq_entries={_ioUringSqRingInfo.SqEntries}"); + Debug.Assert( + _managedCqMask == _managedCqEntries - 1, + $"Unexpected CQ mask/entries contract: cq_mask={_managedCqMask}, cq_entries={_managedCqEntries}"); + + _managedObservedCqOverflow = Volatile.Read(ref *_managedCqOverflowPtr); + _cqOverflowRecoveryActive = false; + _cqOverflowRecoveryBranch = default; + + // Store ring region info for teardown. + _managedSqRingPtr = sqRingPtr; + _managedCqRingPtr = cqRingPtr; + _managedSqRingSize = sqRingSize; + _managedCqRingSize = cqRingSize; + _managedSqesSize = sqesSize; + _managedUsesSingleMmap = usesSingleMmap; + _managedRingFd = setup.RingFd; + _managedUsesExtArg = setup.UsesExtArg; + _managedUsesNoSqArray = usesNoSqArray; + _managedNegotiatedFlags = setup.NegotiatedFlags; + _managedSqeInvariantsValidated = ValidateManagedSqeInitializationInvariants(); + if (!_managedSqeInvariantsValidated) + { + CleanupManagedRings(); + return false; + } + + return true; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void CleanupManagedRings() + { + _managedCqDrainEnabled = false; + + byte* sqRingPtr = _managedSqRingPtr; + byte* cqRingPtr = _managedCqRingPtr; + ulong sqRingSize = _managedSqRingSize; + ulong cqRingSize = _managedCqRingSize; + ulong sqesSize = _managedSqesSize; + bool usesSingleMmap = _managedUsesSingleMmap; + void* sqeBase = _ioUringSqRingInfo.SqeBase.ToPointer(); + + // Clear all mmap-derived pointers before unmapping so any late reads fail safely. + _managedSqFlagsPtr = null; + _managedCqeBase = null; + _managedCqTailPtr = null; + _managedCqHeadPtr = null; + _managedCqOverflowPtr = null; + _managedSqRingPtr = null; + _managedCqRingPtr = null; + _managedSqRingSize = 0; + _managedCqRingSize = 0; + _managedSqesSize = 0; + _managedCqMask = 0; + _managedCqEntries = 0; + _managedCachedCqHead = 0; + _managedObservedCqOverflow = 0; + _ioUringSqRingInfo = default; + _managedSqeInvariantsValidated = false; + + if (sqRingPtr != null) + { + // Unmap SQEs first + if (sqesSize > 0 && sqeBase != null) + { + Interop.Sys.IoUringShimMunmap(sqeBase, sqesSize); + } + // Unmap CQ ring (only if separate from SQ ring) + if (!usesSingleMmap && cqRingPtr != null && cqRingPtr != sqRingPtr) + { + Interop.Sys.IoUringShimMunmap(cqRingPtr, cqRingSize); + } + // Unmap SQ ring + Interop.Sys.IoUringShimMunmap(sqRingPtr, sqRingSize); + } + if (_managedRingFd >= 0) + { + Interop.Sys.IoUringShimCloseFd(_managedRingFd); + _managedRingFd = -1; + } + } + + /// Unmaps rings and closes the ring fd. + partial void LinuxFreeIoUringResources() + { + // Managed io_uring teardown: release resources allocated during TryInitializeManagedIoUring. + // This must run BEFORE the common slot/buffer cleanup below because kernel + // unregister operations need the ring fd to still be open. + if (_ioUringInitialized) + { + // 0. Unregister/dispose provided buffer ring while the main ring fd is still open. + FreeIoUringProvidedBufferRing(); + + // 1. The registered ring fd is implicitly released when the ring fd is closed. + // Just mark it as inactive so no subsequent code attempts to use it. + _ioUringSqRingInfo.RegisteredRingFd = -1; + + // 2. Close the wakeup eventfd. + if (_managedWakeupEventFd >= 0) + { + Interop.Sys.IoUringShimCloseFd(_managedWakeupEventFd); + _managedWakeupEventFd = -1; + } + + // 3. Unmap SQ/CQ rings, SQEs and close the ring fd. + // Closing the ring fd also terminates any kernel SQPOLL thread for this ring. + CleanupManagedRings(); + + // 4. Disable managed flags to prevent any late operations. + _ioUringInitialized = false; + _managedCqDrainEnabled = false; + } + + bool portClosedForTeardown = Volatile.Read(ref _ioUringPortClosedForTeardown) != 0; + if (!portClosedForTeardown) + { + PollIoUringDiagnosticsIfNeeded(force: true); + } + + // Second drain intentionally catches any items enqueued after LinuxBeforeFreeNativeResources + // published teardown but before native port closure became globally visible. + DrainQueuedIoUringOperationsForTeardown(); + + if (_completionSlots is not null) + { + DrainTrackedIoUringOperationsForTeardown(portClosedForTeardown); + Debug.Assert(IsIoUringTrackingEmpty(), $"Leaked tracked io_uring operations: {Volatile.Read(ref _trackedIoUringOperationCount)}"); + + // Free any native memory still held by completion slots + for (int i = 0; i < _completionSlots.Length; i++) + { + ref IoUringCompletionSlot slot = ref _completionSlots[i]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![i]; + if (slot.IsZeroCopySend && slot.ZeroCopyNotificationPending) + { + // Ring teardown can drop in-flight NOTIF CQEs; clear pending SEND_ZC state + // so teardown cannot leave slots/pin-holds logically waiting forever. + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + } + + ReleaseZeroCopyPinHold(i); + if (slot.Kind == IoUringCompletionOperationKind.Message) + { + FreeMessageStorage(i); + } + else if (slot.Kind == IoUringCompletionOperationKind.Accept && slotStorage.NativeSocketAddressLengthPtr != null) + { + *slotStorage.NativeSocketAddressLengthPtr = 0; + } + + // Clear all pointers that alias _completionSlotNativeStorage before freeing it. + slotStorage.NativeInlineStorage = null; + slotStorage.NativeSocketAddressLengthPtr = null; + slotStorage.NativeMsgHdrPtr = IntPtr.Zero; + slotStorage.MessageIsReceive = false; + slotStorage.NativeIOVectors = null; + slotStorage.NativeSocketAddress = null; + slotStorage.NativeControlBuffer = null; + slotStorage.ReceiveOutputSocketAddress = null; + slotStorage.ReceiveOutputControlBuffer = null; + slotStorage.ReceiveSocketAddressCapacity = 0; + slotStorage.ReceiveControlBufferCapacity = 0; + } + + _completionSlots = null; + _trackedOperations = null; + _completionSlotStorage = null; + _trackedIoUringOperationCount = 0; + _zeroCopyPinHolds = null; + _completionSlotFreeListHead = -1; + _completionSlotsInUse = 0; + _liveAcceptCompletionSlotCount = 0; + + _ioUringSlotCapacity = 0; + _cqOverflowRecoveryActive = false; + _cqOverflowRecoveryBranch = default; + _ioUringManagedPendingSubmissions = 0; + _ioUringManagedSqTail = 0; + _ioUringManagedSqTailLoaded = false; + _ioUringSqRingInfo = default; + _ioUringDirectSqeEnabled = false; + _sqPollEnabled = false; + + LogLinuxIoUringTeardownSummaryIfNeeded(); + } + + if (_completionSlotNativeStorage != null) + { + NativeMemory.Free(_completionSlotNativeStorage); + _completionSlotNativeStorage = null; + _completionSlotNativeStorageStride = 0; + } + + ResetIoUringPrepareQueueDepthTelemetry(); + + // Final flush of managed io_uring deltas in case teardown modified counters + // after the forced diagnostics poll and no further event-loop iteration runs. + PublishIoUringManagedDiagnosticsDelta(); + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs new file mode 100644 index 00000000000000..164c67891c375a --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSlots.Linux.cs @@ -0,0 +1,461 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.ExceptionServices; +using System.Runtime.InteropServices; +using System.Threading; +using Microsoft.Win32.SafeHandles; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nuint GetCompletionSlotNativeStorageStride() + { + nuint iovSize = (nuint)IoUringConstants.MessageInlineIovCount * (nuint)sizeof(Interop.Sys.IOVector); + return (nuint)sizeof(NativeMsghdr) + + iovSize + + (nuint)IoUringConstants.MessageInlineSocketAddressCapacity + + (nuint)IoUringConstants.MessageInlineControlBufferCapacity + + (nuint)sizeof(int); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void InitializeCompletionSlotNativeStorage( + ref IoUringCompletionSlotStorage slotStorage, + byte* slotStorageBase) + { + slotStorage.NativeInlineStorage = slotStorageBase; + slotStorage.NativeMsgHdrPtr = (IntPtr)slotStorageBase; + + byte* cursor = slotStorageBase + sizeof(NativeMsghdr); + slotStorage.NativeIOVectors = (Interop.Sys.IOVector*)cursor; + cursor += IoUringConstants.MessageInlineIovCount * sizeof(Interop.Sys.IOVector); + slotStorage.NativeSocketAddress = cursor; + cursor += IoUringConstants.MessageInlineSocketAddressCapacity; + slotStorage.NativeControlBuffer = cursor; + cursor += IoUringConstants.MessageInlineControlBufferCapacity; + slotStorage.NativeSocketAddressLengthPtr = (int*)cursor; + + slotStorage.MessageIsReceive = false; + slotStorage.ReceiveOutputSocketAddress = null; + slotStorage.ReceiveOutputControlBuffer = null; + slotStorage.ReceiveSocketAddressCapacity = 0; + slotStorage.ReceiveControlBufferCapacity = 0; + } + + /// Allocates SoA completion slot arrays and initializes the free list. + [MethodImpl(MethodImplOptions.NoInlining)] + private void InitializeCompletionSlotPool(int capacity) + { + Debug.Assert( + (ulong)capacity <= IoUringConstants.SlotIndexMask + 1UL, + $"Completion slot capacity {capacity} exceeds encodable slot index range {IoUringConstants.SlotIndexMask + 1UL}."); + Debug.Assert( + Unsafe.SizeOf() == 24, + $"IoUringCompletionSlot size drifted: expected 24, got {Unsafe.SizeOf()}."); + _completionSlots = new IoUringCompletionSlot[capacity]; + _trackedOperations = new IoUringTrackedOperationState[capacity]; + _completionSlotStorage = new IoUringCompletionSlotStorage[capacity]; + _zeroCopyPinHolds = new System.Buffers.MemoryHandle[capacity]; + _completionSlotNativeStorageStride = GetCompletionSlotNativeStorageStride(); + Debug.Assert( + _completionSlotNativeStorageStride <= int.MaxValue, + $"Completion slot native storage stride overflow: {_completionSlotNativeStorageStride}."); + if (_completionSlotNativeStorageStride > int.MaxValue) + { + // FailFast-adjacent site: impossible stride overflow indicates corrupted + // layout assumptions during engine initialization, so keep the hard failure. + ThrowInternalException(Interop.Error.EOVERFLOW); + } + + _completionSlotNativeStorage = (byte*)NativeMemory.AllocZeroed((nuint)capacity * _completionSlotNativeStorageStride); + // Build free list linking all slots + for (int i = 0; i < capacity - 1; i++) + { + _completionSlots[i].Generation = 1; + _completionSlots[i].FreeListNext = i + 1; + InitializeCompletionSlotNativeStorage( + ref _completionSlotStorage[i], + _completionSlotNativeStorage + ((nuint)i * _completionSlotNativeStorageStride)); + } + _completionSlots[capacity - 1].Generation = 1; + _completionSlots[capacity - 1].FreeListNext = -1; + InitializeCompletionSlotNativeStorage( + ref _completionSlotStorage[capacity - 1], + _completionSlotNativeStorage + ((nuint)(capacity - 1) * _completionSlotNativeStorageStride)); + _completionSlotFreeListHead = 0; + _completionSlotsInUse = 0; + _completionSlotsHighWaterMark = 0; + _liveAcceptCompletionSlotCount = 0; + _trackedIoUringOperationCount = 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void SetCompletionSlotKind(ref IoUringCompletionSlot slot, IoUringCompletionOperationKind kind) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "SetCompletionSlotKind must run on the event-loop thread."); + IoUringCompletionOperationKind previousKind = slot.Kind; + if (previousKind == kind) + { + return; + } + + slot.Kind = kind; + if (previousKind == IoUringCompletionOperationKind.Accept || + kind == IoUringCompletionOperationKind.Accept) + { + int liveAcceptCount = _liveAcceptCompletionSlotCount; + if (previousKind == IoUringCompletionOperationKind.Accept) + { + liveAcceptCount--; + } + + if (kind == IoUringCompletionOperationKind.Accept) + { + liveAcceptCount++; + } + + Debug.Assert(liveAcceptCount >= 0); + Volatile.Write(ref _liveAcceptCompletionSlotCount, liveAcceptCount); + } + } + + /// + /// Allocates a completion slot from the free list. Returns the slot index, + /// or -1 if the pool is exhausted (backpressure signal). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int AllocateCompletionSlot() + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "AllocateCompletionSlot must run on the event-loop thread."); + Debug.Assert(_completionSlots is not null); + int index = _completionSlotFreeListHead; + if (index < 0) + return -1; // Pool exhausted + + ref IoUringCompletionSlot slot = ref _completionSlots![index]; + // Slot state is reset in FreeCompletionSlot; keep allocation to free-list bookkeeping only. + _completionSlotFreeListHead = slot.FreeListNext; + slot.FreeListNext = -1; + int inUse = ++_completionSlotsInUse; + if (inUse > _completionSlotsHighWaterMark) + { + _completionSlotsHighWaterMark = inUse; + SocketsTelemetry.Log.IoUringCompletionSlotHighWaterMark(inUse); + } + return index; + } + + /// + /// Returns a completion slot to the free list, incrementing its generation + /// to invalidate any stale user_data references. + /// + private unsafe void FreeCompletionSlot(int index) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "FreeCompletionSlot must run on the event-loop thread."); + Debug.Assert(index >= 0 && index < _completionSlots!.Length); + + ReleaseZeroCopyPinHold(index); + ref IoUringCompletionSlot slot = ref _completionSlots![index]; + ref IoUringTrackedOperationState trackedState = ref _trackedOperations![index]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![index]; + Debug.Assert( + Volatile.Read(ref trackedState.TrackedOperation) is null, + "Completion slot should not be freed while a tracked io_uring operation is still attached."); + + SafeSocketHandle? dangerousRefSocketHandle = slotStorage.DangerousRefSocketHandle; + ExceptionDispatchInfo? dangerousReleaseException = null; + try + { + if (dangerousRefSocketHandle is not null) + { + slotStorage.DangerousRefSocketHandle = null; + dangerousRefSocketHandle.DangerousRelease(); + } + } + catch (Exception ex) + { + dangerousReleaseException = ExceptionDispatchInfo.Capture(ex); + } + finally + { + if (slot.UsesFixedRecvBuffer) + { + IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing; + if (providedBufferRing is not null) + { + providedBufferRing.TryRecycleBufferFromCompletion(slot.FixedRecvBufferId); + } + } + + // Free any native message storage + if (slot.Kind == IoUringCompletionOperationKind.Message) + { + FreeMessageStorage(index); + } + else if (slot.Kind == IoUringCompletionOperationKind.Accept) + { + if (slotStorage.NativeSocketAddressLengthPtr != null) + { + *slotStorage.NativeSocketAddressLengthPtr = 0; + } + } + + slot.Generation = (slot.Generation + 1UL) & IoUringConstants.GenerationMask; + if (slot.Generation == 0) + { + slot.Generation = 1; + } + SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.None); + ResetDebugTestForcedResult(ref slot); + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + slot.UsesFixedRecvBuffer = false; + slot.FixedRecvBufferId = 0; + Volatile.Write(ref trackedState.TrackedOperation, null); + trackedState.TrackedOperationGeneration = 0; + slot.FreeListNext = _completionSlotFreeListHead; + _completionSlotFreeListHead = index; + _completionSlotsInUse--; + } + + dangerousReleaseException?.Throw(); + } + + /// Disposes a retained zero-copy pin-hold for the specified completion slot. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ReleaseZeroCopyPinHold(int slotIndex) + { + System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds; + if (pinHolds is null || (uint)slotIndex >= (uint)pinHolds.Length) + { + return; + } + + pinHolds[slotIndex].Dispose(); + pinHolds[slotIndex] = default; + } + + /// Transfers operation-owned pin state into the engine's zero-copy pin-hold table. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void TransferIoUringZeroCopyPinHold(ulong userData, System.Buffers.MemoryHandle pinHold) + { + System.Buffers.MemoryHandle[]? pinHolds = _zeroCopyPinHolds; + if (pinHolds is null) + { + pinHold.Dispose(); + Debug.Fail("Zero-copy pin-hold table is unavailable while transferring pin ownership."); + return; + } + + int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask); + if ((uint)slotIndex >= (uint)pinHolds.Length) + { + pinHold.Dispose(); + Debug.Fail($"Invalid completion slot index while transferring zero-copy pin hold: {slotIndex}."); + return; + } + + Debug.Assert(_completionSlots is not null); + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + if (!slot.IsZeroCopySend) + { + pinHold.Dispose(); + Debug.Fail("Zero-copy pin hold transfer requested for a non-zero-copy completion slot."); + return; + } + + pinHolds[slotIndex].Dispose(); + pinHolds[slotIndex] = pinHold; + } + + /// + /// Prepares pre-allocated per-slot native message storage for sendmsg/recvmsg. + /// Returns false when header shape exceeds inline capacities so callers can fall back. + /// + private unsafe bool TryPrepareInlineMessageStorage(int slotIndex, Interop.Sys.MessageHeader* messageHeader, bool isReceive) + { + Debug.Assert(sizeof(NativeMsghdr) == 56, $"NativeMsghdr size mismatch with kernel struct msghdr: expected 56, got {sizeof(NativeMsghdr)}"); + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + + int iovCount = messageHeader->IOVectorCount; + int sockAddrLen = messageHeader->SocketAddressLen; + int controlBufLen = messageHeader->ControlBufferLen; + Debug.Assert(iovCount >= 0, $"Expected non-negative iovCount, got {iovCount}"); + Debug.Assert(sockAddrLen >= 0, $"Expected non-negative socket address length, got {sockAddrLen}"); + Debug.Assert(controlBufLen >= 0, $"Expected non-negative control buffer length, got {controlBufLen}"); + + if ((uint)iovCount > IoUringConstants.MessageInlineIovCount || + (uint)sockAddrLen > IoUringConstants.MessageInlineSocketAddressCapacity || + (uint)controlBufLen > IoUringConstants.MessageInlineControlBufferCapacity) + { + return false; + } + + if (slotStorage.NativeInlineStorage == null) + { + return false; + } + + if ((iovCount > 0 && messageHeader->IOVectors == null) || + (sockAddrLen > 0 && messageHeader->SocketAddress == null) || + (controlBufLen > 0 && messageHeader->ControlBuffer == null)) + { + return false; + } + + // Most of the inline slab is overwritten immediately; clear only msghdr header state. + new Span(slotStorage.NativeMsgHdrPtr.ToPointer(), sizeof(NativeMsghdr)).Clear(); + + NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr; + Interop.Sys.IOVector* iovDst = slotStorage.NativeIOVectors; + byte* sockAddrDst = slotStorage.NativeSocketAddress; + byte* controlBufDst = slotStorage.NativeControlBuffer; + + if (iovCount > 0) + { + nuint iovBytes = (nuint)iovCount * (nuint)sizeof(Interop.Sys.IOVector); + Buffer.MemoryCopy( + messageHeader->IOVectors, + iovDst, + (nuint)IoUringConstants.MessageInlineIovCount * (nuint)sizeof(Interop.Sys.IOVector), + iovBytes); + } + + if (!isReceive) + { + if (sockAddrLen > 0) + { + Buffer.MemoryCopy( + messageHeader->SocketAddress, + sockAddrDst, + (nuint)IoUringConstants.MessageInlineSocketAddressCapacity, + (nuint)sockAddrLen); + } + + if (controlBufLen > 0) + { + Buffer.MemoryCopy( + messageHeader->ControlBuffer, + controlBufDst, + (nuint)IoUringConstants.MessageInlineControlBufferCapacity, + (nuint)controlBufLen); + } + } + + hdr->MsgName = sockAddrLen > 0 ? sockAddrDst : null; + hdr->MsgNameLen = (uint)sockAddrLen; + hdr->MsgIov = iovCount > 0 ? iovDst : null; + hdr->MsgIovLen = (nuint)iovCount; + hdr->MsgControl = controlBufLen > 0 ? controlBufDst : null; + hdr->MsgControlLen = (nuint)controlBufLen; + hdr->MsgFlags = 0; + + if (isReceive) + { + slotStorage.ReceiveOutputSocketAddress = messageHeader->SocketAddress; + slotStorage.ReceiveOutputControlBuffer = messageHeader->ControlBuffer; + slotStorage.ReceiveSocketAddressCapacity = sockAddrLen; + slotStorage.ReceiveControlBufferCapacity = controlBufLen; + } + else + { + slotStorage.ReceiveOutputSocketAddress = null; + slotStorage.ReceiveOutputControlBuffer = null; + slotStorage.ReceiveSocketAddressCapacity = 0; + slotStorage.ReceiveControlBufferCapacity = 0; + } + + slotStorage.MessageIsReceive = isReceive; + return true; + } + + /// + /// Resets inline message metadata on the completion slot. + /// + private unsafe void FreeMessageStorage(int slotIndex) + { + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + // Slot inline storage is cleared on prepare before each reuse; avoid a second full memset on free. + + slotStorage.ReceiveOutputSocketAddress = null; + slotStorage.ReceiveOutputControlBuffer = null; + slotStorage.ReceiveSocketAddressCapacity = 0; + slotStorage.ReceiveControlBufferCapacity = 0; + slotStorage.MessageIsReceive = false; + } + + /// + /// After a recvmsg CQE completes, copies the kernel-written socket address and + /// control buffer data from the native msghdr back to the managed MessageHeader's + /// output buffers. For sendmsg completions this is a no-op. + /// Returns the actual socket address length, control buffer length, and msg_flags written by the kernel. + /// + private unsafe void CopyMessageCompletionOutputs( + int slotIndex, + out int socketAddressLen, + out int controlBufferLen, + out uint messageFlags) + { + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + socketAddressLen = 0; + controlBufferLen = 0; + messageFlags = 0; + + if (!slotStorage.MessageIsReceive) + return; + + NativeMsghdr* hdr = (NativeMsghdr*)slotStorage.NativeMsgHdrPtr; + if (hdr == null) + return; + + socketAddressLen = (int)hdr->MsgNameLen; + controlBufferLen = (int)hdr->MsgControlLen; + messageFlags = (uint)hdr->MsgFlags; + + // Copy socket address from native buffer back to managed output buffer + if (slotStorage.ReceiveOutputSocketAddress != null && slotStorage.NativeSocketAddress != null && + slotStorage.ReceiveSocketAddressCapacity > 0 && socketAddressLen > 0) + { + int copyLen = Math.Min(slotStorage.ReceiveSocketAddressCapacity, socketAddressLen); + Buffer.MemoryCopy(slotStorage.NativeSocketAddress, slotStorage.ReceiveOutputSocketAddress, copyLen, copyLen); + } + + // Copy control buffer from native buffer back to managed output buffer + if (slotStorage.ReceiveOutputControlBuffer != null && slotStorage.NativeControlBuffer != null && + slotStorage.ReceiveControlBufferCapacity > 0 && controlBufferLen > 0) + { + int copyLen = Math.Min(slotStorage.ReceiveControlBufferCapacity, controlBufferLen); + Buffer.MemoryCopy(slotStorage.NativeControlBuffer, slotStorage.ReceiveOutputControlBuffer, copyLen, copyLen); + } + } + + /// + /// Decodes a completion slot index from a user_data payload value. + /// The slot index is encoded in the lower bits of the payload. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int DecodeCompletionSlotIndex(ulong payload) + { + return (int)(payload & IoUringConstants.SlotIndexMask); + } + + /// + /// Encodes a completion slot index and generation into a user_data value + /// with the ReservedCompletion tag. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong EncodeCompletionSlotUserData(int slotIndex, ulong generation) + { + ulong payload = ((ulong)(generation & IoUringConstants.GenerationMask) << IoUringConstants.SlotIndexBits) | ((ulong)slotIndex & IoUringConstants.SlotIndexMask); + return EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs new file mode 100644 index 00000000000000..6a3bbad9a3b883 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringSqeWriters.Linux.cs @@ -0,0 +1,249 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + /// Converts SocketFlags to the kernel msg_flags representation for io_uring. + private static bool TryConvertIoUringPrepareSocketFlags(SocketFlags flags, out uint rwFlags) + { + const SocketFlags SupportedIoUringFlags = + SocketFlags.OutOfBand | + SocketFlags.Peek | + SocketFlags.DontRoute; + + if ((flags & ~SupportedIoUringFlags) != 0) + { + rwFlags = 0; + return false; + } + + rwFlags = (uint)(int)flags; + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteSendLikeSqe( + IoUringSqe* sqe, + byte opcode, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* buffer, + uint length, + uint rwFlags) + { + sqe->Opcode = opcode; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)buffer; + sqe->Len = length; + sqe->RwFlags = rwFlags; + sqe->UserData = userData; + } + + /// Writes a recv SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteRecvSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* buffer, + uint length, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.Recv; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Ioprio = 0; + sqe->Addr = (ulong)(nuint)buffer; + sqe->Len = length; + sqe->RwFlags = rwFlags; + sqe->BufIndex = 0; + sqe->UserData = userData; + } + + /// Writes a read-fixed SQE for registered-buffer receive. + private static unsafe void WriteReadFixedSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* buffer, + uint length, + ushort bufferIndex) + { + sqe->Opcode = IoUringOpcodes.ReadFixed; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Ioprio = 0; + sqe->Addr = (ulong)(nuint)buffer; + sqe->Len = length; + // For non-seekable sockets, offset is ignored; -1 matches "current position" semantics. + sqe->Off = ulong.MaxValue; + sqe->RwFlags = 0; + sqe->BufIndex = bufferIndex; + sqe->UserData = userData; + } + + /// + /// Writes a one-shot recv SQE using provided-buffer selection. + /// The kernel chooses a buffer from the specified buffer group. + /// + private static void WriteProvidedBufferRecvSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + uint requestedLength, + uint rwFlags, + ushort bufferGroupId) + { + sqe->Opcode = IoUringOpcodes.Recv; + sqe->Fd = sqeFd; + sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect); + sqe->Ioprio = 0; + sqe->Addr = 0; + sqe->Len = requestedLength; + sqe->RwFlags = rwFlags; + sqe->BufIndex = bufferGroupId; + sqe->UserData = userData; + } + + /// + /// Writes a multishot recv SQE to the submission ring entry. + /// The kernel selects buffers from a provided buffer ring (IOSQE_BUFFER_SELECT). + /// + private static void WriteMultishotRecvSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + ushort bufferGroupId) + { + sqe->Opcode = IoUringOpcodes.Recv; + sqe->Fd = sqeFd; + sqe->Flags = (byte)(sqeFlags | IoUringConstants.SqeBufferSelect); + sqe->Ioprio = IoUringConstants.RecvMultishot; + sqe->Addr = 0; + sqe->Len = 0; + sqe->RwFlags = 0; + sqe->BufIndex = bufferGroupId; + sqe->UserData = userData; + } + + /// Writes an accept SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteAcceptSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* socketAddress, + IntPtr socketAddressLengthPtr) + { + sqe->Opcode = IoUringOpcodes.Accept; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)socketAddress; + // Kernel accept prep aliases addr2 at sqe->off. + sqe->Off = (ulong)(nuint)socketAddressLengthPtr; + sqe->RwFlags = IoUringConstants.AcceptFlags; + sqe->UserData = userData; + } + + /// Writes a multishot accept SQE to the submission ring entry. + private static unsafe void WriteMultishotAcceptSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* socketAddress, + IntPtr socketAddressLengthPtr) + { + sqe->Opcode = IoUringOpcodes.Accept; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Ioprio = IoUringConstants.AcceptMultishot; + sqe->Addr = (ulong)(nuint)socketAddress; + // accept4 prep aliases addr2 at sqe->off for addrlen pointer + sqe->Off = (ulong)(nuint)socketAddressLengthPtr; + sqe->RwFlags = IoUringConstants.AcceptFlags; + sqe->UserData = userData; + } + + private static void WriteSendMsgLikeSqe( + IoUringSqe* sqe, + byte opcode, + int sqeFd, + byte sqeFlags, + ulong userData, + IntPtr messageHeader, + uint rwFlags) + { + sqe->Opcode = opcode; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)messageHeader; + sqe->Len = 1; + sqe->RwFlags = rwFlags; + sqe->UserData = userData; + } + + /// Writes a recvmsg SQE to the submission ring entry. + private static void WriteRecvMsgSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + IntPtr messageHeader, + uint rwFlags) + { + sqe->Opcode = IoUringOpcodes.RecvMsg; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)messageHeader; + sqe->Len = 1; + sqe->RwFlags = rwFlags; + sqe->UserData = userData; + } + + /// Writes a connect SQE to the submission ring entry. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WriteConnectSqe( + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + byte* socketAddress, + int socketAddressLen) + { + sqe->Opcode = IoUringOpcodes.Connect; + sqe->Fd = sqeFd; + sqe->Flags = sqeFlags; + sqe->Addr = (ulong)(nuint)socketAddress; + // Kernel connect prep aliases addrlen at sqe->off and requires len=0. + sqe->Off = (uint)socketAddressLen; + sqe->Len = 0; + sqe->UserData = userData; + } + + /// Writes an ASYNC_CANCEL SQE targeting the specified user_data. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteAsyncCancelSqe(IoUringSqe* sqe, ulong userData) + { + sqe->Opcode = IoUringOpcodes.AsyncCancel; + sqe->Fd = -1; + Debug.Assert((byte)(userData >> IoUringUserDataTagShift) == IoUringConstants.TagReservedCompletion); + sqe->Addr = userData; + sqe->UserData = 0; + } + + + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs new file mode 100644 index 00000000000000..b541bfff24d9d6 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.IoUringTestHooks.Stubs.Linux.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ResetDebugTestForcedResult(ref IoUringCompletionSlot slot) + { + _ = slot; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ResolveDebugTestForcedResult(ref IoUringCompletionSlot slot, ref int result) + { + _ = slot; + _ = result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ApplyDebugTestForcedResult(ref IoUringCompletionSlot slot, byte opcode) + { + _ = _ioUringInitialized; + _ = slot; + _ = opcode; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RestoreDebugTestForcedResultIfNeeded(int slotIndex, byte opcode) + { + _ = _ioUringInitialized; + _ = slotIndex; + _ = opcode; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void InitializeDebugTestHooksFromEnvironment() + { + _ = _ioUringInitialized; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryConsumeDebugForcedSubmitError(out Interop.Error forcedError) + { + _ = _ioUringInitialized; + forcedError = Interop.Error.SUCCESS; + return false; + } + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs new file mode 100644 index 00000000000000..1b1c50f0c1ac2f --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Linux.cs @@ -0,0 +1,4611 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + /// Linux socket engine coordinating epoll and io_uring work for process sockets. + /// + /// io_uring completion mode currently uses one active engine/event-loop instance per process. + /// This keeps ownership and teardown semantics simple, but the single submit/drain loop can + /// become a bottleneck at high core/socket densities. + /// Future work may evaluate multi-engine sharding (for example by socket affinity) when high-core + /// throughput data justifies the additional complexity. + /// + internal sealed unsafe partial class SocketAsyncEngine + { + /// Indicates which io_uring dispatch mode is active for this engine instance. + private enum IoUringMode : byte + { + Disabled = 0, + Completion = 1 + } + + /// Distinguishes cancellation requests issued during normal runtime from those during engine teardown. + private enum IoUringCancellationOrigin : byte + { + Runtime = 0, + Teardown = 1 + } + + /// Identifies which CQ-overflow recovery branch is active for logging/telemetry correlation. + private enum IoUringCqOverflowRecoveryBranch : byte + { + MultishotAcceptArming = 0, + Teardown = 1, + // Steady-state branch: normal runtime overflow recovery outside teardown/accept-arm handoff. + DualWave = 2 + } + + /// Tracks the lifecycle of an io_uring operation for debug assertions on valid state transitions. + private enum IoUringOperationLifecycleState : byte + { + Queued = 0, + Prepared = 1, + Submitted = 2, + Completed = 3, + Canceled = 4, + Detached = 5 + } + + /// Result of attempting to remove a tracked operation by user_data. + private enum IoUringTrackedOperationRemoveResult : byte + { + Removed = 0, + NotFound = 1, + Mismatch = 2 + } + + /// Immutable snapshot of negotiated io_uring capabilities for this engine instance. + private readonly struct LinuxIoUringCapabilities + { + private const uint FlagIsIoUringPort = 1u << 0; + private const uint FlagSupportsMultishotRecv = 1u << 1; + private const uint FlagSupportsMultishotAccept = 1u << 2; + private const uint FlagSupportsZeroCopySend = 1u << 3; + private const uint FlagSqPollEnabled = 1u << 4; + private const uint FlagSupportsProvidedBufferRings = 1u << 5; + private const uint FlagHasRegisteredBuffers = 1u << 6; + + private readonly uint _flags; + + /// The active io_uring dispatch mode. + internal IoUringMode Mode { get; } + + /// Whether the engine's port was created as an io_uring instance. + internal bool IsIoUringPort => (_flags & FlagIsIoUringPort) != 0; + /// Whether multishot recv can be used by this engine instance. + internal bool SupportsMultishotRecv => (_flags & FlagSupportsMultishotRecv) != 0; + /// Whether multishot accept can be used by this engine instance. + internal bool SupportsMultishotAccept => (_flags & FlagSupportsMultishotAccept) != 0; + /// Whether zero-copy send is enabled for this engine instance. + internal bool SupportsZeroCopySend => (_flags & FlagSupportsZeroCopySend) != 0; + /// Whether SQPOLL mode is enabled for this engine instance. + internal bool SqPollEnabled => (_flags & FlagSqPollEnabled) != 0; + /// Whether provided-buffer rings are active for this engine instance. + internal bool SupportsProvidedBufferRings => (_flags & FlagSupportsProvidedBufferRings) != 0; + /// Whether provided buffers are currently registered with the kernel. + internal bool HasRegisteredBuffers => (_flags & FlagHasRegisteredBuffers) != 0; + + /// Whether the engine is operating in full completion mode. + internal bool IsCompletionMode => + Mode == IoUringMode.Completion; + + private LinuxIoUringCapabilities(IoUringMode mode, uint flags) + { + Mode = mode; + _flags = flags; + } + + internal LinuxIoUringCapabilities WithMode(IoUringMode mode) => + new LinuxIoUringCapabilities(mode, _flags); + + internal LinuxIoUringCapabilities WithIsIoUringPort(bool value) => + WithFlag(FlagIsIoUringPort, value); + + internal LinuxIoUringCapabilities WithSupportsMultishotRecv(bool value) => + WithFlag(FlagSupportsMultishotRecv, value); + + internal LinuxIoUringCapabilities WithSupportsMultishotAccept(bool value) => + WithFlag(FlagSupportsMultishotAccept, value); + + internal LinuxIoUringCapabilities WithSupportsZeroCopySend(bool value) => + WithFlag(FlagSupportsZeroCopySend, value); + + internal LinuxIoUringCapabilities WithSqPollEnabled(bool value) => + WithFlag(FlagSqPollEnabled, value); + + internal LinuxIoUringCapabilities WithSupportsProvidedBufferRings(bool value) => + WithFlag(FlagSupportsProvidedBufferRings, value); + + internal LinuxIoUringCapabilities WithHasRegisteredBuffers(bool value) => + WithFlag(FlagHasRegisteredBuffers, value); + + private LinuxIoUringCapabilities WithFlag(uint flag, bool value) + { + uint flags = value ? (_flags | flag) : (_flags & ~flag); + return new LinuxIoUringCapabilities(Mode, flags); + } + } + + [Flags] + private enum IoUringConfigurationWarningFlags : byte + { + None = 0, + SqPollRequestedWithoutIoUring = 1 << 0, + DirectSqeDisabledWithoutIoUring = 1 << 1, + ZeroCopyOptInWithoutIoUring = 1 << 2 + } + + /// Immutable process-wide snapshot of resolved io_uring configuration inputs. + private readonly struct IoUringResolvedConfiguration + { + internal bool IoUringEnabled { get; } + internal bool SqPollRequested { get; } + internal bool DirectSqeDisabled { get; } + internal bool ZeroCopySendOptedIn { get; } + internal bool RegisterBuffersEnabled { get; } + internal bool AdaptiveProvidedBufferSizingEnabled { get; } + internal int ProvidedBufferSize { get; } + internal int PrepareQueueCapacity { get; } + internal int CancellationQueueCapacity { get; } + private readonly IoUringConfigurationWarningFlags _warningFlags; + + internal IoUringResolvedConfiguration( + bool ioUringEnabled, + bool sqPollRequested, + bool directSqeDisabled, + bool zeroCopySendOptedIn, + bool registerBuffersEnabled, + bool adaptiveProvidedBufferSizingEnabled, + int providedBufferSize, + int prepareQueueCapacity, + int cancellationQueueCapacity) + { + IoUringEnabled = ioUringEnabled; + SqPollRequested = sqPollRequested; + DirectSqeDisabled = directSqeDisabled; + ZeroCopySendOptedIn = zeroCopySendOptedIn; + RegisterBuffersEnabled = registerBuffersEnabled; + AdaptiveProvidedBufferSizingEnabled = adaptiveProvidedBufferSizingEnabled; + ProvidedBufferSize = providedBufferSize; + PrepareQueueCapacity = prepareQueueCapacity; + CancellationQueueCapacity = cancellationQueueCapacity; + _warningFlags = ComputeWarningFlags( + ioUringEnabled, + sqPollRequested, + directSqeDisabled, + zeroCopySendOptedIn); + } + + internal string ToLogString() => + $"enabled={IoUringEnabled}, sqpollRequested={SqPollRequested}, directSqeDisabled={DirectSqeDisabled}, zeroCopySendOptedIn={ZeroCopySendOptedIn}, registerBuffersEnabled={RegisterBuffersEnabled}, adaptiveProvidedBufferSizingEnabled={AdaptiveProvidedBufferSizingEnabled}, providedBufferSize={ProvidedBufferSize}, prepareQueueCapacity={PrepareQueueCapacity}, cancellationQueueCapacity={CancellationQueueCapacity}"; + + internal bool TryGetValidationWarnings([NotNullWhen(true)] out string? warnings) + { + if (_warningFlags == IoUringConfigurationWarningFlags.None) + { + warnings = null; + return false; + } + + warnings = BuildWarningMessage(_warningFlags); + return true; + } + + private static IoUringConfigurationWarningFlags ComputeWarningFlags( + bool ioUringEnabled, + bool sqPollRequested, + bool directSqeDisabled, + bool zeroCopySendOptedIn) + { + IoUringConfigurationWarningFlags warnings = IoUringConfigurationWarningFlags.None; + if (!ioUringEnabled && sqPollRequested) + { + warnings |= IoUringConfigurationWarningFlags.SqPollRequestedWithoutIoUring; + } + + if (!ioUringEnabled && directSqeDisabled) + { + warnings |= IoUringConfigurationWarningFlags.DirectSqeDisabledWithoutIoUring; + } + + if (!ioUringEnabled && zeroCopySendOptedIn) + { + warnings |= IoUringConfigurationWarningFlags.ZeroCopyOptInWithoutIoUring; + } + + return warnings; + } + + private static string BuildWarningMessage(IoUringConfigurationWarningFlags warnings) + { + var parts = new List(3); + if ((warnings & IoUringConfigurationWarningFlags.SqPollRequestedWithoutIoUring) != 0) + { + parts.Add("SQPOLL requested while io_uring is disabled"); + } + + if ((warnings & IoUringConfigurationWarningFlags.DirectSqeDisabledWithoutIoUring) != 0) + { + parts.Add("direct SQE disabled while io_uring is disabled"); + } + + if ((warnings & IoUringConfigurationWarningFlags.ZeroCopyOptInWithoutIoUring) != 0) + { + parts.Add("zero-copy send opted-in while io_uring is disabled"); + } + + return string.Join("; ", parts); + } + } + + /// Mirrors kernel struct io_uring_sqe (64 bytes), written to the SQ ring for submission. + [StructLayout(LayoutKind.Explicit, Size = 64)] + internal struct IoUringSqe + { + [FieldOffset(0)] + internal byte Opcode; + [FieldOffset(1)] + internal byte Flags; + [FieldOffset(2)] + internal ushort Ioprio; + [FieldOffset(4)] + internal int Fd; + [FieldOffset(8)] + internal ulong Off; + [FieldOffset(16)] + internal ulong Addr; + [FieldOffset(24)] + internal uint Len; + [FieldOffset(28)] + internal uint RwFlags; + [FieldOffset(32)] + internal ulong UserData; + [FieldOffset(40)] + internal ushort BufIndex; + [FieldOffset(42)] + internal ushort Personality; + [FieldOffset(44)] + internal int SpliceFdIn; + [FieldOffset(48)] + internal ulong Addr3; + } + + /// Mirrors kernel struct io_uring_probe_op (8 bytes per entry in the probe ops array). + [StructLayout(LayoutKind.Explicit, Size = 8)] + private struct IoUringProbeOp + { + [FieldOffset(0)] internal byte Op; + [FieldOffset(1)] internal byte Resv; + [FieldOffset(2)] internal ushort Flags; + // 4 bytes reserved at offset 4 + } + + /// Mirrors kernel struct io_uring_probe (16-byte header preceding the variable-length ops array). + [StructLayout(LayoutKind.Explicit, Size = 16)] + private struct IoUringProbeHeader + { + [FieldOffset(0)] internal byte LastOp; + [FieldOffset(1)] internal byte OpsLen; + // 14 bytes reserved at offset 2 + } + + /// + /// Kernel ABI opcode constants as a static class (not an enum) to avoid byte-cast noise + /// at every SQE write site, since the SQE Opcode field is typed as byte. + /// + private static class IoUringOpcodes + { + internal const byte ReadFixed = 4; + internal const byte Send = 26; + internal const byte Recv = 27; + internal const byte SendMsg = 9; + internal const byte RecvMsg = 10; + internal const byte Accept = 13; + internal const byte Connect = 16; + internal const byte SendZc = 53; + internal const byte SendMsgZc = 54; + internal const byte AsyncCancel = 14; + internal const byte PollAdd = 6; + } + + /// + /// Centralizes io_uring ABI constants that mirror the native definitions in pal_io_uring.c. + /// These are used by managed code that directly interacts with the io_uring submission + /// and completion rings (e.g., direct SQE writes via mmap'd ring access). + /// + private static class IoUringConstants + { + // Setup flags (io_uring_setup params.flags) + internal const uint SetupCqSize = 1u << 3; + internal const uint SetupSqPoll = 1u << 5; + internal const uint SetupSubmitAll = 1u << 7; + internal const uint SetupCoopTaskrun = 1u << 8; + internal const uint SetupSqe128 = 1u << 10; + internal const uint SetupSingleIssuer = 1u << 12; + internal const uint SetupDeferTaskrun = 1u << 13; + internal const uint SetupRDisabled = 1u << 6; + internal const uint SetupNoSqArray = 1u << 16; + internal const uint SetupCloexec = 1u << 19; + + // Feature flags (io_uring_params.features) + internal const uint FeatureSingleMmap = 1u << 0; + internal const uint FeatureExtArg = 1u << 8; + + // Enter flags (io_uring_enter flags parameter) + internal const uint EnterGetevents = 1u << 0; + internal const uint EnterSqWakeup = 1u << 1; + internal const uint EnterExtArg = 1u << 3; + internal const uint EnterRegisteredRing = 1u << 4; + + // SQ ring flags (sq_ring->flags) + internal const uint SqNeedWakeup = 1u << 0; + + // Register opcodes + internal const uint RegisterEnableRings = 1; + internal const uint RegisterBuffers = 0; + internal const uint UnregisterBuffers = 1; + internal const uint RegisterProbe = 8; + internal const uint RegisterRingFds = 20; + internal const uint UnregisterRingFds = 21; + internal const uint RegisterPbufRing = 22; + internal const uint UnregisterPbufRing = 23; + + // Register helper values + internal const uint RegisterOffsetAuto = 0xFFFFFFFFU; + + // Probe op flags + internal const uint ProbeOpFlagSupported = 1u << 0; + + // Poll flags + internal const uint PollAddFlagMulti = 1u << 0; + internal const uint PollIn = 0x0001; + + // CQE flags + internal const uint CqeFBuffer = 1u << 0; // IORING_CQE_F_BUFFER (buffer id in upper bits) + internal const uint CqeFMore = 1u << 1; // IORING_CQE_F_MORE (multishot) + internal const uint CqeFNotif = 1u << 2; // IORING_CQE_F_NOTIF (zero-copy notification) + internal const int CqeBufferShift = 16; // IORING_CQE_BUFFER_SHIFT + + // Recv ioprio flags + internal const ushort RecvMultishot = 1 << 1; // IORING_RECV_MULTISHOT + // Accept ioprio flags + internal const ushort AcceptMultishot = 1 << 0; // IORING_ACCEPT_MULTISHOT + + // SQE flags + internal const byte SqeBufferSelect = 1 << 5; // IOSQE_BUFFER_SELECT + + // Sizing + internal const uint QueueEntries = 1024; + // Keep CQ capacity at 4x SQ entries to absorb completion bursts during short GC pauses + // without immediately tripping overflow recovery on busy rings. + internal const uint CqEntriesFactor = 4; + internal const uint MaxCqeDrainBatch = 128; + // Bounded wait trades wake latency for starvation resilience: + // if an eventfd wake is missed or deferred, the event loop still polls at least once + // every 50ms (worst-case deferred wake latency). + internal const long BoundedWaitTimeoutNanos = 50L * 1000 * 1000; // 50ms + // Circuit-breaker bounded wait used after repeated eventfd wake failures. + internal const long WakeFailureFallbackWaitTimeoutNanos = 1L * 1000 * 1000; // 1ms + + // Completion operation pool sizing + internal const int CompletionOperationPoolCapacityFactor = 2; + + // mmap offsets (from kernel UAPI: IORING_OFF_SQ_RING, IORING_OFF_CQ_RING, IORING_OFF_SQES) + internal const ulong OffSqRing = 0; + internal const ulong OffCqRing = 0x8000000; + internal const ulong OffSqes = 0x10000000; + + // Minimum kernel version for io_uring engine. + // SEND_ZC deferred-completion logic relies on NOTIF CQE sequencing behavior stabilized in Linux 6.1.0. + internal const int MinKernelMajor = 6; + internal const int MinKernelMinor = 1; + + // Zero-copy send size threshold (payloads below this use regular send). + internal const int ZeroCopySendThreshold = 16384; // 16KB + + // User data tag values (encoded in upper bits of user_data) + internal const byte TagNone = 0; + internal const byte TagReservedCompletion = 2; + internal const byte TagWakeupSignal = 3; + + // Accept-time flags for accepted socket descriptors: SOCK_CLOEXEC | SOCK_NONBLOCK. + internal const uint AcceptFlags = 0x80800; + + // Message inline capacities (avoid heap allocation on common small payloads) + internal const int MessageInlineIovCount = 4; + internal const int MessageInlineSocketAddressCapacity = 128; // sizeof(sockaddr_storage) + internal const int MessageInlineControlBufferCapacity = 128; + + // Internal discriminator for io_uring vs epoll fallback detection + internal const int NotSocketEventPort = int.MinValue + 1; + + // Completion slot encoding + // Slot index is encoded into 13 bits of user_data payload => max 8192 slot IDs per engine. + // Capacity planning note: with persistent multishot receives occupying long-lived slots, + // a 4000-connection steady state leaves ~4192 slots for transient sends/connects/one-shot recvs. + internal const int SlotIndexBits = 13; + internal const ulong SlotIndexMask = (1UL << SlotIndexBits) - 1UL; + internal const int GenerationBits = 56 - SlotIndexBits; + // 43-bit generation space gives each slot ~8.8 trillion incarnations before wrap. + // Generation zero remains reserved as "uninitialized", so wrap remaps 2^43-1 -> 1. + internal const ulong GenerationMask = (1UL << GenerationBits) - 1UL; + + // Test hook opcode masks (mirrors IoUringTestOpcodeMask in pal_io_uring.c) + internal const byte TestOpcodeMaskNone = 0; + internal const byte TestOpcodeMaskSend = 1 << 0; + internal const byte TestOpcodeMaskRecv = 1 << 1; + internal const byte TestOpcodeMaskSendMsg = 1 << 2; + internal const byte TestOpcodeMaskRecvMsg = 1 << 3; + internal const byte TestOpcodeMaskAccept = 1 << 4; + internal const byte TestOpcodeMaskConnect = 1 << 5; + internal const byte TestOpcodeMaskSendZc = 1 << 6; + internal const byte TestOpcodeMaskSendMsgZc = 1 << 7; + } + + /// Captures the results of io_uring_setup(2) including ring fd, negotiated params, and feature flags. + private struct IoUringSetupResult + { + internal int RingFd; + internal Interop.Sys.IoUringParams Params; + internal uint NegotiatedFlags; + internal bool UsesExtArg; + internal bool SqPollNegotiated; + } + + /// Discriminates completion slot metadata shape for operation-specific post-completion processing. + private enum IoUringCompletionOperationKind : byte + { + None = 0, + Accept = 1, + Message = 2, + } + + /// + /// Hot per-slot metadata used on every CQE dispatch. + /// Keep this minimal; native pointer-heavy state is kept in . + /// Explicit 24-byte layout keeps generation/free-list state and hot flags in one compact block. + /// + [StructLayout(LayoutKind.Explicit, Size = 24)] + private struct IoUringCompletionSlot + { + // 0..7 + [FieldOffset(0)] + public ulong Generation; + // 8..11 (-1 = end of free list) + [FieldOffset(8)] + public int FreeListNext; + // 12..15 (operation kind + hot state flags) + [FieldOffset(12)] + private uint _packedState; + // 16..17 + [FieldOffset(16)] + public ushort FixedRecvBufferId; +#if DEBUG + // 20..23 debug-only forced completion result payload. + [FieldOffset(20)] + public int TestForcedResult; +#endif + + private const uint KindMask = 0xFFu; + private const uint FlagIsZeroCopySend = 1u << 8; + private const uint FlagZeroCopyNotificationPending = 1u << 9; + private const uint FlagUsesFixedRecvBuffer = 1u << 10; +#if DEBUG + private const uint FlagHasTestForcedResult = 1u << 11; +#endif + + public IoUringCompletionOperationKind Kind + { + get => (IoUringCompletionOperationKind)(_packedState & KindMask); + set => _packedState = (_packedState & ~KindMask) | ((uint)value & KindMask); + } + + public bool IsZeroCopySend + { + get => (_packedState & FlagIsZeroCopySend) != 0; + set => SetFlag(FlagIsZeroCopySend, value); + } + + public bool ZeroCopyNotificationPending + { + get => (_packedState & FlagZeroCopyNotificationPending) != 0; + set => SetFlag(FlagZeroCopyNotificationPending, value); + } + + public bool UsesFixedRecvBuffer + { + get => (_packedState & FlagUsesFixedRecvBuffer) != 0; + set => SetFlag(FlagUsesFixedRecvBuffer, value); + } + +#if DEBUG + public bool HasTestForcedResult + { + get => (_packedState & FlagHasTestForcedResult) != 0; + set => SetFlag(FlagHasTestForcedResult, value); + } +#endif + + private void SetFlag(uint mask, bool value) + { + if (value) + { + _packedState |= mask; + } + else + { + _packedState &= ~mask; + } + } + } + + /// + /// Hot tracked-operation ownership state used on completion and cancellation paths. + /// Kept separate from native slot storage to improve cache locality in CQE dispatch. + /// + private struct IoUringTrackedOperationState + { + public SocketAsyncContext.AsyncOperation? TrackedOperation; + public ulong TrackedOperationGeneration; + } + + /// + /// Cold per-slot native metadata: pointers and message writeback state needed only for + /// operation-specific completion processing. + /// + private struct IoUringCompletionSlotStorage + { + // Hold a DangerousAddRef lease for the socket fd until this slot is fully retired. + public SafeSocketHandle? DangerousRefSocketHandle; + // Per-slot pre-allocated native slab backing accept socklen_t and message inline storage. + public unsafe byte* NativeInlineStorage; + // Accept metadata + public unsafe int* NativeSocketAddressLengthPtr; // socklen_t* in NativeInlineStorage + // Message metadata (pointers to native-alloc'd msghdr/iovec) + public IntPtr NativeMsgHdrPtr; + public bool MessageIsReceive; + // Message metadata - deep-copied native msghdr constituents (point into NativeInlineStorage). + public unsafe Interop.Sys.IOVector* NativeIOVectors; + public unsafe byte* NativeSocketAddress; + public unsafe byte* NativeControlBuffer; + // RecvMsg output capture - pointers back to managed MessageHeader buffers for writeback + public unsafe byte* ReceiveOutputSocketAddress; + public unsafe byte* ReceiveOutputControlBuffer; + public int ReceiveSocketAddressCapacity; + public int ReceiveControlBufferCapacity; + } + + /// + /// Mirrors the kernel's struct msghdr layout for direct SQE submission. + /// Used by to build a native msghdr that + /// io_uring sendmsg/recvmsg opcodes can consume directly. + /// Must only be used on 64-bit Linux where sizeof(msghdr) == 56. + /// + [StructLayout(LayoutKind.Explicit)] + private unsafe struct NativeMsghdr + { + [FieldOffset(0)] + public void* MsgName; + [FieldOffset(8)] + public uint MsgNameLen; + [FieldOffset(16)] + public Interop.Sys.IOVector* MsgIov; + [FieldOffset(24)] + public nuint MsgIovLen; + [FieldOffset(32)] + public void* MsgControl; + [FieldOffset(40)] + public nuint MsgControlLen; + [FieldOffset(48)] + public int MsgFlags; + } + + /// + /// Grouped managed ring mmap state. + /// Keeping these fields in a single struct reduces top-level instance-field sprawl. + /// + private unsafe struct ManagedRingState + { + public Interop.Sys.IoUringCqe* CqeBase; + public uint* CqTailPtr; + public uint* CqHeadPtr; + public uint CqMask; + public uint CqEntries; + public uint* CqOverflowPtr; + public uint ObservedCqOverflow; + public byte* SqRingPtr; + public byte* CqRingPtr; + public uint* SqFlagsPtr; + public ulong SqRingSize; + public ulong CqRingSize; + public ulong SqesSize; + public bool UsesSingleMmap; + public int RingFd; + public bool UsesExtArg; + public bool UsesNoSqArray; + public uint NegotiatedFlags; + public uint CachedCqHead; + public bool CqDrainEnabled; + public int WakeupEventFd; + + public static ManagedRingState CreateDefault() + { + ManagedRingState state = default; + state.RingFd = -1; + state.WakeupEventFd = -1; + return state; + } + } + + private const int IoUringDiagnosticsPollInterval = 64; + private const long DiagnosticSampleMask = IoUringDiagnosticsPollInterval - 1; + private const int MaxIoUringPrepareQueueDrainPerSubmit = 256; + private const int MaxIoUringCancelQueueDrainPerSubmit = 256; + private const int MaxSlotExhaustionRetries = 3; + private const int MaxIoUringSqeAcquireSubmitAttempts = 16; + private const int CqOverflowTrackedSweepDelayMilliseconds = 250; + private const int CqOverflowTrackedSweepMaxRearms = 8; + private const int IoUringWakeFailureCircuitBreakerThreshold = 8; + private const string IoUringEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING"; + private const string IoUringSqPollEnvironmentVariable = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL"; + private const string UseIoUringAppContextSwitch = "System.Net.Sockets.UseIoUring"; + private const string UseIoUringSqPollAppContextSwitch = "System.Net.Sockets.UseIoUringSqPoll"; + // Configuration matrix (7 surfaces): + // 1) DOTNET_SYSTEM_NET_SOCKETS_IO_URING + // 2) AppContext: System.Net.Sockets.UseIoUring + // 3) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL + // 4) AppContext: System.Net.Sockets.UseIoUringSqPoll + // 5) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE (DEBUG) + // 6) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND (DEBUG) + // 7) DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS (DEBUG, in IoUringProvidedBufferRing) + // + // Precedence: + // - Primary gate: env (surface #1) overrides AppContext (surface #2); AppContext is used only when env is unset. + // - SQPOLL gate: dual opt-in requires both env (surface #3 == "1") and AppContext (surface #4 == true). + private const ulong IoUringUserDataPayloadMask = 0x00FF_FFFF_FFFF_FFFFUL; + private const int IoUringUserDataTagShift = 56; + private static readonly int s_ioUringPrepareQueueCapacity = GetIoUringPrepareQueueCapacity(); + private static readonly int s_ioUringCancellationQueueCapacity = s_ioUringPrepareQueueCapacity; + private int _ioUringResolvedConfigurationLogged; + private long _ioUringPendingRetryQueuedToPrepareQueueCount; + private long _ioUringNonPinnablePrepareFallbackCount; + private long _ioUringPublishedNonPinnablePrepareFallbackCount; + private MpscQueue? _ioUringPrepareQueue; + private MpscQueue? _ioUringCancelQueue; + private long _ioUringPrepareQueueLength; + private long _ioUringCancelQueueLength; + private long _ioUringPrepareQueueOverflowCount; + private long _ioUringCancelQueueOverflowCount; + private long _ioUringPrepareQueueOverflowFallbackCount; + private long _ioUringCompletionSlotExhaustionCount; + private long _ioUringCompletionSlotDrainRecoveryCount; + private long _ioUringPublishedPrepareQueueLength; + private long _ioUringBenignLateCompletionCount; + private long _ioUringCompletionRequeueFailureCount; + private long _ioUringUntrackMismatchCount; + private long _ioUringPublishedPrepareQueueOverflowCount; + private long _ioUringPublishedPrepareQueueOverflowFallbackCount; + private long _ioUringPublishedCompletionRequeueFailureCount; + private long _ioUringPublishedCompletionSlotExhaustionCount; + private long _ioUringPublishedCompletionSlotDrainRecoveryCount; + private int _ioUringDiagnosticsPollCountdown; + private bool _ioUringAdvancedFeatureStateLogged; + private int _ioUringWakeFailureConsecutiveCount; + private uint _ioUringWakeupGeneration; + private int _ioUringPortClosedForTeardown; + // Release-published teardown gate. Readers use Volatile.Read in enqueue/wakeup paths + // to prevent new io_uring work from being published after teardown begins. + private int _ioUringTeardownInitiated; + private int _ioUringSlotCapacity; + private bool _completionSlotDrainInProgress; + private bool _cqOverflowRecoveryActive; + private IoUringCqOverflowRecoveryBranch _cqOverflowRecoveryBranch; + private long _cqOverflowTrackedSweepDeadlineTicks; + private int _cqOverflowTrackedSweepRearmCount; + private uint _ioUringManagedPendingSubmissions; + private uint _ioUringManagedSqTail; + private bool _ioUringManagedSqTailLoaded; + private Interop.Sys.IoUringSqRingInfo _ioUringSqRingInfo; + private bool _managedSqeInvariantsValidated; + private bool _ioUringDirectSqeEnabled; + private ManagedRingState _ringState = ManagedRingState.CreateDefault(); + + // Per-opcode support flags, populated by ProbeIoUringOpcodeSupport. + private bool _supportsOpSend; + private bool _supportsOpReadFixed; + private bool _supportsOpRecv; + private bool _supportsOpSendMsg; + private bool _supportsOpRecvMsg; + private bool _supportsOpAccept; + private bool _supportsOpConnect; + private bool _supportsOpSendZc; + private bool _supportsOpSendMsgZc; + private bool _supportsOpAsyncCancel; + private bool _supportsMultishotRecv; + private bool _supportsMultishotAccept; + private bool _zeroCopySendEnabled; + + private bool _sqPollEnabled; + private bool _ioUringInitialized; + private int _ioUringDrainTelemetryBatchActive; + private long _ioUringDrainBatchProvidedBufferDepletionCount; + private long _ioUringDrainBatchProvidedBufferRecycleCount; + private long _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount; + private IoUringProvidedBufferRing? _ioUringProvidedBufferRing; + private ushort _ioUringProvidedBufferGroupId; + // SoA split: hot completion slot state and cold native storage/tracking metadata. + private IoUringCompletionSlot[]? _completionSlots; + private IoUringTrackedOperationState[]? _trackedOperations; + private IoUringCompletionSlotStorage[]? _completionSlotStorage; + private unsafe byte* _completionSlotNativeStorage; + private nuint _completionSlotNativeStorageStride; + private int _trackedIoUringOperationCount; + private System.Buffers.MemoryHandle[]? _zeroCopyPinHolds; + private int _completionSlotFreeListHead = -1; + private int _completionSlotsInUse; + private int _completionSlotsHighWaterMark; + private int _liveAcceptCompletionSlotCount; + +#if DEBUG + // Test hook state: forced completion result injection (mirrors native pal_io_uring.c test hooks). + private byte _testForceEagainOnceMask; + private byte _testForceEcanceledOnceMask; + private int _testForceSubmitEpermOnce; + // Test-only observability for cancel-queue full retry path. + private long _testCancelQueueWakeRetryCount; +#endif + + private LinuxIoUringCapabilities _ioUringCapabilities; + + // Managed ring state accessors (backed by _ringState). + private unsafe Interop.Sys.IoUringCqe* _managedCqeBase + { + get => _ringState.CqeBase; + set => _ringState.CqeBase = value; + } + + private unsafe uint* _managedCqTailPtr + { + get => _ringState.CqTailPtr; + set => _ringState.CqTailPtr = value; + } + + private unsafe uint* _managedCqHeadPtr + { + get => _ringState.CqHeadPtr; + set => _ringState.CqHeadPtr = value; + } + + private uint _managedCqMask + { + get => _ringState.CqMask; + set => _ringState.CqMask = value; + } + + private uint _managedCqEntries + { + get => _ringState.CqEntries; + set => _ringState.CqEntries = value; + } + + private unsafe uint* _managedCqOverflowPtr + { + get => _ringState.CqOverflowPtr; + set => _ringState.CqOverflowPtr = value; + } + + private uint _managedObservedCqOverflow + { + get => _ringState.ObservedCqOverflow; + set => _ringState.ObservedCqOverflow = value; + } + + private unsafe byte* _managedSqRingPtr + { + get => _ringState.SqRingPtr; + set => _ringState.SqRingPtr = value; + } + + private unsafe byte* _managedCqRingPtr + { + get => _ringState.CqRingPtr; + set => _ringState.CqRingPtr = value; + } + + private unsafe uint* _managedSqFlagsPtr + { + get => _ringState.SqFlagsPtr; + set => _ringState.SqFlagsPtr = value; + } + + private ulong _managedSqRingSize + { + get => _ringState.SqRingSize; + set => _ringState.SqRingSize = value; + } + + private ulong _managedCqRingSize + { + get => _ringState.CqRingSize; + set => _ringState.CqRingSize = value; + } + + private ulong _managedSqesSize + { + get => _ringState.SqesSize; + set => _ringState.SqesSize = value; + } + + private bool _managedUsesSingleMmap + { + get => _ringState.UsesSingleMmap; + set => _ringState.UsesSingleMmap = value; + } + + private int _managedRingFd + { + get => _ringState.RingFd; + set => _ringState.RingFd = value; + } + + private bool _managedUsesExtArg + { + get => _ringState.UsesExtArg; + set => _ringState.UsesExtArg = value; + } + + private bool _managedUsesNoSqArray + { + get => _ringState.UsesNoSqArray; + set => _ringState.UsesNoSqArray = value; + } + + private uint _managedNegotiatedFlags + { + get => _ringState.NegotiatedFlags; + set => _ringState.NegotiatedFlags = value; + } + + private uint _managedCachedCqHead + { + get => _ringState.CachedCqHead; + set => _ringState.CachedCqHead = value; + } + + private bool _managedCqDrainEnabled + { + get => _ringState.CqDrainEnabled; + set => _ringState.CqDrainEnabled = value; + } + + private int _managedWakeupEventFd + { + get => _ringState.WakeupEventFd; + set => _ringState.WakeupEventFd = value; + } + + /// Whether this engine instance is using io_uring completion mode. + internal bool IsIoUringCompletionModeEnabled => _ioUringCapabilities.IsCompletionMode; + /// Whether managed direct SQE submission is enabled. + internal bool IsIoUringDirectSqeEnabled => _ioUringDirectSqeEnabled; + /// Whether a connected send payload is eligible for the SEND_ZC path. + internal bool ShouldTryIoUringDirectSendZeroCopy(int payloadLength) => + IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: false); + /// Whether a message-based send payload is eligible for the SENDMSG_ZC path. + internal bool ShouldTryIoUringDirectSendMessageZeroCopy(int payloadLength) => + IsIoUringZeroCopySendEligible(payloadLength, requiresSendMessageOpcode: true); + + /// + /// Centralized zero-copy policy: + /// 1) process-level opt-in, 2) opcode support, 3) payload threshold. + /// The threshold is based on total payload bytes so buffer-list workloads (e.g. 4KB segments) + /// are eligible once the aggregate payload crosses the cutoff. + /// + private bool IsIoUringZeroCopySendEligible(int payloadLength, bool requiresSendMessageOpcode) + { + if (!_zeroCopySendEnabled || payloadLength < IoUringConstants.ZeroCopySendThreshold) + { + return false; + } + + return requiresSendMessageOpcode ? _supportsOpSendMsgZc : _supportsOpSendZc; + } + + /// + /// Reads the total count of pending completions that had to requeue through prepare queues + /// after inline completion-mode re-prepare was not used. + /// + internal static long GetIoUringPendingRetryQueuedToPrepareQueueCount() + { + long total = 0; + foreach (SocketAsyncEngine engine in s_engines) + { + total += Interlocked.Read(ref engine._ioUringPendingRetryQueuedToPrepareQueueCount); + } + + return total; + } + + internal static long GetIoUringNonPinnablePrepareFallbackCount() + { + long total = 0; + foreach (SocketAsyncEngine engine in s_engines) + { + total += Interlocked.Read(ref engine._ioUringNonPinnablePrepareFallbackCount); + } + + return total; + } + + internal static void SetIoUringNonPinnablePrepareFallbackCountForTest(long value) + { +#if DEBUG + bool assigned = false; + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled) + { + continue; + } + + long engineValue = assigned ? 0 : value; + Interlocked.Exchange(ref engine._ioUringNonPinnablePrepareFallbackCount, engineValue); + Interlocked.Exchange(ref engine._ioUringPublishedNonPinnablePrepareFallbackCount, 0); + assigned = true; + } +#else + _ = value; +#endif + } + + private void LogIoUringResolvedConfigurationIfNeeded(in IoUringResolvedConfiguration resolvedConfiguration) + { + if (Interlocked.Exchange(ref _ioUringResolvedConfigurationLogged, 1) != 0) + { + return; + } + + string configuration = resolvedConfiguration.ToLogString(); + SocketsTelemetry.Log.ReportIoUringResolvedConfiguration(configuration); + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(this, $"io_uring resolved configuration: {configuration}"); + if (resolvedConfiguration.TryGetValidationWarnings(out string? warnings)) + { + NetEventSource.Info(this, $"io_uring configuration warnings: {warnings}"); + } + } + } + + private static int GetIoUringPrepareQueueCapacity() + { +#if DEBUG + if (Environment.GetEnvironmentVariable( + IoUringTestEnvironmentVariables.PrepareQueueCapacity) is string configuredValue && + int.TryParse(configuredValue, out int configuredCapacity) && + configuredCapacity > 0) + { + return configuredCapacity; + } +#endif + + // Raised default to reduce fallback frequency under bursty load. + int scaledCapacity = s_eventBufferCount >= 32 ? checked(s_eventBufferCount * 4) : 512; + return Math.Max(scaledCapacity, 512); + } + + private static uint GetIoUringQueueEntries() + { +#if DEBUG + if (Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.QueueEntries) is string configuredValue && + int.TryParse(configuredValue, out int configuredEntries) && + configuredEntries >= 2 && + configuredEntries <= IoUringConstants.QueueEntries && + (configuredEntries & (configuredEntries - 1)) == 0) + { + return (uint)configuredEntries; + } +#endif + + return IoUringConstants.QueueEntries; + } + + /// Creates a capabilities snapshot based on whether the port is io_uring. + private static LinuxIoUringCapabilities ResolveLinuxIoUringCapabilities(bool isIoUringPort) => + default(LinuxIoUringCapabilities) + .WithIsIoUringPort(isIoUringPort) + .WithMode(isIoUringPort ? IoUringMode.Completion : IoUringMode.Disabled); + + private void SetIoUringProvidedBufferCapabilityState(bool supportsProvidedBufferRings, bool hasRegisteredBuffers) + { + _ioUringCapabilities = _ioUringCapabilities + .WithSupportsProvidedBufferRings(supportsProvidedBufferRings) + .WithHasRegisteredBuffers(hasRegisteredBuffers); + } + + /// Encodes a tag byte and payload into a 64-bit user_data value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong EncodeIoUringUserData(byte tag, ulong payload) => + ((ulong)tag << IoUringUserDataTagShift) | (payload & IoUringUserDataPayloadMask); + + /// Reads the next CQE from the completion ring without advancing the head. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe, int eventLoopThreadId) + { + Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId, + "TryPeekNextCqe must only be called from the event loop thread (SINGLE_ISSUER contract)."); + cqe = null; + uint cqTail = Volatile.Read(ref *_managedCqTailPtr); + if (_managedCachedCqHead == cqTail) return false; + uint index = _managedCachedCqHead & _managedCqMask; + cqe = _managedCqeBase + index; + return true; + } + + /// Advances the CQ head pointer by the given count, making slots available to the kernel. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void AdvanceCqHead(uint count, int eventLoopThreadId) + { + Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId, + "AdvanceCqHead must only be called from the event loop thread (SINGLE_ISSUER contract)."); + _managedCachedCqHead += count; + Volatile.Write(ref *_managedCqHeadPtr, _managedCachedCqHead); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void BeginIoUringDrainTelemetryBatch() + { + _ioUringDrainBatchProvidedBufferDepletionCount = 0; + _ioUringDrainBatchProvidedBufferRecycleCount = 0; + _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount = 0; + Volatile.Write(ref _ioUringDrainTelemetryBatchActive, 1); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void FlushIoUringDrainTelemetryBatch() + { + Volatile.Write(ref _ioUringDrainTelemetryBatchActive, 0); + + long depletionCount = _ioUringDrainBatchProvidedBufferDepletionCount; + if (depletionCount != 0) + { + SocketsTelemetry.Log.IoUringProvidedBufferDepletion(depletionCount); + } + + long recycleCount = _ioUringDrainBatchProvidedBufferRecycleCount; + if (recycleCount != 0) + { + SocketsTelemetry.Log.IoUringProvidedBufferRecycle(recycleCount); + } + + long earlyDataCount = _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount; + if (earlyDataCount != 0) + { + SocketsTelemetry.Log.IoUringPersistentMultishotRecvEarlyData(earlyDataCount); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordIoUringProvidedBufferDepletionForDrainBatch(long count = 1) + { + if (Volatile.Read(ref _ioUringDrainTelemetryBatchActive) != 0) + { + _ioUringDrainBatchProvidedBufferDepletionCount += count; + return; + } + + SocketsTelemetry.Log.IoUringProvidedBufferDepletion(count); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordIoUringProvidedBufferRecycleForDrainBatch(long count = 1) + { + if (Volatile.Read(ref _ioUringDrainTelemetryBatchActive) != 0) + { + _ioUringDrainBatchProvidedBufferRecycleCount += count; + return; + } + + SocketsTelemetry.Log.IoUringProvidedBufferRecycle(count); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordIoUringPersistentMultishotRecvEarlyDataForDrainBatch(long count = 1) + { + if (Volatile.Read(ref _ioUringDrainTelemetryBatchActive) != 0) + { + _ioUringDrainBatchPersistentMultishotRecvEarlyDataCount += count; + return; + } + + SocketsTelemetry.Log.IoUringPersistentMultishotRecvEarlyData(count); + } + + /// + /// Drains up to CQEs from the mmap'd + /// completion ring and dispatches each based on the user_data tag. + /// Tag=2 (reserved completion) entries are dispatched directly through + /// . + /// Tag=3 (wakeup signal) entries are handled inline. + /// Returns true when at least one CQE was drained. + /// + private unsafe bool DrainCqeRingBatch(SocketEventHandler handler) + { + int eventLoopThreadId = Volatile.Read(ref _eventLoopManagedThreadId); + Debug.Assert(eventLoopThreadId == Environment.CurrentManagedThreadId, + "DrainCqeRingBatch must only be called from the event loop thread (SINGLE_ISSUER contract)."); + ObserveManagedCqOverflowCounter(); + int drained = 0; + bool drainedAnyCqe = false; + bool enqueuedFallbackEvent = false; + uint deferredCqHeadAdvance = 0; + IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing; + providedBufferRing?.BeginDeferredRecyclePublish(); + BeginIoUringDrainTelemetryBatch(); + + try + { + while (drained < (int)IoUringConstants.MaxCqeDrainBatch + && TryPeekNextCqe(out Interop.Sys.IoUringCqe* cqe, eventLoopThreadId)) + { + drainedAnyCqe = true; + ulong userData = cqe->UserData; + int result = cqe->Result; + uint flags = cqe->Flags; + + if (_cqOverflowRecoveryActive) + { + // During overflow recovery, publish head movement per CQE so the kernel can + // reclaim CQ ring space immediately and avoid extending overflow pressure. + AdvanceCqHead(1, eventLoopThreadId); + } + else + { + _managedCachedCqHead++; + deferredCqHeadAdvance++; + } + + byte tag = (byte)(userData >> IoUringUserDataTagShift); + ulong payload = userData & IoUringUserDataPayloadMask; + + if (tag == IoUringConstants.TagReservedCompletion) + { + if ((flags & IoUringConstants.CqeFNotif) != 0) + { + if (HandleZeroCopyNotification(payload)) + { + handler.DispatchZeroCopyIoUringNotification(payload); + } + + drained++; + continue; + } + + bool isMultishotCompletion = false; + if ((flags & IoUringConstants.CqeFMore) != 0) + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + int slotIndex = DecodeCompletionSlotIndex(payload); + if (completionEntries is not null && + (uint)slotIndex < (uint)completionEntries.Length) + { + IoUringCompletionOperationKind kind = completionEntries[slotIndex].Kind; + isMultishotCompletion = + (kind == IoUringCompletionOperationKind.Message && _ioUringCapabilities.SupportsMultishotRecv) || + (kind == IoUringCompletionOperationKind.Accept && _ioUringCapabilities.SupportsMultishotAccept); + } + } + ResolveReservedCompletionSlotMetadata( + payload, + isMultishotCompletion, + ref result, + out int completionSocketAddressLen, + out int completionControlBufferLen, + out uint completionAuxiliaryData, + out bool hasFixedRecvBuffer, + out ushort fixedRecvBufferId); + + if (isMultishotCompletion) + { + // Dispatch expects full tagged user_data so tracked-ownership decode can validate tag+generation. + handler.DispatchMultishotIoUringCompletion( + userData, + result, + flags, + completionSocketAddressLen, + completionControlBufferLen, + completionAuxiliaryData, + hasFixedRecvBuffer, + fixedRecvBufferId, + ref enqueuedFallbackEvent); + } + else + { + // Dispatch expects full tagged user_data so tracked-ownership decode can validate tag+generation. + handler.DispatchSingleIoUringCompletion( + userData, + result, + flags, + completionSocketAddressLen, + completionControlBufferLen, + completionAuxiliaryData, + hasFixedRecvBuffer, + fixedRecvBufferId, + ref enqueuedFallbackEvent); + } + } + else if (tag == IoUringConstants.TagWakeupSignal) + { + HandleManagedWakeupSignal(result); + } + else if (tag != IoUringConstants.TagNone) + { + Debug.Fail($"Unknown io_uring CQE user_data tag: {tag}."); + } + + drained++; + } + } + finally + { + providedBufferRing?.EndDeferredRecyclePublish(); + FlushIoUringDrainTelemetryBatch(); + if (deferredCqHeadAdvance != 0 && _managedCqHeadPtr is not null) + { + Volatile.Write(ref *_managedCqHeadPtr, _managedCachedCqHead); + } + } + + if (enqueuedFallbackEvent) + { + EnsureWorkerScheduled(); + } + + TryCompleteManagedCqOverflowRecovery(); + AssertCompletionSlotUsageBounded(); + + return drainedAnyCqe; + } + + /// + /// Resolves metadata for a reserved completion by applying forced test results and + /// copying operation-specific completion outputs (accept/recvmsg) from native storage. + /// + private void ResolveReservedCompletionSlotMetadata( + ulong payload, + bool isMultishotCompletion, + ref int result, + out int completionSocketAddressLen, + out int completionControlBufferLen, + out uint completionAuxiliaryData, + out bool hasFixedRecvBuffer, + out ushort fixedRecvBufferId) + { + completionSocketAddressLen = 0; + completionControlBufferLen = 0; + completionAuxiliaryData = 0; + hasFixedRecvBuffer = false; + fixedRecvBufferId = 0; + + int slotIndex = DecodeCompletionSlotIndex(payload); + if ((uint)slotIndex >= (uint)_completionSlots!.Length) + { + return; + } + + ref IoUringCompletionSlot slot = ref _completionSlots[slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + ulong completionGeneration = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask; + if (completionGeneration != slot.Generation) + { + // Stale CQE for a recycled slot; ignore without mutating current slot state. + return; + } + + ResolveDebugTestForcedResult(ref slot, ref result); + + if (slot.UsesFixedRecvBuffer) + { + hasFixedRecvBuffer = true; + fixedRecvBufferId = slot.FixedRecvBufferId; + slot.UsesFixedRecvBuffer = false; + slot.FixedRecvBufferId = 0; + Debug.Assert(!isMultishotCompletion, "Fixed-buffer receive completions are expected to be one-shot."); + } + + if (slot.Kind == IoUringCompletionOperationKind.Accept && + slotStorage.NativeSocketAddressLengthPtr is not null) + { + int nativeSocketAddressLength = *slotStorage.NativeSocketAddressLengthPtr; + completionAuxiliaryData = nativeSocketAddressLength >= 0 ? (uint)nativeSocketAddressLength : 0u; + if (isMultishotCompletion) + { + int socketAddressCapacity = slotStorage.ReceiveSocketAddressCapacity; + if (socketAddressCapacity > 0 && slotStorage.NativeSocketAddress is not null) + { + Unsafe.InitBlockUnaligned(slotStorage.NativeSocketAddress, 0, (uint)socketAddressCapacity); + } + + *slotStorage.NativeSocketAddressLengthPtr = socketAddressCapacity >= 0 ? socketAddressCapacity : 0; + } + } + else if (slot.Kind == IoUringCompletionOperationKind.Message) + { + CopyMessageCompletionOutputs( + slotIndex, + out completionSocketAddressLen, + out completionControlBufferLen, + out completionAuxiliaryData); + } + + if (!isMultishotCompletion) + { + if (!slot.IsZeroCopySend) + { + FreeCompletionSlot(slotIndex); + } + else if (result < 0) + { + // Error completion path may not produce a NOTIF CQE. + FreeCompletionSlot(slotIndex); + } + else if (!slot.ZeroCopyNotificationPending) + { + // First CQE for zero-copy send: keep slot alive until NOTIF CQE arrives. + slot.ZeroCopyNotificationPending = true; + AssertZeroCopyNotificationPendingForPayload(payload); + } + } + } + + /// Handles NOTIF CQEs for zero-copy sends and releases retained completion slots. + private bool HandleZeroCopyNotification(ulong payload) + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + return false; + } + + int slotIndex = DecodeCompletionSlotIndex(payload); + if ((uint)slotIndex >= (uint)completionEntries.Length) + { + return false; + } + + ref IoUringCompletionSlot slot = ref completionEntries[slotIndex]; + ulong completionGeneration = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask; + if (slot.Generation != completionGeneration) + { + return false; + } + + if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending) + { + return false; + } + + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + FreeCompletionSlot(slotIndex); + return true; + } + + /// Returns true when the completion slot for is waiting on SEND_ZC NOTIF. + private bool IsZeroCopyNotificationPending(ulong userData) + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + return false; + } + + int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask); + if ((uint)slotIndex >= (uint)completionEntries.Length) + { + return false; + } + + ref IoUringCompletionSlot slot = ref completionEntries[slotIndex]; + return slot.IsZeroCopySend && slot.ZeroCopyNotificationPending; + } + + /// + /// Releases a deferred SEND_ZC completion slot when dispatch cannot reattach ownership. + /// + private bool TryCleanupDeferredZeroCopyCompletionSlot(ulong userData) + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + return false; + } + + int slotIndex = DecodeCompletionSlotIndex(userData & IoUringUserDataPayloadMask); + if ((uint)slotIndex >= (uint)completionEntries.Length) + { + return false; + } + + ref IoUringCompletionSlot slot = ref completionEntries[slotIndex]; + ulong completionGeneration = ((userData & IoUringUserDataPayloadMask) >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask; + if (slot.Generation != completionGeneration) + { + return false; + } + + if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending) + { + return false; + } + + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + FreeCompletionSlot(slotIndex); + return true; + } + + /// Debug assertion that a reserved completion payload remains armed for SEND_ZC NOTIF. + [Conditional("DEBUG")] + private void AssertZeroCopyNotificationPendingForPayload(ulong payload) + { + ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + Debug.Assert( + IsZeroCopyNotificationPending(userData), + "SEND_ZC first CQE must leave the completion slot pending until NOTIF CQE arrives."); + } + + /// Debug assertion that SEND_ZC completion dispatch is deferred until NOTIF arrives. + [Conditional("DEBUG")] + private void AssertZeroCopyDeferredCompletionState(ulong userData, SocketAsyncContext.AsyncOperation operation) + { + Debug.Assert( + operation.IoUringUserData == userData, + "Deferred SEND_ZC completion must retain the original user_data until NOTIF CQE dispatch."); + Debug.Assert( + IsZeroCopyNotificationPending(userData), + "Deferred SEND_ZC completion requires an armed NOTIF state."); + } + + /// Observes kernel CQ overflow count deltas and emits telemetry/logs. + private unsafe void ObserveManagedCqOverflowCounter() + { + if (_managedCqOverflowPtr is null) + { + return; + } + + uint observedOverflow = Volatile.Read(ref *_managedCqOverflowPtr); + uint previousOverflow = _managedObservedCqOverflow; + // The kernel counter is uint32 and wraps; compare via wrapped delta instead of monotonic ordering. + uint delta = unchecked(observedOverflow - previousOverflow); + if (delta == 0) + { + return; + } + + _managedObservedCqOverflow = observedOverflow; + SocketsTelemetry.Log.IoUringCqOverflow(delta); + // Defer stale-tracked sweep scheduling until recovery completes. + Volatile.Write(ref _cqOverflowTrackedSweepDeadlineTicks, 0); + _cqOverflowTrackedSweepRearmCount = 0; + + IoUringCqOverflowRecoveryBranch branch = _cqOverflowRecoveryActive ? + IoUringCqOverflowRecoveryBranch.DualWave : + DetermineCqOverflowRecoveryBranchAtEntry(); + _cqOverflowRecoveryActive = true; + _cqOverflowRecoveryBranch = branch; + AssertLiveAcceptSlotsRemainTrackedDuringRecovery(branch); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringCqOverflow(observedOverflow, delta); + LogIoUringCqOverflowRecoveryEntry(branch, observedOverflow, delta); + } + } + + /// Determines the initial recovery branch discriminator for a newly observed CQ overflow. + private IoUringCqOverflowRecoveryBranch DetermineCqOverflowRecoveryBranchAtEntry() + { + if (Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return IoUringCqOverflowRecoveryBranch.Teardown; + } + + if (_ioUringCapabilities.SupportsMultishotAccept && + HasLiveAcceptCompletionSlot()) + { + return IoUringCqOverflowRecoveryBranch.MultishotAcceptArming; + } + + return IoUringCqOverflowRecoveryBranch.DualWave; + } + + /// Returns true when at least one active completion slot is currently tracking accept metadata. + private bool HasLiveAcceptCompletionSlot() + { + // Keep this O(1): CQ-overflow branch selection can run frequently on the event loop hot path. + int liveAcceptCount = Volatile.Read(ref _liveAcceptCompletionSlotCount); + Debug.Assert(liveAcceptCount >= 0); + return liveAcceptCount != 0; + } + + /// + /// Completes CQ-overflow recovery once the ring is drained and no additional overflow increments are observed. + /// Recovery is best-effort: dropped CQEs cannot be reconstructed, so this only restores steady-state draining. + /// + private unsafe void TryCompleteManagedCqOverflowRecovery() + { + if (!_cqOverflowRecoveryActive || + _managedCqOverflowPtr is null || + _managedCqTailPtr is null) + { + return; + } + + uint cqTail = Volatile.Read(ref *_managedCqTailPtr); + if (_managedCachedCqHead != cqTail) + { + return; + } + + if (Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.Teardown; + } + + uint observedOverflow = Volatile.Read(ref *_managedCqOverflowPtr); + // The kernel counter is uint32 and wraps; compare via wrapped subtraction. + uint delta = unchecked(observedOverflow - _managedObservedCqOverflow); + if (delta > 0) + { + _managedObservedCqOverflow = observedOverflow; + if (_cqOverflowRecoveryBranch != IoUringCqOverflowRecoveryBranch.Teardown) + { + _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.DualWave; + } + SocketsTelemetry.Log.IoUringCqOverflow(delta); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringCqOverflow(observedOverflow, delta); + LogIoUringCqOverflowRecoveryEntry( + _cqOverflowRecoveryBranch, + observedOverflow, + delta); + } + + return; + } + + _cqOverflowRecoveryActive = false; + _cqOverflowTrackedSweepRearmCount = 0; + Volatile.Write( + ref _cqOverflowTrackedSweepDeadlineTicks, + Environment.TickCount64 + CqOverflowTrackedSweepDelayMilliseconds); + SocketsTelemetry.Log.IoUringCqOverflowRecovery(1); + if (_cqOverflowRecoveryBranch == IoUringCqOverflowRecoveryBranch.MultishotAcceptArming) + { + // Phase 1 spec branch (a): if CQ overflow occurs while multishot accept is live, + // defer re-arm nudges until after drain completes instead of discarding active state. + TryQueueDeferredMultishotAcceptRearmAfterRecovery(); + } + AssertCompletionSlotPoolConsistency(); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringCqOverflowRecoveryCompleted( + _cqOverflowRecoveryBranch, + _completionSlotsInUse); + } + } + + /// + /// After CQ-overflow recovery completes, performs a delayed sweep to retire tracked operations + /// that remain attached despite already transitioning out of the waiting state. + /// + private void TrySweepStaleTrackedIoUringOperationsAfterCqOverflowRecovery() + { + if (!_ioUringCapabilities.IsCompletionMode || + _cqOverflowRecoveryActive || + !IsCurrentThreadEventLoopThread()) + { + return; + } + + long deadline = Volatile.Read(ref _cqOverflowTrackedSweepDeadlineTicks); + if (deadline == 0 || + unchecked(Environment.TickCount64 - deadline) < 0) + { + return; + } + + // Consume the deadline before the sweep; follow-up work can re-arm it. + Volatile.Write(ref _cqOverflowTrackedSweepDeadlineTicks, 0); + + IoUringCompletionSlot[]? completionEntries = _completionSlots; + IoUringTrackedOperationState[]? trackedOperations = _trackedOperations; + if (completionEntries is null || + trackedOperations is null || + trackedOperations.Length != completionEntries.Length || + IsIoUringTrackingEmpty()) + { + return; + } + + int detachedCount = 0; + int canceledWaitingCount = 0; + + for (int slotIndex = 0; slotIndex < trackedOperations.Length; slotIndex++) + { + ref IoUringTrackedOperationState trackedState = ref trackedOperations[slotIndex]; + SocketAsyncContext.AsyncOperation? operation = Volatile.Read(ref trackedState.TrackedOperation); + if (operation is null) + { + continue; + } + + ulong generation = Volatile.Read(ref trackedState.TrackedOperationGeneration); + if (generation == 0) + { + continue; + } + + ulong payload = EncodeCompletionSlotUserData(slotIndex, generation); + ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + if (operation.IoUringUserData != userData) + { + continue; + } + + IoUringCompletionOperationKind kind = completionEntries[slotIndex].Kind; + if (ShouldSkipCqOverflowTrackedSweep(operation, userData, kind)) + { + continue; + } + + if (operation.IsInWaitingState()) + { + if (operation.TryCancel()) + { + canceledWaitingCount++; + } + + continue; + } + + if (TryUntrackTrackedIoUringOperation(userData, operation, out SocketAsyncContext.AsyncOperation? removedOperation) != IoUringTrackedOperationRemoveResult.Removed || + removedOperation is null) + { + continue; + } + + removedOperation.ClearIoUringUserData(); + FreeCompletionSlot(slotIndex); + detachedCount++; + } + + // Sweep for orphaned SEND_ZC completion slots whose NOTIF CQE was lost to CQ overflow. + int zeroCopyOrphanCount = SweepOrphanedZeroCopyNotificationSlots(completionEntries, trackedOperations); + + int totalDrainRecovery = detachedCount + zeroCopyOrphanCount; + if (totalDrainRecovery != 0) + { + SocketsTelemetry.Log.IoUringCompletionSlotDrainRecovery(totalDrainRecovery); + } + + if (canceledWaitingCount != 0) + { + if (_cqOverflowTrackedSweepRearmCount < CqOverflowTrackedSweepMaxRearms) + { + _cqOverflowTrackedSweepRearmCount++; + Volatile.Write( + ref _cqOverflowTrackedSweepDeadlineTicks, + Environment.TickCount64 + CqOverflowTrackedSweepDelayMilliseconds); + } + else if (NetEventSource.Log.IsEnabled()) + { + LogIoUringCqOverflowTrackedSweepRearmLimitReached( + CqOverflowTrackedSweepMaxRearms, + canceledWaitingCount); + } + } + else + { + _cqOverflowTrackedSweepRearmCount = 0; + } + + if (NetEventSource.Log.IsEnabled() && + (detachedCount != 0 || canceledWaitingCount != 0)) + { + LogIoUringCqOverflowTrackedSweepResult(detachedCount, canceledWaitingCount); + } + } + + /// + /// Scans completion slots for SEND_ZC entries stuck in ZeroCopyNotificationPending state + /// with no corresponding tracked operation, indicating a lost NOTIF CQE from CQ overflow. + /// + private int SweepOrphanedZeroCopyNotificationSlots( + IoUringCompletionSlot[] completionEntries, + IoUringTrackedOperationState[] trackedOperations) + { + int freedCount = 0; + for (int slotIndex = 0; slotIndex < completionEntries.Length; slotIndex++) + { + ref IoUringCompletionSlot slot = ref completionEntries[slotIndex]; + if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending) + { + continue; + } + + // The slot is waiting for a NOTIF CQE. Check whether any tracked operation + // still references this slot. If not, the first CQE was already processed and + // the operation was completed/dispatched, meaning the NOTIF CQE is the only + // thing keeping this slot alive -- and it was lost to CQ overflow. + ref IoUringTrackedOperationState trackedState = ref trackedOperations[slotIndex]; + if (Volatile.Read(ref trackedState.TrackedOperation) is not null) + { + continue; + } + + // Orphaned: NOTIF-pending with no tracked operation. Force-free the slot. + slot.IsZeroCopySend = false; + slot.ZeroCopyNotificationPending = false; + FreeCompletionSlot(slotIndex); + freedCount++; + } + + if (freedCount != 0 && NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info( + this, + $"io_uring CQ overflow recovery: freed {freedCount} orphaned SEND_ZC NOTIF-pending completion slot(s)."); + } + + return freedCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool ShouldSkipCqOverflowTrackedSweep( + SocketAsyncContext.AsyncOperation operation, + ulong userData, + IoUringCompletionOperationKind kind) + { + SocketAsyncContext context = operation.AssociatedContext; + + if (kind == IoUringCompletionOperationKind.Accept && + context.IsMultishotAcceptArmed && + context.MultishotAcceptUserData == userData) + { + // Active multishot accept slots are intentionally long-lived. + return true; + } + + if (kind == IoUringCompletionOperationKind.Message && + context.IsPersistentMultishotRecvArmed() && + context.PersistentMultishotRecvUserData == userData) + { + // Persistent multishot recv slots are intentionally long-lived. + return true; + } + + return false; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCqOverflowTrackedSweepResult(int detachedCount, int canceledWaitingCount) + { + NetEventSource.Info( + this, + $"io_uring CQ overflow stale-tracked sweep: detached={detachedCount}, canceledWaiting={canceledWaitingCount}"); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringCqOverflowTrackedSweepRearmLimitReached(int maxRearms, int canceledWaitingCount) + { + NetEventSource.Info( + this, + $"io_uring CQ overflow stale-tracked sweep rearm limit reached: maxRearms={maxRearms}, canceledWaiting={canceledWaitingCount}"); + } + + /// Debug assertion for Phase-1 branch (a): live multishot-accept slots must remain tracked during recovery. + [Conditional("DEBUG")] + private void AssertLiveAcceptSlotsRemainTrackedDuringRecovery(IoUringCqOverflowRecoveryBranch branch) + { + if (branch != IoUringCqOverflowRecoveryBranch.MultishotAcceptArming) + { + return; + } + + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + return; + } + + bool foundTrackedAccept = false; + for (int i = 0; i < completionEntries.Length; i++) + { + if (completionEntries[i].Kind != IoUringCompletionOperationKind.Accept) + { + continue; + } + + ulong payload = EncodeCompletionSlotUserData(i, completionEntries[i].Generation); + ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + if (ContainsTrackedIoUringOperation(userData)) + { + foundTrackedAccept = true; + break; + } + } + + Debug.Assert( + foundTrackedAccept, + "CQ-overflow recovery branch (a) requires at least one live tracked multishot-accept slot."); + } + + /// + /// After overflow recovery completes, nudges accept contexts with live multishot accept state + /// so the managed accept pipeline can resume dequeue/prepare flow. + /// + private void TryQueueDeferredMultishotAcceptRearmAfterRecovery() + { + if (!_ioUringCapabilities.SupportsMultishotAccept || + Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return; + } + + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + return; + } + + bool queuedAnyRearmNudge = false; + for (int slotIndex = 0; slotIndex < completionEntries.Length; slotIndex++) + { + if (completionEntries[slotIndex].Kind != IoUringCompletionOperationKind.Accept) + { + continue; + } + + ulong payload = EncodeCompletionSlotUserData(slotIndex, completionEntries[slotIndex].Generation); + ulong userData = EncodeIoUringUserData(IoUringConstants.TagReservedCompletion, payload); + if (!TryGetTrackedIoUringOperation(userData, out SocketAsyncContext.AsyncOperation? operation) || + operation is not SocketAsyncContext.AcceptOperation acceptOperation) + { + continue; + } + + SocketAsyncContext context = acceptOperation.AssociatedContext; + if (!context.IsMultishotAcceptArmed || + context.MultishotAcceptUserData != userData) + { + continue; + } + + EnqueueReadinessFallbackEvent(context, Interop.Sys.SocketEvents.Read); + queuedAnyRearmNudge = true; + } + + if (queuedAnyRearmNudge && NetEventSource.Log.IsEnabled()) + { + LogIoUringDeferredMultishotAcceptRearmAfterRecovery(); + } + } + + /// + /// Handles a wakeup signal CQE by consuming the eventfd counter. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void HandleManagedWakeupSignal(int cqeResult) + { + if (cqeResult >= 0 && _managedWakeupEventFd >= 0) + { + ulong value; + Interop.Error readError = Interop.Sys.IoUringShimReadEventFd(_managedWakeupEventFd, &value); + if (readError != Interop.Error.SUCCESS && + readError != Interop.Error.EAGAIN && + NetEventSource.Log.IsEnabled()) + { + LogWakeupReadFailure(this, readError); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void LogWakeupReadFailure(SocketAsyncEngine engine, Interop.Error readErrorCode) + { + NetEventSource.Error(engine, $"io_uring wakeup eventfd read failed: error={readErrorCode}"); + } + } + + private const int FdCloexec = 1; + /// io_uring completion mode does not use socket event registration updates. + partial void LinuxTryChangeSocketEventRegistration( + IntPtr socketHandle, + Interop.Sys.SocketEvents currentEvents, + Interop.Sys.SocketEvents newEvents, + int data, + ref Interop.Error error, + ref bool handled) + { + if (!Volatile.Read(ref _ioUringInitialized)) + { + return; + } + + handled = true; + error = Interop.Error.SUCCESS; + } + + private static bool TrySetFdCloseOnExec(int fd, out Interop.Error error) + { + int currentFlags = Interop.Sys.Fcntl.GetFD((IntPtr)fd); + if (currentFlags < 0) + { + error = Interop.Sys.GetLastErrorInfo().Error; + return false; + } + + int updatedFlags = currentFlags | FdCloexec; + if (updatedFlags == currentFlags) + { + error = Interop.Error.SUCCESS; + return true; + } + + if (Interop.Sys.Fcntl.SetFD((IntPtr)fd, updatedFlags) == 0) + { + error = Interop.Error.SUCCESS; + return true; + } + + error = Interop.Sys.GetLastErrorInfo().Error; + return false; + } + + /// + /// Probes the kernel for supported io_uring opcodes using IORING_REGISTER_PROBE and + /// populates the per-opcode _supportsOp* capability flags. + /// When the probe syscall is unavailable (older kernels), all flags remain at their + /// default value (). + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe void ProbeIoUringOpcodeSupport(int ringFd) + { + // Probe buffer: 16-byte header + 256 * 8-byte ops = 2064 bytes. + const int maxOps = 256; + const int probeSize = 16 + maxOps * 8; + byte* probeBuffer = stackalloc byte[probeSize]; + new Span(probeBuffer, probeSize).Clear(); + + int result; + Interop.Error err = Interop.Sys.IoUringShimRegister( + ringFd, IoUringConstants.RegisterProbe, probeBuffer, (uint)maxOps, &result); + + if (err != Interop.Error.SUCCESS) + { + // Probe not supported (for example older kernels): per-opcode flags remain false. + // Direct SQE prep does not gate on these flags; this mainly affects optional feature light-up. + return; + } + + // Parse: ops start at offset 16, each is 8 bytes. + IoUringProbeOp* ops = (IoUringProbeOp*)(probeBuffer + 16); + IoUringProbeHeader* header = (IoUringProbeHeader*)probeBuffer; + int opsCount = Math.Min((int)header->OpsLen, maxOps); + + _supportsOpReadFixed = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.ReadFixed); + _supportsOpSend = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Send); + _supportsOpRecv = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Recv); + _supportsOpSendMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsg); + _supportsOpRecvMsg = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.RecvMsg); + _supportsOpAccept = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Accept); + _supportsOpConnect = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.Connect); + _supportsOpSendZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendZc); + _supportsOpSendMsgZc = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.SendMsgZc); + _zeroCopySendEnabled = _supportsOpSendZc && IsZeroCopySendOptedIn(); + _supportsOpAsyncCancel = IsOpcodeSupported(ops, opsCount, IoUringOpcodes.AsyncCancel); + _supportsMultishotAccept = _supportsOpAccept; + RefreshIoUringMultishotRecvSupport(); + } + + /// Checks whether a specific opcode is supported by the kernel's io_uring probe result. + private static unsafe bool IsOpcodeSupported(IoUringProbeOp* ops, int opsCount, byte opcode) + { + if (opcode >= opsCount) return false; + return (ops[opcode].Flags & IoUringConstants.ProbeOpFlagSupported) != 0; + } + + /// Publishes the managed SQ tail pointer to make queued SQEs visible to the kernel. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void PublishManagedSqeTail() + { + if (!_ioUringManagedSqTailLoaded || _ioUringSqRingInfo.SqTailPtr == IntPtr.Zero) + { + return; + } + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "PublishManagedSqeTail must only be called from the event loop thread (SINGLE_ISSUER contract)."); + ref uint sqTailRef = ref Unsafe.AsRef((void*)_ioUringSqRingInfo.SqTailPtr); + Volatile.Write(ref sqTailRef, _ioUringManagedSqTail); + _ioUringManagedSqTailLoaded = false; + } + + /// + /// Returns true when the SQPOLL kernel thread has gone idle and needs an explicit wakeup. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool SqNeedWakeup() + { + Debug.Assert(_sqPollEnabled, "SqNeedWakeup should only be checked in SQPOLL mode."); + if (_managedSqFlagsPtr == null) + { + return true; + } + + return (Volatile.Read(ref *_managedSqFlagsPtr) & IoUringConstants.SqNeedWakeup) != 0; + } + + /// Allocates the next available SQE slot from the submission ring. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryGetNextManagedSqe(out IoUringSqe* sqe) + { + sqe = null; + if (!_ioUringDirectSqeEnabled) + { + return false; + } + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryGetNextManagedSqe must only be called from the event loop thread (SINGLE_ISSUER contract)."); + if (!_managedSqeInvariantsValidated) + { + return false; + } + + ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo; + Debug.Assert(ringInfo.SqeBase != IntPtr.Zero); + Debug.Assert(ringInfo.SqHeadPtr != IntPtr.Zero); + Debug.Assert(ringInfo.SqTailPtr != IntPtr.Zero); + Debug.Assert(ringInfo.SqEntries != 0); + Debug.Assert(ringInfo.SqeSize == (uint)sizeof(IoUringSqe)); + + ref uint sqHeadRef = ref Unsafe.AsRef((void*)ringInfo.SqHeadPtr); + uint sqHead = Volatile.Read(ref sqHeadRef); + if (!_ioUringManagedSqTailLoaded) + { + ref uint sqTailRef = ref Unsafe.AsRef((void*)ringInfo.SqTailPtr); + _ioUringManagedSqTail = Volatile.Read(ref sqTailRef); + _ioUringManagedSqTailLoaded = true; + } + + uint sqTail = _ioUringManagedSqTail; + if (sqTail - sqHead >= ringInfo.SqEntries) + { + return false; + } + + uint index = sqTail & ringInfo.SqMask; + nint sqeOffset = checked((nint)((nuint)index * ringInfo.SqeSize)); + sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset); + // Managed direct-SQE preparation in System.Net.Sockets is socket-opcode-only. Clearing the full + // SQE is safe because every opcode we emit initializes all fields it relies on. + Unsafe.WriteUnaligned(sqe, default(IoUringSqe)); + _ioUringManagedSqTail = sqTail + 1; + _ioUringManagedPendingSubmissions++; + return true; + } + + /// Validates immutable SQ ring invariants once at initialization. + private bool ValidateManagedSqeInitializationInvariants() + { + ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo; + if (ringInfo.SqeBase == IntPtr.Zero || + ringInfo.SqHeadPtr == IntPtr.Zero || + ringInfo.SqTailPtr == IntPtr.Zero || + ringInfo.SqEntries == 0) + { + return false; + } + + if (ringInfo.SqeSize != (uint)sizeof(IoUringSqe)) + { + Debug.Fail($"Unexpected io_uring SQE size. Expected {sizeof(IoUringSqe)}, got {ringInfo.SqeSize}."); + return false; + } + + return true; + } + + /// Attempts to acquire an SQE, retrying with intermediate submits on ring full. + private unsafe bool TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError) + { + sqe = null; + submitError = Interop.Error.SUCCESS; + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryAcquireManagedSqeWithRetry must only be called from the event loop thread (SINGLE_ISSUER contract)."); + SocketEventHandler drainHandler = default; + bool drainHandlerInitialized = false; + + for (int attempt = 0; attempt < MaxIoUringSqeAcquireSubmitAttempts; attempt++) + { + if (TryGetNextManagedSqe(out sqe)) + { + return true; + } + + // Before retrying submission, run a CQ drain pass so completions can release + // slots and unblock kernel forward progress. The overflow counter is observed + // during drain; do not assume a single pass fully clears overflow pressure. + if (_managedCqDrainEnabled && + _managedCqOverflowPtr is not null && + _completionSlotsInUse != 0) + { + if (!drainHandlerInitialized) + { + drainHandler = new SocketEventHandler(this); + drainHandlerInitialized = true; + } + _ = DrainCqeRingBatch(drainHandler); + + if (TryGetNextManagedSqe(out sqe)) + { + return true; + } + } + + submitError = SubmitIoUringOperationsNormalized(); + if (submitError != Interop.Error.SUCCESS) + { + return false; + } + } + + submitError = Interop.Error.EAGAIN; + return false; + } + + /// + /// Common setup for direct SQE preparation: allocates a completion slot, encodes user data, + /// resolves the socket fd/flags, applies test hooks, and acquires an SQE. On failure, + /// restores test state and frees the slot. + /// + private unsafe struct IoUringDirectSqeSetupResult + { + public SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult PrepareResult; + public int SlotIndex; + public ulong UserData; + public int SqeFd; + public byte SqeFlags; + public IoUringSqe* Sqe; + public SocketError ErrorCode; + } + + /// + /// Prepares a direct SQE and returns all setup data as a single struct to avoid large + /// out-parameter callsites in per-opcode prepare paths. + /// + /// + /// if the SQE was acquired + /// (caller must write the SQE and return Prepared), + /// or a terminal result (Unsupported/PrepareFailed) that the caller should return directly. + /// + private unsafe IoUringDirectSqeSetupResult TrySetupDirectSqe( + SafeSocketHandle socket, + byte opcode) + { + IoUringDirectSqeSetupResult setup = default; + setup.SlotIndex = -1; + setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + setup.ErrorCode = SocketError.Success; + + if (!_ioUringDirectSqeEnabled) + { + return setup; + } + + int slotIndex = AllocateCompletionSlot(); + if (slotIndex < 0) + { + RecordIoUringCompletionSlotExhaustion(); + + if (!_completionSlotDrainInProgress) + { + _completionSlotDrainInProgress = true; + try + { + SocketEventHandler handler = new SocketEventHandler(this); + if (DrainCqeRingBatch(handler)) + { + slotIndex = AllocateCompletionSlot(); + } + } + finally + { + _completionSlotDrainInProgress = false; + } + } + + if (slotIndex < 0) + { + return setup; + } + + RecordIoUringCompletionSlotDrainRecovery(); + } + + setup.SlotIndex = slotIndex; + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![slotIndex]; + setup.UserData = EncodeCompletionSlotUserData(slotIndex, slot.Generation); + + bool addedSocketRef = false; + try + { + // Keep the fd alive from SQE prep through CQE retirement to avoid fd-reuse races after close. + socket.DangerousAddRef(ref addedSocketRef); + } + catch (ObjectDisposedException) + { + FreeCompletionSlot(slotIndex); + setup.SlotIndex = -1; + setup.ErrorCode = SocketError.OperationAborted; + setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed; + return setup; + } + + if (!addedSocketRef) + { + FreeCompletionSlot(slotIndex); + setup.SlotIndex = -1; + setup.ErrorCode = SocketError.OperationAborted; + setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed; + return setup; + } + + slotStorage.DangerousRefSocketHandle = socket; + // GC/rooting contract for fd lifetime: + // Engine -> _completionSlotStorage[slotIndex].DangerousRefSocketHandle -> SafeSocketHandle. + // Keep this chain alive across SQE submission through CQE retirement to avoid fd reuse races. + SafeSocketHandle? operation = slotStorage.DangerousRefSocketHandle; + Debug.Assert(operation != null); + int socketFd = (int)(nint)operation!.DangerousGetHandle(); + ConfigureSocketSqeFdAndFlags(socketFd, out setup.SqeFd, out setup.SqeFlags); + ApplyDebugTestForcedResult(ref slot, opcode); + + if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out Interop.Error submitError)) + { + RestoreDebugTestForcedResultIfNeeded(slotIndex, opcode); + FreeCompletionSlot(slotIndex); + setup.SlotIndex = -1; + + if (submitError == Interop.Error.SUCCESS || + submitError == Interop.Error.EAGAIN || + submitError == Interop.Error.EWOULDBLOCK) + { + return setup; + } + + setup.ErrorCode = SocketPal.GetSocketErrorForErrorCode(submitError); + setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.PrepareFailed; + return setup; + } + + setup.Sqe = sqe; + setup.PrepareResult = SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + return setup; + } + + /// Prepares a send SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSend( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Send); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + WriteSendLikeSqe(setup.Sqe, IoUringOpcodes.Send, setup.SqeFd, setup.SqeFlags, setup.UserData, buffer, (uint)bufferLen, rwFlags); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// + /// Prepares a send SQE, preferring SEND_ZC when eligible and falling back to SEND when unavailable. + /// + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendWithZeroCopyFallback( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + out bool usedZeroCopy, + out ulong userData, + out SocketError errorCode) + { + usedZeroCopy = false; + if (ShouldTryIoUringDirectSendZeroCopy(bufferLen)) + { + SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendZc( + socket, + buffer, + bufferLen, + flags, + out userData, + out errorCode); + if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported) + { + usedZeroCopy = zeroCopyResult == SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + return zeroCopyResult; + } + } + + return TryPrepareIoUringDirectSend( + socket, + buffer, + bufferLen, + flags, + out userData, + out errorCode); + } + + /// Prepares a zero-copy send SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendZc( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!ShouldTryIoUringDirectSendZeroCopy(bufferLen)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.SendZc); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex]; + slot.IsZeroCopySend = true; + slot.ZeroCopyNotificationPending = false; + + WriteSendLikeSqe(setup.Sqe, IoUringOpcodes.SendZc, setup.SqeFd, setup.SqeFlags, setup.UserData, buffer, (uint)bufferLen, rwFlags); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a recv SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectRecv( + SafeSocketHandle socket, + byte* buffer, + int bufferLen, + SocketFlags flags, + bool allowMultishotRecv, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Recv); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + if (ShouldTryIoUringDirectFixedRecv(flags, allowMultishotRecv, bufferLen) && + TryPrepareIoUringDirectRecvFixed(setup.SlotIndex, setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, bufferLen)) + { + SocketsTelemetry.Log.IoUringFixedRecvSelected(); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + if (allowMultishotRecv && + bufferLen > 0 && + TryGetIoUringMultishotRecvBufferGroupId(out ushort multishotBufferGroupId)) + { + WriteMultishotRecvSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, multishotBufferGroupId); + } + else if (bufferLen > 0 && + TryGetIoUringProvidedBufferGroupId(out ushort providedBufferGroupId)) + { + WriteProvidedBufferRecvSqe( + setup.Sqe, + setup.SqeFd, + setup.SqeFlags, + setup.UserData, + (uint)bufferLen, + rwFlags, + providedBufferGroupId); + } + else + { + WriteRecvSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, buffer, (uint)bufferLen, rwFlags); + } + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + private bool ShouldTryIoUringDirectFixedRecv(SocketFlags flags, bool allowMultishotRecv, int bufferLen) + { + if (!_supportsOpReadFixed || !_ioUringCapabilities.HasRegisteredBuffers) + { + return false; + } + + if (allowMultishotRecv || bufferLen <= 0) + { + return false; + } + + // READ_FIXED does not provide recvmsg/socket flags semantics. + return flags == SocketFlags.None; + } + + private unsafe bool TryPrepareIoUringDirectRecvFixed( + int slotIndex, + IoUringSqe* sqe, + int sqeFd, + byte sqeFlags, + ulong userData, + int requestedLength) + { + IoUringProvidedBufferRing? providedBufferRing = _ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + SocketsTelemetry.Log.IoUringFixedRecvFallback(); + return false; + } + + if (!providedBufferRing.TryAcquireBufferForPreparedReceive( + out ushort bufferId, + out byte* fixedBuffer, + out int fixedBufferLength)) + { + // Under transient provided-buffer pressure, fall back to normal receive preparation. + SocketsTelemetry.Log.IoUringFixedRecvFallback(); + return false; + } + + Debug.Assert(_completionSlots is not null); + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + slot.UsesFixedRecvBuffer = true; + slot.FixedRecvBufferId = bufferId; + + int receiveLength = Math.Min(requestedLength, fixedBufferLength); + WriteReadFixedSqe( + sqe, + sqeFd, + sqeFlags, + userData, + fixedBuffer, + (uint)receiveLength, + bufferId); + return true; + } + + /// Prepares an accept SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectAccept( + SafeSocketHandle socket, + byte* socketAddress, + int socketAddressLen, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Accept); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex]; + SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Accept); + Debug.Assert(slotStorage.NativeSocketAddressLengthPtr is not null); + *slotStorage.NativeSocketAddressLengthPtr = socketAddressLen; + + WriteAcceptSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, socketAddress, (IntPtr)slotStorage.NativeSocketAddressLengthPtr); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a multishot accept SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectMultishotAccept( + SafeSocketHandle socket, + byte* socketAddress, + int socketAddressLen, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + if (!_supportsMultishotAccept) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Accept); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + _ = socketAddress; + _ = socketAddressLen; + + ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex]; + SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Accept); + Debug.Assert(slotStorage.NativeSocketAddressLengthPtr is not null); + // Security hardening: multishot accept reuses a single SQE across shots, so sharing one sockaddr + // writeback buffer can race and surface mismatched peer addresses under bursty delivery. + // Transitional multishot accept only needs accepted fds, so request no sockaddr writeback. + *slotStorage.NativeSocketAddressLengthPtr = 0; + slotStorage.ReceiveSocketAddressCapacity = 0; + + WriteMultishotAcceptSqe( + setup.Sqe, + setup.SqeFd, + setup.SqeFlags, + setup.UserData, + socketAddress: null, + socketAddressLengthPtr: IntPtr.Zero); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a connect SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectConnect( + SafeSocketHandle socket, + byte* socketAddress, + int socketAddressLen, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.Connect); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + WriteConnectSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, socketAddress, socketAddressLen); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a sendmsg SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessage( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.SendMsg); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex]; + SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Message); + if (!TryPrepareInlineMessageStorage(setup.SlotIndex, messageHeader, isReceive: false)) + { + FreeCompletionSlot(setup.SlotIndex); + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + WriteSendMsgLikeSqe(setup.Sqe, IoUringOpcodes.SendMsg, setup.SqeFd, setup.SqeFlags, setup.UserData, slotStorage.NativeMsgHdrPtr, rwFlags); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// + /// Prepares a sendmsg SQE, preferring SENDMSG_ZC when eligible and falling back to SENDMSG otherwise. + /// + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageWithZeroCopyFallback( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + int payloadLength, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + if (ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength)) + { + SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult zeroCopyResult = TryPrepareIoUringDirectSendMessageZc( + socket, + messageHeader, + payloadLength, + flags, + out userData, + out errorCode); + if (zeroCopyResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported) + { + return zeroCopyResult; + } + } + + return TryPrepareIoUringDirectSendMessage( + socket, + messageHeader, + flags, + out userData, + out errorCode); + } + + /// Prepares a sendmsg_zc SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectSendMessageZc( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + int payloadLength, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!ShouldTryIoUringDirectSendMessageZeroCopy(payloadLength)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.SendMsgZc); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex]; + SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Message); + slotStorage.MessageIsReceive = false; + // Mirror SEND_ZC semantics: first CQE is not final managed completion; operation + // completes only after NOTIF CQE confirms kernel/NIC no longer references payload. + slot.IsZeroCopySend = true; + slot.ZeroCopyNotificationPending = false; + if (!TryPrepareInlineMessageStorage(setup.SlotIndex, messageHeader, isReceive: false)) + { + // Per-slot inline native slabs avoid hot-path allocations; overflow shapes + // fall back by returning Unsupported so upper layers can use alternate paths. + FreeCompletionSlot(setup.SlotIndex); + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + WriteSendMsgLikeSqe(setup.Sqe, IoUringOpcodes.SendMsgZc, setup.SqeFd, setup.SqeFlags, setup.UserData, slotStorage.NativeMsgHdrPtr, rwFlags); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Prepares a recvmsg SQE via the managed direct path. + internal unsafe SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult TryPrepareIoUringDirectReceiveMessage( + SafeSocketHandle socket, + Interop.Sys.MessageHeader* messageHeader, + SocketFlags flags, + out ulong userData, + out SocketError errorCode) + { + userData = 0; + errorCode = SocketError.Success; + + if (!TryConvertIoUringPrepareSocketFlags(flags, out uint rwFlags)) + { + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + IoUringDirectSqeSetupResult setup = TrySetupDirectSqe(socket, IoUringOpcodes.RecvMsg); + if (setup.PrepareResult != SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared) + { + errorCode = setup.ErrorCode; + return setup.PrepareResult; + } + + ref IoUringCompletionSlot slot = ref _completionSlots![setup.SlotIndex]; + ref IoUringCompletionSlotStorage slotStorage = ref _completionSlotStorage![setup.SlotIndex]; + SetCompletionSlotKind(ref slot, IoUringCompletionOperationKind.Message); + if (!TryPrepareInlineMessageStorage(setup.SlotIndex, messageHeader, isReceive: true)) + { + FreeCompletionSlot(setup.SlotIndex); + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Unsupported; + } + + WriteRecvMsgSqe(setup.Sqe, setup.SqeFd, setup.SqeFlags, setup.UserData, slotStorage.NativeMsgHdrPtr, rwFlags); + userData = setup.UserData; + return SocketAsyncContext.AsyncOperation.IoUringDirectPrepareResult.Prepared; + } + + /// Debug-only assertion that validates a state machine transition. + [Conditional("DEBUG")] + private static void AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState from, + IoUringOperationLifecycleState to) + { + bool isValid = + from == IoUringOperationLifecycleState.Queued && to == IoUringOperationLifecycleState.Prepared || + from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Submitted || + from == IoUringOperationLifecycleState.Prepared && to == IoUringOperationLifecycleState.Detached || + from == IoUringOperationLifecycleState.Submitted && + (to == IoUringOperationLifecycleState.Queued || + to == IoUringOperationLifecycleState.Completed || + to == IoUringOperationLifecycleState.Canceled || + to == IoUringOperationLifecycleState.Detached); + + Debug.Assert(isValid, $"Invalid io_uring lifecycle transition: {from} -> {to}"); + } + + /// Checks whether the kernel version meets the minimum for io_uring support. + [MethodImpl(MethodImplOptions.NoInlining)] + private static bool IsIoUringKernelVersionSupported() + { +#if DEBUG + if (string.Equals( + Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceKernelVersionUnsupported), + "1", + StringComparison.Ordinal)) + { + return false; + } +#endif + + return OperatingSystem.IsOSPlatformVersionAtLeast( + "Linux", + IoUringConstants.MinKernelMajor, + IoUringConstants.MinKernelMinor); + } + + /// + /// Recomputes whether multishot recv can be used by this engine instance. + /// Requires opcode support and active provided-buffer ring support. + /// + private bool RefreshIoUringMultishotRecvSupport() + { + _supportsMultishotRecv = + _supportsOpRecv && + _ioUringCapabilities.SupportsProvidedBufferRings; + return _supportsMultishotRecv; + } + + /// + /// Returns the provided-buffer group id used for buffer-select receive submissions. + /// + private bool TryGetIoUringProvidedBufferGroupId(out ushort bufferGroupId) + { + if (_ioUringCapabilities.SupportsProvidedBufferRings && _ioUringProvidedBufferRing is not null) + { + bufferGroupId = _ioUringProvidedBufferGroupId; + return true; + } + + bufferGroupId = default; + return false; + } + + /// + /// Returns the provided-buffer group id used for multishot recv submissions. + /// Multishot recv remains disabled unless both the opcode probe and provided-ring + /// registration succeeded for this engine instance. + /// + private bool TryGetIoUringMultishotRecvBufferGroupId(out ushort bufferGroupId) + { + if (_supportsMultishotRecv && TryGetIoUringProvidedBufferGroupId(out bufferGroupId)) + { + return true; + } + + bufferGroupId = default; + return false; + } + + internal bool SupportsMultishotRecv => _ioUringCapabilities.SupportsMultishotRecv; + internal bool SupportsMultishotAccept => _ioUringCapabilities.SupportsMultishotAccept; + + /// Calls io_uring_setup and negotiates feature flags. + [MethodImpl(MethodImplOptions.NoInlining)] + private static unsafe bool TrySetupIoUring(bool sqPollRequested, out IoUringSetupResult setupResult) + { + setupResult = default; + uint queueEntries = GetIoUringQueueEntries(); + + // R_DISABLED defers submitter_task assignment until REGISTER_ENABLE_RINGS, + // which is called from the event loop thread. This ensures DEFER_TASKRUN's + // submitter_task check (EEXIST on mismatch) passes on all kernel versions. + uint flags = IoUringConstants.SetupCqSize | IoUringConstants.SetupSubmitAll + | IoUringConstants.SetupCoopTaskrun | IoUringConstants.SetupSingleIssuer + | IoUringConstants.SetupNoSqArray | IoUringConstants.SetupCloexec + | IoUringConstants.SetupRDisabled; + + if (sqPollRequested) + { + // SQPOLL and DEFER_TASKRUN are mutually exclusive in practice. + flags |= IoUringConstants.SetupSqPoll; + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(null, "io_uring setup: SQPOLL requested and included in initial setup flags."); + } + } + else + { + // Default to DEFER_TASKRUN (+ COOP_TASKRUN already in base flags): this reduces + // event-loop CPU by deferring task work until io_uring_enter, at the cost of + // potentially higher tail latency when the event loop is preempted by other work. + // Benchmark data for DEFER_TASKRUN vs COOP_TASKRUN-only under mixed epoll/io_uring + // loads and revisit the default when we have p99 data. + flags |= IoUringConstants.SetupDeferTaskrun; + } + + Interop.Sys.IoUringParams ioParams = default; + ioParams.Flags = flags; + ioParams.CqEntries = queueEntries * IoUringConstants.CqEntriesFactor; + + int ringFd; + Interop.Error err = Interop.Sys.IoUringShimSetup(queueEntries, &ioParams, &ringFd); + + // IORING_SETUP_NO_SQARRAY was introduced in Linux 6.6. + // IORING_SETUP_CLOEXEC was introduced in Linux 5.19. + // Peel unsupported setup flags on EINVAL and retry. + if (err == Interop.Error.EPERM) + { + string deniedFlag = (flags & IoUringConstants.SetupSqPoll) != 0 ? + "IORING_SETUP_SQPOLL" : + "IORING_SETUP_NO_SQARRAY"; + // Never peel/retry on EPERM; that can bypass an explicit seccomp/kernel policy denial. + NetEventSource.Error(null, $"io_uring setup denied (EPERM) for {deniedFlag}; not retrying with peeled flags."); + } + else if (err == Interop.Error.EINVAL && + (flags & IoUringConstants.SetupNoSqArray) != 0) + { + flags &= ~IoUringConstants.SetupNoSqArray; + ioParams = default; + ioParams.Flags = flags; + ioParams.CqEntries = queueEntries * IoUringConstants.CqEntriesFactor; + + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(null, $"io_uring setup: peeled NO_SQARRAY after {err}."); + } + + err = Interop.Sys.IoUringShimSetup(queueEntries, &ioParams, &ringFd); + } + + if (err == Interop.Error.EINVAL && + (flags & IoUringConstants.SetupCloexec) != 0) + { + flags &= ~IoUringConstants.SetupCloexec; + ioParams = default; + ioParams.Flags = flags; + ioParams.CqEntries = queueEntries * IoUringConstants.CqEntriesFactor; + + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(null, $"io_uring setup: peeled CLOEXEC after {err}."); + } + + err = Interop.Sys.IoUringShimSetup(queueEntries, &ioParams, &ringFd); + } + + if (err != Interop.Error.SUCCESS) + { + return false; + } + + // IORING_SETUP_CLOEXEC removes the fork/exec inheritance window on supporting kernels. + // Keep FD_CLOEXEC as a fallback for peeled/older setups. + if (!TrySetFdCloseOnExec(ringFd, out Interop.Error cloexecError)) + { + if (NetEventSource.Log.IsEnabled()) + { + // Ensure ring fd is not inherited across fork/exec; inherited ring fds can corrupt ownership. + NetEventSource.Error(null, $"io_uring setup: failed to set FD_CLOEXEC on ring fd: {cloexecError}."); + } + + Interop.Sys.IoUringShimCloseFd(ringFd); + return false; + } + + setupResult.RingFd = ringFd; + setupResult.Params = ioParams; + setupResult.NegotiatedFlags = flags; + setupResult.UsesExtArg = (ioParams.Features & IoUringConstants.FeatureExtArg) != 0; + setupResult.SqPollNegotiated = (flags & IoUringConstants.SetupSqPoll) != 0; + if (setupResult.SqPollNegotiated && NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(null, "io_uring setup: SQPOLL negotiated."); + } + + if (setupResult.SqPollNegotiated) + { + SocketsTelemetry.Log.ReportIoUringSqPollNegotiatedWarning(); + } + return true; + } + + + /// Queues a POLL_ADD SQE on the wakeup eventfd for cross-thread signaling. + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool QueueManagedWakeupPollAdd() + { + if (_managedWakeupEventFd < 0) + return false; + + if (!TryGetNextManagedSqe(out IoUringSqe* sqe)) + return false; + + sqe->Opcode = IoUringOpcodes.PollAdd; + sqe->Fd = _managedWakeupEventFd; + sqe->Len = IoUringConstants.PollAddFlagMulti; // IORING_POLL_ADD_MULTI + sqe->RwFlags = IoUringConstants.PollIn; + sqe->UserData = EncodeIoUringUserData(IoUringConstants.TagWakeupSignal, 0); + return true; + } + + /// Attempts to register the ring fd for fixed-fd submission. + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool TryRegisterRingFd(int ringFd, out int registeredRingFd) + { + registeredRingFd = -1; + + // io_uring_rsrc_update: { uint32 offset, uint32 resv, uint64 data } + uint* update = stackalloc uint[4]; // 16 bytes + update[0] = IoUringConstants.RegisterOffsetAuto; // offset = auto-assign + update[1] = 0; // resv + *(ulong*)(update + 2) = (ulong)ringFd; // data = ring fd + + int result; + Interop.Error err = Interop.Sys.IoUringShimRegister( + ringFd, IoUringConstants.RegisterRingFds, update, 1u, &result); + + if (err != Interop.Error.SUCCESS || result <= 0) + return false; + + registeredRingFd = (int)update[0]; // kernel wrote assigned index back + return true; + } + + /// + /// Configures the SQE fd and flags for a socket operation. + /// Completion mode uses direct socket file descriptors. + /// + private static void ConfigureSocketSqeFdAndFlags(int socketFd, out int sqeFd, out byte sqeFlags) + { + sqeFd = socketFd; + sqeFlags = 0; + } + + + /// + /// Orchestrates complete managed io_uring initialization: kernel version check, + /// ring setup with flag negotiation, mmap, opcode probe, eventfd creation, + /// ring fd registration, and initial wakeup poll queue. + /// + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe bool TryInitializeManagedIoUring(in IoUringResolvedConfiguration resolvedConfiguration) + { + if (!IsIoUringKernelVersionSupported()) + return false; + + bool sqPollRequested = resolvedConfiguration.SqPollRequested; + if (!TrySetupIoUring(sqPollRequested, out IoUringSetupResult setupResult)) + return false; + + if (!TryMmapRings(ref setupResult)) + return false; + + _sqPollEnabled = setupResult.SqPollNegotiated; + if (NetEventSource.Log.IsEnabled()) + { + if (sqPollRequested && !_sqPollEnabled) + { + NetEventSource.Info( + this, + "SQPOLL requested but not negotiated (kernel support/capabilities may be unavailable)."); + } + else if (_sqPollEnabled) + { + NetEventSource.Info(this, "SQPOLL negotiated and enabled."); + } + } + + // Probe opcode support. + ProbeIoUringOpcodeSupport(setupResult.RingFd); + + // Create wakeup eventfd. + int eventFd; + Interop.Error err = Interop.Sys.IoUringShimCreateEventFd(&eventFd); + if (err != Interop.Error.SUCCESS) + { + // Cleanup: unmap and close + CleanupManagedRings(); + return false; + } + + if (!TrySetFdCloseOnExec(eventFd, out Interop.Error cloexecError)) + { + if (NetEventSource.Log.IsEnabled()) + { + // Eventfd wake channel must remain process-local across exec to prevent stale cross-process signaling. + NetEventSource.Error(this, $"io_uring setup: failed to set FD_CLOEXEC on wakeup eventfd: {cloexecError}."); + } + + Interop.Sys.IoUringShimCloseFd(eventFd); + CleanupManagedRings(); + return false; + } + + _managedWakeupEventFd = eventFd; + + // Try to register the ring fd for faster enter syscalls. + if (TryRegisterRingFd(setupResult.RingFd, out int registeredRingFd)) + { + _ioUringSqRingInfo.RegisteredRingFd = registeredRingFd; + } + + // Queue the initial wakeup POLL_ADD. + // Direct SQE must be enabled for QueueManagedWakeupPollAdd to work. + _ioUringDirectSqeEnabled = true; + if (!QueueManagedWakeupPollAdd()) + { + _ioUringDirectSqeEnabled = false; + Interop.Sys.IoUringShimCloseFd(eventFd); + _managedWakeupEventFd = -1; + CleanupManagedRings(); + return false; + } + + // Respect process-level direct SQE toggle after the required wakeup POLL_ADD is armed. + if (resolvedConfiguration.DirectSqeDisabled) + { + _ioUringDirectSqeEnabled = false; + } + + InitializeIoUringProvidedBufferRingIfSupported(setupResult.RingFd); + RefreshIoUringMultishotRecvSupport(); + _ioUringInitialized = true; + + InitializeDebugTestHooksFromEnvironment(); + + return true; + } + + /// Validates the managed NativeMsghdr layout contract for direct io_uring message operations. + [MethodImpl(MethodImplOptions.NoInlining)] + private bool IsNativeMsghdrLayoutSupportedForIoUring() + { + if (IsNativeMsghdrLayoutSupportedForIoUring(IntPtr.Size, sizeof(NativeMsghdr))) + { + return true; + } + + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info( + this, + $"io_uring disabled: unsupported NativeMsghdr layout (pointerSize={IntPtr.Size}, sizeof(NativeMsghdr)={sizeof(NativeMsghdr)})"); + } + + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsNativeMsghdrLayoutSupportedForIoUring(int pointerSize, int nativeMsghdrSize) => + pointerSize == 8 && nativeMsghdrSize == 56; + + /// Detects io_uring support and initializes the managed submission/completion paths. + partial void LinuxDetectAndInitializeIoUring() + { + IoUringResolvedConfiguration resolvedConfiguration = ResolveIoUringResolvedConfiguration(); + LogIoUringResolvedConfigurationIfNeeded(in resolvedConfiguration); + if (!resolvedConfiguration.IoUringEnabled || !IsNativeMsghdrLayoutSupportedForIoUring() || !TryInitializeManagedIoUring(in resolvedConfiguration)) + { + _ioUringCapabilities = ResolveLinuxIoUringCapabilities(isIoUringPort: false); + SocketsTelemetry.Log.ReportSocketEngineBackendSelected( + isIoUringPort: false, + isCompletionMode: false, + sqPollEnabled: false); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringModeSelection(_ioUringCapabilities); + } + + return; + } + + // Managed init succeeded — set capabilities and initialize managed-side state. + _ioUringCapabilities = default(LinuxIoUringCapabilities) + .WithIsIoUringPort(true) + .WithMode(IoUringMode.Completion) + .WithSupportsMultishotRecv(_supportsMultishotRecv) + .WithSupportsMultishotAccept(_supportsMultishotAccept) + .WithSupportsZeroCopySend(_zeroCopySendEnabled) + .WithSqPollEnabled(_sqPollEnabled) + .WithSupportsProvidedBufferRings(false) + .WithHasRegisteredBuffers(false); + + SocketsTelemetry.Log.ReportSocketEngineBackendSelected( + isIoUringPort: true, + isCompletionMode: true, + sqPollEnabled: _ioUringCapabilities.SqPollEnabled); + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringModeSelection(_ioUringCapabilities); + } + + InitializeLinuxIoUringDiagnosticsState(); + + _ioUringSlotCapacity = (int)Math.Max(_managedCqEntries, IoUringConstants.QueueEntries); + // Slot pool capacity is 2x slot capacity (currently 8192 with default cq sizing). + // Multishot operations retain slots for their full lifetime, so this bounds + // concurrent long-lived multishot receives before backpressure/exhaustion. + _ioUringPrepareQueue = new MpscQueue(); + _ioUringCancelQueue = new MpscQueue(); + int completionSlotCapacity = _ioUringSlotCapacity * IoUringConstants.CompletionOperationPoolCapacityFactor; + InitializeCompletionSlotPool(completionSlotCapacity); + + _managedCqDrainEnabled = true; + } + + /// + /// Enables the io_uring ring from the event loop thread, setting submitter_task + /// to this thread. Required because io_uring_setup uses R_DISABLED to defer + /// submitter_task assignment, and DEFER_TASKRUN requires submitter_task == current + /// on every io_uring_enter call. + /// + partial void LinuxEventLoopEnableRings() + { + if (!_ioUringCapabilities.IsIoUringPort || _managedRingFd < 0) + { + return; + } + + int result; + Interop.Error err = Interop.Sys.IoUringShimRegister( + _managedRingFd, IoUringConstants.RegisterEnableRings, null, 0, &result); + if (err != Interop.Error.SUCCESS) + { + if (NetEventSource.Log.IsEnabled()) + { + NetEventSource.Error(this, $"io_uring REGISTER_ENABLE_RINGS failed: {err}"); + } + } + } + + /// Tears down io_uring state before native resource cleanup. + partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort) + { + if (!_ioUringCapabilities.IsIoUringPort || _port == (IntPtr)(-1)) + { + return; + } + + // Publish teardown before draining queues/closing the native port so concurrent + // producer paths observe shutdown via acquire reads and stop queueing new work. + Volatile.Write(ref _ioUringTeardownInitiated, 1); + DrainQueuedIoUringOperationsForTeardown(); + + Interop.Error closeError = Interop.Sys.CloseSocketEventPort(_port); + if (closeError == Interop.Error.SUCCESS) + { + closeSocketEventPort = false; + Volatile.Write(ref _ioUringPortClosedForTeardown, 1); + } + } + + /// Submits pending SQEs before entering the wait. + partial void LinuxEventLoopBeforeWait() + { + Interop.Error submitError = SubmitIoUringBatch(); + if (submitError != Interop.Error.SUCCESS) + { + // FailFast site: the event-loop submit step cannot degrade safely once + // io_uring completion mode is active; losing submit progress would orphan tracked ops. + ThrowInternalException(submitError); + } + } + + /// Attempts a managed completion wait using io_uring_enter with timeout. + partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled) + { + if (!_ioUringCapabilities.IsCompletionMode) + { + return; + } + + // Managed CQE drain path: read CQEs directly from mmap'd ring. + // First, try a non-blocking drain of any already-available CQEs. + bool hadCqes = DrainCqeRingBatch(handler); + if (hadCqes) + { + numCompletions = 1; + waitHandled = true; + err = Interop.Error.SUCCESS; + return; + } + + // No CQEs available — submit pending SQEs and wait for at least 1 CQE. + uint enterFlags = IoUringConstants.EnterGetevents; + int ringFd = _managedRingFd; + if (_ioUringSqRingInfo.RegisteredRingFd >= 0) + { + enterFlags |= IoUringConstants.EnterRegisteredRing; + ringFd = _ioUringSqRingInfo.RegisteredRingFd; + } + + uint submitCount = _sqPollEnabled ? 0u : _ioUringManagedPendingSubmissions; + if (_sqPollEnabled && + _ioUringManagedPendingSubmissions != 0 && + SqNeedWakeup()) + { + enterFlags |= IoUringConstants.EnterSqWakeup; + } + + if (_managedUsesExtArg) + { + // Snapshot the wakeup generation counter before entering the blocking syscall. + // After waking, we compare to detect wakeups that arrived during the syscall. + uint wakeGenBefore = Volatile.Read(ref _ioUringWakeupGeneration); + // Bounded wait via EXT_ARG; timeout shortens when wake circuit-breaker is active. + uint waitEnterFlags = enterFlags | IoUringConstants.EnterExtArg; + int waitRingFd = ringFd; + Interop.Sys.IoUringKernelTimespec timeout = default; + timeout.TvNsec = GetManagedCompletionWaitTimeoutNanos(); + Interop.Sys.IoUringGeteventsArg extArg = default; + extArg.Ts = (ulong)(nuint)(&timeout); + + int result; + err = Interop.Sys.IoUringShimEnterExt( + waitRingFd, submitCount, 1, waitEnterFlags, &extArg, &result); + if (err == Interop.Error.EINVAL && (waitEnterFlags & IoUringConstants.EnterRegisteredRing) != 0) + { + DisableRegisteredRingFd(); + waitEnterFlags &= ~IoUringConstants.EnterRegisteredRing; + waitRingFd = _managedRingFd; + err = Interop.Sys.IoUringShimEnterExt( + waitRingFd, submitCount, 1, waitEnterFlags, &extArg, &result); + } + + if (err == Interop.Error.SUCCESS) + { + UpdateManagedPendingSubmissionCountAfterEnter(submitCount, result); + } + } + else + { + Debug.Assert( + false, + "Non-EXT_ARG io_uring wait fallback is unexpected on supported kernels (>= 6.1)."); + // Snapshot the wakeup generation counter before entering the blocking syscall (non-EXT_ARG fallback). + uint wakeGenBefore = Volatile.Read(ref _ioUringWakeupGeneration); + uint waitEnterFlags = enterFlags; + int waitRingFd = ringFd; + int result; + err = Interop.Sys.IoUringShimEnter( + waitRingFd, submitCount, 1, waitEnterFlags, &result); + if (err == Interop.Error.EINVAL && (waitEnterFlags & IoUringConstants.EnterRegisteredRing) != 0) + { + DisableRegisteredRingFd(); + waitEnterFlags &= ~IoUringConstants.EnterRegisteredRing; + waitRingFd = _managedRingFd; + err = Interop.Sys.IoUringShimEnter( + waitRingFd, submitCount, 1, waitEnterFlags, &result); + } + + if (err == Interop.Error.SUCCESS) + { + UpdateManagedPendingSubmissionCountAfterEnter(submitCount, result); + } + } + + // Drain after waking. If a producer signalled while we were inside io_uring_enter, + // the generation counter will have advanced. In that case, drain again to ensure + // enqueued work is not delayed until the next bounded wait timeout. + hadCqes = false; + do + { + hadCqes |= DrainCqeRingBatch(handler); + } + while (Volatile.Read(ref _ioUringWakeupGeneration) != wakeGenBefore); + numCompletions = hadCqes ? 1 : 0; + numEvents = 0; + waitHandled = true; + err = Interop.Error.SUCCESS; + } + + /// Polls diagnostics after each event loop iteration. + partial void LinuxEventLoopAfterIteration() + { + PollIoUringDiagnosticsIfNeeded(force: false); + TrySweepStaleTrackedIoUringOperationsAfterCqOverflowRecovery(); + } + + /// Queued work item pairing an operation with its prepare sequence number for deferred SQE preparation. + private readonly struct IoUringPrepareWorkItem + { + /// The operation to prepare. + public readonly SocketAsyncContext.AsyncOperation Operation; + /// The sequence number that must match for the preparation to proceed. + public readonly long PrepareSequence; + + /// Creates a work item pairing an operation with its prepare sequence number. + public IoUringPrepareWorkItem(SocketAsyncContext.AsyncOperation operation, long prepareSequence) + { + Operation = operation; + PrepareSequence = prepareSequence; + } + } + + /// Enqueues an operation for deferred SQE preparation on the event loop thread. + internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation, long prepareSequence) + { + if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return false; + } + + MpscQueue? prepareQueue = _ioUringPrepareQueue; + if (prepareQueue is null) + { + return false; + } + + long queueLength = Interlocked.Increment(ref _ioUringPrepareQueueLength); + if (queueLength > s_ioUringPrepareQueueCapacity) + { + Interlocked.Decrement(ref _ioUringPrepareQueueLength); + long overflowCount = Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount); + if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringPrepareQueueOverflow(overflowCount, s_ioUringPrepareQueueCapacity); + } + + return false; + } + + if (!prepareQueue.TryEnqueue(new IoUringPrepareWorkItem(operation, prepareSequence))) + { + Interlocked.Decrement(ref _ioUringPrepareQueueLength); + long overflowCount = Interlocked.Increment(ref _ioUringPrepareQueueOverflowCount); + if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringPrepareQueueOverflow(overflowCount, s_ioUringPrepareQueueCapacity); + } + + return false; + } + + WakeEventLoop(); + return true; + } + + /// Extracts completion-slot index and generation from tracked reserved-completion user_data. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryDecodeTrackedIoUringUserData(ulong userData, out int slotIndex, out ulong generation) + { + generation = 0; + slotIndex = 0; + if (userData == 0) + { + return false; + } + + if ((byte)(userData >> IoUringUserDataTagShift) != IoUringConstants.TagReservedCompletion) + { + return false; + } + + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + return false; + } + + ulong payload = userData & IoUringUserDataPayloadMask; + slotIndex = DecodeCompletionSlotIndex(payload); + if ((uint)slotIndex >= (uint)completionEntries.Length) + { + return false; + } + + generation = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask; + return true; + } + + /// Atomically removes and returns the tracked operation matching the user_data and generation. + private bool TryTakeTrackedIoUringOperation(ulong userData, out SocketAsyncContext.AsyncOperation? operation) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryTakeTrackedIoUringOperation must run on the event-loop thread."); + operation = null; + if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation)) + { + return false; + } + + ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex]; + while (true) + { + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation); + if (currentOperation is null) + { + return false; + } + + // Writers publish generation before operation; if operation is visible here, + // generation must match unless this CQE belongs to an older slot incarnation. + if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation) + { + return false; + } + + // Single-owner handoff: exactly one completion-side CAS can null out TrackedOperation + // for this slot incarnation. A racing replace may swap references, but cannot create + // two winners for the same user_data token. + if (Interlocked.CompareExchange(ref entry.TrackedOperation, null, currentOperation) != currentOperation) + { + continue; + } + + // Reset generation to zero so TryReattachTrackedIoUringOperation (used by + // SEND_ZC to re-register while awaiting the NOTIF CQE) can CAS from 0 to + // the new generation. Volatile.Write ensures visibility on ARM64 before the + // count decrement below, preventing a concurrent TryTrack from observing + // TrackedOperation == null with a stale non-zero generation. + Volatile.Write(ref entry.TrackedOperationGeneration, 0UL); + DecrementTrackedIoUringOperationCountOnEventLoop(); + operation = currentOperation; + return true; + } + } + + /// Returns the tracked operation for the given user_data without untracking it. + private bool TryGetTrackedIoUringOperation(ulong userData, out SocketAsyncContext.AsyncOperation? operation) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryGetTrackedIoUringOperation must run on the event-loop thread."); + operation = null; + if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation)) + { + return false; + } + + ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex]; + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation); + if (currentOperation is null) + { + return false; + } + + if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation) + { + return false; + } + + operation = currentOperation; + return true; + } + + /// Returns whether an operation with the given user_data and generation is currently tracked. + private bool ContainsTrackedIoUringOperation(ulong userData) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "ContainsTrackedIoUringOperation must run on the event-loop thread."); + return TryGetTrackedIoUringOperation(userData, out _); + } + + /// Re-attaches a completion owner after dispatch-side deferral (for example SEND_ZC waiting on NOTIF CQE). + private bool TryReattachTrackedIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation operation) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryReattachTrackedIoUringOperation must run on the event-loop thread."); + if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation)) + { + return false; + } + + // Verify the completion slot is still in the expected SEND_ZC NOTIF-pending state + // before attempting to reattach. If the slot was freed and reallocated between the + // first CQE dispatch and this reattach call, the slot's state will not match. + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null || (uint)slotIndex >= (uint)completionEntries.Length) + { + return false; + } + + ref IoUringCompletionSlot slot = ref completionEntries[slotIndex]; + if (!slot.IsZeroCopySend || !slot.ZeroCopyNotificationPending || slot.Generation != generation) + { + // Slot was freed and possibly reallocated. The NOTIF CQE was either already + // processed or will be discarded by HandleZeroCopyNotification's generation check. + return false; + } + + ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex]; + if (Interlocked.CompareExchange(ref entry.TrackedOperationGeneration, generation, 0) != 0) + { + return false; + } + + if (Interlocked.CompareExchange(ref entry.TrackedOperation, operation, null) is not null) + { + Volatile.Write(ref entry.TrackedOperationGeneration, 0); + return false; + } + + IncrementTrackedIoUringOperationCountOnEventLoop(); + return true; + } + + /// Atomically replaces the tracked operation for the given user_data. + private bool TryReplaceTrackedIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation) + { + if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation)) + { + return false; + } + + ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex]; + while (true) + { + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation); + if (currentOperation is null) + { + return false; + } + + if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation) + { + return false; + } + + if (Interlocked.CompareExchange(ref entry.TrackedOperation, newOperation, currentOperation) == currentOperation) + { + return true; + } + } + } + + /// Removes a tracked operation, optionally verifying it matches an expected reference. + private IoUringTrackedOperationRemoveResult TryUntrackTrackedIoUringOperation( + ulong userData, + SocketAsyncContext.AsyncOperation? expectedOperation, + out SocketAsyncContext.AsyncOperation? removedOperation) + { + removedOperation = null; + if (!TryDecodeTrackedIoUringUserData(userData, out int slotIndex, out ulong generation)) + { + return IoUringTrackedOperationRemoveResult.NotFound; + } + + ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex]; + while (true) + { + SocketAsyncContext.AsyncOperation? currentOperation = Volatile.Read(ref entry.TrackedOperation); + if (currentOperation is null) + { + return IoUringTrackedOperationRemoveResult.NotFound; + } + + if (Volatile.Read(ref entry.TrackedOperationGeneration) != generation) + { + return IoUringTrackedOperationRemoveResult.NotFound; + } + + if (expectedOperation is not null && !ReferenceEquals(currentOperation, expectedOperation)) + { + return IoUringTrackedOperationRemoveResult.Mismatch; + } + + if (Interlocked.CompareExchange(ref entry.TrackedOperation, null, currentOperation) != currentOperation) + { + continue; + } + + // Volatile.Write ensures the generation reset is visible on ARM64 before + // the count decrement. This method runs from worker threads (cancellation), + // and a plain store could reorder past Interlocked.Decrement, leaving a + // window where the event loop sees TrackedOperation == null but generation != 0. + Volatile.Write(ref entry.TrackedOperationGeneration, 0UL); + Interlocked.Decrement(ref _trackedIoUringOperationCount); + removedOperation = currentOperation; + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Canceled); + return IoUringTrackedOperationRemoveResult.Removed; + } + } + + /// Returns true when no io_uring operations are currently tracked. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsIoUringTrackingEmpty() => + Volatile.Read(ref _trackedIoUringOperationCount) == 0; + + private void IncrementTrackedIoUringOperationCountOnEventLoop() + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "Tracked-operation increments must run on the event-loop thread."); + int nextCount = _trackedIoUringOperationCount + 1; + Volatile.Write(ref _trackedIoUringOperationCount, nextCount); + } + + private void DecrementTrackedIoUringOperationCountOnEventLoop() + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "Tracked-operation decrements must run on the event-loop thread."); + int nextCount = _trackedIoUringOperationCount - 1; + Debug.Assert(nextCount >= 0, "Tracked-operation count underflow."); + Volatile.Write(ref _trackedIoUringOperationCount, nextCount); + } + + /// Removes an operation from completion-slot tracking, logging on mismatch. + internal bool TryUntrackIoUringOperation(ulong userData, SocketAsyncContext.AsyncOperation? expectedOperation = null) + { + IoUringTrackedOperationRemoveResult removeResult = TryUntrackTrackedIoUringOperation(userData, expectedOperation, out _); + if (removeResult == IoUringTrackedOperationRemoveResult.Mismatch) + { + Debug.Fail("io_uring tracked operation mismatch while untracking user_data."); + long mismatchCount = Interlocked.Increment(ref _ioUringUntrackMismatchCount); + if ((mismatchCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringUntrackMismatch(userData, mismatchCount); + } + + return false; + } + + return true; + } + + /// Attempts to replace the currently tracked operation for an existing user_data slot. + internal bool TryReplaceIoUringTrackedOperation(ulong userData, SocketAsyncContext.AsyncOperation newOperation) + { + // Replacement keeps the same slot+generation token; completion ownership is still + // resolved by the CompareExchange gate in TryTakeTrackedIoUringOperation. + return TryReplaceTrackedIoUringOperation(userData, newOperation); + } + + /// Enqueues a user_data for ASYNC_CANCEL on the event loop thread. + private bool TryEnqueueIoUringCancellation(ulong userData) + { + if (!_ioUringCapabilities.IsCompletionMode || userData == 0 || Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return false; + } + + MpscQueue? cancelQueue = _ioUringCancelQueue; + if (cancelQueue is null) + { + return false; + } + + // First attempt: enqueue directly. + long queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength); + if (queueLength <= s_ioUringCancellationQueueCapacity) + { + if (cancelQueue.TryEnqueue(userData)) + { + return true; + } + + Interlocked.Decrement(ref _ioUringCancelQueueLength); + } + else + { + Interlocked.Decrement(ref _ioUringCancelQueueLength); + } + + // Queue-full can be transient under cancellation bursts. Wake the event loop and retry once. +#if DEBUG + // Keep a dedicated test counter so functional tests can verify the wake-and-retry path. + Interlocked.Increment(ref _testCancelQueueWakeRetryCount); +#endif + WakeEventLoop(); + // Retry while SpinWait remains in active-spin mode; once it would yield, take slow-path accounting. + SpinWait retryBackoff = default; + do + { + retryBackoff.SpinOnce(); + + queueLength = Interlocked.Increment(ref _ioUringCancelQueueLength); + if (queueLength <= s_ioUringCancellationQueueCapacity) + { + if (cancelQueue.TryEnqueue(userData)) + { + return true; + } + + Interlocked.Decrement(ref _ioUringCancelQueueLength); + continue; + } + + Interlocked.Decrement(ref _ioUringCancelQueueLength); + } while (!retryBackoff.NextSpinWillYield); + + long overflowCount = Interlocked.Increment(ref _ioUringCancelQueueOverflowCount); + SocketsTelemetry.Log.IoUringCancellationQueueOverflow(); + if ((overflowCount & DiagnosticSampleMask) == 1 && NetEventSource.Log.IsEnabled()) + { + LogIoUringCancellationQueueOverflow(overflowCount, s_ioUringCancellationQueueCapacity); + } + + return false; + } + + /// Writes an ASYNC_CANCEL SQE directly if the engine is on the event loop thread. + private bool TryQueueIoUringAsyncCancel(ulong userData) + { + if (!_ioUringCapabilities.IsIoUringPort || userData == 0) + { + return false; + } + + if (!TryAcquireManagedSqeWithRetry(out IoUringSqe* sqe, out _)) + { + return false; + } + + WriteAsyncCancelSqe(sqe, userData); + return true; + } + + /// Writes to the eventfd to wake the event loop from a blocking wait. + [MethodImpl(MethodImplOptions.NoInlining)] + private Interop.Error ManagedWakeEventLoop() + { + return Interop.Sys.IoUringShimWriteEventFd(_managedWakeupEventFd); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private long GetManagedCompletionWaitTimeoutNanos() + { + return Volatile.Read(ref _ioUringWakeFailureConsecutiveCount) >= IoUringWakeFailureCircuitBreakerThreshold + ? IoUringConstants.WakeFailureFallbackWaitTimeoutNanos + : IoUringConstants.BoundedWaitTimeoutNanos; + } + + /// Sends a coalesced wake signal to the event loop thread. + private void WakeEventLoop() + { + if (!_ioUringCapabilities.IsCompletionMode || Volatile.Read(ref _ioUringTeardownInitiated) != 0) + { + return; + } + + // Advance the wakeup generation. The event loop compares its snapshot to detect + // wakeups that arrived during the blocking syscall. Coalescing: only the thread + // that moves the counter from even to odd actually writes the eventfd. + uint gen = Interlocked.Increment(ref _ioUringWakeupGeneration); + if ((gen & 1) == 0) + { + // Another producer already advanced the generation and is responsible + // for the eventfd write. Our increment is visible to the consumer's + // post-wake generation comparison, so the work will be drained. + return; + } + + Interop.Error error = ManagedWakeEventLoop(); + if (error == Interop.Error.SUCCESS) + { + int previousFailureCount = Interlocked.Exchange(ref _ioUringWakeFailureConsecutiveCount, 0); + if (previousFailureCount >= IoUringWakeFailureCircuitBreakerThreshold && + NetEventSource.Log.IsEnabled()) + { + LogIoUringWakeCircuitBreakerStateChanged(enabled: false, previousFailureCount); + } + + return; + } + + int consecutiveFailures = Interlocked.Increment(ref _ioUringWakeFailureConsecutiveCount); + if (consecutiveFailures == IoUringWakeFailureCircuitBreakerThreshold && + NetEventSource.Log.IsEnabled()) + { + LogIoUringWakeCircuitBreakerStateChanged(enabled: true, consecutiveFailures); + } + + // Advance generation again so the next producer can attempt an eventfd write. + // (Moves from odd back to even, re-enabling the coalescing gate.) + Interlocked.Increment(ref _ioUringWakeupGeneration); + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringWakeFailure(error); + } + } + + /// Enqueues a cancellation request and wakes the event loop. + internal void TryRequestIoUringCancellation(ulong userData) + { + if (!TryEnqueueIoUringCancellation(userData)) + { + return; + } + + WakeEventLoop(); + } + + /// + /// Enqueues a readiness fallback event when io_uring submission is congested. + /// + internal void EnqueueReadinessFallbackEvent( + SocketAsyncContext context, + Interop.Sys.SocketEvents events, + bool countAsPrepareQueueOverflowFallback = false) + { + if (events == Interop.Sys.SocketEvents.None) + { + return; + } + + _eventQueue.Enqueue(new SocketIOEvent(context, events)); + if (countAsPrepareQueueOverflowFallback) + { + RecordIoUringPrepareQueueOverflowFallback(); + } + EnsureWorkerScheduled(); + } + + /// Drains queued cancellation requests into ASYNC_CANCEL SQEs. + private bool DrainIoUringCancellationQueue() + { + MpscQueue? cancelQueue = _ioUringCancelQueue; + if (cancelQueue is null) + { + return false; + } + + bool preparedSqe = false; + for (int drained = 0; drained < MaxIoUringCancelQueueDrainPerSubmit && + cancelQueue.TryDequeue(out ulong userData); drained++) + { + long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength); + Debug.Assert(remainingLength >= 0); + + // Cancellation requests can race with terminal completion/untracking. + // Skip stale requests to avoid issuing known -ENOENT async-cancel SQEs. + if (!IsTrackedIoUringOperation(userData)) + { + continue; + } + + if (TryQueueIoUringAsyncCancel(userData)) + { + preparedSqe = true; + } + } + return preparedSqe; + } + + /// Drains both prepare and cancel queues, then submits all pending SQEs. + private Interop.Error SubmitIoUringBatch() + { + if (!_ioUringCapabilities.IsIoUringPort) + { + return Interop.Error.SUCCESS; + } + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "SubmitIoUringBatch must only be called from the event loop thread (SINGLE_ISSUER contract)."); + bool preparedSqe = false; + if (_ioUringCapabilities.IsCompletionMode) + { + preparedSqe |= DrainIoUringCancellationQueue(); + + MpscQueue? prepareQueue = _ioUringPrepareQueue; + if (prepareQueue is null) + { + ThrowInternalException("io_uring invariant violation: prepare queue is null while engine is in completion mode"); + } + + for (int drained = 0; drained < MaxIoUringPrepareQueueDrainPerSubmit && + prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem); drained++) + { + long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength); + Debug.Assert(remainingLength >= 0); + Interop.Error prepareError = TryPrepareAndTrackIoUringOperation( + workItem.Operation, + workItem.PrepareSequence, + out bool preparedOperation); + if (prepareError != Interop.Error.SUCCESS) + { + return prepareError; + } + + preparedSqe |= preparedOperation; + if (!preparedOperation && workItem.Operation.IsInWaitingState()) + { + if (IsPotentialCompletionSlotExhaustion()) + { + int retryCount = workItem.Operation.IncrementIoUringSlotExhaustionRetryCount(); + if (retryCount < MaxSlotExhaustionRetries && + workItem.Operation.TryQueueIoUringPreparation()) + { + continue; + } + } + + workItem.Operation.ResetIoUringSlotExhaustionRetryCount(); + EmitReadinessFallbackForUnpreparedOperation(workItem.Operation); + } + } + + } + + if (!preparedSqe) + { + // Inline re-prepare paths can write SQEs outside queue drains; ensure they are submitted. + if (_ioUringManagedPendingSubmissions != 0) + { + return SubmitIoUringOperationsNormalized(); + } + + if ((_ioUringCancelQueue?.IsEmpty == false) || (_ioUringPrepareQueue?.IsEmpty == false)) + { + WakeEventLoop(); + } + + return Interop.Error.SUCCESS; + } + + return SubmitIoUringOperationsNormalized(); + } + + /// + /// Prepares an operation for io_uring submission and tracks it in completion-slot metadata. + /// On non-prepared paths, clears operation user_data and releases preparation resources. + /// + private Interop.Error TryPrepareAndTrackIoUringOperation( + SocketAsyncContext.AsyncOperation operation, + long prepareSequence, + out bool preparedSqe) + { + preparedSqe = false; + + bool prepared = operation.TryPrepareIoUring(operation.AssociatedContext, prepareSequence); + if (prepared) + { + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Queued, + IoUringOperationLifecycleState.Prepared); + } + + if (prepared && operation.ErrorCode == SocketError.Success) + { + preparedSqe = true; + if (!TryTrackPreparedIoUringOperation(operation)) + { + // Invariant violation: tracking collision after prepare. + // A prepared SQE may now complete without a managed owner; do not attempt best-effort recovery. + operation.ClearIoUringUserData(); + ThrowInternalException("io_uring tracking collision: prepared SQE could not be tracked by user_data"); + } + + return Interop.Error.SUCCESS; + } + + if (prepared) + { + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Prepared, + IoUringOperationLifecycleState.Detached); + } + + if (!TryUntrackIoUringOperation(operation.IoUringUserData, operation)) + { + // Mismatch indicates token ownership confusion; avoid releasing + // resources that may still be associated with another tracked op. + ThrowInternalException("io_uring untrack mismatch: token ownership confusion during prepare cleanup"); + } + + operation.ClearIoUringUserData(); + return Interop.Error.SUCCESS; + } + + /// + /// Falls back to readiness notification for an operation that remained waiting after a failed prepare attempt. + /// + private void EmitReadinessFallbackForUnpreparedOperation(SocketAsyncContext.AsyncOperation operation) + { + operation.ClearIoUringUserData(); + Interop.Sys.SocketEvents fallbackEvents = operation.GetIoUringFallbackSocketEvents(); + if (fallbackEvents == Interop.Sys.SocketEvents.None) + { + return; + } + + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringPrepareFallbackToReadiness(fallbackEvents); + } + + EnqueueReadinessFallbackEvent(operation.AssociatedContext, fallbackEvents); + + [MethodImpl(MethodImplOptions.NoInlining)] + void LogIoUringPrepareFallbackToReadiness(Interop.Sys.SocketEvents events) + { + NetEventSource.Error( + this, + $"io_uring prepare fallback to readiness notification: events={events}"); + } + } + + /// Registers a prepared operation in completion-slot metadata. + private bool TryTrackPreparedIoUringOperation(SocketAsyncContext.AsyncOperation operation) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "TryTrackPreparedIoUringOperation must run on the event-loop thread."); + if (!TryDecodeTrackedIoUringUserData(operation.IoUringUserData, out int slotIndex, out ulong generation)) + { + return false; + } + + ref IoUringTrackedOperationState entry = ref _trackedOperations![slotIndex]; + if (Volatile.Read(ref entry.TrackedOperationGeneration) == 0 && + Volatile.Read(ref entry.TrackedOperation) is null) + { + // Publish generation before operation so readers never observe a new + // operation paired with a stale generation on weakly-ordered CPUs. + Volatile.Write(ref entry.TrackedOperationGeneration, generation); + Volatile.Write(ref entry.TrackedOperation, operation); + IncrementTrackedIoUringOperationCountOnEventLoop(); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Prepared, + IoUringOperationLifecycleState.Submitted); + return true; + } + + if (Volatile.Read(ref entry.TrackedOperation) is null && + Volatile.Read(ref entry.TrackedOperationGeneration) == generation) + { + Volatile.Write(ref entry.TrackedOperationGeneration, 0); + } + + // Persistent multishot receive can rebind an existing tracked user_data to a new + // managed operation before this call. In that case, tracking is already satisfied. + return operation.IoUringUserData != 0 && + TryGetTrackedIoUringOperation(operation.IoUringUserData, out SocketAsyncContext.AsyncOperation? trackedOperation) && + ReferenceEquals(trackedOperation, operation); + } + + /// Returns whether the given user_data is currently tracked. + private bool IsTrackedIoUringOperation(ulong userData) + { + return ContainsTrackedIoUringOperation(userData); + } + + /// Returns whether current completion-slot usage indicates likely slot exhaustion pressure. + private bool IsPotentialCompletionSlotExhaustion() + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null || completionEntries.Length == 0) + { + return false; + } + + int threshold = Math.Max(0, completionEntries.Length - 16); + return _completionSlotsInUse >= threshold; + } + + /// Debug assertion that tracked completion-slot usage never exceeds pool bounds. + [Conditional("DEBUG")] + private void AssertCompletionSlotUsageBounded() + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + Debug.Assert( + _completionSlotsInUse == 0, + "Completion slot usage must be zero when the slot pool is not allocated."); + return; + } + + Debug.Assert( + _completionSlotsInUse >= 0 && _completionSlotsInUse <= completionEntries.Length, + $"Completion slot usage out of bounds: inUse={_completionSlotsInUse}, capacity={completionEntries.Length}."); + } + + /// Debug assertion that completion-slot free-list topology matches . + [Conditional("DEBUG")] + private void AssertCompletionSlotPoolConsistency() + { + IoUringCompletionSlot[]? completionEntries = _completionSlots; + if (completionEntries is null) + { + Debug.Assert(_completionSlotsInUse == 0, "Completion slot usage must be zero when slots are not allocated."); + Debug.Assert(_completionSlotFreeListHead == -1, "Free-list head must be reset when slots are not allocated."); + return; + } + + bool[] visited = new bool[completionEntries.Length]; + int freeCount = 0; + int current = _completionSlotFreeListHead; + while (current >= 0) + { + Debug.Assert( + (uint)current < (uint)completionEntries.Length, + $"Completion-slot free-list index out of range: {current}."); + if ((uint)current >= (uint)completionEntries.Length || visited[current]) + { + break; + } + + visited[current] = true; + freeCount++; + current = completionEntries[current].FreeListNext; + } + + int expectedInUse = completionEntries.Length - freeCount; + Debug.Assert( + expectedInUse == _completionSlotsInUse, + $"Completion-slot accounting mismatch: expected in-use={expectedInUse}, actual in-use={_completionSlotsInUse}, free={freeCount}, capacity={completionEntries.Length}."); + } + + /// Returns whether the calling thread is the event loop thread. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsCurrentThreadEventLoopThread() => + Volatile.Read(ref _eventLoopManagedThreadId) == Environment.CurrentManagedThreadId; + + /// Returns whether a submit error indicates an unsupported operation rather than a real failure. + private static bool IsIgnoredIoUringSubmitError(Interop.Error error) => + error == Interop.Error.ENOSYS || + error == Interop.Error.ENOTSUP || + error == Interop.Error.EOPNOTSUPP || + error == Interop.Error.EINTR || + error == Interop.Error.EPERM; + + /// Disables the registered ring fd after an EINVAL and falls back to the raw ring fd. + private void DisableRegisteredRingFd() + { + _ioUringSqRingInfo.RegisteredRingFd = -1; + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringRegisteredRingFdDisabled(); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringRegisteredRingFdDisabled() => + NetEventSource.Error(this, "io_uring registered ring fd disabled after EINVAL; falling back to raw ring fd."); + + [MethodImpl(MethodImplOptions.NoInlining)] + private void LogIoUringSubmitErrorDrained(uint rejectedCount, Interop.Error error) => + NetEventSource.Error(this, $"io_uring submit returned {error}: draining {rejectedCount} rejected SQE(s) as failed completions."); + + /// + /// Completes rejected-but-published SQEs as failed completions so ignored submit + /// errors do not re-queue the same work indefinitely. + /// + private unsafe void DrainRejectedManagedSqesAsFailedCompletions(uint rejectedSubmitCount, Interop.Error submitError) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "DrainRejectedManagedSqesAsFailedCompletions must run on the event-loop thread."); + if (rejectedSubmitCount == 0) + { + return; + } + + ref Interop.Sys.IoUringSqRingInfo ringInfo = ref _ioUringSqRingInfo; + if (ringInfo.SqeBase == IntPtr.Zero || ringInfo.SqEntries == 0 || ringInfo.SqeSize < (uint)sizeof(IoUringSqe)) + { + return; + } + + int completionResult = -Interop.Sys.ConvertErrorPalToPlatform(submitError); + uint firstRejectedSqTail = _ioUringManagedSqTail - rejectedSubmitCount; + SocketEventHandler handler = new SocketEventHandler(this); + bool enqueuedFallbackEvent = false; + + for (uint i = 0; i < rejectedSubmitCount; i++) + { + uint sqTail = firstRejectedSqTail + i; + uint ringIndex = sqTail & ringInfo.SqMask; + nint sqeOffset = checked((nint)((nuint)ringIndex * ringInfo.SqeSize)); + IoUringSqe* sqe = (IoUringSqe*)((byte*)ringInfo.SqeBase + sqeOffset); + ulong sqeUserData = sqe->UserData; + byte tag = (byte)(sqeUserData >> IoUringUserDataTagShift); + + if (tag == IoUringConstants.TagReservedCompletion) + { + handler.DispatchSingleIoUringCompletion( + sqeUserData, + completionResult, + flags: 0, + socketAddressLen: 0, + controlBufferLen: 0, + auxiliaryData: 0, + hasFixedRecvBuffer: false, + fixedRecvBufferId: 0, + ref enqueuedFallbackEvent); + } + else if (tag != IoUringConstants.TagNone && tag != IoUringConstants.TagWakeupSignal) + { + Debug.Fail($"Unexpected io_uring SQE user_data tag on rejected submit drain: {tag}."); + } + } + + if (enqueuedFallbackEvent) + { + EnsureWorkerScheduled(); + } + } + + /// Returns the accepted SQE count from an io_uring_enter result, clamped to the requested submit count. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ComputeAcceptedSubmissionCount(uint requestedSubmitCount, int enterResult) + { + if (requestedSubmitCount == 0 || enterResult <= 0) + { + return 0; + } + + uint acceptedSubmitCount = (uint)enterResult; + return acceptedSubmitCount <= requestedSubmitCount ? acceptedSubmitCount : requestedSubmitCount; + } + + /// Updates pending-submission accounting after an io_uring_enter wait call. + private void UpdateManagedPendingSubmissionCountAfterEnter(uint requestedSubmitCount, int enterResult) + { + if (_sqPollEnabled) + { + // SQPOLL consumes published SQEs asynchronously after wakeup. + _ioUringManagedPendingSubmissions = 0; + return; + } + + uint acceptedSubmitCount = ComputeAcceptedSubmissionCount(requestedSubmitCount, enterResult); + uint rejectedSubmitCount = requestedSubmitCount - acceptedSubmitCount; + Debug.Assert( + acceptedSubmitCount + rejectedSubmitCount == requestedSubmitCount, + "Partial-submit accounting mismatch in io_uring wait path."); + _ioUringManagedPendingSubmissions = rejectedSubmitCount; + } + + /// Submits the specified number of pending SQEs via io_uring_enter. + [MethodImpl(MethodImplOptions.NoInlining)] + private unsafe Interop.Error ManagedSubmitPendingEntries( + uint toSubmit, + out uint acceptedSubmitCount) + { + acceptedSubmitCount = 0; + if (toSubmit == 0) + { + return Interop.Error.SUCCESS; + } + + Debug.Assert(IsCurrentThreadEventLoopThread(), + "ManagedSubmitPendingEntries must only be called from the event loop thread (SINGLE_ISSUER contract)."); + if (TryConsumeDebugForcedSubmitError(out Interop.Error forcedSubmitError)) + { + return forcedSubmitError; + } + + if (_sqPollEnabled) + { + if (!SqNeedWakeup()) + { + SocketsTelemetry.Log.IoUringSqPollSubmissionSkipped(toSubmit); + acceptedSubmitCount = toSubmit; + return Interop.Error.SUCCESS; + } + + uint wakeupFlags = IoUringConstants.EnterSqWakeup; + int wakeupRingFd = _managedRingFd; + if (_ioUringSqRingInfo.RegisteredRingFd >= 0) + { + wakeupFlags |= IoUringConstants.EnterRegisteredRing; + wakeupRingFd = _ioUringSqRingInfo.RegisteredRingFd; + } + + if (NetEventSource.Log.IsEnabled()) + { + LogSqPollWakeup(this, toSubmit); + } + + // Wakeup accounting is intentionally optimistic: this counter tracks wake requests + // issued by managed code, not guaranteed kernel-side SQ consumption. + SocketsTelemetry.Log.IoUringSqPollWakeup(); + int wakeupResult; + Interop.Error wakeupError = Interop.Sys.IoUringShimEnter(wakeupRingFd, 0, 0, wakeupFlags, &wakeupResult); + if (wakeupError == Interop.Error.EINVAL && (wakeupFlags & IoUringConstants.EnterRegisteredRing) != 0) + { + DisableRegisteredRingFd(); + wakeupFlags &= ~IoUringConstants.EnterRegisteredRing; + wakeupRingFd = _managedRingFd; + wakeupError = Interop.Sys.IoUringShimEnter(wakeupRingFd, 0, 0, wakeupFlags, &wakeupResult); + } + + if (wakeupError == Interop.Error.SUCCESS) + { + acceptedSubmitCount = toSubmit; + } + + return wakeupError; + } + + uint enterFlags = 0; + int ringFd = _managedRingFd; + if (_ioUringSqRingInfo.RegisteredRingFd >= 0) + { + enterFlags |= IoUringConstants.EnterRegisteredRing; + ringFd = _ioUringSqRingInfo.RegisteredRingFd; + } + + while (toSubmit > 0) + { + int result; + Interop.Error err = Interop.Sys.IoUringShimEnter(ringFd, toSubmit, 0, enterFlags, &result); + if (err == Interop.Error.EINVAL && (enterFlags & IoUringConstants.EnterRegisteredRing) != 0) + { + DisableRegisteredRingFd(); + enterFlags &= ~IoUringConstants.EnterRegisteredRing; + ringFd = _managedRingFd; + err = Interop.Sys.IoUringShimEnter(ringFd, toSubmit, 0, enterFlags, &result); + } + + if (err != Interop.Error.SUCCESS) + return err; + + uint acceptedThisCall = ComputeAcceptedSubmissionCount(toSubmit, result); + if (acceptedThisCall == 0) + { + return Interop.Error.EAGAIN; + } + + acceptedSubmitCount += acceptedThisCall; + toSubmit -= acceptedThisCall; + } + return Interop.Error.SUCCESS; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void LogSqPollWakeup(SocketAsyncEngine engine, uint pendingSubmissionCount) => + NetEventSource.Info(engine, $"io_uring SQPOLL wakeup requested for pending SQEs: {pendingSubmissionCount}"); + + /// Computes pending submissions and calls ManagedSubmitPendingEntries. + private Interop.Error SubmitIoUringOperationsNormalized() + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "SubmitIoUringOperationsNormalized must only be called from the event loop thread (SINGLE_ISSUER contract)."); + PublishManagedSqeTail(); + uint managedPending = _ioUringManagedPendingSubmissions; + _ioUringManagedPendingSubmissions = 0; + + Interop.Error error = ManagedSubmitPendingEntries(managedPending, out uint acceptedSubmitCount); + uint rejectedSubmitCount = managedPending - acceptedSubmitCount; + Debug.Assert( + acceptedSubmitCount + rejectedSubmitCount == managedPending, + "Partial-submit accounting mismatch in io_uring submit path."); + + // EFAULT indicates corrupted SQ ring memory; propagate to FailFast. + // All other errors drain rejected SQEs as failed completions so individual + // operations receive error callbacks and the engine survives. + bool fatalSubmitError = error == Interop.Error.EFAULT; + if (error != Interop.Error.SUCCESS && rejectedSubmitCount != 0 && !fatalSubmitError) + { + if (!IsIgnoredIoUringSubmitError(error)) + { + LogIoUringSubmitErrorDrained(rejectedSubmitCount, error); + } + + DrainRejectedManagedSqesAsFailedCompletions(rejectedSubmitCount, error); + } + + return fatalSubmitError ? error : Interop.Error.SUCCESS; + } + + /// Cancels all queued-but-not-submitted operations during teardown. + private void DrainQueuedIoUringOperationsForTeardown() + { + MpscQueue? prepareQueue = _ioUringPrepareQueue; + if (prepareQueue is not null) + { + while (prepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem)) + { + long remainingLength = Interlocked.Decrement(ref _ioUringPrepareQueueLength); + Debug.Assert(remainingLength >= 0); + + SocketAsyncContext.AsyncOperation operation = workItem.Operation; + operation.CancelPendingIoUringPreparation(workItem.PrepareSequence); + operation.TryCancelForTeardown(); + operation.ClearIoUringUserData(); + } + } + + MpscQueue? cancelQueue = _ioUringCancelQueue; + if (cancelQueue is not null) + { + while (cancelQueue.TryDequeue(out _)) + { + long remainingLength = Interlocked.Decrement(ref _ioUringCancelQueueLength); + Debug.Assert(remainingLength >= 0); + } + } + + // No reset needed for generation counter; teardown does not re-enter the wait loop. + } + + /// + /// Cancels all tracked in-flight operations during teardown. + /// This includes any future long-lived operations (for example multishot recv). + /// + private void DrainTrackedIoUringOperationsForTeardown(bool portClosedForTeardown) + { + Debug.Assert(IsCurrentThreadEventLoopThread(), + "DrainTrackedIoUringOperationsForTeardown must run on the event-loop thread."); + if (_completionSlots is null || IsIoUringTrackingEmpty()) + { + return; + } + + if (_cqOverflowRecoveryActive) + { + // Phase 1 spec branch (b): teardown preempts overflow-recovery ownership; + // tracked-operation drain/cancel paths become the single shutdown owner. + _cqOverflowRecoveryBranch = IoUringCqOverflowRecoveryBranch.Teardown; + _cqOverflowRecoveryActive = false; + if (NetEventSource.Log.IsEnabled()) + { + LogIoUringCqOverflowRecoveryTeardownPreempted(); + } + } + + bool queuedAsyncCancel = false; + bool canPrepareTeardownCancels = !portClosedForTeardown && IsCurrentThreadEventLoopThread(); + IoUringTrackedOperationState[]? trackedOperations = _trackedOperations; + if (trackedOperations is null) + { + return; + } + + // Teardown uses an explicit array walk to avoid iterator state-machine allocations. + for (int i = 0; i < trackedOperations.Length; i++) + { + SocketAsyncContext.AsyncOperation? operation = Interlocked.Exchange(ref trackedOperations[i].TrackedOperation, null); + if (operation is null) + { + continue; + } + + Volatile.Write(ref trackedOperations[i].TrackedOperationGeneration, 0UL); + DecrementTrackedIoUringOperationCountOnEventLoop(); + AssertIoUringLifecycleTransition( + IoUringOperationLifecycleState.Submitted, + IoUringOperationLifecycleState.Detached); + + ulong userData = operation.IoUringUserData; + if (canPrepareTeardownCancels && + TryQueueIoUringAsyncCancel(userData)) + { + queuedAsyncCancel = true; + } + + // Teardown policy: if the port was already closed, native ownership has been + // detached and it is now safe to release operation-owned resources eagerly. + // Otherwise, queue best-effort async cancel before releasing resources. + operation.TryCancelForTeardown(); + operation.ClearIoUringUserData(); + } + + if (canPrepareTeardownCancels && queuedAsyncCancel) + { + Interop.Error submitError = SubmitIoUringOperationsNormalized(); + if (submitError != Interop.Error.SUCCESS) + { + if (NetEventSource.Log.IsEnabled()) LogIoUringAsyncCancelSubmitFailure(submitError, IoUringCancellationOrigin.Teardown); + } + } + } + + /// Increments the late-completion counter and samples to the log. + private void RecordBenignLateIoUringCompletion(ulong userData) + { + RecordIoUringCounterAndMaybeLog(ref _ioUringBenignLateCompletionCount, userData, "io_uring completion arrived after managed untrack"); + } + + /// Increments the diagnostic counter tracking pending completion retries that queued prepare work. + private void RecordIoUringPendingRetryQueuedToPrepareQueue() + { + Interlocked.Increment(ref _ioUringPendingRetryQueuedToPrepareQueueCount); + } + + /// Increments the non-pinnable prepare fallback counter for this engine instance. + internal void RecordIoUringNonPinnablePrepareFallback() + { + Interlocked.Increment(ref _ioUringNonPinnablePrepareFallbackCount); + } + + /// Increments the completion-slot exhaustion counter. + private void RecordIoUringCompletionSlotExhaustion() + { + Interlocked.Increment(ref _ioUringCompletionSlotExhaustionCount); + } + + /// Increments the completion-slot drain-recovery counter. + private void RecordIoUringCompletionSlotDrainRecovery() + { + Interlocked.Increment(ref _ioUringCompletionSlotDrainRecoveryCount); + } + + /// Increments the prepare-queue overflow fallback counter. + private void RecordIoUringPrepareQueueOverflowFallback() + { + Interlocked.Increment(ref _ioUringPrepareQueueOverflowFallbackCount); + } + + /// Increments the requeue-failure counter and samples to the log. + private void RecordIoUringCompletionRequeueFailure(ulong userData) + { + RecordIoUringCounterAndMaybeLog(ref _ioUringCompletionRequeueFailureCount, userData, "io_uring completion requeue failed; queued readiness fallback"); + } + + + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index ae9b6c9095e43f..b6f4494980fd95 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -4,20 +4,22 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading; namespace System.Net.Sockets { - internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem + internal sealed unsafe partial class SocketAsyncEngine : IThreadPoolWorkItem { - private const int EventBufferCount = + private const int DefaultEventBufferCount = #if DEBUG 32; #else 1024; #endif + private static readonly int s_eventBufferCount = GetEventBufferCount(); // Socket continuations are dispatched to the ThreadPool from the event thread. // This avoids continuations blocking the event handling. @@ -25,9 +27,55 @@ internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem // PreferInlineCompletions defaults to false and can be set to true using the DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS envvar. internal static readonly bool InlineSocketCompletionsEnabled = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS") == "1"; +#if DEBUG + /// + /// Central registry of DEBUG-only io_uring test environment variables. + /// These switches are intentionally unsupported for production tuning. + /// + private static class IoUringTestEnvironmentVariables + { + internal const string EventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT"; + internal const string QueueEntries = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_QUEUE_ENTRIES"; + internal const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY"; + internal const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE"; + internal const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND"; + internal const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK"; + internal const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK"; + internal const string ForceSubmitEpermOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_SUBMIT_EPERM_ONCE"; + internal const string ForceEnterEintrRetryLimitOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE"; + internal const string ForceKernelVersionUnsupported = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_KERNEL_VERSION_UNSUPPORTED"; + internal const string ForceProvidedBufferRingOomOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_PROVIDED_BUFFER_RING_OOM_ONCE"; + internal const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE"; + internal const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING"; + internal const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS"; + } +#endif + + private static int GetEventBufferCount() + { +#if DEBUG + // Test-only knob to make wait-buffer saturation deterministic for io_uring diagnostics coverage. + // Only available in DEBUG builds so production code never reads test env vars. + if (OperatingSystem.IsLinux()) + { + string? configuredValue = Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.EventBufferCount); + if (configuredValue is not null && + int.TryParse(configuredValue, out int parsedValue) && + parsedValue >= 1 && + parsedValue <= DefaultEventBufferCount) + { + return parsedValue; + } + } +#endif + + return DefaultEventBufferCount; + } + private static int GetEngineCount() { // The responsibility of SocketAsyncEngine is to get notifications from epoll|kqueue + // (or io_uring on Linux when enabled in the native shim) // and schedule corresponding work items to ThreadPool (socket reads and writes). // // Using TechEmpower benchmarks that generate a LOT of SMALL socket reads and writes under a VERY HIGH load @@ -85,11 +133,12 @@ private static SocketAsyncEngine[] CreateEngines() private readonly IntPtr _port; private readonly Interop.Sys.SocketEvent* _buffer; + private int _eventLoopManagedThreadId; // // Queue of events generated by EventLoop() that would be processed by the thread pool // - private readonly ConcurrentQueue _eventQueue = new ConcurrentQueue(); + private readonly SocketIOEventQueue _eventQueue = new SocketIOEventQueue(); // This flag is used for communication between item enqueuing and workers that process the items. // There are two states of this flag: @@ -143,8 +192,20 @@ private bool TryRegisterCore(IntPtr socketHandle, SocketAsyncContext context, ou context.GlobalContextIndex = index; } - error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None, - Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex); + Interop.Error managedError = default; + bool managedHandled = false; + LinuxTryChangeSocketEventRegistration(socketHandle, Interop.Sys.SocketEvents.None, + Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, + context.GlobalContextIndex, ref managedError, ref managedHandled); + if (managedHandled) + { + error = managedError; + } + else + { + error = Interop.Sys.TryChangeSocketEventRegistration(_port, socketHandle, Interop.Sys.SocketEvents.None, + Interop.Sys.SocketEvents.Read | Interop.Sys.SocketEvents.Write, context.GlobalContextIndex); + } if (error == Interop.Error.SUCCESS) { return true; @@ -182,19 +243,21 @@ private SocketAsyncEngine() err = Interop.Sys.CreateSocketEventPort(portPtr); if (err != Interop.Error.SUCCESS) { - throw new InternalException(err); + ThrowInternalException(err); } } fixed (Interop.Sys.SocketEvent** bufferPtr = &_buffer) { - err = Interop.Sys.CreateSocketEventBuffer(EventBufferCount, bufferPtr); + err = Interop.Sys.CreateSocketEventBuffer(s_eventBufferCount, bufferPtr); if (err != Interop.Error.SUCCESS) { - throw new InternalException(err); + ThrowInternalException(err); } } + LinuxDetectAndInitializeIoUring(); + var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop()) { IsBackground = true, @@ -204,37 +267,92 @@ private SocketAsyncEngine() } catch { + // Constructor failure path only: if construction throws, clean up immediately. + // This path is the sole caller of FreeNativeResources(). FreeNativeResources(); throw; } } + partial void LinuxDetectAndInitializeIoUring(); + partial void LinuxEventLoopEnableRings(); + partial void LinuxEventLoopBeforeWait(); + partial void LinuxEventLoopTryCompletionWait(SocketEventHandler handler, ref int numEvents, ref int numCompletions, ref Interop.Error err, ref bool waitHandled); + partial void LinuxEventLoopAfterIteration(); + partial void LinuxBeforeFreeNativeResources(ref bool closeSocketEventPort); + partial void LinuxFreeIoUringResources(); + partial void LinuxTryChangeSocketEventRegistration(IntPtr socketHandle, Interop.Sys.SocketEvents currentEvents, Interop.Sys.SocketEvents newEvents, int data, ref Interop.Error error, ref bool handled); + + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowInternalException(Interop.Error error) => + throw new InternalException(error); + + [DoesNotReturn] + [StackTraceHidden] + private static void ThrowInternalException(string message) => + throw new InternalException(message); + + [DoesNotReturn] + [StackTraceHidden] + [MethodImpl(MethodImplOptions.NoInlining)] + private static void FailFastEventLoop(Exception exception) => + Environment.FailFast($"Exception thrown from SocketAsyncEngine event loop: {exception}", exception); + + private void RecordAndAssertEventLoopThreadIdentity() + { + int currentThreadId = Environment.CurrentManagedThreadId; +#if DEBUG + int previousThreadId = Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0); + Debug.Assert( + previousThreadId == 0 || previousThreadId == currentThreadId, + $"SocketAsyncEngine event loop thread changed: previous={previousThreadId}, current={currentThreadId}"); +#else + Interlocked.CompareExchange(ref _eventLoopManagedThreadId, currentThreadId, 0); +#endif + } + private void EventLoop() { try { + RecordAndAssertEventLoopThreadIdentity(); + LinuxEventLoopEnableRings(); SocketEventHandler handler = new SocketEventHandler(this); while (true) { - int numEvents = EventBufferCount; - Interop.Error err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents); + LinuxEventLoopBeforeWait(); + + int numEvents = s_eventBufferCount; + int numCompletions = 0; + Interop.Error err = default; + bool waitHandled = false; + LinuxEventLoopTryCompletionWait(handler, ref numEvents, ref numCompletions, ref err, ref waitHandled); + if (!waitHandled) + { + err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents); + } + if (err != Interop.Error.SUCCESS) { - throw new InternalException(err); + ThrowInternalException(err); } - // The native shim is responsible for ensuring this condition. - Debug.Assert(numEvents > 0, $"Unexpected numEvents: {numEvents}"); + // io_uring completion-mode wait can return with zero surfaced events/completions + // when woken only to flush managed prepare/cancel queues. + Debug.Assert(waitHandled || numEvents > 0 || numCompletions > 0, $"Unexpected wait result: events={numEvents}, completions={numCompletions}"); - if (handler.HandleSocketEvents(numEvents)) + if (numEvents > 0 && handler.HandleSocketEvents(numEvents)) { EnsureWorkerScheduled(); } + + LinuxEventLoopAfterIteration(); } } catch (Exception e) { - Environment.FailFast("Exception thrown from SocketAsyncEngine event loop: " + e.ToString(), e); + FailFastEventLoop(e); } } @@ -257,7 +375,7 @@ void IThreadPoolWorkItem.Execute() // Checking for items must happen after resetting the processing state. Interlocked.MemoryBarrier(); - ConcurrentQueue eventQueue = _eventQueue; + SocketIOEventQueue eventQueue = _eventQueue; if (!eventQueue.TryDequeue(out SocketIOEvent ev)) { return; @@ -295,11 +413,22 @@ void IThreadPoolWorkItem.Execute() private void FreeNativeResources() { + Debug.Assert( + Volatile.Read(ref _eventLoopManagedThreadId) == 0, + "FreeNativeResources is only used by constructor-failure cleanup; event loop thread must not have started."); + bool closeSocketEventPort = true; + // Linux io_uring teardown may need to close the port first to ensure native + // ownership is detached before managed operation resources are released. + LinuxBeforeFreeNativeResources(ref closeSocketEventPort); + + LinuxFreeIoUringResources(); + if (_buffer != null) { Interop.Sys.FreeSocketEventBuffer(_buffer); } - if (_port != (IntPtr)(-1)) + + if (closeSocketEventPort && _port != (IntPtr)(-1)) { Interop.Sys.CloseSocketEventPort(_port); } @@ -310,14 +439,16 @@ private void FreeNativeResources() // To avoid this, the event handling logic is delegated to a non-inlined processing method. // See discussion: https://github.com/dotnet/runtime/issues/37064 // SocketEventHandler holds an on-stack cache of SocketAsyncEngine members needed by the handler method. - private readonly struct SocketEventHandler + private readonly partial struct SocketEventHandler { public Interop.Sys.SocketEvent* Buffer { get; } - private readonly ConcurrentQueue _eventQueue; + private readonly SocketIOEventQueue _eventQueue; + private readonly SocketAsyncEngine _engine; public SocketEventHandler(SocketAsyncEngine engine) { + _engine = engine; Buffer = engine._buffer; _eventQueue = engine._eventQueue; } @@ -358,6 +489,25 @@ public bool HandleSocketEvents(int numEvents) } } + private sealed class SocketIOEventQueue + { +#if TARGET_LINUX + private readonly MpscQueue _queue = new MpscQueue(); +#else + private readonly ConcurrentQueue _queue = new ConcurrentQueue(); +#endif + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + // Event delivery cannot drop entries. Use Enqueue's retrying contract here; + // io_uring prepare/cancel queues use TryEnqueue where fallback paths exist. + public void Enqueue(SocketIOEvent socketEvent) => _queue.Enqueue(socketEvent); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryDequeue(out SocketIOEvent socketEvent) => _queue.TryDequeue(out socketEvent); + + public bool IsEmpty => _queue.IsEmpty; + } + private readonly struct SocketIOEvent { public SocketAsyncContext Context { get; } diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs new file mode 100644 index 00000000000000..38d7ef78334b34 --- /dev/null +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.IoUring.Linux.cs @@ -0,0 +1,12 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Net.Sockets +{ + internal static partial class SocketPal + { + /// Extracts from a completed io_uring recvmsg message header. + internal static unsafe IPPacketInformation GetIoUringIPPacketInformation(Interop.Sys.MessageHeader* messageHeader, bool isIPv4, bool isIPv6) => + GetIPPacketInformation(messageHeader, isIPv4, isIPv6); + } +} diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs index 1171961a204351..114bcad4dc2f3a 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs @@ -14,6 +14,29 @@ internal sealed partial class SocketsTelemetry : EventSource private const string ConnectActivityName = ActivitySourceName + ".Connect"; private static readonly ActivitySource s_connectActivitySource = new ActivitySource(ActivitySourceName); + internal static class Keywords + { + // Stable operational counters are always published when the source is enabled on Linux. + // Diagnostic counters are opt-in and can evolve without name stability guarantees. + internal const EventKeywords IoUringDiagnostics = (EventKeywords)0x1; + } + + internal static class IoUringCounterNames + { + internal const string PrepareNonPinnableFallbacks = "io-uring-prepare-nonpinnable-fallbacks"; + internal const string SocketEventBufferFull = "io-uring-socket-event-buffer-full"; + internal const string CqOverflows = "io-uring-cq-overflows"; + internal const string CqOverflowRecoveries = "io-uring-cq-overflow-recoveries"; + internal const string PrepareQueueOverflows = "io-uring-prepare-queue-overflows"; + internal const string PrepareQueueOverflowFallbacks = "io-uring-prepare-queue-overflow-fallbacks"; + internal const string CompletionSlotExhaustions = "io-uring-completion-slot-exhaustions"; + internal const string CompletionSlotHighWaterMark = "io-uring-completion-slot-high-water-mark"; + internal const string CancellationQueueOverflows = "io-uring-cancellation-queue-overflows"; + internal const string ProvidedBufferDepletions = "io-uring-provided-buffer-depletions"; + internal const string SqPollWakeups = "io-uring-sqpoll-wakeups"; + internal const string SqPollSubmissionsSkipped = "io-uring-sqpoll-submissions-skipped"; + } + public static readonly SocketsTelemetry Log = new SocketsTelemetry(); private PollingCounter? _currentOutgoingConnectAttemptsCounter; @@ -23,6 +46,20 @@ internal sealed partial class SocketsTelemetry : EventSource private PollingCounter? _bytesSentCounter; private PollingCounter? _datagramsReceivedCounter; private PollingCounter? _datagramsSentCounter; + // Keep io_uring counter backing fields always present so EventCounter name contracts remain stable + // across platforms; OnEventCommand only registers these counters on Linux. + private PollingCounter? _ioUringPrepareNonPinnableFallbacksCounter; + private PollingCounter? _ioUringSocketEventBufferFullCounter; + private PollingCounter? _ioUringCqOverflowCounter; + private PollingCounter? _ioUringCqOverflowRecoveriesCounter; + private PollingCounter? _ioUringPrepareQueueOverflowsCounter; + private PollingCounter? _ioUringPrepareQueueOverflowFallbacksCounter; + private PollingCounter? _ioUringCompletionSlotExhaustionsCounter; + private PollingCounter? _ioUringCompletionSlotHighWaterMarkCounter; + private PollingCounter? _ioUringCancellationQueueOverflowsCounter; + private PollingCounter? _ioUringProvidedBufferDepletionsCounter; + private PollingCounter? _ioUringSqPollWakeupsCounter; + private PollingCounter? _ioUringSqPollSubmissionsSkippedCounter; private long _currentOutgoingConnectAttempts; private long _outgoingConnectionsEstablished; @@ -31,6 +68,92 @@ internal sealed partial class SocketsTelemetry : EventSource private long _bytesSent; private long _datagramsReceived; private long _datagramsSent; + // Backing fields stay cross-platform for contract stability; they are only surfaced as counters on Linux. + private long _ioUringPrepareNonPinnableFallbacks; + private long _ioUringAsyncCancelRequestCqes; + private long _ioUringSocketEventBufferFull; + private long _ioUringCqOverflow; + private long _ioUringCqOverflowRecoveries; + private long _ioUringCompletionRequeueFailures; + private long _ioUringZeroCopyNotificationPendingSlots; + private long _ioUringPrepareQueueOverflows; + private long _ioUringPrepareQueueOverflowFallbacks; + private long _ioUringPrepareQueueDepth; + private long _ioUringCompletionSlotExhaustions; + private long _ioUringCompletionSlotHighWaterMark; + private long _ioUringCancellationQueueOverflows; + private long _ioUringCompletionSlotDrainRecoveries; + private long _ioUringProvidedBufferDepletions; + private long _ioUringProvidedBufferCurrentSize; + private long _ioUringProvidedBufferRecycles; + private long _ioUringProvidedBufferResizes; + private long _ioUringRegisteredBuffersInitialSuccess; + private long _ioUringRegisteredBuffersInitialFailure; + private long _ioUringRegisteredBuffersReregistrationSuccess; + private long _ioUringRegisteredBuffersReregistrationFailure; + private long _ioUringFixedRecvSelected; + private long _ioUringFixedRecvFallbacks; + private long _ioUringSqPollWakeups; + private long _ioUringSqPollSubmissionsSkipped; + private long _ioUringPersistentMultishotRecvReuse; + private long _ioUringPersistentMultishotRecvTermination; + private long _ioUringPersistentMultishotRecvEarlyData; + + internal enum IoUringCounterFieldForTest + { + PrepareNonPinnableFallbacks, + AsyncCancelRequestCqes, + SocketEventBufferFull, + CqOverflow, + CqOverflowRecoveries, + CompletionRequeueFailures, + PrepareQueueOverflows, + PrepareQueueOverflowFallbacks, + CompletionSlotExhaustions, + CompletionSlotHighWaterMark, + CancellationQueueOverflows, + SqPollWakeups, + SqPollSubmissionsSkipped, + ProvidedBufferDepletions, + ProvidedBufferRecycles, + RegisteredBuffersInitialSuccess, + RegisteredBuffersInitialFailure, + RegisteredBuffersReregistrationSuccess, + RegisteredBuffersReregistrationFailure, + PersistentMultishotRecvReuse, + PersistentMultishotRecvTermination, + } + + internal static ulong GetIoUringCounterValueForTest(IoUringCounterFieldForTest field) + { + // Test hook used by io_uring functional tests to validate counter deltas without + // reflecting over private fields that are brittle under refactoring. + return field switch + { + IoUringCounterFieldForTest.PrepareNonPinnableFallbacks => (ulong)Interlocked.Read(ref Log._ioUringPrepareNonPinnableFallbacks), + IoUringCounterFieldForTest.AsyncCancelRequestCqes => (ulong)Interlocked.Read(ref Log._ioUringAsyncCancelRequestCqes), + IoUringCounterFieldForTest.SocketEventBufferFull => (ulong)Interlocked.Read(ref Log._ioUringSocketEventBufferFull), + IoUringCounterFieldForTest.CqOverflow => (ulong)Interlocked.Read(ref Log._ioUringCqOverflow), + IoUringCounterFieldForTest.CqOverflowRecoveries => (ulong)Interlocked.Read(ref Log._ioUringCqOverflowRecoveries), + IoUringCounterFieldForTest.CompletionRequeueFailures => (ulong)Interlocked.Read(ref Log._ioUringCompletionRequeueFailures), + IoUringCounterFieldForTest.PrepareQueueOverflows => (ulong)Interlocked.Read(ref Log._ioUringPrepareQueueOverflows), + IoUringCounterFieldForTest.PrepareQueueOverflowFallbacks => (ulong)Interlocked.Read(ref Log._ioUringPrepareQueueOverflowFallbacks), + IoUringCounterFieldForTest.CompletionSlotExhaustions => (ulong)Interlocked.Read(ref Log._ioUringCompletionSlotExhaustions), + IoUringCounterFieldForTest.CompletionSlotHighWaterMark => (ulong)Interlocked.Read(ref Log._ioUringCompletionSlotHighWaterMark), + IoUringCounterFieldForTest.CancellationQueueOverflows => (ulong)Interlocked.Read(ref Log._ioUringCancellationQueueOverflows), + IoUringCounterFieldForTest.SqPollWakeups => (ulong)Interlocked.Read(ref Log._ioUringSqPollWakeups), + IoUringCounterFieldForTest.SqPollSubmissionsSkipped => (ulong)Interlocked.Read(ref Log._ioUringSqPollSubmissionsSkipped), + IoUringCounterFieldForTest.ProvidedBufferDepletions => (ulong)Interlocked.Read(ref Log._ioUringProvidedBufferDepletions), + IoUringCounterFieldForTest.ProvidedBufferRecycles => (ulong)Interlocked.Read(ref Log._ioUringProvidedBufferRecycles), + IoUringCounterFieldForTest.RegisteredBuffersInitialSuccess => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersInitialSuccess), + IoUringCounterFieldForTest.RegisteredBuffersInitialFailure => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersInitialFailure), + IoUringCounterFieldForTest.RegisteredBuffersReregistrationSuccess => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersReregistrationSuccess), + IoUringCounterFieldForTest.RegisteredBuffersReregistrationFailure => (ulong)Interlocked.Read(ref Log._ioUringRegisteredBuffersReregistrationFailure), + IoUringCounterFieldForTest.PersistentMultishotRecvReuse => (ulong)Interlocked.Read(ref Log._ioUringPersistentMultishotRecvReuse), + IoUringCounterFieldForTest.PersistentMultishotRecvTermination => (ulong)Interlocked.Read(ref Log._ioUringPersistentMultishotRecvTermination), + _ => 0UL, + }; + } [Event(1, Level = EventLevel.Informational)] private void ConnectStart(string? address) @@ -80,6 +203,33 @@ private void AcceptFailed(SocketError error, string? exceptionMessage) } } + [Event(7, Level = EventLevel.Informational)] + private void SocketEngineBackendSelected(string backend, int isIoUringPort, int sqPollEnabled) + { + if (IsEnabled(EventLevel.Informational, EventKeywords.All)) + { + WriteEvent(eventId: 7, backend, isIoUringPort, sqPollEnabled); + } + } + + [Event(8, Level = EventLevel.Warning)] + private void IoUringSqPollNegotiatedWarning(string message) + { + if (IsEnabled(EventLevel.Warning, EventKeywords.All)) + { + WriteEvent(eventId: 8, message); + } + } + + [Event(9, Level = EventLevel.Informational)] + private void IoUringResolvedConfiguration(string configuration) + { + if (IsEnabled(EventLevel.Informational, EventKeywords.All)) + { + WriteEvent(eventId: 9, configuration); + } + } + [NonEvent] public Activity? ConnectStart(SocketAddress address, ProtocolType protocolType, EndPoint endPoint, bool keepActivityCurrent) { @@ -189,6 +339,43 @@ public void AcceptStart(EndPoint address) } } + [NonEvent] + internal void ReportSocketEngineBackendSelected(bool isIoUringPort, bool isCompletionMode, bool sqPollEnabled) + { + if (!IsEnabled(EventLevel.Informational, EventKeywords.All)) + { + return; + } + + SocketEngineBackendSelected( + isCompletionMode ? "io_uring_completion" : "epoll", + isIoUringPort ? 1 : 0, + sqPollEnabled ? 1 : 0); + } + + [NonEvent] + internal void ReportIoUringSqPollNegotiatedWarning() + { + if (!IsEnabled(EventLevel.Warning, EventKeywords.All)) + { + return; + } + + IoUringSqPollNegotiatedWarning( + "io_uring SQPOLL negotiated: kernel polling thread is enabled and may increase privileges in containerized environments."); + } + + [NonEvent] + internal void ReportIoUringResolvedConfiguration(string configuration) + { + if (!IsEnabled(EventLevel.Informational, EventKeywords.All)) + { + return; + } + + IoUringResolvedConfiguration(configuration); + } + [NonEvent] public void AfterAccept(SocketError error, string? exceptionMessage = null) { @@ -231,6 +418,222 @@ public void DatagramSent() Interlocked.Increment(ref _datagramsSent); } + [NonEvent] + public void IoUringPrepareNonPinnableFallback(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPrepareNonPinnableFallbacks, count); + } + + [NonEvent] + public void IoUringAsyncCancelRequestCqes(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringAsyncCancelRequestCqes, count); + } + + [NonEvent] + public void IoUringSocketEventBufferFull(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringSocketEventBufferFull, count); + } + + [NonEvent] + public void IoUringCqOverflow(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCqOverflow, count); + } + + [NonEvent] + public void IoUringCqOverflowRecovery(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCqOverflowRecoveries, count); + } + + [NonEvent] + public void IoUringCompletionRequeueFailure(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCompletionRequeueFailures, count); + } + + [NonEvent] + public void IoUringZeroCopyNotificationPendingSlots(int count) + { + Debug.Assert(count >= 0); + Volatile.Write(ref _ioUringZeroCopyNotificationPendingSlots, count); + } + + [NonEvent] + public void IoUringPrepareQueueOverflow(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPrepareQueueOverflows, count); + } + + [NonEvent] + public void IoUringPrepareQueueOverflowFallback(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPrepareQueueOverflowFallbacks, count); + } + + [NonEvent] + public void IoUringPrepareQueueDepthDelta(long delta) + { + long value = Interlocked.Add(ref _ioUringPrepareQueueDepth, delta); + Debug.Assert(value >= 0, $"io_uring prepare queue depth cannot be negative: {value}"); + } + + [NonEvent] + public void IoUringCompletionSlotExhaustion(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCompletionSlotExhaustions, count); + } + + [NonEvent] + public void IoUringCompletionSlotHighWaterMark(long count) + { + Debug.Assert(count >= 0); + while (true) + { + long observed = Volatile.Read(ref _ioUringCompletionSlotHighWaterMark); + if (count <= observed) + { + return; + } + + if (Interlocked.CompareExchange(ref _ioUringCompletionSlotHighWaterMark, count, observed) == observed) + { + return; + } + } + } + + [NonEvent] + public void IoUringCompletionSlotDrainRecovery(long count) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCompletionSlotDrainRecoveries, count); + } + + [NonEvent] + public void IoUringCancellationQueueOverflow(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringCancellationQueueOverflows, count); + } + + [NonEvent] + public void IoUringProvidedBufferDepletion(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringProvidedBufferDepletions, count); + } + + [NonEvent] + public void IoUringProvidedBufferCurrentSize(int size) + { + Debug.Assert(size >= 0); + Volatile.Write(ref _ioUringProvidedBufferCurrentSize, size); + } + + [NonEvent] + public void IoUringProvidedBufferRecycle(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringProvidedBufferRecycles, count); + } + + [NonEvent] + public void IoUringProvidedBufferResize(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringProvidedBufferResizes, count); + } + + [NonEvent] + public void IoUringRegisteredBuffersResult(bool success, int bufferCount, int bufferSize) + { + Debug.Assert(bufferCount >= 0); + Debug.Assert(bufferSize >= 0); + + if (success) + { + Interlocked.Increment(ref _ioUringRegisteredBuffersInitialSuccess); + } + else + { + Interlocked.Increment(ref _ioUringRegisteredBuffersInitialFailure); + } + } + + [NonEvent] + public void IoUringRegisteredBuffersReregistration(bool success) + { + if (success) + { + Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationSuccess); + } + else + { + Interlocked.Increment(ref _ioUringRegisteredBuffersReregistrationFailure); + } + } + + [NonEvent] + public void IoUringFixedRecvSelected(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringFixedRecvSelected, count); + } + + [NonEvent] + public void IoUringFixedRecvFallback(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringFixedRecvFallbacks, count); + } + + [NonEvent] + public void IoUringSqPollWakeup(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringSqPollWakeups, count); + } + + [NonEvent] + public void IoUringSqPollSubmissionSkipped(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringSqPollSubmissionsSkipped, count); + } + + [NonEvent] + public void IoUringPersistentMultishotRecvReuse(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPersistentMultishotRecvReuse, count); + } + + [NonEvent] + public void IoUringPersistentMultishotRecvTermination(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPersistentMultishotRecvTermination, count); + } + + [NonEvent] + public void IoUringPersistentMultishotRecvEarlyData(long count = 1) + { + Debug.Assert(count >= 0); + Interlocked.Add(ref _ioUringPersistentMultishotRecvEarlyData, count); + } + private static string GetErrorType(SocketError socketError) => socketError switch { // Common connect() errors expected to be seen: @@ -291,6 +694,60 @@ protected override void OnEventCommand(EventCommandEventArgs command) { DisplayName = "Datagrams Sent", }; + + if (!OperatingSystem.IsLinux()) + { + return; + } + + _ioUringPrepareNonPinnableFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareNonPinnableFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareNonPinnableFallbacks)) + { + DisplayName = "io_uring Prepare Non-Pinnable Fallbacks", + }; + _ioUringSocketEventBufferFullCounter ??= new PollingCounter(IoUringCounterNames.SocketEventBufferFull, this, () => Interlocked.Read(ref _ioUringSocketEventBufferFull)) + { + DisplayName = "io_uring Socket Event Buffer Full", + }; + _ioUringCqOverflowCounter ??= new PollingCounter(IoUringCounterNames.CqOverflows, this, () => Interlocked.Read(ref _ioUringCqOverflow)) + { + DisplayName = "io_uring Completion Queue Overflow", + }; + _ioUringCqOverflowRecoveriesCounter ??= new PollingCounter(IoUringCounterNames.CqOverflowRecoveries, this, () => Interlocked.Read(ref _ioUringCqOverflowRecoveries)) + { + DisplayName = "io_uring Completion Queue Overflow Recoveries", + }; + _ioUringPrepareQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflows, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflows)) + { + DisplayName = "io_uring Prepare Queue Overflows", + }; + _ioUringPrepareQueueOverflowFallbacksCounter ??= new PollingCounter(IoUringCounterNames.PrepareQueueOverflowFallbacks, this, () => Interlocked.Read(ref _ioUringPrepareQueueOverflowFallbacks)) + { + DisplayName = "io_uring Prepare Queue Overflow Fallbacks", + }; + _ioUringCompletionSlotExhaustionsCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotExhaustions, this, () => Interlocked.Read(ref _ioUringCompletionSlotExhaustions)) + { + DisplayName = "io_uring Completion Slot Exhaustions", + }; + _ioUringCompletionSlotHighWaterMarkCounter ??= new PollingCounter(IoUringCounterNames.CompletionSlotHighWaterMark, this, () => Interlocked.Read(ref _ioUringCompletionSlotHighWaterMark)) + { + DisplayName = "io_uring Completion Slot High-Water Mark", + }; + _ioUringCancellationQueueOverflowsCounter ??= new PollingCounter(IoUringCounterNames.CancellationQueueOverflows, this, () => Interlocked.Read(ref _ioUringCancellationQueueOverflows)) + { + DisplayName = "io_uring Cancellation Queue Overflows", + }; + _ioUringProvidedBufferDepletionsCounter ??= new PollingCounter(IoUringCounterNames.ProvidedBufferDepletions, this, () => Interlocked.Read(ref _ioUringProvidedBufferDepletions)) + { + DisplayName = "io_uring Provided Buffer Depletions", + }; + _ioUringSqPollWakeupsCounter ??= new PollingCounter(IoUringCounterNames.SqPollWakeups, this, () => Interlocked.Read(ref _ioUringSqPollWakeups)) + { + DisplayName = "io_uring SQPOLL Wakeups", + }; + _ioUringSqPollSubmissionsSkippedCounter ??= new PollingCounter(IoUringCounterNames.SqPollSubmissionsSkipped, this, () => Interlocked.Read(ref _ioUringSqPollSubmissionsSkipped)) + { + DisplayName = "io_uring SQPOLL Submissions Skipped", + }; } } } diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs new file mode 100644 index 00000000000000..c64d942f73fa6b --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/InternalTestShims.Linux.cs @@ -0,0 +1,697 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Runtime.ExceptionServices; + +namespace System.Net.Sockets +{ + /// + /// Linux test-only shim that mirrors internal SocketAsyncEngine test hooks through reflection. + /// + internal sealed class SocketAsyncEngine + { + private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic; + private const BindingFlags InstanceFlags = BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic; + + // Keep shim type initialization inert: all reflection is resolved lazily per call. + [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketAsyncEngine", "System.Net.Sockets")] + static SocketAsyncEngine() + { + } + + private readonly object _inner; + + private SocketAsyncEngine(object inner) + { + _inner = inner; + } + + internal readonly struct IoUringNonPinnableFallbackPublicationState + { + internal IoUringNonPinnableFallbackPublicationState(long publishedCount, int publishingGate, long fallbackCount) + { + PublishedCount = publishedCount; + PublishingGate = publishingGate; + FallbackCount = fallbackCount; + } + + internal long PublishedCount { get; } + internal int PublishingGate { get; } + internal long FallbackCount { get; } + } + + internal readonly struct IoUringNativeDiagnosticsSnapshotForTest + { + internal IoUringNativeDiagnosticsSnapshotForTest( + bool hasIoUringPort, + ulong asyncCancelRequestCqeCount, + ulong asyncCancelRequestCqeEnoentCount, + ulong asyncCancelRequestCqeEalreadyCount, + ulong asyncCancelRequestCqeOtherCount, + ulong socketEventBufferFullCount, + ulong unsupportedOpcodePrepareCount, + ulong cqOverflowCount) + { + HasIoUringPort = hasIoUringPort; + AsyncCancelRequestCqeCount = asyncCancelRequestCqeCount; + AsyncCancelRequestCqeEnoentCount = asyncCancelRequestCqeEnoentCount; + AsyncCancelRequestCqeEalreadyCount = asyncCancelRequestCqeEalreadyCount; + AsyncCancelRequestCqeOtherCount = asyncCancelRequestCqeOtherCount; + SocketEventBufferFullCount = socketEventBufferFullCount; + UnsupportedOpcodePrepareCount = unsupportedOpcodePrepareCount; + CqOverflowCount = cqOverflowCount; + } + + internal bool HasIoUringPort { get; } + internal ulong AsyncCancelRequestCqeCount { get; } + internal ulong AsyncCancelRequestCqeEnoentCount { get; } + internal ulong AsyncCancelRequestCqeEalreadyCount { get; } + internal ulong AsyncCancelRequestCqeOtherCount { get; } + internal ulong SocketEventBufferFullCount { get; } + internal ulong UnsupportedOpcodePrepareCount { get; } + internal ulong CqOverflowCount { get; } + } + + internal readonly struct IoUringProvidedBufferSnapshotForTest + { + internal IoUringProvidedBufferSnapshotForTest( + bool hasIoUringPort, + bool supportsProvidedBufferRings, + bool hasProvidedBufferRing, + bool hasRegisteredBuffers, + bool adaptiveBufferSizingEnabled, + int availableCount, + int inUseCount, + int totalBufferCount, + int bufferSize, + int recommendedBufferSize, + long recycledCount, + long allocationFailureCount) + { + HasIoUringPort = hasIoUringPort; + SupportsProvidedBufferRings = supportsProvidedBufferRings; + HasProvidedBufferRing = hasProvidedBufferRing; + HasRegisteredBuffers = hasRegisteredBuffers; + AdaptiveBufferSizingEnabled = adaptiveBufferSizingEnabled; + AvailableCount = availableCount; + InUseCount = inUseCount; + TotalBufferCount = totalBufferCount; + BufferSize = bufferSize; + RecommendedBufferSize = recommendedBufferSize; + RecycledCount = recycledCount; + AllocationFailureCount = allocationFailureCount; + } + + internal bool HasIoUringPort { get; } + internal bool SupportsProvidedBufferRings { get; } + internal bool HasProvidedBufferRing { get; } + internal bool HasRegisteredBuffers { get; } + internal bool AdaptiveBufferSizingEnabled { get; } + internal int AvailableCount { get; } + internal int InUseCount { get; } + internal int TotalBufferCount { get; } + internal int BufferSize { get; } + internal int RecommendedBufferSize { get; } + internal long RecycledCount { get; } + internal long AllocationFailureCount { get; } + } + + internal readonly struct IoUringZeroCopySendSnapshotForTest + { + internal IoUringZeroCopySendSnapshotForTest( + bool hasIoUringPort, + bool supportsSendZc, + bool supportsSendMsgZc, + bool zeroCopySendEnabled) + { + HasIoUringPort = hasIoUringPort; + SupportsSendZc = supportsSendZc; + SupportsSendMsgZc = supportsSendMsgZc; + ZeroCopySendEnabled = zeroCopySendEnabled; + } + + internal bool HasIoUringPort { get; } + internal bool SupportsSendZc { get; } + internal bool SupportsSendMsgZc { get; } + internal bool ZeroCopySendEnabled { get; } + } + + internal readonly struct IoUringFixedRecvSnapshotForTest + { + internal IoUringFixedRecvSnapshotForTest( + bool hasIoUringPort, + bool supportsReadFixed, + bool hasRegisteredBuffers) + { + HasIoUringPort = hasIoUringPort; + SupportsReadFixed = supportsReadFixed; + HasRegisteredBuffers = hasRegisteredBuffers; + } + + internal bool HasIoUringPort { get; } + internal bool SupportsReadFixed { get; } + internal bool HasRegisteredBuffers { get; } + } + + internal readonly struct IoUringSqPollSnapshotForTest + { + internal IoUringSqPollSnapshotForTest(bool hasIoUringPort, bool sqPollEnabled, bool deferTaskrunEnabled) + { + HasIoUringPort = hasIoUringPort; + SqPollEnabled = sqPollEnabled; + DeferTaskrunEnabled = deferTaskrunEnabled; + } + + internal bool HasIoUringPort { get; } + internal bool SqPollEnabled { get; } + internal bool DeferTaskrunEnabled { get; } + } + + internal readonly struct IoUringZeroCopyPinHoldSnapshotForTest + { + internal IoUringZeroCopyPinHoldSnapshotForTest( + bool hasIoUringPort, + int activePinHolds, + int pendingNotificationCount) + { + HasIoUringPort = hasIoUringPort; + ActivePinHolds = activePinHolds; + PendingNotificationCount = pendingNotificationCount; + } + + internal bool HasIoUringPort { get; } + internal int ActivePinHolds { get; } + internal int PendingNotificationCount { get; } + } + + internal readonly struct IoUringNativeMsghdrLayoutSnapshotForTest + { + internal IoUringNativeMsghdrLayoutSnapshotForTest( + int size, + int msgNameOffset, + int msgNameLengthOffset, + int msgIovOffset, + int msgIovLengthOffset, + int msgControlOffset, + int msgControlLengthOffset, + int msgFlagsOffset) + { + Size = size; + MsgNameOffset = msgNameOffset; + MsgNameLengthOffset = msgNameLengthOffset; + MsgIovOffset = msgIovOffset; + MsgIovLengthOffset = msgIovLengthOffset; + MsgControlOffset = msgControlOffset; + MsgControlLengthOffset = msgControlLengthOffset; + MsgFlagsOffset = msgFlagsOffset; + } + + internal int Size { get; } + internal int MsgNameOffset { get; } + internal int MsgNameLengthOffset { get; } + internal int MsgIovOffset { get; } + internal int MsgIovLengthOffset { get; } + internal int MsgControlOffset { get; } + internal int MsgControlLengthOffset { get; } + internal int MsgFlagsOffset { get; } + } + + internal readonly struct IoUringCompletionSlotLayoutSnapshotForTest + { + internal IoUringCompletionSlotLayoutSnapshotForTest( + int size, + int generationOffset, + int freeListNextOffset, + int packedStateOffset, + int fixedRecvBufferIdOffset, + int testForcedResultOffset) + { + Size = size; + GenerationOffset = generationOffset; + FreeListNextOffset = freeListNextOffset; + PackedStateOffset = packedStateOffset; + FixedRecvBufferIdOffset = fixedRecvBufferIdOffset; + TestForcedResultOffset = testForcedResultOffset; + } + + internal int Size { get; } + internal int GenerationOffset { get; } + internal int FreeListNextOffset { get; } + internal int PackedStateOffset { get; } + internal int FixedRecvBufferIdOffset { get; } + internal int TestForcedResultOffset { get; } + } + + internal static IoUringNonPinnableFallbackPublicationState GetIoUringNonPinnableFallbackPublicationStateForTest() + { + object state = InvokeStatic("GetIoUringNonPinnableFallbackPublicationStateForTest")!; + return new IoUringNonPinnableFallbackPublicationState( + ReadProperty(state, "PublishedCount"), + ReadProperty(state, "PublishingGate"), + ReadProperty(state, "FallbackCount")); + } + + internal static void SetIoUringNonPinnableFallbackPublicationStateForTest(IoUringNonPinnableFallbackPublicationState state) + { + MethodInfo setter = GetRequiredMethod(GetEngineType(), "SetIoUringNonPinnableFallbackPublicationStateForTest", StaticFlags); + Type stateType = setter.GetParameters()[0].ParameterType; + ConstructorInfo constructor = stateType.GetConstructor( + BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic, + binder: null, + types: new[] { typeof(long), typeof(int), typeof(long) }, + modifiers: null) ?? throw new MissingMethodException(stateType.FullName, ".ctor(long,int,long)"); + + object rawState = constructor.Invoke(new object[] { state.PublishedCount, state.PublishingGate, state.FallbackCount }); + _ = setter.Invoke(null, new object[] { rawState }); + } + + internal static long GetIoUringNonPinnablePrepareFallbackDeltaForTest() => (long)InvokeStatic("GetIoUringNonPinnablePrepareFallbackDeltaForTest")!; + internal static bool IsIoUringEnabledForTest() => (bool)InvokeStatic("IsIoUringEnabledForTest")!; + internal static bool IsSqPollRequestedForTest() => (bool)InvokeStatic("IsSqPollRequestedForTest")!; + internal static bool IsIoUringDirectSqeDisabledForTest() => (bool)InvokeStatic("IsIoUringDirectSqeDisabledForTest")!; + internal static bool IsZeroCopySendOptedInForTest() => (bool)InvokeStatic("IsZeroCopySendOptedInForTest")!; + internal static bool IsIoUringRegisterBuffersEnabledForTest() => (bool)InvokeStatic("IsIoUringRegisterBuffersEnabledForTest")!; + internal static bool IsNativeMsghdrLayoutSupportedForIoUringForTest(int pointerSize, int nativeMsghdrSize) => + (bool)InvokeStatic("IsNativeMsghdrLayoutSupportedForIoUringForTest", new object?[] { pointerSize, nativeMsghdrSize })!; + internal static long GetIoUringPendingRetryQueuedToPrepareQueueCountForTest() => (long)InvokeStatic("GetIoUringPendingRetryQueuedToPrepareQueueCountForTest")!; + internal static int GetIoUringCancellationQueueCapacityForTest() => (int)InvokeStatic("GetIoUringCancellationQueueCapacityForTest")!; + internal static bool IsIoUringMultishotRecvSupportedForTest() => (bool)InvokeStatic("IsIoUringMultishotRecvSupportedForTest")!; + internal static bool IsIoUringMultishotAcceptSupportedForTest() => (bool)InvokeStatic("IsIoUringMultishotAcceptSupportedForTest")!; + internal static bool HasActiveIoUringEngineWithInitializedCqStateForTest() => (bool)InvokeStatic("HasActiveIoUringEngineWithInitializedCqStateForTest")!; + internal static int GetIoUringCompletionSlotsInUseForTest() => (int)InvokeStatic("GetIoUringCompletionSlotsInUseForTest")!; + internal static int GetIoUringTrackedOperationCountForTest() => (int)InvokeStatic("GetIoUringTrackedOperationCountForTest")!; + internal static bool IsAnyIoUringSqPollEngineNeedingWakeupForTest() => (bool)InvokeStatic("IsAnyIoUringSqPollEngineNeedingWakeupForTest")!; + internal static bool ValidateIoUringProvidedBufferTeardownOrderingForTest() => (bool)InvokeStatic("ValidateIoUringProvidedBufferTeardownOrderingForTest")!; + internal static ulong EncodeCompletionSlotUserDataForTest(int slotIndex, ulong generation) => + (ulong)InvokeStatic("EncodeCompletionSlotUserDataForTest", new object?[] { slotIndex, generation })!; + internal static ulong IncrementCompletionSlotGenerationForTest(ulong generation) => + (ulong)InvokeStatic("IncrementCompletionSlotGenerationForTest", new object?[] { generation })!; + + internal static bool TryDecodeCompletionSlotUserDataForTest(ulong userData, out int slotIndex, out ulong generation) + { + object?[] args = new object?[] { userData, 0, 0UL }; + bool result = (bool)InvokeStatic("TryDecodeCompletionSlotUserDataForTest", args)!; + slotIndex = (int)args[1]!; + generation = (ulong)args[2]!; + return result; + } + + internal static IoUringNativeMsghdrLayoutSnapshotForTest GetIoUringNativeMsghdrLayoutForTest() + { + object snapshot = InvokeStatic("GetIoUringNativeMsghdrLayoutForTest")!; + return new IoUringNativeMsghdrLayoutSnapshotForTest( + ReadProperty(snapshot, "Size"), + ReadProperty(snapshot, "MsgNameOffset"), + ReadProperty(snapshot, "MsgNameLengthOffset"), + ReadProperty(snapshot, "MsgIovOffset"), + ReadProperty(snapshot, "MsgIovLengthOffset"), + ReadProperty(snapshot, "MsgControlOffset"), + ReadProperty(snapshot, "MsgControlLengthOffset"), + ReadProperty(snapshot, "MsgFlagsOffset")); + } + + internal static IoUringCompletionSlotLayoutSnapshotForTest GetIoUringCompletionSlotLayoutForTest() + { + object snapshot = InvokeStatic("GetIoUringCompletionSlotLayoutForTest")!; + return new IoUringCompletionSlotLayoutSnapshotForTest( + ReadProperty(snapshot, "Size"), + ReadProperty(snapshot, "GenerationOffset"), + ReadProperty(snapshot, "FreeListNextOffset"), + ReadProperty(snapshot, "PackedStateOffset"), + ReadProperty(snapshot, "FixedRecvBufferIdOffset"), + ReadProperty(snapshot, "TestForcedResultOffset")); + } + + internal static IoUringNativeDiagnosticsSnapshotForTest GetIoUringNativeDiagnosticsSnapshotForTest() + { + object snapshot = InvokeStatic("GetIoUringNativeDiagnosticsSnapshotForTest")!; + return new IoUringNativeDiagnosticsSnapshotForTest( + ReadProperty(snapshot, "HasIoUringPort"), + ReadProperty(snapshot, "AsyncCancelRequestCqeCount"), + ReadProperty(snapshot, "AsyncCancelRequestCqeEnoentCount"), + ReadProperty(snapshot, "AsyncCancelRequestCqeEalreadyCount"), + ReadProperty(snapshot, "AsyncCancelRequestCqeOtherCount"), + ReadProperty(snapshot, "SocketEventBufferFullCount"), + ReadProperty(snapshot, "UnsupportedOpcodePrepareCount"), + ReadProperty(snapshot, "CqOverflowCount")); + } + + internal static IoUringProvidedBufferSnapshotForTest GetIoUringProvidedBufferSnapshotForTest() + { + object snapshot = InvokeStatic("GetIoUringProvidedBufferSnapshotForTest")!; + return new IoUringProvidedBufferSnapshotForTest( + ReadProperty(snapshot, "HasIoUringPort"), + ReadProperty(snapshot, "SupportsProvidedBufferRings"), + ReadProperty(snapshot, "HasProvidedBufferRing"), + ReadProperty(snapshot, "HasRegisteredBuffers"), + ReadProperty(snapshot, "AdaptiveBufferSizingEnabled"), + ReadProperty(snapshot, "AvailableCount"), + ReadProperty(snapshot, "InUseCount"), + ReadProperty(snapshot, "TotalBufferCount"), + ReadProperty(snapshot, "BufferSize"), + ReadProperty(snapshot, "RecommendedBufferSize"), + ReadProperty(snapshot, "RecycledCount"), + ReadProperty(snapshot, "AllocationFailureCount")); + } + + internal static IoUringZeroCopySendSnapshotForTest GetIoUringZeroCopySendSnapshotForTest() + { + object snapshot = InvokeStatic("GetIoUringZeroCopySendSnapshotForTest")!; + return new IoUringZeroCopySendSnapshotForTest( + ReadProperty(snapshot, "HasIoUringPort"), + ReadProperty(snapshot, "SupportsSendZc"), + ReadProperty(snapshot, "SupportsSendMsgZc"), + ReadProperty(snapshot, "ZeroCopySendEnabled")); + } + + internal static IoUringFixedRecvSnapshotForTest GetIoUringFixedRecvSnapshotForTest() + { + object snapshot = InvokeStatic("GetIoUringFixedRecvSnapshotForTest")!; + return new IoUringFixedRecvSnapshotForTest( + ReadProperty(snapshot, "HasIoUringPort"), + ReadProperty(snapshot, "SupportsReadFixed"), + ReadProperty(snapshot, "HasRegisteredBuffers")); + } + + internal static IoUringSqPollSnapshotForTest GetIoUringSqPollSnapshotForTest() + { + object snapshot = InvokeStatic("GetIoUringSqPollSnapshotForTest")!; + return new IoUringSqPollSnapshotForTest( + ReadProperty(snapshot, "HasIoUringPort"), + ReadProperty(snapshot, "SqPollEnabled"), + ReadProperty(snapshot, "DeferTaskrunEnabled")); + } + + internal static IoUringZeroCopyPinHoldSnapshotForTest GetIoUringZeroCopyPinHoldSnapshotForTest() + { + object snapshot = InvokeStatic("GetIoUringZeroCopyPinHoldSnapshotForTest")!; + return new IoUringZeroCopyPinHoldSnapshotForTest( + ReadProperty(snapshot, "HasIoUringPort"), + ReadProperty(snapshot, "ActivePinHolds"), + ReadProperty(snapshot, "PendingNotificationCount")); + } + + internal static bool TryInjectIoUringCqOverflowForTest(uint delta, out int injectedEngineCount) + { + object?[] args = new object?[] { delta, 0 }; + bool result = (bool)InvokeStatic("TryInjectIoUringCqOverflowForTest", args)!; + injectedEngineCount = (int)args[1]!; + return result; + } + + internal static bool TryGetIoUringRingFdForTest(out int ringFd) + { + object?[] args = new object?[] { -1 }; + bool result = (bool)InvokeStatic("TryGetIoUringRingFdForTest", args)!; + ringFd = (int)args[0]!; + return result; + } + + internal static bool TryGetIoUringWakeupEventFdForTest(out int eventFd) + { + object?[] args = new object?[] { -1 }; + bool result = (bool)InvokeStatic("TryGetIoUringWakeupEventFdForTest", args)!; + eventFd = (int)args[0]!; + return result; + } + + internal static bool TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest(out bool matches) + { + object?[] args = new object?[] { false }; + bool result = (bool)InvokeStatic("TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest", args)!; + matches = (bool)args[0]!; + return result; + } + + internal static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount) + { + object?[] args = new object?[] { 0 }; + bool result = (bool)InvokeStatic("TryForceIoUringProvidedBufferRingExhaustionForTest", args)!; + forcedBufferCount = (int)args[0]!; + return result; + } + + internal static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount) + { + object?[] args = new object?[] { 0 }; + bool result = (bool)InvokeStatic("TryRecycleForcedIoUringProvidedBufferRingForTest", args)!; + recycledBufferCount = (int)args[0]!; + return result; + } + + internal static bool TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) + { + object?[] args = new object?[] { null }; + bool result = (bool)InvokeStatic("TryGetFirstIoUringEngineForTest", args)!; + if (!result || args[0] is null) + { + ioUringEngine = null; + return false; + } + + ioUringEngine = new SocketAsyncEngine(args[0]); + return true; + } + + internal static SocketAsyncEngine[] GetActiveIoUringEnginesForTest() + { + Array engines = (Array)InvokeStatic("GetActiveIoUringEnginesForTest")!; + var wrappers = new SocketAsyncEngine[engines.Length]; + for (int i = 0; i < engines.Length; i++) + { + wrappers[i] = new SocketAsyncEngine(engines.GetValue(i)!); + } + + return wrappers; + } + + internal bool SupportsMultishotAcceptForTest + { + get => GetInstanceProperty("SupportsMultishotAcceptForTest"); + set => SetInstanceProperty("SupportsMultishotAcceptForTest", value); + } + + internal bool SupportsOpSendZcForTest + { + get => GetInstanceProperty("SupportsOpSendZcForTest"); + set => SetInstanceProperty("SupportsOpSendZcForTest", value); + } + + internal bool ZeroCopySendEnabledForTest + { + get => GetInstanceProperty("ZeroCopySendEnabledForTest"); + set => SetInstanceProperty("ZeroCopySendEnabledForTest", value); + } + + internal long IoUringCancelQueueLengthForTest + { + get => GetInstanceProperty("IoUringCancelQueueLengthForTest"); + set => SetInstanceProperty("IoUringCancelQueueLengthForTest", value); + } + + internal long IoUringCancelQueueOverflowCountForTest => GetInstanceProperty("IoUringCancelQueueOverflowCountForTest"); + internal long IoUringCancelQueueWakeRetryCountForTest => GetInstanceProperty("IoUringCancelQueueWakeRetryCountForTest"); + + internal int IoUringWakeupRequestedForTest + { + get => GetInstanceProperty("IoUringWakeupRequestedForTest"); + set => SetInstanceProperty("IoUringWakeupRequestedForTest", value); + } + + internal bool TryEnqueueIoUringCancellationForTest(ulong userData) + => (bool)InvokeInstance("TryEnqueueIoUringCancellationForTest", userData)!; + + internal int SubmitIoUringOperationsNormalizedForTest() + => Convert.ToInt32(InvokeInstance("SubmitIoUringOperationsNormalizedForTest")); + + private static object? InvokeStatic(string methodName, params object?[]? args) + { + MethodInfo method = GetRequiredMethod(GetEngineType(), methodName, StaticFlags); + return method.Invoke(null, args); + } + + private object? InvokeInstance(string methodName, params object?[]? args) + { + MethodInfo method = GetRequiredMethod(GetEngineType(), methodName, InstanceFlags); + return method.Invoke(_inner, args); + } + + private T GetInstanceProperty(string propertyName) + { + PropertyInfo property = GetRequiredProperty(GetEngineType(), propertyName, InstanceFlags); + return (T)property.GetValue(_inner)!; + } + + private void SetInstanceProperty(string propertyName, object? value) + { + PropertyInfo property = GetRequiredProperty(GetEngineType(), propertyName, InstanceFlags); + property.SetValue(_inner, value); + } + + private static T ReadProperty(object instance, string propertyName) + { + PropertyInfo property = GetRequiredProperty(instance.GetType(), propertyName, InstanceFlags); + return (T)property.GetValue(instance)!; + } + + [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] + private static Type GetEngineType() + { + return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncEngine", throwOnError: true, ignoreCase: false)!; + } + + private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags) + { + return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName); + } + + private static PropertyInfo GetRequiredProperty([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string propertyName, BindingFlags flags) + { + return type.GetProperty(propertyName, flags) ?? throw new MissingMemberException(type.FullName, propertyName); + } + } + + /// + /// Linux test-only shim that forwards SocketAsyncContext test hooks through reflection. + /// + internal static class SocketAsyncContext + { + private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic; + + [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketAsyncContext", "System.Net.Sockets")] + internal static bool IsMultishotAcceptArmedForTest(Socket socket) + => (bool)GetRequiredMethod(GetContextType(), "IsMultishotAcceptArmedForTest", StaticFlags).Invoke(null, new object[] { socket })!; + + internal static int GetMultishotAcceptQueueCountForTest(Socket socket) + => (int)GetRequiredMethod(GetContextType(), "GetMultishotAcceptQueueCountForTest", StaticFlags).Invoke(null, new object[] { socket })!; + + internal static bool IsPersistentMultishotRecvArmedForTest(Socket socket) + => (bool)GetRequiredMethod(GetContextType(), "IsPersistentMultishotRecvArmedForTest", StaticFlags).Invoke(null, new object[] { socket })!; + + [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] + private static Type GetContextType() + { + return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketAsyncContext", throwOnError: true, ignoreCase: false)!; + } + + private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags) + { + return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName); + } + } + + /// + /// Linux test-only shim that forwards SocketsTelemetry test hook access through reflection. + /// + internal static class SocketsTelemetry + { + private const BindingFlags StaticFlags = BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic; + + internal enum IoUringCounterFieldForTest + { + PrepareNonPinnableFallbacks, + AsyncCancelRequestCqes, + SocketEventBufferFull, + CqOverflow, + CqOverflowRecoveries, + CompletionRequeueFailures, + PrepareQueueOverflows, + PrepareQueueOverflowFallbacks, + CompletionSlotExhaustions, + CompletionSlotHighWaterMark, + CancellationQueueOverflows, + SqPollWakeups, + SqPollSubmissionsSkipped, + ProvidedBufferDepletions, + ProvidedBufferRecycles, + RegisteredBuffersInitialSuccess, + RegisteredBuffersInitialFailure, + RegisteredBuffersReregistrationSuccess, + RegisteredBuffersReregistrationFailure, + PersistentMultishotRecvReuse, + PersistentMultishotRecvTermination, + } + + [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.SocketsTelemetry", "System.Net.Sockets")] + internal static ulong GetIoUringCounterValueForTest(IoUringCounterFieldForTest counter) + { + Type telemetryType = GetTelemetryType(); + MethodInfo method = GetRequiredMethod(telemetryType, "GetIoUringCounterValueForTest", StaticFlags); + Type counterType = method.GetParameters()[0].ParameterType; + object counterValue = Enum.ToObject(counterType, (int)counter); + return (ulong)method.Invoke(null, new object[] { counterValue })!; + } + + [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] + private static Type GetTelemetryType() + { + return typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry", throwOnError: true, ignoreCase: false)!; + } + + private static MethodInfo GetRequiredMethod([DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] Type type, string methodName, BindingFlags flags) + { + return type.GetMethod(methodName, flags) ?? throw new MissingMethodException(type.FullName, methodName); + } + } + + /// + /// Linux test-only shim that mirrors internal MpscQueue{T} through reflection. + /// + internal sealed class MpscQueue + { + private const BindingFlags InstanceFlags = BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic; + private readonly object _inner; + private readonly MethodInfo _enqueueMethod; + private readonly MethodInfo _tryDequeueMethod; + private readonly PropertyInfo _isEmptyProperty; + + [DynamicDependency(DynamicallyAccessedMemberTypes.All, "System.Net.Sockets.MpscQueue`1", "System.Net.Sockets")] + internal MpscQueue(int segmentSize) + { + Type queueType = GetQueueType(); + ConstructorInfo constructor = queueType.GetConstructor( + BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic, + binder: null, + types: new[] { typeof(int) }, + modifiers: null) ?? throw new MissingMethodException(queueType.FullName, ".ctor(int)"); + + _enqueueMethod = queueType.GetMethod("Enqueue", InstanceFlags) ?? throw new MissingMethodException(queueType.FullName, "Enqueue"); + _tryDequeueMethod = queueType.GetMethod("TryDequeue", InstanceFlags) ?? throw new MissingMethodException(queueType.FullName, "TryDequeue"); + _isEmptyProperty = queueType.GetProperty("IsEmpty", InstanceFlags) ?? throw new MissingMemberException(queueType.FullName, "IsEmpty"); + try + { + _inner = constructor.Invoke(new object[] { segmentSize }); + } + catch (TargetInvocationException tie) when (tie.InnerException is Exception inner) + { + ExceptionDispatchInfo.Capture(inner).Throw(); + throw; + } + } + + internal void Enqueue(T item) + { + _ = _enqueueMethod.Invoke(_inner, new object?[] { item }); + } + + internal bool TryDequeue(out T item) + { + object?[] args = new object?[] { null }; + bool dequeued = (bool)_tryDequeueMethod.Invoke(_inner, args)!; + item = dequeued ? (T)args[0]! : default!; + return dequeued; + } + + internal bool IsEmpty => (bool)_isEmptyProperty.GetValue(_inner)!; + + [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.All)] + private static Type GetQueueType() + { + Type genericType = typeof(Socket).Assembly.GetType("System.Net.Sockets.MpscQueue`1", throwOnError: true, ignoreCase: false)!; + return genericType.MakeGenericType(typeof(T)); + } + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs new file mode 100644 index 00000000000000..4ff32891c1622d --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -0,0 +1,7256 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Diagnostics; +using System.Net; +using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.DotNet.RemoteExecutor; +using Xunit; +using IoUringFixedRecvSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringFixedRecvSnapshotForTest; +using IoUringNativeDiagnosticsSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringNativeDiagnosticsSnapshotForTest; +using IoUringProvidedBufferSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringProvidedBufferSnapshotForTest; +using IoUringSqPollSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringSqPollSnapshotForTest; +using IoUringZeroCopyPinHoldSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringZeroCopyPinHoldSnapshotForTest; +using IoUringZeroCopySendSnapshot = System.Net.Sockets.SocketAsyncEngine.IoUringZeroCopySendSnapshotForTest; + +namespace System.Net.Sockets.Tests +{ + // io_uring internals and reflection-based test hooks are currently validated on CoreCLR. + [ConditionalClass(typeof(PlatformDetection), nameof(PlatformDetection.IsNotMonoRuntime))] + public partial class IoUring + { + private const int F_GETFD = 1; + private const int F_GETFL = 3; + private const int FD_CLOEXEC = 1; + private const int O_NONBLOCK = 0x800; + private const int RLIMIT_NOFILE = 7; + + [StructLayout(LayoutKind.Sequential)] + private struct RLimit + { + public nuint Current; + public nuint Maximum; + } + + private static class IoUringEnvironmentVariables + { + public const string Enabled = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING"; + public const string ProvidedBufferSize = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PROVIDED_BUFFER_SIZE"; + public const string AdaptiveBufferSizing = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ADAPTIVE_BUFFER_SIZING"; + public const string RegisterBuffers = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_REGISTER_BUFFERS"; + public const string SqPoll = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_SQPOLL"; + public const string ZeroCopySend = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_ZERO_COPY_SEND"; + public const string DirectSqe = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_DIRECT_SQE"; + public const string ForceEagainOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_EAGAIN_ONCE_MASK"; + public const string ForceEcanceledOnceMask = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ECANCELED_ONCE_MASK"; + public const string ForceSubmitEpermOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_SUBMIT_EPERM_ONCE"; + public const string ForceEnterEintrRetryLimitOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE"; + public const string ForceKernelVersionUnsupported = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_KERNEL_VERSION_UNSUPPORTED"; + public const string ForceProvidedBufferRingOomOnce = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_PROVIDED_BUFFER_RING_OOM_ONCE"; + public const string TestEventBufferCount = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_EVENT_BUFFER_COUNT"; + public const string PrepareQueueCapacity = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_PREPARE_QUEUE_CAPACITY"; + public const string QueueEntries = "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_QUEUE_ENTRIES"; + public const string ThreadCount = "DOTNET_SYSTEM_NET_SOCKETS_THREAD_COUNT"; + } + + // fcntl uses C int for fd/cmd/return on Linux ABIs. + [LibraryImport("libc", EntryPoint = "fcntl", SetLastError = true)] + private static partial int Fcntl(int fd, int cmd); + + [LibraryImport("libc", EntryPoint = "getrlimit", SetLastError = true)] + private static partial int GetRLimit(int resource, out RLimit limit); + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // Uses Linux-only io_uring publication internals. + public static async Task IoUringNonPinnableFallbackPublication_ConcurrentPublishers_EmitSingleDelta() + { + await RemoteExecutor.Invoke(static () => + { + SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState originalState = + SocketAsyncEngine.GetIoUringNonPinnableFallbackPublicationStateForTest(); + + try + { + const long firstFallbackCount = 17; + const int publisherCount = 16; + long[] deltas = new long[publisherCount]; + using var start = new ManualResetEventSlim(initialState: false); + var tasks = new Task[publisherCount]; + + SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest( + new SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState( + publishedCount: 0L, + publishingGate: 0, + fallbackCount: firstFallbackCount)); + + for (int i = 0; i < publisherCount; i++) + { + int capturedIndex = i; + tasks[i] = Task.Run(() => + { + start.Wait(); + deltas[capturedIndex] = SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest(); + }); + } + + start.Set(); + Task.WaitAll(tasks); + + long deltaTotal = 0; + int nonZeroCount = 0; + long nonZeroValue = 0; + foreach (long delta in deltas) + { + deltaTotal += delta; + if (delta != 0) + { + nonZeroCount++; + nonZeroValue = delta; + } + } + + Assert.Equal(firstFallbackCount, deltaTotal); + Assert.Equal(1, nonZeroCount); + Assert.Equal(firstFallbackCount, nonZeroValue); + + const long secondFallbackCount = 23; + SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest( + new SocketAsyncEngine.IoUringNonPinnableFallbackPublicationState( + publishedCount: firstFallbackCount, + publishingGate: 0, + fallbackCount: secondFallbackCount)); + Assert.Equal(secondFallbackCount - firstFallbackCount, SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest()); + Assert.Equal(0, SocketAsyncEngine.GetIoUringNonPinnablePrepareFallbackDeltaForTest()); + } + finally + { + SocketAsyncEngine.SetIoUringNonPinnableFallbackPublicationStateForTest(originalState); + } + }).DisposeAsync(); + } + + private static RemoteInvokeOptions CreateSocketEngineOptions( + string? ioUringValue = "1", + string? forceEagainOnceMask = null, + string? forceEcanceledOnceMask = null, + bool? forceSubmitEpermOnce = null, + bool? forceEnterEintrRetryLimitOnce = null, + bool? forceKernelVersionUnsupported = null, + bool? forceProvidedBufferRingOomOnce = null, + int? testEventBufferCount = null, + string? testEventBufferCountRaw = null, + int? prepareQueueCapacity = null, + int? queueEntries = null, + int? threadCount = null, + int? providedBufferSize = null, + bool? adaptiveBufferSizingEnabled = null, + bool? registerBuffersEnabled = null, + bool? sqPollEnabled = null, + bool? directSqeEnabled = null, + bool? zeroCopySendEnabled = null) + { + static void SetOrRemoveEnvironmentVariable(RemoteInvokeOptions options, string name, string? value) + { + if (value is null) + { + options.StartInfo.EnvironmentVariables.Remove(name); + } + else + { + options.StartInfo.EnvironmentVariables[name] = value; + } + } + + static void ValidateSocketEngineOptionCombination(int? configuredEventBufferCount, string? configuredEventBufferCountRaw) + { + if (configuredEventBufferCount.HasValue && configuredEventBufferCountRaw is not null) + { + throw new ArgumentException( + "Specify either testEventBufferCount or testEventBufferCountRaw, not both.", + nameof(configuredEventBufferCountRaw)); + } + } + + ValidateSocketEngineOptionCombination(testEventBufferCount, testEventBufferCountRaw); + + RemoteInvokeOptions options = new RemoteInvokeOptions(); + string? configuredEventBufferCount = + testEventBufferCountRaw ?? (testEventBufferCount.HasValue ? testEventBufferCount.Value.ToString() : null); + (string Name, string? Value)[] ioUringEnvironmentAssignments = + { + (IoUringEnvironmentVariables.Enabled, ioUringValue), + (IoUringEnvironmentVariables.ProvidedBufferSize, providedBufferSize?.ToString()), + (IoUringEnvironmentVariables.AdaptiveBufferSizing, adaptiveBufferSizingEnabled.HasValue ? (adaptiveBufferSizingEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.RegisterBuffers, registerBuffersEnabled.HasValue ? (registerBuffersEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.SqPoll, sqPollEnabled.HasValue ? (sqPollEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.DirectSqe, directSqeEnabled.HasValue ? (directSqeEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.ZeroCopySend, zeroCopySendEnabled.HasValue ? (zeroCopySendEnabled.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.ForceEagainOnceMask, string.IsNullOrEmpty(forceEagainOnceMask) ? null : forceEagainOnceMask), + (IoUringEnvironmentVariables.ForceEcanceledOnceMask, string.IsNullOrEmpty(forceEcanceledOnceMask) ? null : forceEcanceledOnceMask), + (IoUringEnvironmentVariables.ForceSubmitEpermOnce, forceSubmitEpermOnce.HasValue ? (forceSubmitEpermOnce.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.ForceEnterEintrRetryLimitOnce, forceEnterEintrRetryLimitOnce.HasValue ? (forceEnterEintrRetryLimitOnce.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.ForceKernelVersionUnsupported, forceKernelVersionUnsupported.HasValue ? (forceKernelVersionUnsupported.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.ForceProvidedBufferRingOomOnce, forceProvidedBufferRingOomOnce.HasValue ? (forceProvidedBufferRingOomOnce.Value ? "1" : "0") : null), + (IoUringEnvironmentVariables.TestEventBufferCount, configuredEventBufferCount), + (IoUringEnvironmentVariables.PrepareQueueCapacity, prepareQueueCapacity?.ToString()), + (IoUringEnvironmentVariables.QueueEntries, queueEntries?.ToString()), + (IoUringEnvironmentVariables.ThreadCount, threadCount?.ToString()), + }; + + foreach ((string Name, string? Value) assignment in ioUringEnvironmentAssignments) + { + SetOrRemoveEnvironmentVariable(options, assignment.Name, assignment.Value); + } + + options.TimeOut = (int)TimeSpan.FromMinutes(10).TotalMilliseconds; + return options; + } + + private static Task ToTask(Task task) => task; + private static Task ToTask(ValueTask task) => task.AsTask(); + + private static async Task AwaitWithTimeoutAsync(Task task, string operationName) + { + Task completed = await Task.WhenAny(task, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.True(ReferenceEquals(task, completed), $"Timed out waiting for {operationName}"); + return await task; + } + + private static void AssertCanceledOrInterrupted(Exception? ex) + { + Assert.NotNull(ex); + Assert.True( + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + + private static void AssertCanceledDisposedOrInterrupted(Exception? ex) + { + if (ex is null) + { + return; + } + + Assert.True( + ex is ObjectDisposedException || + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + + private static bool IsProvidedBufferSnapshotUsable(IoUringProvidedBufferSnapshot snapshot) => + snapshot.HasIoUringPort && + snapshot.SupportsProvidedBufferRings && + snapshot.HasProvidedBufferRing && + snapshot.TotalBufferCount > 0; + + private static bool IsAdaptiveSizingUsable(IoUringProvidedBufferSnapshot snapshot) => + IsProvidedBufferSnapshotUsable(snapshot) && snapshot.AdaptiveBufferSizingEnabled; + + private static bool IsFixedRecvEnabled(IoUringFixedRecvSnapshot snapshot) => + snapshot.SupportsReadFixed && snapshot.HasRegisteredBuffers; + + private static bool IsSqPollActive(IoUringSqPollSnapshot snapshot) => + snapshot.HasIoUringPort && snapshot.SqPollEnabled; + + private sealed class NonPinnableMemoryManager : MemoryManager + { + private readonly byte[] _buffer; + + public NonPinnableMemoryManager(byte[] buffer) + { + _buffer = buffer; + } + + public override Span GetSpan() => _buffer; + + public override MemoryHandle Pin(int elementIndex = 0) + { + _ = elementIndex; + throw new NotSupportedException("Non-pinnable test memory."); + } + + public override void Unpin() + { + } + + protected override void Dispose(bool disposing) + { + } + } + + private sealed unsafe class TrackingPinnableMemoryManager : MemoryManager + { + private readonly byte[] _buffer; + private int _pinCount; + private int _unpinCount; + + public TrackingPinnableMemoryManager(byte[] buffer) + { + _buffer = buffer; + } + + public int PinCount => Volatile.Read(ref _pinCount); + public int UnpinCount => Volatile.Read(ref _unpinCount); + + public override Span GetSpan() => _buffer; + + public override MemoryHandle Pin(int elementIndex = 0) + { + if ((uint)elementIndex > (uint)_buffer.Length) + { + throw new ArgumentOutOfRangeException(nameof(elementIndex)); + } + + Interlocked.Increment(ref _pinCount); + GCHandle handle = GCHandle.Alloc(_buffer, GCHandleType.Pinned); + byte* pointer = (byte*)handle.AddrOfPinnedObject() + elementIndex; + return new MemoryHandle(pointer, handle, this); + } + + public override void Unpin() + { + Interlocked.Increment(ref _unpinCount); + } + + protected override void Dispose(bool disposing) + { + } + } + +#if DEBUG + private sealed class ThrowingTraceListener : TraceListener + { + public override void Write(string? message) + { + } + + public override void WriteLine(string? message) + { + } + + public override void Fail(string? message, string? detailMessage) + { + throw new InvalidOperationException($"{message} {detailMessage}"); + } + } +#endif + + private static long GetIoUringPrepareNonPinnableFallbackCounterValue() + => (long)SocketsTelemetry.GetIoUringCounterValueForTest(SocketsTelemetry.IoUringCounterFieldForTest.PrepareNonPinnableFallbacks); + + private static long GetIoUringCompletionRequeueFailureCounterValue() + => (long)SocketsTelemetry.GetIoUringCounterValueForTest(SocketsTelemetry.IoUringCounterFieldForTest.CompletionRequeueFailures); + + private static bool InvokeSocketAsyncEngineBoolMethod(string methodName) + { + return methodName switch + { + "IsIoUringEnabled" => SocketAsyncEngine.IsIoUringEnabledForTest(), + "IsSqPollRequested" => SocketAsyncEngine.IsSqPollRequestedForTest(), + "IsIoUringDirectSqeDisabled" => SocketAsyncEngine.IsIoUringDirectSqeDisabledForTest(), + "IsZeroCopySendOptedIn" => SocketAsyncEngine.IsZeroCopySendOptedInForTest(), + "IsIoUringRegisterBuffersEnabled" => SocketAsyncEngine.IsIoUringRegisterBuffersEnabledForTest(), + _ => throw new ArgumentOutOfRangeException(nameof(methodName), methodName, "Unknown SocketAsyncEngine bool selector."), + }; + } + + private static void AssertBooleanAppContextSwitch( + string switchName, + string methodName, + bool expectedWhenSwitchTrue, + bool expectedWhenSwitchFalse) + { + AppContext.SetSwitch(switchName, true); + Assert.Equal(expectedWhenSwitchTrue, InvokeSocketAsyncEngineBoolMethod(methodName)); + + AppContext.SetSwitch(switchName, false); + Assert.Equal(expectedWhenSwitchFalse, InvokeSocketAsyncEngineBoolMethod(methodName)); + } + + private static ulong GetIoUringTelemetryCounterValue(string fieldName) + { + SocketsTelemetry.IoUringCounterFieldForTest counter = fieldName switch + { + "_ioUringPrepareNonPinnableFallbacks" => SocketsTelemetry.IoUringCounterFieldForTest.PrepareNonPinnableFallbacks, + "_ioUringAsyncCancelRequestCqes" => SocketsTelemetry.IoUringCounterFieldForTest.AsyncCancelRequestCqes, + "_ioUringSocketEventBufferFull" => SocketsTelemetry.IoUringCounterFieldForTest.SocketEventBufferFull, + "_ioUringCqOverflow" => SocketsTelemetry.IoUringCounterFieldForTest.CqOverflow, + "_ioUringCqOverflowRecoveries" => SocketsTelemetry.IoUringCounterFieldForTest.CqOverflowRecoveries, + "_ioUringCompletionRequeueFailures" => SocketsTelemetry.IoUringCounterFieldForTest.CompletionRequeueFailures, + "_ioUringPrepareQueueOverflows" => SocketsTelemetry.IoUringCounterFieldForTest.PrepareQueueOverflows, + "_ioUringPrepareQueueOverflowFallbacks" => SocketsTelemetry.IoUringCounterFieldForTest.PrepareQueueOverflowFallbacks, + "_ioUringCompletionSlotExhaustions" => SocketsTelemetry.IoUringCounterFieldForTest.CompletionSlotExhaustions, + "_ioUringCompletionSlotHighWaterMark" => SocketsTelemetry.IoUringCounterFieldForTest.CompletionSlotHighWaterMark, + "_ioUringSqPollWakeups" => SocketsTelemetry.IoUringCounterFieldForTest.SqPollWakeups, + "_ioUringSqPollSubmissionsSkipped" => SocketsTelemetry.IoUringCounterFieldForTest.SqPollSubmissionsSkipped, + "_ioUringProvidedBufferDepletions" => SocketsTelemetry.IoUringCounterFieldForTest.ProvidedBufferDepletions, + "_ioUringProvidedBufferRecycles" => SocketsTelemetry.IoUringCounterFieldForTest.ProvidedBufferRecycles, + "_ioUringRegisteredBuffersInitialSuccess" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersInitialSuccess, + "_ioUringRegisteredBuffersInitialFailure" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersInitialFailure, + "_ioUringRegisteredBuffersReregistrationSuccess" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersReregistrationSuccess, + "_ioUringRegisteredBuffersReregistrationFailure" => SocketsTelemetry.IoUringCounterFieldForTest.RegisteredBuffersReregistrationFailure, + "_ioUringPersistentMultishotRecvReuse" => SocketsTelemetry.IoUringCounterFieldForTest.PersistentMultishotRecvReuse, + "_ioUringPersistentMultishotRecvTermination" => SocketsTelemetry.IoUringCounterFieldForTest.PersistentMultishotRecvTermination, + _ => throw new ArgumentOutOfRangeException(nameof(fieldName), fieldName, "Unknown io_uring telemetry counter selector."), + }; + + return SocketsTelemetry.GetIoUringCounterValueForTest(counter); + } + + private static long GetIoUringPendingRetryQueuedToPrepareQueueCount() + => SocketAsyncEngine.GetIoUringPendingRetryQueuedToPrepareQueueCountForTest(); + + private static void AssertNativeMsghdrLayoutContractForIoUring() + { + SocketAsyncEngine.IoUringNativeMsghdrLayoutSnapshotForTest layout = + SocketAsyncEngine.GetIoUringNativeMsghdrLayoutForTest(); + + Assert.Equal(56, layout.Size); + Assert.Equal(0, layout.MsgNameOffset); + Assert.Equal(8, layout.MsgNameLengthOffset); + Assert.Equal(16, layout.MsgIovOffset); + Assert.Equal(24, layout.MsgIovLengthOffset); + Assert.Equal(32, layout.MsgControlOffset); + Assert.Equal(40, layout.MsgControlLengthOffset); + Assert.Equal(48, layout.MsgFlagsOffset); + } + + private static void AssertNativeMsghdr32BitRejectionPathForIoUring() + { + Assert.True(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 8, nativeMsghdrSize: 56)); + Assert.False(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 4, nativeMsghdrSize: 56)); + Assert.False(SocketAsyncEngine.IsNativeMsghdrLayoutSupportedForIoUringForTest(pointerSize: 8, nativeMsghdrSize: 48)); + } + + private static void AssertIoUringCompletionSlotLayoutContractForIoUring() + { + SocketAsyncEngine.IoUringCompletionSlotLayoutSnapshotForTest layout = + SocketAsyncEngine.GetIoUringCompletionSlotLayoutForTest(); + + Assert.Equal(24, layout.Size); + Assert.Equal(0, layout.GenerationOffset); + Assert.Equal(8, layout.FreeListNextOffset); + Assert.Equal(12, layout.PackedStateOffset); + Assert.Equal(16, layout.FixedRecvBufferIdOffset); + if (layout.TestForcedResultOffset >= 0) + { + Assert.Equal(20, layout.TestForcedResultOffset); + } + } + + private static bool TryInjectIoUringCqOverflowForTest(uint delta, out int injectedEngineCount) + => SocketAsyncEngine.TryInjectIoUringCqOverflowForTest(delta, out injectedEngineCount); + + private static bool AssertIoUringCqReflectionTargetsStableForTest() + => SocketAsyncEngine.HasActiveIoUringEngineWithInitializedCqStateForTest(); + + private static int GetIoUringCompletionSlotsInUseForTest() + => SocketAsyncEngine.GetIoUringCompletionSlotsInUseForTest(); + + private static int GetIoUringTrackedOperationCountForTest() + => SocketAsyncEngine.GetIoUringTrackedOperationCountForTest(); + + private static ulong EncodeCompletionSlotUserDataForTest(int slotIndex, ulong generation) + => SocketAsyncEngine.EncodeCompletionSlotUserDataForTest(slotIndex, generation); + + private static bool TryDecodeCompletionSlotUserDataForTest(ulong userData, out int slotIndex, out ulong generation) + => SocketAsyncEngine.TryDecodeCompletionSlotUserDataForTest(userData, out slotIndex, out generation); + + private static ulong IncrementCompletionSlotGenerationForTest(ulong generation) + => SocketAsyncEngine.IncrementCompletionSlotGenerationForTest(generation); + + private static bool IsTrackedIoUringUserDataForTest(ulong userData) + => SocketAsyncEngine.IsTrackedIoUringUserDataForTest(userData); + + private static bool TryGetIoUringRingFdForTest(out int ringFd) + => SocketAsyncEngine.TryGetIoUringRingFdForTest(out ringFd); + + private static bool TryGetIoUringWakeupEventFdForTest(out int eventFd) + => SocketAsyncEngine.TryGetIoUringWakeupEventFdForTest(out eventFd); + + private static bool TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) + { + return SocketAsyncEngine.TryGetFirstIoUringEngineForTest(out ioUringEngine); + } + + private static void AssertCompletionSlotUserDataEncodingBoundaryContractForIoUring() + { + const int MaxSlotIndex = 8191; + const ulong MaxGeneration = (1UL << 43) - 1; + + ulong encoded = EncodeCompletionSlotUserDataForTest(MaxSlotIndex, MaxGeneration); + Assert.True(TryDecodeCompletionSlotUserDataForTest(encoded, out int decodedSlotIndex, out ulong decodedGeneration)); + Assert.Equal(MaxSlotIndex, decodedSlotIndex); + Assert.Equal(MaxGeneration, decodedGeneration); + + ulong wrappedGeneration = IncrementCompletionSlotGenerationForTest(MaxGeneration); + Assert.Equal(1UL, wrappedGeneration); + + ulong wrappedEncoded = EncodeCompletionSlotUserDataForTest(MaxSlotIndex, wrappedGeneration); + Assert.True(TryDecodeCompletionSlotUserDataForTest(wrappedEncoded, out int wrappedSlotIndex, out ulong wrappedDecodedGeneration)); + Assert.Equal(MaxSlotIndex, wrappedSlotIndex); + Assert.Equal(1UL, wrappedDecodedGeneration); + } + + private static async Task WaitForIoUringTelemetryCounterAtLeastAsync(string counterFieldName, ulong targetValue, int timeoutMilliseconds = 30000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (GetIoUringTelemetryCounterValue(counterFieldName) >= targetValue) + { + return true; + } + + await Task.Delay(25); + } + + return GetIoUringTelemetryCounterValue(counterFieldName) >= targetValue; + } + + private static async Task WaitForIoUringCompletionSlotsInUseAtMostAsync(int maxValue, int timeoutMilliseconds = 10000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (GetIoUringCompletionSlotsInUseForTest() <= maxValue) + { + return true; + } + + await Task.Delay(25); + } + + return GetIoUringCompletionSlotsInUseForTest() <= maxValue; + } + + private static async Task WaitForIoUringCompletionSlotsInUseAboveAsync(int baselineValue, int minimumDelta, int timeoutMilliseconds = 10000) + { + int threshold = baselineValue + minimumDelta; + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (GetIoUringCompletionSlotsInUseForTest() > threshold) + { + return true; + } + + await Task.Delay(25); + } + + return GetIoUringCompletionSlotsInUseForTest() > threshold; + } + + private static async Task WaitForIoUringTrackedOperationsAtMostAsync(int maxValue, int timeoutMilliseconds = 10000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (GetIoUringTrackedOperationCountForTest() <= maxValue) + { + return true; + } + + await Task.Delay(25); + } + + return GetIoUringTrackedOperationCountForTest() <= maxValue; + } + + private static bool IsIoUringMultishotRecvSupported() + => SocketAsyncEngine.IsIoUringMultishotRecvSupportedForTest(); + + private static bool IsIoUringMultishotAcceptSupported() + => SocketAsyncEngine.IsIoUringMultishotAcceptSupportedForTest(); + + private static bool IsListenerMultishotAcceptArmed(Socket listener) + => SocketAsyncContext.IsMultishotAcceptArmedForTest(listener); + + private static int GetListenerMultishotAcceptQueueCount(Socket listener) + => SocketAsyncContext.GetMultishotAcceptQueueCountForTest(listener); + + private static async Task WaitForMultishotAcceptArmedStateAsync(Socket listener, bool expectedArmed, int timeoutMilliseconds = 5000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (IsListenerMultishotAcceptArmed(listener) == expectedArmed) + { + return true; + } + + await Task.Delay(20); + } + + return IsListenerMultishotAcceptArmed(listener) == expectedArmed; + } + + private static bool IsPersistentMultishotRecvArmed(Socket socket) + => SocketAsyncContext.IsPersistentMultishotRecvArmedForTest(socket); + + private static ulong GetPersistentMultishotRecvUserData(Socket socket) + => SocketAsyncContext.GetPersistentMultishotRecvUserDataForTest(socket); + + private static int GetPersistentMultishotRecvBufferedCount(Socket socket) + => SocketAsyncContext.GetPersistentMultishotRecvBufferedCountForTest(socket); + + private static async Task WaitForPersistentMultishotRecvArmedStateAsync(Socket socket, bool expectedArmed, int timeoutMilliseconds = 5000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + while (DateTime.UtcNow < deadline) + { + if (IsPersistentMultishotRecvArmed(socket) == expectedArmed) + { + return true; + } + + await Task.Delay(20); + } + + return IsPersistentMultishotRecvArmed(socket) == expectedArmed; + } + + private static bool HasSufficientFileDescriptorLimit(int requiredDescriptorCount) + { + if (requiredDescriptorCount <= 0) + { + return true; + } + + if (GetRLimit(RLIMIT_NOFILE, out RLimit limit) != 0) + { + return true; + } + + return limit.Current >= (nuint)requiredDescriptorCount; + } + + private static bool DoesExecChildObserveFileDescriptor(int fd) + { + if (fd < 0) + { + return false; + } + + using Process process = Process.Start( + new ProcessStartInfo + { + FileName = "/bin/sh", + Arguments = $"-c \"[ -e /proc/self/fd/{fd} ]\"", + UseShellExecute = false, + })!; + + process.WaitForExit(); + return process.ExitCode == 0; + } + + private static async Task WaitForZeroCopyPinHoldSnapshotAsync( + Func predicate, + int timeoutMilliseconds = 5000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + IoUringZeroCopyPinHoldSnapshot snapshot = GetIoUringZeroCopyPinHoldSnapshot(); + while (DateTime.UtcNow < deadline) + { + if (predicate(snapshot)) + { + return snapshot; + } + + await Task.Delay(20); + snapshot = GetIoUringZeroCopyPinHoldSnapshot(); + } + + return snapshot; + } + + private static async Task AssertConnectedPairRoundTripAsync(Socket client, Socket server, byte marker) + { + byte[] payload = new byte[] { marker }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(marker, receiveBuffer[0]); + } + + private static async Task AssertPinsReleasedAsync(TrackingPinnableMemoryManager manager) + { + DateTime start = DateTime.UtcNow; + while (manager.PinCount != manager.UnpinCount) + { + if (DateTime.UtcNow - start > TimeSpan.FromSeconds(10)) + { + break; + } + + await Task.Delay(20); + } + + Assert.True(manager.PinCount > 0, "Expected at least one pin."); + Assert.Equal(manager.PinCount, manager.UnpinCount); + } + + private static IoUringNativeDiagnosticsSnapshot GetIoUringNativeDiagnosticsSnapshot() + { + return SocketAsyncEngine.GetIoUringNativeDiagnosticsSnapshotForTest(); + } + + private static IoUringProvidedBufferSnapshot GetIoUringProvidedBufferSnapshot() + { + return SocketAsyncEngine.GetIoUringProvidedBufferSnapshotForTest(); + } + + private static IoUringZeroCopySendSnapshot GetIoUringZeroCopySendSnapshot() + { + return SocketAsyncEngine.GetIoUringZeroCopySendSnapshotForTest(); + } + + private static IoUringFixedRecvSnapshot GetIoUringFixedRecvSnapshot() + { + return SocketAsyncEngine.GetIoUringFixedRecvSnapshotForTest(); + } + + private static IoUringSqPollSnapshot GetIoUringSqPollSnapshot() + { + return SocketAsyncEngine.GetIoUringSqPollSnapshotForTest(); + } + + private static bool IsAnyIoUringSqPollEngineNeedingWakeup() + => SocketAsyncEngine.IsAnyIoUringSqPollEngineNeedingWakeupForTest(); + + private static bool ValidateSqNeedWakeupMatchesRawSqFlagBit() + { + if (!SocketAsyncEngine.TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest(out bool matches)) + { + return false; + } + + Assert.True(matches, "SqNeedWakeup should match the SQ_NEED_WAKEUP bit contract."); + return true; + } + + private static void EnableSqPollAppContextOptIn() => + AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", true); + + private static IoUringZeroCopyPinHoldSnapshot GetIoUringZeroCopyPinHoldSnapshot() + { + return SocketAsyncEngine.GetIoUringZeroCopyPinHoldSnapshotForTest(); + } + + private static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount) + => SocketAsyncEngine.TryForceIoUringProvidedBufferRingExhaustionForTest(out forcedBufferCount); + + private static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount) + => SocketAsyncEngine.TryRecycleForcedIoUringProvidedBufferRingForTest(out recycledBufferCount); + + private static ulong CounterDelta(ulong before, ulong after) + { + Assert.True(after >= before, $"Expected monotonic io_uring counter. before={before}, after={after}"); + return after - before; + } + + private static async Task WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + Func scenario, + Action validateDelta, + int settleDelayMilliseconds = 0, + bool skipScenarioWhenIoUringUnavailable = false) + { + IoUringNativeDiagnosticsSnapshot diagnosticsBefore = GetIoUringNativeDiagnosticsSnapshot(); + if (skipScenarioWhenIoUringUnavailable && !diagnosticsBefore.HasIoUringPort) + { + return; + } + + await scenario(); + + if (settleDelayMilliseconds > 0) + { + await Task.Delay(settleDelayMilliseconds); + } + + IoUringNativeDiagnosticsSnapshot diagnosticsAfter = GetIoUringNativeDiagnosticsSnapshot(); + if (!diagnosticsBefore.HasIoUringPort && !diagnosticsAfter.HasIoUringPort) + { + return; + } + + validateDelta(diagnosticsBefore, diagnosticsAfter); + } + + private static Task StartReceiveMessageFromAsync(Socket socket, SocketAsyncEventArgs eventArgs) + => StartSocketAsyncEventArgsOperation(socket, eventArgs, static (s, args) => s.ReceiveMessageFromAsync(args)); + + private static Task StartSocketAsyncEventArgsOperation( + Socket socket, + SocketAsyncEventArgs eventArgs, + Func startOperation) + { + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + EventHandler handler = null!; + handler = (_, completedArgs) => + { + eventArgs.Completed -= handler; + tcs.TrySetResult(completedArgs); + }; + + eventArgs.Completed += handler; + if (!startOperation(socket, eventArgs)) + { + eventArgs.Completed -= handler; + tcs.TrySetResult(eventArgs); + } + + return tcs.Task; + } + + private static async Task<(Socket Listener, Socket Client, Socket Server)> CreateConnectedTcpSocketTrioAsync(int listenBacklog = 1) + { + Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + try + { + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(listenBacklog); + + Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + try + { + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + Socket server = await acceptTask; + return (listener, client, server); + } + catch + { + client.Dispose(); + throw; + } + } + catch + { + listener.Dispose(); + throw; + } + } + + private static async Task<(Socket Client, Socket Server)> AcceptConnectedTcpPairAsync(Socket listener, IPEndPoint endpoint) + { + Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + try + { + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync(endpoint); + Socket server = await acceptTask; + return (client, server); + } + catch + { + client.Dispose(); + throw; + } + } + + private static async Task RunTcpRoundTripAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[] { 1 }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < iterations; i++) + { + var serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, clientSent); + + int serverReceived = await serverReceiveTask; + Assert.Equal(1, serverReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + var clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, serverSent); + + int clientReceived = await clientReceiveTask; + Assert.Equal(1, clientReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + unchecked + { + sendBuffer[0]++; + } + } + } + + private static async Task RunUnixDomainSocketRoundTripAsync() + { + if (!Socket.OSSupportsUnixDomainSockets) + { + return; + } + + string path = UnixDomainSocketTest.GetRandomNonExistingFilePath(); + var endpoint = new UnixDomainSocketEndPoint(path); + try + { + using Socket listener = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified); + listener.Bind(endpoint); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified); + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync(endpoint); + + using Socket server = await acceptTask; + await AssertConnectedPairRoundTripAsync(client, server, 0x31); + await AssertConnectedPairRoundTripAsync(server, client, 0x32); + } + finally + { + try + { + System.IO.File.Delete(path); + } + catch + { + } + } + } + + private static async Task RunHybridIoUringAndEpollEngineScenarioAsync() + { + await RunTcpRoundTripAsync(4); + + // With DOTNET_SYSTEM_NET_SOCKETS_THREAD_COUNT=2, one io_uring engine indicates a hybrid mix. + if (SocketAsyncEngine.GetActiveIoUringEnginesForTest().Length != 1) + { + return; + } + + const int ConnectionCount = 32; + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(ConnectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var acceptTasks = new Task[ConnectionCount]; + var clients = new Socket[ConnectionCount]; + var connectTasks = new Task[ConnectionCount]; + + for (int i = 0; i < ConnectionCount; i++) + { + acceptTasks[i] = listener.AcceptAsync(); + } + + for (int i = 0; i < ConnectionCount; i++) + { + clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectTasks[i] = clients[i].ConnectAsync(endpoint); + } + + await Task.WhenAll(connectTasks); + Socket[] servers = await Task.WhenAll(acceptTasks); + + try + { + var work = new Task[ConnectionCount]; + for (int i = 0; i < ConnectionCount; i++) + { + Socket client = clients[i]; + Socket server = servers[i]; + byte value = (byte)(i + 1); + + work[i] = Task.Run(async () => + { + byte[] tx = new byte[] { value }; + byte[] rx = new byte[1]; + + int sent = await client.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await server.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + + sent = await server.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + received = await client.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + }); + } + + await Task.WhenAll(work); + } + finally + { + for (int i = 0; i < ConnectionCount; i++) + { + servers[i].Dispose(); + clients[i].Dispose(); + } + } + } + + private static async Task RunThreadCountTwoCancellationRoutingScenarioAsync() + { + await RunHybridIoUringAndEpollEngineScenarioAsync(); + + SocketAsyncEngine[] ioUringEngines = SocketAsyncEngine.GetActiveIoUringEnginesForTest(); + if (ioUringEngines.Length != 1) + { + return; + } + + SocketAsyncEngine ioUringEngine = ioUringEngines[0]; + long queueLengthBefore = ioUringEngine.IoUringCancelQueueLengthForTest; + long wakeRetryBefore = ioUringEngine.IoUringCancelQueueWakeRetryCountForTest; + + await RunCancellationSubmitContentionScenarioAsync(connectionCount: 8, cancellationsPerConnection: 64); + + Assert.True(queueLengthBefore >= 0); + Assert.True(ioUringEngine.IoUringCancelQueueLengthForTest >= 0); + Assert.True( + ioUringEngine.IoUringCancelQueueLengthForTest <= SocketAsyncEngine.GetIoUringCancellationQueueCapacityForTest()); + Assert.True(ioUringEngine.IoUringCancelQueueWakeRetryCountForTest >= wakeRetryBefore); + } + + private static async Task RunKernelVersionUnsupportedFallbackScenarioAsync() + { + await RunTcpRoundTripAsync(4); + Assert.Equal(0, SocketAsyncEngine.GetActiveIoUringEnginesForTest().Length); + } + + private static async Task RunTrackedOperationGenerationTransitionStressScenarioAsync(int connectionCount, int iterationsPerConnection) + { + if (!PlatformDetection.IsArm64Process) + { + return; + } + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest(); + int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest(); + + var clients = new List(connectionCount); + var servers = new List(connectionCount); + try + { + for (int i = 0; i < connectionCount; i++) + { + (Socket client, Socket server) = await AcceptConnectedTcpPairAsync(listener, endpoint); + clients.Add(client); + servers.Add(server); + } + + var workers = new Task[connectionCount]; + for (int i = 0; i < connectionCount; i++) + { + Socket client = clients[i]; + Socket server = servers[i]; + workers[i] = Task.Run(async () => + { + byte[] sendBuffer = new byte[1]; + byte[] receiveBuffer = new byte[1]; + for (int iteration = 0; iteration < iterationsPerConnection; iteration++) + { + // Stress rapid slot reuse so generation mismatches surface as stuck operations + // rather than silently passing under low churn. + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + int sent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await receiveTask; + Assert.Equal(1, received); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + unchecked + { + sendBuffer[0]++; + } + } + }); + } + + Task workerTask = Task.WhenAll(workers); + Task completed = await Task.WhenAny(workerTask, Task.Delay(TimeSpan.FromSeconds(60))); + Assert.Same(workerTask, completed); + await workerTask; + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + + Assert.True( + await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 2, timeoutMilliseconds: 15000), + "Completion-slot usage remained elevated after ARM64 generation-transition stress."); + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 2, timeoutMilliseconds: 15000), + "Tracked-operation count remained elevated after ARM64 generation-transition stress."); + } + + private static async Task RunGenerationWrapAroundDispatchScenarioAsync() + { + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + Task armReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x5C }, SocketFlags.None)); + Assert.Equal(1, await armReceive); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to arm before generation-wrap dispatch validation."); + + ulong activeUserData = GetPersistentMultishotRecvUserData(server); + Assert.NotEqual(0UL, activeUserData); + Assert.True(IsTrackedIoUringUserDataForTest(activeUserData), "Active multishot user_data should be tracked."); + Assert.True(TryDecodeCompletionSlotUserDataForTest(activeUserData, out int slotIndex, out ulong generation)); + + // Derive max generation from encoding mask and verify helper wrap contract. + ulong maxEncodedUserData = EncodeCompletionSlotUserDataForTest(slotIndex, ulong.MaxValue); + Assert.True(TryDecodeCompletionSlotUserDataForTest(maxEncodedUserData, out _, out ulong maxGeneration)); + Assert.Equal(1UL, IncrementCompletionSlotGenerationForTest(maxGeneration)); + + ulong staleGeneration = IncrementCompletionSlotGenerationForTest(generation); + ulong staleUserData = EncodeCompletionSlotUserDataForTest(slotIndex, staleGeneration); + if (staleUserData == activeUserData) + { + staleUserData = EncodeCompletionSlotUserDataForTest(slotIndex, generation == 1UL ? 2UL : 1UL); + } + + Assert.NotEqual(activeUserData, staleUserData); + Assert.False( + IsTrackedIoUringUserDataForTest(staleUserData), + "Stale wrapped-generation user_data should be rejected during dispatch lookup."); + Assert.True(IsTrackedIoUringUserDataForTest(activeUserData)); + } + + private static async Task RunBufferListSendRoundTripAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x11, 0x22, 0x33, 0x44, 0x55 }; + var sendBuffers = new List> + { + new ArraySegment(payload, 0, 2), + new ArraySegment(payload, 2, 1), + new ArraySegment(payload, 3, 2) + }; + + byte[] receiveBuffer = new byte[payload.Length]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + int sent = await client.SendAsync(sendBuffers, SocketFlags.None); + Assert.Equal(payload.Length, sent); + Assert.Equal(payload.Length, await receiveTask); + Assert.Equal(payload, receiveBuffer); + } + + private static async Task RunReceiveMessageFromRoundTripAsync() + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + + receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[] { 0x91, 0x92, 0x93 }; + byte[] receiveBuffer = new byte[payload.Length]; + EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + + var receiveTask = receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketReceiveMessageFromResult result = await receiveTask; + Assert.Equal(payload.Length, result.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint); + } + + private static async Task RunReceiveMessageFromPacketInformationRoundTripAsync(bool useIpv6) + { + if (useIpv6 && !Socket.OSSupportsIPv6) + { + return; + } + + AddressFamily addressFamily = useIpv6 ? AddressFamily.InterNetworkV6 : AddressFamily.InterNetwork; + SocketOptionLevel optionLevel = useIpv6 ? SocketOptionLevel.IPv6 : SocketOptionLevel.IP; + IPAddress loopbackAddress = useIpv6 ? IPAddress.IPv6Loopback : IPAddress.Loopback; + IPAddress anyAddress = useIpv6 ? IPAddress.IPv6Any : IPAddress.Any; + + using Socket receiver = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(addressFamily, SocketType.Dgram, ProtocolType.Udp); + + receiver.SetSocketOption(optionLevel, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(loopbackAddress, 0)); + sender.Bind(new IPEndPoint(loopbackAddress, 0)); + + byte[] payload = useIpv6 ? + new byte[] { 0xA1, 0xA2, 0xA3 } : + new byte[] { 0x90, 0x91, 0x92, 0x93 }; + byte[] receiveBuffer = new byte[payload.Length]; + EndPoint remoteEndPoint = new IPEndPoint(anyAddress, 0); + + Task receiveTask = + ToTask(receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint)); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketReceiveMessageFromResult result = await receiveTask; + Assert.Equal(payload.Length, result.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, result.RemoteEndPoint); + Assert.Equal(((IPEndPoint)sender.LocalEndPoint!).Address, result.PacketInformation.Address); + } + + private static async Task RunNonPinnableMemorySendFallbackScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x71, 0x72, 0x73, 0x74 }; + using var nonPinnableMemory = new NonPinnableMemoryManager(payload); + byte[] receiveBuffer = new byte[payload.Length]; + + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + int sent = await client.SendAsync(nonPinnableMemory.Memory, SocketFlags.None); + Assert.Equal(payload.Length, sent); + Assert.Equal(payload.Length, await receiveTask); + Assert.Equal(payload, receiveBuffer); + } + + private static async Task RunNonPinnableMemoryReceiveFallbackScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[4]; + using var nonPinnableMemory = new NonPinnableMemoryManager(receiveBuffer); + byte[] payload = new byte[] { 0x81, 0x82, 0x83, 0x84 }; + + Task receiveTask = ToTask(server.ReceiveAsync(nonPinnableMemory.Memory, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(payload.Length, await receiveTask); + Assert.Equal(payload, receiveBuffer); + } + + private static Task RunNonPinnableMemoryFallbackScenarioAsync(bool receivePath) => + receivePath ? RunNonPinnableMemoryReceiveFallbackScenarioAsync() : RunNonPinnableMemorySendFallbackScenarioAsync(); + + private static async Task RunNonPinnableFallbackTelemetryScenarioAsync() + { + long before = 0; + long after = 0; + + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + before = GetIoUringPrepareNonPinnableFallbackCounterValue(); + await RunNonPinnableMemorySendFallbackScenarioAsync(); + await RunNonPinnableMemoryReceiveFallbackScenarioAsync(); + after = GetIoUringPrepareNonPinnableFallbackCounterValue(); + }, + (_, _) => + { + Assert.True( + after > before, + $"Expected io_uring non-pinnable fallback telemetry to increase. before={before}, after={after}"); + }, + skipScenarioWhenIoUringUnavailable: true); + } + + private static async Task RunPinnableMemoryPinReleaseLifecycleScenarioAsync() + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Completion path: receive completes with data and must release pin. + byte[] completionPayload = new byte[] { 0x91 }; + using var completionMemory = new TrackingPinnableMemoryManager(new byte[completionPayload.Length]); + Task completionReceive = ToTask(server.ReceiveAsync(completionMemory.Memory, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(completionPayload, SocketFlags.None)); + Assert.Equal(1, await completionReceive); + Assert.Equal(completionPayload, completionMemory.GetSpan().ToArray()); + await AssertPinsReleasedAsync(completionMemory); + + // Cancellation path: pending receive canceled by token must release pin. + using var cancellationMemory = new TrackingPinnableMemoryManager(new byte[16]); + using (var cts = new CancellationTokenSource()) + { + Task canceledReceive = ToTask(server.ReceiveAsync(cancellationMemory.Memory, SocketFlags.None, cts.Token)); + await Task.Delay(20); + cts.Cancel(); + + Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(canceledException); + } + + await AssertPinsReleasedAsync(cancellationMemory); + + // Teardown/abort path: pending receive interrupted by close must release pin. + using var teardownMemory = new TrackingPinnableMemoryManager(new byte[16]); + Task teardownReceive = ToTask(server.ReceiveAsync(teardownMemory.Memory, SocketFlags.None)); + await Task.Yield(); + client.Dispose(); + server.Dispose(); + + Exception? teardownException = await Record.ExceptionAsync(async () => await teardownReceive); + AssertCanceledDisposedOrInterrupted(teardownException); + await AssertPinsReleasedAsync(teardownMemory); + }, + static (_, _) => { }, + skipScenarioWhenIoUringUnavailable: true); + } + + private static async Task RunProvidedBufferRegistrationLifecycleScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + Task initialReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xA1 }, SocketFlags.None)); + Assert.Equal(1, await initialReceive); + + IoUringProvidedBufferSnapshot initialSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(initialSnapshot)) + { + return; + } + + Assert.Equal(initialSnapshot.TotalBufferCount, initialSnapshot.AvailableCount + initialSnapshot.InUseCount); + Assert.Equal(0, initialSnapshot.InUseCount); + + using (var cts = new CancellationTokenSource()) + { + Task canceledReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + Exception? canceledException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(canceledException); + } + + await Task.Delay(50); + IoUringProvidedBufferSnapshot postCancellationSnapshot = GetIoUringProvidedBufferSnapshot(); + Assert.Equal(initialSnapshot.TotalBufferCount, postCancellationSnapshot.TotalBufferCount); + Assert.Equal(postCancellationSnapshot.TotalBufferCount, postCancellationSnapshot.AvailableCount + postCancellationSnapshot.InUseCount); + Assert.Equal(0, postCancellationSnapshot.InUseCount); + } + + private static async Task RunProvidedBufferSelectReceiveScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(beforeSnapshot)) + { + return; + } + + ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xB2 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(0xB2, receiveBuffer[0]); + + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase after a completion."); + Assert.Equal(depletionBefore, depletionAfter); + Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount); + Assert.Equal(0, afterSnapshot.InUseCount); + } + + private static async Task RunProvidedBufferRecycleReuseScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(beforeSnapshot)) + { + return; + } + + ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + long allocationFailuresBefore = beforeSnapshot.AllocationFailureCount; + + int iterations = Math.Max(beforeSnapshot.TotalBufferCount + 64, 512); + byte[] receiveBuffer = new byte[1]; + byte[] payload = new byte[1]; + + for (int i = 0; i < iterations; i++) + { + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + payload[0] = unchecked((byte)i); + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(payload[0], receiveBuffer[0]); + } + + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + Assert.True( + recycleAfter >= recycleBefore + (ulong)iterations, + $"Expected at least {iterations} provided-buffer recycle increments. before={recycleBefore}, after={recycleAfter}"); + Assert.Equal(depletionBefore, depletionAfter); + Assert.Equal(allocationFailuresBefore, afterSnapshot.AllocationFailureCount); + Assert.Equal(beforeSnapshot.TotalBufferCount, afterSnapshot.TotalBufferCount); + Assert.Equal(0, afterSnapshot.InUseCount); + Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount); + } + + private static async Task RunProvidedBufferExhaustionScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] warmupBuffer = new byte[1]; + Task warmupReceive = ToTask(server.ReceiveAsync(warmupBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC1 }, SocketFlags.None)); + Assert.Equal(1, await warmupReceive); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(snapshot)) + { + return; + } + + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)); + Assert.True(forcedBufferCount > 0); + + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC2 }, SocketFlags.None)); + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + + Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask); + SocketException socketException = Assert.IsType(receiveException); + Assert.Equal(SocketError.NoBufferSpaceAvailable, socketException.SocketErrorCode); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions") > depletionBefore, + "Expected provided-buffer depletion counter to increase when ring buffers are forced unavailable."); + } + + private static async Task RunProvidedBufferMixedWorkloadScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(beforeSnapshot)) + { + return; + } + + using Socket udpReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + udpReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + ulong recycleBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + byte[] tcpReceiveBuffer = new byte[1]; + byte[] udpReceiveBuffer = new byte[2]; + + Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None)); + Task udpReceive = ToTask( + udpReceiver.ReceiveFromAsync( + udpReceiveBuffer, + SocketFlags.None, + new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xD1 }, SocketFlags.None)); + Assert.Equal(2, await udpSender.SendToAsync(new byte[] { 0xE1, 0xE2 }, SocketFlags.None, udpReceiver.LocalEndPoint!)); + + Assert.Equal(1, await tcpReceive); + Assert.Equal(0xD1, tcpReceiveBuffer[0]); + + SocketReceiveFromResult udpResult = await udpReceive; + Assert.Equal(2, udpResult.ReceivedBytes); + Assert.Equal(0xE1, udpReceiveBuffer[0]); + Assert.Equal(0xE2, udpReceiveBuffer[1]); + + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + ulong recycleAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferRecycles"); + ulong depletionAfter = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + + Assert.True(recycleAfter > recycleBefore, "Expected provided-buffer recycle counter to increase in mixed workload."); + Assert.Equal(depletionBefore, depletionAfter); + Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount); + Assert.Equal(0, afterSnapshot.InUseCount); + } + + private static async Task SendExactlyAsync(Socket socket, ReadOnlyMemory buffer) + { + int totalSent = 0; + while (totalSent < buffer.Length) + { + int sent = await socket.SendAsync(buffer.Slice(totalSent), SocketFlags.None); + Assert.True(sent > 0, "Socket.SendAsync returned 0 before sending all bytes."); + totalSent += sent; + } + } + + private static async Task ReceiveExactlyAsync(Socket socket, Memory buffer) + { + int totalReceived = 0; + while (totalReceived < buffer.Length) + { + int received = await socket.ReceiveAsync(buffer.Slice(totalReceived), SocketFlags.None); + Assert.True(received > 0, "Socket.ReceiveAsync returned 0 before receiving all expected bytes."); + totalReceived += received; + } + } + + private static async Task WaitForProvidedBufferSnapshotAsync( + Func predicate, + int timeoutMilliseconds = 10000) + { + DateTime deadline = DateTime.UtcNow + TimeSpan.FromMilliseconds(timeoutMilliseconds); + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + while (DateTime.UtcNow < deadline) + { + if (predicate(snapshot)) + { + return snapshot; + } + + await Task.Delay(50); + snapshot = GetIoUringProvidedBufferSnapshot(); + } + + return snapshot; + } + + private static async Task RunAdaptiveProvidedBufferSmallMessageShrinkScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsAdaptiveSizingUsable(beforeSnapshot)) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + + for (int i = 0; i < 320; i++) + { + sendBuffer.AsSpan().Fill(unchecked((byte)i)); + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => IsAdaptiveSizingUsable(snapshot) && + (snapshot.RecommendedBufferSize < initialBufferSize || snapshot.BufferSize < initialBufferSize)); + + Assert.True( + afterSnapshot.RecommendedBufferSize < initialBufferSize || afterSnapshot.BufferSize < initialBufferSize, + $"Expected adaptive recommendation to shrink from {initialBufferSize}. " + + $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}"); + } + + private static async Task RunAdaptiveProvidedBufferLargeMessageGrowScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsAdaptiveSizingUsable(beforeSnapshot)) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + int payloadSize = initialBufferSize; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + sendBuffer.AsSpan().Fill(0x5A); + + for (int i = 0; i < 320; i++) + { + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => IsAdaptiveSizingUsable(snapshot) && + (snapshot.RecommendedBufferSize > initialBufferSize || snapshot.BufferSize > initialBufferSize)); + + Assert.True( + afterSnapshot.RecommendedBufferSize > initialBufferSize || afterSnapshot.BufferSize > initialBufferSize, + $"Expected adaptive recommendation to grow from {initialBufferSize}. " + + $"actual buffer={afterSnapshot.BufferSize}, recommended={afterSnapshot.RecommendedBufferSize}"); + } + + private static async Task RunAdaptiveProvidedBufferMixedWorkloadStableScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsAdaptiveSizingUsable(beforeSnapshot)) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + byte[] smallSend = new byte[64]; + byte[] smallReceive = new byte[64]; + byte[] largeSend = new byte[initialBufferSize]; + byte[] largeReceive = new byte[initialBufferSize]; + smallSend.AsSpan().Fill(0x11); + largeSend.AsSpan().Fill(0x77); + + for (int i = 0; i < 320; i++) + { + bool useLarge = (i & 1) == 1; + byte[] send = useLarge ? largeSend : smallSend; + byte[] receive = useLarge ? largeReceive : smallReceive; + + Task receiveTask = ReceiveExactlyAsync(server, receive); + await SendExactlyAsync(client, send); + await receiveTask; + Assert.Equal(send, receive); + } + + await Task.Delay(250); + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + Assert.True(IsAdaptiveSizingUsable(afterSnapshot)); + Assert.Equal(initialBufferSize, afterSnapshot.RecommendedBufferSize); + } + + private static async Task RunAdaptiveProvidedBufferResizeSwapNoDataLossScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsAdaptiveSizingUsable(beforeSnapshot)) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + for (int i = 0; i < 384; i++) + { + sendBuffer.AsSpan().Fill(unchecked((byte)i)); + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => IsAdaptiveSizingUsable(snapshot) && snapshot.BufferSize < initialBufferSize, + timeoutMilliseconds: 15000); + + Assert.True( + afterSnapshot.BufferSize < initialBufferSize, + $"Expected adaptive resize swap to shrink active ring. initial={initialBufferSize}, current={afterSnapshot.BufferSize}"); + } + + private static async Task RunAdaptiveProvidedBufferResizeSwapConcurrentInFlightNoDataLossScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsAdaptiveSizingUsable(beforeSnapshot)) + { + return; + } + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + const int batchSize = 64; + const int rounds = 24; + + // Keep many receives in flight while driving enough completions to trigger adaptive + // resize; this exercises ring-swap safety under concurrent tracked receive activity. + for (int round = 0; round < rounds; round++) + { + Task[] receiveTasks = new Task[batchSize]; + byte[][] receiveBuffers = new byte[batchSize][]; + for (int i = 0; i < batchSize; i++) + { + byte[] receiveBuffer = new byte[1]; + receiveBuffers[i] = receiveBuffer; + receiveTasks[i] = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + } + + await Task.Yield(); + + for (int i = 0; i < batchSize; i++) + { + byte expected = unchecked((byte)(round + i + 1)); + Assert.Equal(1, await client.SendAsync(new[] { expected }, SocketFlags.None)); + } + + int[] completed = await Task.WhenAll(receiveTasks); + for (int i = 0; i < batchSize; i++) + { + Assert.Equal(1, completed[i]); + Assert.NotEqual(0, receiveBuffers[i][0]); + } + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => IsAdaptiveSizingUsable(snapshot) && snapshot.BufferSize < initialBufferSize, + timeoutMilliseconds: 15000); + + Assert.True( + afterSnapshot.BufferSize < initialBufferSize, + $"Expected adaptive resize swap to shrink active ring under in-flight receive stress. initial={initialBufferSize}, current={afterSnapshot.BufferSize}"); + Assert.Equal(0, afterSnapshot.InUseCount); + Assert.Equal(afterSnapshot.TotalBufferCount, afterSnapshot.AvailableCount + afterSnapshot.InUseCount); + } + + private static async Task RunAdaptiveProvidedBufferDisabledScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(beforeSnapshot)) + { + return; + } + + Assert.False(beforeSnapshot.AdaptiveBufferSizingEnabled); + + int initialBufferSize = beforeSnapshot.BufferSize; + int initialRecommendedSize = beforeSnapshot.RecommendedBufferSize; + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + sendBuffer.AsSpan().Fill(0xA5); + + for (int i = 0; i < 320; i++) + { + Task receiveTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receiveTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + await Task.Delay(250); + IoUringProvidedBufferSnapshot afterSnapshot = GetIoUringProvidedBufferSnapshot(); + Assert.True(IsProvidedBufferSnapshotUsable(afterSnapshot)); + Assert.False(afterSnapshot.AdaptiveBufferSizingEnabled); + Assert.Equal(initialBufferSize, afterSnapshot.BufferSize); + Assert.Equal(initialRecommendedSize, afterSnapshot.RecommendedBufferSize); + } + + private static async Task RunAdaptiveProvidedBufferSizingStateScenarioAsync(bool expectedEnabled) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state is initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(snapshot)) + { + return; + } + + Assert.Equal(expectedEnabled, snapshot.AdaptiveBufferSizingEnabled); + } + + private static async Task RunProvidedBufferKernelRegistrationDisabledScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state is initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(snapshot)) + { + return; + } + + Assert.False(snapshot.HasRegisteredBuffers); + } + + private static async Task RunProvidedBufferKernelRegistrationSuccessScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(snapshot)) + { + return; + } + + ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess"); + ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure"); + Assert.True( + successCount + failureCount > 0, + "Expected at least one registered-buffer initialization attempt."); + + // Best-effort success-path assertion: only enforce when registration succeeded on this machine. + if (!snapshot.HasRegisteredBuffers) + { + return; + } + + Assert.True(successCount > 0, "Expected success telemetry when registered buffers are active."); + } + + private static async Task RunProvidedBufferKernelRegistrationFailureNonFatalScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + // Warm up receive path so io_uring provided-buffer ring state and telemetry are initialized. + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x42 }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(snapshot) || snapshot.HasRegisteredBuffers) + { + // No observed registration failure in this environment. + return; + } + + // Registration is not active: verify provided-buffer receive path still works. + byte[] payload = new byte[4096]; + byte[] received = new byte[payload.Length]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 31)); + } + + Task receiveAllTask = ReceiveExactlyAsync(server, received); + await SendExactlyAsync(client, payload); + await receiveAllTask; + Assert.Equal(payload, received); + + ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure"); + Assert.True(failureCount > 0, "Expected failure telemetry when registered buffers are inactive."); + } + + private static async Task RunProvidedBufferKernelReregistrationOnResizeScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot beforeSnapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsAdaptiveSizingUsable(beforeSnapshot)) + { + return; + } + + ulong reregSuccessBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess"); + ulong reregFailureBefore = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure"); + + int initialBufferSize = beforeSnapshot.BufferSize; + Assert.True(initialBufferSize > 0); + + const int payloadSize = 64; + byte[] sendBuffer = new byte[payloadSize]; + byte[] receiveBuffer = new byte[payloadSize]; + for (int i = 0; i < 384; i++) + { + sendBuffer.AsSpan().Fill(unchecked((byte)(i + 1))); + Task receivePayloadTask = ReceiveExactlyAsync(server, receiveBuffer); + await SendExactlyAsync(client, sendBuffer); + await receivePayloadTask; + Assert.Equal(sendBuffer, receiveBuffer); + } + + IoUringProvidedBufferSnapshot afterSnapshot = await WaitForProvidedBufferSnapshotAsync( + snapshot => IsAdaptiveSizingUsable(snapshot) && snapshot.BufferSize < initialBufferSize, + timeoutMilliseconds: 15000); + + Assert.True(afterSnapshot.BufferSize < initialBufferSize); + + ulong reregSuccessAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationSuccess"); + ulong reregFailureAfter = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersReregistrationFailure"); + Assert.True( + (reregSuccessAfter + reregFailureAfter) > (reregSuccessBefore + reregFailureBefore), + "Expected at least one registered-buffer re-registration attempt after adaptive resize."); + } + + private static async Task RunProvidedBufferRegisteredBuffersDataCorrectnessScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(snapshot) || !snapshot.HasRegisteredBuffers) + { + return; + } + + // Reuse the mixed workload profile to validate payload correctness with registered buffers active. + byte[] smallSend = new byte[64]; + byte[] largeSend = new byte[Math.Max(snapshot.BufferSize, 4096)]; + byte[] smallReceive = new byte[smallSend.Length]; + byte[] largeReceive = new byte[largeSend.Length]; + + for (int i = 0; i < 64; i++) + { + smallSend.AsSpan().Fill(unchecked((byte)(i + 5))); + largeSend.AsSpan().Fill(unchecked((byte)(i + 11))); + + Task smallReceiveTask = ReceiveExactlyAsync(server, smallReceive); + await SendExactlyAsync(client, smallSend); + await smallReceiveTask; + Assert.Equal(smallSend, smallReceive); + + Task largeReceiveTask = ReceiveExactlyAsync(server, largeReceive); + await SendExactlyAsync(client, largeSend); + await largeReceiveTask; + Assert.Equal(largeSend, largeReceive); + } + } + + private static async Task RunProvidedBufferRegistrationMemoryPressureScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!IsProvidedBufferSnapshotUsable(snapshot)) + { + return; + } + + int payloadSize = Math.Min(snapshot.BufferSize, 16 * 1024); + payloadSize = Math.Max(payloadSize, 1024); + byte[] payload = new byte[payloadSize]; + byte[] received = new byte[payloadSize]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 41)); + } + + Task receiveTask = ReceiveExactlyAsync(server, received); + await SendExactlyAsync(client, payload); + await receiveTask; + Assert.Equal(payload, received); + + ulong successCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialSuccess"); + ulong failureCount = GetIoUringTelemetryCounterValue("_ioUringRegisteredBuffersInitialFailure"); + if (snapshot.HasRegisteredBuffers) + { + Assert.True(successCount > 0, "Expected successful registration telemetry when buffers are registered."); + } + else + { + Assert.True(failureCount > 0, "Expected failure telemetry when registration falls back under pressure."); + } + } + + private static async Task RunProvidedBufferRingForcedAllocationFailureFallbackScenarioAsync() + { + await RunTcpRoundTripAsync(4); + + IoUringProvidedBufferSnapshot snapshot = GetIoUringProvidedBufferSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + Assert.False(snapshot.HasProvidedBufferRing, "Provided-buffer ring should be disabled after forced allocation failure."); + Assert.False(snapshot.SupportsProvidedBufferRings, "Capability should remain disabled when provided-buffer ring creation fails."); + + // Ensure sockets continue to function after provided-buffer OOM fallback. + await RunTcpRoundTripAsync(4); + } + + private static Task RunProvidedBufferTeardownOrderingContractScenarioAsync() + { + Assert.True( + SocketAsyncEngine.ValidateIoUringProvidedBufferTeardownOrderingForTest(), + "Expected teardown to unregister/dispose provided buffers before ring unmap/close."); + + return Task.CompletedTask; + } + + private static async Task RunZeroCopySendStateScenarioAsync(bool expectedEnabledWhenSupported) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[64]; + byte[] receiveBuffer = new byte[sendBuffer.Length]; + Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None)); + await ReceiveExactlyAsync(server, receiveBuffer); + + IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + if (!snapshot.SupportsSendZc) + { + Assert.False(snapshot.ZeroCopySendEnabled); + return; + } + + Assert.Equal(expectedEnabledWhenSupported, snapshot.ZeroCopySendEnabled); + } + + private static async Task RunFixedRecvStateScenarioAsync(bool expectedEnabled) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[64]; + byte[] receiveBuffer = new byte[sendBuffer.Length]; + Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None)); + await ReceiveExactlyAsync(server, receiveBuffer); + + IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + Assert.Equal(expectedEnabled, IsFixedRecvEnabled(snapshot)); + } + + private static async Task RunFixedRecvActivationFollowsRuntimeCapabilitiesScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendBuffer = new byte[64]; + byte[] receiveBuffer = new byte[sendBuffer.Length]; + Assert.Equal(sendBuffer.Length, await client.SendAsync(sendBuffer, SocketFlags.None)); + await ReceiveExactlyAsync(server, receiveBuffer); + + IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + Assert.Equal(snapshot.SupportsReadFixed && snapshot.HasRegisteredBuffers, IsFixedRecvEnabled(snapshot)); + } + + private static async Task RunFixedRecvDataCorrectnessScenarioAsync() + { + IoUringFixedRecvSnapshot snapshot = GetIoUringFixedRecvSnapshot(); + if (!snapshot.HasIoUringPort || !IsFixedRecvEnabled(snapshot) || !snapshot.SupportsReadFixed || !snapshot.HasRegisteredBuffers) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[32 * 1024]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i * 13)); + } + + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None)); + await receiveTask; + Assert.Equal(payload, received); + } + + private static async Task RunSqPollBasicSendReceiveScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(8); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!IsSqPollActive(snapshot)) + { + return; + } + + await RunTcpRoundTripAsync(16); + } + + private static async Task RunDeferTaskrunEventLoopInitScenarioAsync() + { + // TCP round-trips exercise io_uring_enter from the event loop thread. + // With DEFER_TASKRUN + SINGLE_ISSUER, the kernel checks that io_uring_enter + // is called from the same thread that called io_uring_setup (submitter_task). + // If initialization ran on the wrong thread, io_uring_enter would return + // EEXIST and all operations would fail. + await RunTcpRoundTripAsync(8); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!snapshot.HasIoUringPort) + { + return; + } + + // Non-SQPOLL engines negotiate DEFER_TASKRUN by default. + Assert.False( + snapshot.SqPollEnabled, + "SQPOLL should be disabled in the default DEFER_TASKRUN initialization scenario."); + Assert.True( + snapshot.DeferTaskrunEnabled, + "Non-SQPOLL io_uring engines should negotiate DEFER_TASKRUN."); + + // Additional round-trips after the assertion to confirm ongoing stability. + await RunTcpRoundTripAsync(8); + } + + private static async Task RunSqPollRequestedScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(8); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + // Some Helix legs can run without an active io_uring port (kernel/config/runtime gating). + // In that case this SQPOLL-request scenario is not applicable. + if (!snapshot.HasIoUringPort) + { + return; + } + + if (!snapshot.SqPollEnabled) + { + // SQPOLL wasn't active on this leg, but socket operations must continue to succeed. + await RunTcpRoundTripAsync(16); + return; + } + + Assert.False( + snapshot.DeferTaskrunEnabled, + "SQPOLL and DEFER_TASKRUN must be mutually exclusive in negotiated io_uring setup flags."); + } + + private static async Task RunSqPollWakeupAfterIdleScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!IsSqPollActive(snapshot)) + { + return; + } + + ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + + // Let the kernel SQPOLL thread go idle and set SQ_NEED_WAKEUP. + bool observedNeedWakeup = false; + for (int i = 0; i < 25; i++) + { + await Task.Delay(100); + if (IsAnyIoUringSqPollEngineNeedingWakeup()) + { + observedNeedWakeup = true; + break; + } + } + + if (!observedNeedWakeup) + { + return; + } + + await RunTcpRoundTripAsync(2); + + ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + Assert.True( + wakeupsAfter > wakeupsBefore, + $"Expected SQPOLL wakeups to increase after idle wake path. before={wakeupsBefore}, after={wakeupsAfter}"); + } + + private static async Task RunSqPollMultishotRecvScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!IsSqPollActive(snapshot)) + { + return; + } + + await RunMultishotRecvBasicScenarioAsync(iterations: 32); + } + + private static async Task RunSqPollZeroCopySendScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!IsSqPollActive(snapshot)) + { + return; + } + + await RunZeroCopySendLargeBufferRoundTripScenarioAsync(); + } + + private static async Task RunSqPollTelemetryCountersScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!IsSqPollActive(snapshot)) + { + return; + } + + ulong skippedBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped"); + ulong wakeupsBefore = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + + await RunTcpRoundTripAsync(32); + ulong skippedAfterBurst = GetIoUringTelemetryCounterValue("_ioUringSqPollSubmissionsSkipped"); + Assert.True( + skippedAfterBurst > skippedBefore, + $"Expected SQPOLL submission-skipped counter to increase. before={skippedBefore}, after={skippedAfterBurst}"); + + await Task.Delay(1500); + await RunTcpRoundTripAsync(2); + + ulong wakeupsAfter = GetIoUringTelemetryCounterValue("_ioUringSqPollWakeups"); + Assert.True( + wakeupsAfter >= wakeupsBefore, + $"Expected SQPOLL wakeup counter to be readable/nondecreasing. before={wakeupsBefore}, after={wakeupsAfter}"); + } + + private static async Task RunSqPollNeedWakeupContractScenarioAsync() + { + EnableSqPollAppContextOptIn(); + await RunTcpRoundTripAsync(4); + + IoUringSqPollSnapshot snapshot = GetIoUringSqPollSnapshot(); + if (!IsSqPollActive(snapshot)) + { + return; + } + + Assert.True( + ValidateSqNeedWakeupMatchesRawSqFlagBit(), + "Expected at least one active SQPOLL io_uring engine for SqNeedWakeup contract validation."); + } + + private static bool IsZeroCopySendEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot) + { + snapshot = GetIoUringZeroCopySendSnapshot(); + return snapshot.HasIoUringPort && snapshot.SupportsSendZc && snapshot.ZeroCopySendEnabled; + } + + private static bool IsZeroCopySendMessageEnabledAndSupported(out IoUringZeroCopySendSnapshot snapshot) + { + snapshot = GetIoUringZeroCopySendSnapshot(); + return snapshot.HasIoUringPort && snapshot.SupportsSendMsgZc && snapshot.ZeroCopySendEnabled; + } + + private static async Task RunZeroCopySendLargeBufferRoundTripScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[64 * 1024]; + byte[] received = new byte[payload.Length]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)i); + } + + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + + private static async Task RunZeroCopySendSmallBufferUsesRegularSendWithForcedSendErrorScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] smallPayload = new byte[1024]; + // forceEcanceledOnceMask: "send" is set by the caller. Small payloads should use regular SEND, + // so the first send is expected to observe the injected cancellation/interruption. + Exception? sendException = await Record.ExceptionAsync(async () => await client.SendAsync(smallPayload, SocketFlags.None)); + AssertCanceledOrInterrupted(sendException); + + byte[] verificationPayload = new byte[] { 0x5A }; + byte[] verificationReceive = new byte[1]; + Task verificationReceiveTask = ToTask(server.ReceiveAsync(verificationReceive, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(verificationPayload, SocketFlags.None)); + Assert.Equal(1, await verificationReceiveTask); + Assert.Equal(verificationPayload[0], verificationReceive[0]); + } + + private static async Task RunZeroCopySendNotifCqeReleasesPinHoldsScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[128 * 1024]; + byte[] received = new byte[payload.Length]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 1)); + } + + const int iterations = 8; + for (int i = 0; i < iterations; i++) + { + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + + IoUringZeroCopyPinHoldSnapshot releasedSnapshot = await WaitForZeroCopyPinHoldSnapshotAsync( + static snapshot => !snapshot.HasIoUringPort || (snapshot.ActivePinHolds == 0 && snapshot.PendingNotificationCount == 0)); + if (!releasedSnapshot.HasIoUringPort) + { + return; + } + + Assert.Equal(0, releasedSnapshot.ActivePinHolds); + Assert.Equal(0, releasedSnapshot.PendingNotificationCount); + } + + private static async Task RunZeroCopySendResetStormSlotRecoveryScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + const int ConcurrentSendCount = 512; + const int SlotPressureDelta = 32; + TimeSpan runDuration = TimeSpan.FromSeconds(60); + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(ConcurrentSendCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + int baselineSlotsInUse = GetIoUringCompletionSlotsInUseForTest(); + IoUringZeroCopyPinHoldSnapshot baselineSnapshot = GetIoUringZeroCopyPinHoldSnapshot(); + + byte[] payload = new byte[64 * 1024]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 11)); + } + + DateTime deadline = DateTime.UtcNow + runDuration; + int rounds = 0; + int roundsWithConnectionReset = 0; + bool observedPendingNotifications = false; + // Long-running reset churn is intentional: leaked pending-NOTIF slots tend to show + // up only after repeated mid-flight resets, not short happy-path bursts. + while (DateTime.UtcNow < deadline) + { + (Socket client, Socket server) = await AcceptConnectedTcpPairAsync(listener, endpoint); + using (client) + using (server) + { + server.LingerState = new LingerOption(enable: true, seconds: 0); + var sendTasks = new Task[ConcurrentSendCount]; + for (int i = 0; i < sendTasks.Length; i++) + { + sendTasks[i] = ToTask(client.SendAsync(payload, SocketFlags.None)); + } + + // Wait for slot pressure rather than sleeping arbitrarily so the test + // only resets once a meaningful in-flight SEND_ZC wave exists. + Assert.True( + await WaitForIoUringCompletionSlotsInUseAboveAsync(baselineSlotsInUse, SlotPressureDelta, timeoutMilliseconds: 2_000), + $"Expected completion slots to exceed baseline {baselineSlotsInUse} by at least {SlotPressureDelta}, observed {GetIoUringCompletionSlotsInUseForTest()}."); + + if (GetIoUringZeroCopyPinHoldSnapshot().PendingNotificationCount > 0) + { + observedPendingNotifications = true; + } + + server.Dispose(); + + bool roundSawConnectionReset = false; + for (int i = 0; i < sendTasks.Length; i++) + { + Exception? ex = await Record.ExceptionAsync(async () => await sendTasks[i]); + if (ex is null) + { + continue; + } + + if (ex is SocketException socketException) + { + if (socketException.SocketErrorCode == SocketError.ConnectionReset) + { + roundSawConnectionReset = true; + } + + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.ConnectionAborted || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted || + socketException.SocketErrorCode == SocketError.Shutdown, + $"Unexpected socket error during reset-churn SEND_ZC stress: {socketException.SocketErrorCode}"); + } + else + { + Assert.True( + ex is ObjectDisposedException || ex is OperationCanceledException, + $"Unexpected exception during reset-churn SEND_ZC stress: {ex}"); + } + } + + if (roundSawConnectionReset) + { + roundsWithConnectionReset++; + } + } + + rounds++; + } + + Assert.True(rounds > 0, "Expected at least one reset-churn round in the SEND_ZC recovery scenario."); + Assert.True( + observedPendingNotifications, + "Expected to observe at least one in-flight pending SEND_ZC notification during reset-churn stress."); + Assert.True( + (double)roundsWithConnectionReset / rounds >= 0.10, + $"Expected at least 10% of reset-churn rounds to include ConnectionReset; observed {roundsWithConnectionReset}/{rounds}."); + + IoUringZeroCopyPinHoldSnapshot settledSnapshot = await WaitForZeroCopyPinHoldSnapshotAsync( + snapshot => !snapshot.HasIoUringPort || + (snapshot.ActivePinHolds == baselineSnapshot.ActivePinHolds && + snapshot.PendingNotificationCount == baselineSnapshot.PendingNotificationCount), + timeoutMilliseconds: 30_000); + if (!settledSnapshot.HasIoUringPort) + { + return; + } + + Assert.Equal(baselineSnapshot.ActivePinHolds, settledSnapshot.ActivePinHolds); + Assert.Equal(baselineSnapshot.PendingNotificationCount, settledSnapshot.PendingNotificationCount); + Assert.True( + await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineSlotsInUse, timeoutMilliseconds: 30_000), + $"Expected completion slots to recover to baseline {baselineSlotsInUse}, observed {GetIoUringCompletionSlotsInUseForTest()}."); + + ulong completionSlotExhaustionsAfterStress = GetIoUringTelemetryCounterValue("_ioUringCompletionSlotExhaustions"); + await RunZeroCopySendLargeBufferRoundTripScenarioAsync(); + ulong completionSlotExhaustionsAfterRecovery = GetIoUringTelemetryCounterValue("_ioUringCompletionSlotExhaustions"); + Assert.Equal( + completionSlotExhaustionsAfterStress, + completionSlotExhaustionsAfterRecovery); + } + + private static async Task RunZeroCopySendPartialSendResubmissionScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + await RunLargeSendWithBackpressureAsync(useBufferListSend: false); + } + + private static async Task RunZeroCopySendCompletionPinLifetimeScenarioAsync() + { + if (!IsZeroCopySendEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + byte[] payload = new byte[96 * 1024]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 3)); + } + + using var trackingMemory = new TrackingPinnableMemoryManager(payload); + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(trackingMemory.Memory, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + await AssertPinsReleasedAsync(trackingMemory); + Assert.Equal(payload, received); + } + + private static async Task RunZeroCopySendUnsupportedOpcodeFallbackScenarioAsync() + { + SocketAsyncEngine[] engines = SocketAsyncEngine.GetActiveIoUringEnginesForTest(); + var overrides = new List<(SocketAsyncEngine Engine, bool SupportsSendZc, bool ZeroCopyEnabled)>(engines.Length); + foreach (SocketAsyncEngine engine in engines) + { + overrides.Add((engine, engine.SupportsOpSendZcForTest, engine.ZeroCopySendEnabledForTest)); + engine.SupportsOpSendZcForTest = false; + engine.ZeroCopySendEnabledForTest = false; + } + + if (engines.Length == 0) + { + return; + } + + try + { + IoUringZeroCopySendSnapshot snapshot = GetIoUringZeroCopySendSnapshot(); + Assert.False(snapshot.SupportsSendZc); + Assert.False(snapshot.ZeroCopySendEnabled); + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[64 * 1024]; + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + finally + { + foreach ((SocketAsyncEngine engine, bool supports, bool enabled) in overrides) + { + engine.SupportsOpSendZcForTest = supports; + engine.ZeroCopySendEnabledForTest = enabled; + } + } + } + + private static async Task RunZeroCopySendBufferListSegmentThresholdScenarioAsync() + { + if (!IsZeroCopySendMessageEnabledAndSupported(out _)) + { + return; + } + + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + + const int segmentCount = 8; + const int segmentSize = 4 * 1024; + int payloadLength = segmentCount * segmentSize; + byte[] payload = new byte[payloadLength]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 17)); + } + + var sendBuffers = new List>(segmentCount); + for (int i = 0; i < segmentCount; i++) + { + sendBuffers.Add(new ArraySegment(payload, i * segmentSize, segmentSize)); + } + + byte[] received = new byte[payload.Length]; + Task receiveTask = ReceiveExactlyAsync(server, received); + int sent = await client.SendAsync(sendBuffers, SocketFlags.None); + Assert.Equal(payload.Length, sent); + await receiveTask; + Assert.Equal(payload, received); + } + + private static async Task RunZeroCopySendToAboveThresholdScenarioAsync() + { + if (!IsZeroCopySendMessageEnabledAndSupported(out _)) + { + return; + } + + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[20 * 1024]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = unchecked((byte)(i + 23)); + } + + byte[] receiveBuffer = new byte[payload.Length]; + Task receiveTask = + ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketReceiveFromResult receiveResult = await receiveTask; + Assert.Equal(payload.Length, receiveResult.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, receiveResult.RemoteEndPoint); + } + + private static async Task RunMultishotRecvBasicScenarioAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + ulong reuseBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvReuse"); + ulong asyncCancelBefore = GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes"); + byte[] receiveBuffer = new byte[1]; + byte[] payload = new byte[1]; + for (int i = 0; i < iterations; i++) + { + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + payload[0] = unchecked((byte)(i + 1)); + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(payload[0], receiveBuffer[0]); + } + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to remain armed after repeated ReceiveAsync calls."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvReuse") > reuseBefore, + "Expected ReceiveAsync calls to reuse an armed multishot recv (TryReplace path)."); + Assert.Equal( + asyncCancelBefore, + GetIoUringTelemetryCounterValue("_ioUringAsyncCancelRequestCqes")); + } + + private static async Task RunMultishotRecvCancellationScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket listener = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + _ = listener; + _ = client; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + byte[] receiveBuffer = new byte[16]; + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token)); + await Task.Yield(); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to arm before cancellation."); + + cts.Cancel(); + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledOrInterrupted(ex); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false), + "Expected persistent multishot recv to disarm after cancellation."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected cancellation to produce a terminal persistent multishot recv completion."); + } + + private static async Task RunMultishotRecvPeerCloseScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + byte[] receiveBuffer = new byte[8]; + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + client.Shutdown(SocketShutdown.Both); + client.Dispose(); + + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + if (ex is null) + { + Assert.Equal(0, await receiveTask); + } + else + { + SocketException socketException = Assert.IsType(ex); + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error after multishot peer close: {socketException.SocketErrorCode}"); + } + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false), + "Expected persistent multishot recv to disarm after terminal peer-close completion."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected terminal completion to increment persistent multishot recv termination telemetry."); + } + + private static async Task RunPersistentMultishotRecvProvidedBufferExhaustionScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + byte[] receiveBuffer = new byte[1]; + Task armReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC3 }, SocketFlags.None)); + Assert.Equal(1, await armReceive); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to arm before forced provided-buffer exhaustion."); + + ulong depletionBefore = GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions"); + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + + Assert.True(TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount)); + Assert.True(forcedBufferCount > 0); + + Task exhaustedReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC4 }, SocketFlags.None)); + Task exhaustedCompleted = await Task.WhenAny(exhaustedReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(exhaustedReceive, exhaustedCompleted); + + Exception? exhaustedException = await Record.ExceptionAsync(async () => await exhaustedReceive); + SocketException exhaustedSocketException = Assert.IsType(exhaustedException); + Assert.Equal(SocketError.NoBufferSpaceAvailable, exhaustedSocketException.SocketErrorCode); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: false), + "Expected persistent multishot recv to disarm after ENOBUFS terminal completion."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringProvidedBufferDepletions") > depletionBefore, + "Expected provided-buffer depletion counter to increase after forced exhaustion."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected persistent multishot recv termination counter to increase after ENOBUFS."); + + Assert.True(TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount)); + Assert.True(recycledBufferCount > 0, "Expected forced checked-out provided buffers to be recycled for recovery."); + + Task recoveredReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xC5 }, SocketFlags.None)); + Assert.Equal(1, await recoveredReceive); + Assert.Equal(0xC5, receiveBuffer[0]); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to re-arm after provided buffers were recycled."); + } + + private static async Task RunPersistentMultishotRecvShapeChangeScenarioAsync() + { + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + receiver.Connect(sender.LocalEndPoint!); + sender.Connect(receiver.LocalEndPoint!); + + byte[] receiveBuffer = new byte[1]; + Task armReceive = ToTask(receiver.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD1 }, SocketFlags.None)); + Assert.Equal(1, await armReceive); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: true), + "Expected persistent multishot recv to arm before shape-change scenario."); + + ulong terminationBefore = GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination"); + + byte[] receiveFromBuffer = new byte[1]; + Task receiveFromTask = ToTask( + receiver.ReceiveFromAsync(receiveFromBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD2 }, SocketFlags.None)); + SocketReceiveFromResult receiveFromResult = await receiveFromTask; + Assert.Equal(1, receiveFromResult.ReceivedBytes); + Assert.Equal(0xD2, receiveFromBuffer[0]); + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: false), + "Expected persistent multishot recv to disarm when receive shape switches to ReceiveFromAsync."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringPersistentMultishotRecvTermination") > terminationBefore, + "Expected shape-change cancellation to increment persistent multishot recv terminations."); + + Task rearmReceive = ToTask(receiver.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await sender.SendAsync(new byte[] { 0xD3 }, SocketFlags.None)); + Assert.Equal(1, await rearmReceive); + Assert.Equal(0xD3, receiveBuffer[0]); + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(receiver, expectedArmed: true), + "Expected persistent multishot recv to re-arm after shape-change operation completed."); + } + + private static async Task RunPersistentMultishotRecvConcurrentCloseRaceScenarioAsync(int iterations) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(Math.Max(4, iterations)); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotRecvSupported()) + { + return; + } + + for (int i = 0; i < iterations; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + byte[] armBuffer = new byte[1]; + Task armReceive = ToTask(server.ReceiveAsync(armBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0xE1 }, SocketFlags.None)); + Assert.Equal(1, await armReceive); + + Assert.True( + await WaitForPersistentMultishotRecvArmedStateAsync(server, expectedArmed: true), + "Expected persistent multishot recv to arm before concurrent close race."); + + Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None)); + await Task.Yield(); + + _ = Task.Run(() => + { + server.Dispose(); + client.Dispose(); + }); + + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error from persistent multishot recv close race: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + } + + private static Task RunPersistentMultishotRecvQueueSaturationScenarioAsync() + { + using Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Assert.True(SocketAsyncContext.TryGetSocketAsyncContextForTest(socket, out SocketAsyncContext? context)); + Assert.NotNull(context); + + byte[] payload = new byte[] { 0xE7 }; + for (int i = 0; i < 16; i++) + { + Assert.True(context!.TryBufferEarlyPersistentMultishotRecvData(payload)); + } + + Assert.False(context!.TryBufferEarlyPersistentMultishotRecvData(payload)); + Assert.Equal(16, GetPersistentMultishotRecvBufferedCount(socket)); + return Task.CompletedTask; + } + + private static async Task RunNetworkStreamReadAsyncCancellationTokenScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + using var networkStream = new NetworkStream(server, ownsSocket: false); + + byte[] readBuffer = new byte[1]; + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); + ValueTask readTask = networkStream.ReadAsync(readBuffer, cts.Token); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xF1 }, SocketFlags.None)); + Assert.Equal(1, await readTask); + Assert.Equal(0xF1, readBuffer[0]); + } + + private static async Task RunReceiveAsyncSocketAsyncEventArgsBufferListScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer) + } + }; + + Task receiveTask = StartSocketAsyncEventArgsOperation( + server, + receiveEventArgs, + static (s, args) => s.ReceiveAsync(args)); + await Task.Yield(); + + Assert.Equal(1, await client.SendAsync(new byte[] { 0xF2 }, SocketFlags.None)); + SocketAsyncEventArgs completedReceive = await receiveTask; + Assert.Equal(SocketError.Success, completedReceive.SocketError); + Assert.Equal(1, completedReceive.BytesTransferred); + Assert.Equal(0xF2, receiveBuffer[0]); + Assert.False( + IsPersistentMultishotRecvArmed(server), + "SAEA BufferList receive path should not arm persistent multishot recv state."); + } + + private static async Task RunMultishotAcceptBasicScenarioAsync(int connectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + Task firstAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed while first accept was pending."); + + using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) + { + await firstClient.ConnectAsync(endpoint); + using Socket firstServer = await AwaitWithTimeoutAsync(firstAcceptTask, "first multishot accept"); + await AssertConnectedPairRoundTripAsync(firstClient, firstServer, 0x41); + } + + for (int i = 1; i < connectionCount; i++) + { + (Socket clientSocket, Socket serverSocket) = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = clientSocket; + using Socket server = serverSocket; + await AssertConnectedPairRoundTripAsync(client, server, unchecked((byte)(0x41 + i))); + } + } + + private static async Task RunMultishotAcceptPrequeueScenarioAsync(int prequeuedConnectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(prequeuedConnectionCount + 2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + // Arm multishot accept once, then connect a burst of clients before issuing + // subsequent AcceptAsync calls to create a pre-queue opportunity. + Task armingAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed while arming accept was pending."); + + var connectedClients = new List(prequeuedConnectionCount + 1); + try + { + var connectTasks = new List(prequeuedConnectionCount + 1); + for (int i = 0; i < prequeuedConnectionCount + 1; i++) + { + var client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectedClients.Add(client); + connectTasks.Add(client.ConnectAsync(endpoint)); + } + + await Task.WhenAll(connectTasks); + using Socket armingServer = await AwaitWithTimeoutAsync(armingAcceptTask, "arming multishot accept"); + + DateTime deadline = DateTime.UtcNow + TimeSpan.FromSeconds(5); + int queueCount = 0; + while (DateTime.UtcNow < deadline) + { + queueCount = GetListenerMultishotAcceptQueueCount(listener); + if (queueCount > 0) + { + break; + } + + await Task.Delay(25); + } + + Assert.True(queueCount > 0, "Expected at least one pre-accepted connection to be queued."); + + for (int i = 0; i < prequeuedConnectionCount; i++) + { + using Socket _ = await AwaitWithTimeoutAsync(listener.AcceptAsync(), "prequeued accept completion"); + } + } + finally + { + foreach (Socket client in connectedClients) + { + client.Dispose(); + } + } + } + + private static async Task RunMultishotAcceptListenerCloseScenarioAsync() + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(4); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + Task firstAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed while first accept was pending."); + + using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) + { + await firstClient.ConnectAsync(endpoint); + using Socket firstServer = await AwaitWithTimeoutAsync(firstAcceptTask, "first accept before listener close"); + await AssertConnectedPairRoundTripAsync(firstClient, firstServer, 0x71); + } + + Task pendingAccept = listener.AcceptAsync(); + await Task.Yield(); + listener.Dispose(); + + Task completed = await Task.WhenAny(pendingAccept, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingAccept, completed); + + Exception? acceptException = await Record.ExceptionAsync(async () => await pendingAccept); + Assert.NotNull(acceptException); + Assert.True( + acceptException is ObjectDisposedException || + acceptException is SocketException, + $"Unexpected pending-accept exception after listener close: {acceptException}"); + + Assert.Equal(0, GetListenerMultishotAcceptQueueCount(listener)); + Assert.False(IsListenerMultishotAcceptArmed(listener)); + } + + private static async Task RunMultishotAcceptTeardownRaceScenarioAsync(int iterations) + { + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + for (int i = 0; i < iterations; i++) + { + await RunMultishotAcceptListenerCloseScenarioAsync(); + } + } + + private static async Task RunMultishotAcceptDisposeDuringArmingRaceScenarioAsync(int iterations) + { + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + for (int i = 0; i < iterations; i++) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + Task pendingAccept = listener.AcceptAsync(); + Task disposeTask = Task.Run(listener.Dispose); + + Task completed = await Task.WhenAny(pendingAccept, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingAccept, completed); + await disposeTask; + + Exception? acceptException = await Record.ExceptionAsync(async () => await pendingAccept); + Assert.NotNull(acceptException); + Assert.True( + acceptException is ObjectDisposedException || acceptException is SocketException, + $"Unexpected accept exception during dispose/arm race at iteration {i}: {acceptException}"); + } + } + + private static async Task RunMultishotAcceptPrepareUnsupportedOneShotFallbackScenarioAsync() + { + // Prime socket engine initialization so s_engines contains any active io_uring engines. + await RunTcpRoundTripAsync(4); + + SocketAsyncEngine[] engines = SocketAsyncEngine.GetActiveIoUringEnginesForTest(); + var overrides = new List<(SocketAsyncEngine Engine, bool SupportsMultishotAccept)>(engines.Length); + foreach (SocketAsyncEngine engine in engines) + { + overrides.Add((engine, engine.SupportsMultishotAcceptForTest)); + engine.SupportsMultishotAcceptForTest = false; + } + + if (engines.Length == 0) + { + return; + } + + try + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + Task acceptTask = listener.AcceptAsync(); + await Task.Yield(); + Assert.False(IsListenerMultishotAcceptArmed(listener), "Listener should remain in one-shot accept mode."); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await client.ConnectAsync(endpoint); + using Socket server = await AwaitWithTimeoutAsync(acceptTask, "one-shot accept fallback"); + await AssertConnectedPairRoundTripAsync(client, server, 0x7A); + } + finally + { + foreach ((SocketAsyncEngine engine, bool supports) in overrides) + { + engine.SupportsMultishotAcceptForTest = supports; + } + } + } + + private static async Task RunMultishotAcceptRearmAfterTerminalCqeScenarioAsync() + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(4); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + Task firstAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed before forced terminal CQE."); + + using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) + { + await firstClient.ConnectAsync(endpoint); + Exception? firstAcceptException = await Record.ExceptionAsync(async () => await firstAcceptTask); + Assert.NotNull(firstAcceptException); + Assert.True( + firstAcceptException is SocketException || + firstAcceptException is ObjectDisposedException, + $"Unexpected forced-accept exception type: {firstAcceptException}"); + } + + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: false), + "Expected multishot accept to disarm after terminal CQE."); + + Task secondAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Expected multishot accept to re-arm on subsequent accept."); + + using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await secondClient.ConnectAsync(endpoint); + using Socket secondServer = await AwaitWithTimeoutAsync(secondAcceptTask, "re-armed multishot accept"); + await AssertConnectedPairRoundTripAsync(secondClient, secondServer, 0x33); + } + + private static async Task RunMultishotAcceptHighConnectionRateScenarioAsync(int connectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + var acceptTasks = new Task[connectionCount]; + var clients = new Socket?[connectionCount]; + var connectTasks = new Task[connectionCount]; + + for (int i = 0; i < connectionCount; i++) + { + acceptTasks[i] = listener.AcceptAsync(); + } + + try + { + for (int i = 0; i < connectionCount; i++) + { + clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectTasks[i] = clients[i].ConnectAsync(endpoint); + } + + await Task.WhenAll(connectTasks); + Socket[] servers = await Task.WhenAll(acceptTasks); + + try + { + var verificationTasks = new List(connectionCount); + for (int i = 0; i < connectionCount; i++) + { + Socket client = Assert.IsType(clients[i]); + Socket server = servers[i]; + byte marker = unchecked((byte)i); + verificationTasks.Add(AssertConnectedPairRoundTripAsync(client, server, marker)); + } + + await Task.WhenAll(verificationTasks); + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + } + } + finally + { + foreach (Socket? client in clients) + { + client?.Dispose(); + } + } + } + + private static async Task RunSlotCapacityStressScenarioAsync(int connectionCount) + { + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + int requiredDescriptorCount = checked((connectionCount * 2) + 256); + if (!HasSufficientFileDescriptorLimit(requiredDescriptorCount)) + { + return; + } + + ulong highWaterBefore = GetIoUringTelemetryCounterValue("_ioUringCompletionSlotHighWaterMark"); + await RunMultishotAcceptHighConnectionRateScenarioAsync(connectionCount); + ulong highWaterAfter = GetIoUringTelemetryCounterValue("_ioUringCompletionSlotHighWaterMark"); + + Assert.True(highWaterAfter >= highWaterBefore, "Completion-slot high-water mark should be monotonic."); + Assert.True(highWaterAfter >= 4000, $"Expected completion-slot high-water mark to reach >= 4000; observed {highWaterAfter}."); + } + + private static async Task RunLargeSendWithBackpressureAsync(bool useBufferListSend) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + client.SendBufferSize = 1024; + server.ReceiveBufferSize = 1024; + + const int PayloadLength = 2 * 1024 * 1024; + byte[] payload = new byte[PayloadLength]; + for (int i = 0; i < payload.Length; i++) + { + payload[i] = (byte)i; + } + + Task sendTask; + if (useBufferListSend) + { + const int SegmentSize = 1024; + var sendBuffers = new List>(); + for (int offset = 0; offset < payload.Length; offset += SegmentSize) + { + int count = Math.Min(SegmentSize, payload.Length - offset); + sendBuffers.Add(new ArraySegment(payload, offset, count)); + } + + sendTask = ToTask(client.SendAsync(sendBuffers, SocketFlags.None)); + } + else + { + sendTask = ToTask(client.SendAsync(payload, SocketFlags.None)); + } + + await Task.Delay(20); + + byte[] received = new byte[payload.Length]; + int totalReceived = 0; + while (totalReceived < payload.Length) + { + int receivedNow = await ToTask(server.ReceiveAsync(received.AsMemory(totalReceived), SocketFlags.None)); + Assert.True(receivedNow > 0); + totalReceived += receivedNow; + if ((totalReceived & 0x3FFF) == 0) + { + await Task.Delay(1); + } + } + + Assert.Equal(payload.Length, await sendTask); + Assert.Equal(payload.Length, totalReceived); + Assert.Equal(payload, received); + } + + private static async Task RunAsyncCancelRequestIsolationScenarioAsync(int iterations) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var cancelPair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket cancelClient = cancelPair.Client; + using Socket cancelServer = cancelPair.Server; + + var activePair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket activeClient = activePair.Client; + using Socket activeServer = activePair.Server; + + byte[] cancelBuffer = new byte[1]; + byte[] activeBuffer = new byte[1]; + for (int i = 0; i < iterations; i++) + { + using var cts = new CancellationTokenSource(); + Task canceledReceive = ToTask(cancelServer.ReceiveAsync(cancelBuffer, SocketFlags.None, cts.Token)); + Task activeReceive = ToTask(activeServer.ReceiveAsync(activeBuffer, SocketFlags.None)); + await Task.Yield(); + + cts.Cancel(); + byte expected = unchecked((byte)(i + 1)); + Assert.Equal(1, await activeClient.SendAsync(new byte[] { expected }, SocketFlags.None)); + + Assert.Equal(1, await activeReceive); + Assert.Equal(expected, activeBuffer[0]); + + Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(cancelException); + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong asyncCancelRequestCqeDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeCount, + diagnosticsAfter.AsyncCancelRequestCqeCount); + if (asyncCancelRequestCqeDelta == 0) + { + return; + } + + ulong asyncCancelRequestCqeEnoentDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEnoentCount, + diagnosticsAfter.AsyncCancelRequestCqeEnoentCount); + ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount, + diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount); + ulong asyncCancelRequestCqeOtherDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeOtherCount, + diagnosticsAfter.AsyncCancelRequestCqeOtherCount); + + Assert.True( + asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta, + $"Unexpected async-cancel accounting for isolation scenario: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}"); + }, + settleDelayMilliseconds: 200); + } + + private static async Task RunReceiveMessageFromCancellationAndDisposeScenariosAsync() + { + using Socket cancelReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + cancelReceiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + cancelReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + EndPoint cancelRemoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + using var cts = new CancellationTokenSource(); + Task canceledReceive = ToTask( + cancelReceiver.ReceiveMessageFromAsync(new byte[64], SocketFlags.None, cancelRemoteEndPoint, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + Task cancelCompleted = await Task.WhenAny(canceledReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(canceledReceive, cancelCompleted); + Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(cancelException); + + using Socket disposeReceiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + disposeReceiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + disposeReceiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] receiveBuffer = new byte[32]; + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer, 0, 16), + new ArraySegment(receiveBuffer, 16, 16) + }, + RemoteEndPoint = new IPEndPoint(IPAddress.Any, 0) + }; + + Task pendingReceive = StartReceiveMessageFromAsync(disposeReceiver, receiveEventArgs); + await Task.Yield(); + disposeReceiver.Dispose(); + + Task disposeCompleted = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, disposeCompleted); + SocketAsyncEventArgs completedArgs = await pendingReceive; + Assert.True( + completedArgs.SocketError == SocketError.OperationAborted || + completedArgs.SocketError == SocketError.Interrupted); + } + + private static async Task RunReceiveMessageFromCancelThenReceiveScenarioAsync() + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.SetSocketOption(SocketOptionLevel.IP, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + EndPoint initialRemoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + using var cts = new CancellationTokenSource(); + Task canceledReceive = ToTask( + receiver.ReceiveMessageFromAsync(new byte[64], SocketFlags.None, initialRemoteEndPoint, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + Task canceledCompleted = await Task.WhenAny(canceledReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(canceledReceive, canceledCompleted); + Exception? cancelException = await Record.ExceptionAsync(async () => await canceledReceive); + AssertCanceledOrInterrupted(cancelException); + + byte[] payload = new byte[] { 0x10, 0x20, 0x30, 0x40 }; + Assert.Equal( + payload.Length, + await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!)); + + byte[] receiveBuffer = new byte[64]; + EndPoint remoteEndPoint = new IPEndPoint(IPAddress.Any, 0); + SocketReceiveMessageFromResult received = await ToTask( + receiver.ReceiveMessageFromAsync(receiveBuffer, SocketFlags.None, remoteEndPoint, CancellationToken.None)); + + Assert.Equal(payload.Length, received.ReceivedBytes); + Assert.True(payload.AsSpan().SequenceEqual(receiveBuffer.AsSpan(0, payload.Length))); + } + + private static async Task RunReceiveMessageFromCancellationAndDisposeScenariosWithGcPressureAsync(int iterations) + { + for (int i = 0; i < iterations; i++) + { + await RunReceiveMessageFromCancellationAndDisposeScenariosAsync(); + if ((i & 0x3) == 0) + { + GC.Collect(); + GC.WaitForPendingFinalizers(); + GC.Collect(); + } + } + } + + private static async Task RunTeardownDrainTrackedOperationsScenarioAsync(int iterations) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(8); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + for (int i = 0; i < iterations; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None)); + await Task.Yield(); + + client.Dispose(); + server.Dispose(); + + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledDisposedOrInterrupted(receiveException); + } + } + + private static async Task RunTeardownCancellationDuplicateGuardScenarioAsync(int iterations) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(8); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + for (int i = 0; i < iterations; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None, cts.Token)); + await Task.Yield(); + cts.Cancel(); + + server.Dispose(); + client.Dispose(); + + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledDisposedOrInterrupted(receiveException); + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong asyncCancelRequestCqeDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeCount, + diagnosticsAfter.AsyncCancelRequestCqeCount); + if (asyncCancelRequestCqeDelta == 0) + { + return; + } + + ulong asyncCancelRequestCqeEnoentDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEnoentCount, + diagnosticsAfter.AsyncCancelRequestCqeEnoentCount); + ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount, + diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount); + ulong asyncCancelRequestCqeOtherDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeOtherCount, + diagnosticsAfter.AsyncCancelRequestCqeOtherCount); + + // Guardrail: one operation per iteration should not devolve into persistent multi-request cancellation churn. + ulong maxExpectedCancelRequestCqes = (ulong)(iterations + (iterations / 2) + 8); + Assert.True( + asyncCancelRequestCqeDelta <= maxExpectedCancelRequestCqes, + $"Unexpected async-cancel CQE inflation: delta={asyncCancelRequestCqeDelta}, max={maxExpectedCancelRequestCqes}, iterations={iterations}"); + Assert.True( + asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta, + $"Unexpected async-cancel accounting: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}"); + }, + settleDelayMilliseconds: 200); + } + + private static async Task RunCancellationSubmitContentionScenarioAsync(int connectionCount, int cancellationsPerConnection) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var servers = new List(connectionCount); + try + { + for (int i = 0; i < connectionCount; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + clients.Add(pair.Client); + servers.Add(pair.Server); + } + + Task[] churnTasks = new Task[connectionCount]; + for (int index = 0; index < connectionCount; index++) + { + Socket server = servers[index]; + churnTasks[index] = Task.Run(async () => + { + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < cancellationsPerConnection; i++) + { + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token)); + cts.Cancel(); + + Exception? receiveException = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledOrInterrupted(receiveException); + } + }); + } + + await Task.WhenAll(churnTasks); + + // Ensure the cancellation churn does not stall normal completion progress afterward. + for (int i = 0; i < connectionCount; i++) + { + byte expected = unchecked((byte)(i + 1)); + byte[] receiveBuffer = new byte[1]; + Task receiveTask = ToTask(servers[i].ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Assert.Equal(1, await clients[i].SendAsync(new byte[] { expected }, SocketFlags.None)); + Assert.Equal(1, await receiveTask); + Assert.Equal(expected, receiveBuffer[0]); + } + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong asyncCancelRequestCqeDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeCount, + diagnosticsAfter.AsyncCancelRequestCqeCount); + if (asyncCancelRequestCqeDelta == 0) + { + // On kernels without async-cancel opcode support this path may fallback without cancel-request CQEs. + return; + } + + ulong asyncCancelRequestCqeEnoentDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEnoentCount, + diagnosticsAfter.AsyncCancelRequestCqeEnoentCount); + ulong asyncCancelRequestCqeEalreadyDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeEalreadyCount, + diagnosticsAfter.AsyncCancelRequestCqeEalreadyCount); + ulong asyncCancelRequestCqeOtherDelta = CounterDelta( + diagnosticsBefore.AsyncCancelRequestCqeOtherCount, + diagnosticsAfter.AsyncCancelRequestCqeOtherCount); + + ulong maxExpectedCancelRequestCqes = (ulong)(connectionCount * cancellationsPerConnection * 2) + 64; + Assert.True( + asyncCancelRequestCqeDelta <= maxExpectedCancelRequestCqes, + $"Unexpected async-cancel request CQE inflation under contention: delta={asyncCancelRequestCqeDelta}, max={maxExpectedCancelRequestCqes}, connections={connectionCount}, cancels_per_connection={cancellationsPerConnection}"); + Assert.True( + asyncCancelRequestCqeEnoentDelta + asyncCancelRequestCqeEalreadyDelta + asyncCancelRequestCqeOtherDelta <= asyncCancelRequestCqeDelta, + $"Unexpected async-cancel accounting under contention: enoent_delta={asyncCancelRequestCqeEnoentDelta}, ealready_delta={asyncCancelRequestCqeEalreadyDelta}, other_delta={asyncCancelRequestCqeOtherDelta}, total_delta={asyncCancelRequestCqeDelta}"); + }, + settleDelayMilliseconds: 200); + } + + private static async Task RunCancellationQueueWakeBeforeOverflowScenarioAsync() + { +#if DEBUG + await RunTcpRoundTripAsync(4); + + if (!TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) || ioUringEngine is null) + { + return; + } + + long configuredCapacity = SocketAsyncEngine.GetIoUringCancellationQueueCapacityForTest(); + Assert.True(configuredCapacity > 0, "Cancellation queue capacity must be positive for wake-before-overflow verification."); + + long originalQueueLength = ioUringEngine.IoUringCancelQueueLengthForTest; + int originalWakeupRequested = ioUringEngine.IoUringWakeupRequestedForTest; + try + { + long overflowBefore = ioUringEngine.IoUringCancelQueueOverflowCountForTest; + long wakeRetryBefore = ioUringEngine.IoUringCancelQueueWakeRetryCountForTest; + + // Force queue-full path deterministically by priming the observed queue length + // to capacity before enqueue; this must trigger wake-and-retry before overflow. + ioUringEngine.IoUringCancelQueueLengthForTest = configuredCapacity; + ioUringEngine.IoUringWakeupRequestedForTest = 0; + + bool enqueueResult = ioUringEngine.TryEnqueueIoUringCancellationForTest((ulong)1); + + long overflowAfter = ioUringEngine.IoUringCancelQueueOverflowCountForTest; + long wakeRetryAfter = ioUringEngine.IoUringCancelQueueWakeRetryCountForTest; + + Assert.False(enqueueResult); + Assert.Equal(overflowBefore + 1, overflowAfter); + Assert.True( + wakeRetryAfter > wakeRetryBefore, + $"Expected cancel queue full path to record a wake-before-retry. before={wakeRetryBefore}, after={wakeRetryAfter}"); + } + finally + { + ioUringEngine.IoUringCancelQueueLengthForTest = originalQueueLength; + ioUringEngine.IoUringWakeupRequestedForTest = originalWakeupRequested; + } +#else + await Task.CompletedTask; +#endif + } + + private static async Task RunMixedModeReadinessCompletionStressScenarioAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] completionBuffer = new byte[1]; + byte[] payload = new byte[1]; + + for (int i = 0; i < iterations; i++) + { + Task completionReceive = ToTask(server.ReceiveAsync(completionBuffer, SocketFlags.None)); + Task readinessProbe = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + payload[0] = unchecked((byte)(i + 1)); + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await completionReceive); + Assert.Equal(payload[0], completionBuffer[0]); + + Task completed = await Task.WhenAny(readinessProbe, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(readinessProbe, completed); + Assert.Equal(0, await readinessProbe); + } + } + + private static async Task RunSameSocketReadinessCompletionBacklogScenarioAsync(int iterations, int completionBatchSize) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] sendPayload = new byte[completionBatchSize]; + for (int iteration = 0; iteration < iterations; iteration++) + { + var receiveBuffers = new byte[completionBatchSize][]; + var completionReceives = new Task[completionBatchSize]; + for (int i = 0; i < completionBatchSize; i++) + { + byte expected = unchecked((byte)((iteration + i + 1) & 0xFF)); + sendPayload[i] = expected; + byte[] receiveBuffer = new byte[1]; + receiveBuffers[i] = receiveBuffer; + completionReceives[i] = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + } + + Task readinessProbe = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + int sent = 0; + while (sent < sendPayload.Length) + { + sent += await client.SendAsync(sendPayload.AsMemory(sent), SocketFlags.None); + } + + Assert.Equal(sendPayload.Length, sent); + + Task readinessCompleted = await Task.WhenAny(readinessProbe, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(readinessProbe, readinessCompleted); + Assert.Equal(0, await readinessProbe); + + int[] receivedCounts = await Task.WhenAll(completionReceives); + for (int i = 0; i < completionBatchSize; i++) + { + Assert.Equal(1, receivedCounts[i]); + Assert.Equal(sendPayload[i], receiveBuffers[i][0]); + } + } + } + + private static async Task RunPureCompletionScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] tcpSendPayload = new byte[] { 0x11 }; + byte[] tcpReceiveBuffer = new byte[1]; + + Task tcpReceive = ToTask(server.ReceiveAsync(tcpReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(tcpSendPayload, SocketFlags.None)); + Assert.Equal(1, await AwaitWithTimeoutAsync(tcpReceive, nameof(tcpReceive))); + Assert.Equal(tcpSendPayload[0], tcpReceiveBuffer[0]); + + Task tcpZeroByteReceive = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + byte[] tcpPayloadAfterProbe = new byte[] { 0x22 }; + Assert.Equal(1, await client.SendAsync(tcpPayloadAfterProbe, SocketFlags.None)); + Task completed = await Task.WhenAny(tcpZeroByteReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(tcpZeroByteReceive, completed); + Assert.Equal(0, await tcpZeroByteReceive); + + byte[] tcpDataAfterZeroByte = new byte[1]; + Task tcpTailReceive = ToTask(server.ReceiveAsync(tcpDataAfterZeroByte, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await AwaitWithTimeoutAsync(tcpTailReceive, nameof(tcpTailReceive))); + Assert.Equal(tcpPayloadAfterProbe[0], tcpDataAfterZeroByte[0]); + + using Socket connectListener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectListener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + connectListener.Listen(1); + IPEndPoint connectEndPoint = (IPEndPoint)connectListener.LocalEndPoint!; + + using Socket connectClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = connectListener.AcceptAsync(); + await connectClient.ConnectAsync(connectEndPoint); + using Socket connectServer = await AwaitWithTimeoutAsync(acceptTask, nameof(acceptTask)); + + byte[] connectPayload = new byte[] { 0x33 }; + Assert.Equal(1, await connectClient.SendAsync(connectPayload, SocketFlags.None)); + byte[] connectReceiveBuffer = new byte[1]; + Assert.Equal(1, await connectServer.ReceiveAsync(connectReceiveBuffer, SocketFlags.None)); + Assert.Equal(connectPayload[0], connectReceiveBuffer[0]); + + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket udpSender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + udpSender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] udpPayload = new byte[] { 0x33, 0x44, 0x55 }; + byte[] udpReceiveBuffer = new byte[udpPayload.Length]; + + Task receiveFromTask = + ToTask(receiver.ReceiveFromAsync(udpReceiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + Assert.Equal(udpPayload.Length, await udpSender.SendToAsync(udpPayload, SocketFlags.None, receiver.LocalEndPoint!)); + + SocketReceiveFromResult receiveFromResult = await receiveFromTask; + Assert.Equal(udpPayload.Length, receiveFromResult.ReceivedBytes); + Assert.Equal(udpPayload, udpReceiveBuffer); + Assert.Equal(udpSender.LocalEndPoint, receiveFromResult.RemoteEndPoint); + } + + private static async Task RunBoundedWaitBufferPressureScenarioAsync(int connectionCount) + { + await WithIoUringNativeDiagnosticsSnapshotDeltaAsync( + async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var servers = new List(connectionCount); + var receiveBuffers = new List(connectionCount); + var receiveTasks = new List>(connectionCount); + var sendTasks = new List>(connectionCount); + + try + { + for (int i = 0; i < connectionCount; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + clients.Add(pair.Client); + servers.Add(pair.Server); + + byte[] receiveBuffer = new byte[1]; + receiveBuffers.Add(receiveBuffer); + receiveTasks.Add(ToTask(pair.Server.ReceiveAsync(receiveBuffer, SocketFlags.None))); + } + + await Task.Yield(); + + for (int i = 0; i < clients.Count; i++) + { + byte payload = unchecked((byte)(i + 1)); + sendTasks.Add(ToTask(clients[i].SendAsync(new byte[] { payload }, SocketFlags.None))); + } + + int[] sentBytes = await Task.WhenAll(sendTasks); + int[] receivedBytes = await Task.WhenAll(receiveTasks); + + for (int i = 0; i < connectionCount; i++) + { + Assert.Equal(1, sentBytes[i]); + Assert.Equal(1, receivedBytes[i]); + Assert.Equal(unchecked((byte)(i + 1)), receiveBuffers[i][0]); + } + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + }, + (diagnosticsBefore, diagnosticsAfter) => + { + ulong socketEventBufferFullDelta = CounterDelta( + diagnosticsBefore.SocketEventBufferFullCount, + diagnosticsAfter.SocketEventBufferFullCount); + + Assert.True( + socketEventBufferFullDelta != 0, + $"Expected io_uring wait-buffer pressure counter to increase. socket_delta={socketEventBufferFullDelta}"); + }, + skipScenarioWhenIoUringUnavailable: true); + } + + private static async Task RunPrepareQueueOverflowFallbackScenarioAsync(int connectionCount) + { + ulong overflowBefore = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflows"); + ulong fallbackBefore = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks"); + bool observedOverflow = false; + + for (int round = 0; round < 4; round++) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var servers = new List(connectionCount); + var receiveTasks = new List>(connectionCount); + try + { + for (int i = 0; i < connectionCount; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + clients.Add(pair.Client); + servers.Add(pair.Server); + + receiveTasks.Add(ToTask(pair.Server.ReceiveAsync(new byte[1], SocketFlags.None))); + } + + await Task.Yield(); + + for (int i = 0; i < connectionCount; i++) + { + Assert.Equal(1, await clients[i].SendAsync(new byte[] { 0x5A }, SocketFlags.None)); + } + + for (int i = 0; i < receiveTasks.Count; i++) + { + Assert.Equal(1, await AwaitWithTimeoutAsync(receiveTasks[i], $"overflow_receive_{round}_{i}")); + } + } + finally + { + foreach (Socket server in servers) + { + server.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + + ulong overflowAfterRound = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflows"); + ulong fallbackAfterRound = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks"); + if (overflowAfterRound > overflowBefore) + { + observedOverflow = true; + Assert.True( + fallbackAfterRound > fallbackBefore, + $"Expected prepare queue overflow fallback counter to increase once overflow is observed. before={fallbackBefore}, after={fallbackAfterRound}"); + return; + } + } + + if (!observedOverflow) + { + // With very fast event-loop draining, queue overflow can be scheduler-dependent even at capacity=1. + // The scenario still validates that completion-mode operations make progress without hangs. + ulong fallbackAfter = GetIoUringTelemetryCounterValue("_ioUringPrepareQueueOverflowFallbacks"); + Assert.True( + fallbackAfter >= fallbackBefore, + $"Prepare queue overflow fallback counter should be nondecreasing. before={fallbackBefore}, after={fallbackAfter}"); + } + } + + private static async Task RunConnectQueueOverflowFallbackScenarioAsync(int connectionCount) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(connectionCount); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var clients = new List(connectionCount); + var connectTasks = new List(connectionCount); + var acceptTasks = new List>(connectionCount); + var acceptedSockets = new List(connectionCount); + + try + { + for (int i = 0; i < connectionCount; i++) + { + Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + clients.Add(client); + acceptTasks.Add(listener.AcceptAsync()); + connectTasks.Add(client.ConnectAsync(endpoint)); + } + + Task connectAll = Task.WhenAll(connectTasks); + Task connectCompleted = await Task.WhenAny(connectAll, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(connectAll, connectCompleted); + await connectAll; + + foreach (Task acceptTask in acceptTasks) + { + acceptedSockets.Add(await AwaitWithTimeoutAsync(acceptTask, nameof(RunConnectQueueOverflowFallbackScenarioAsync))); + } + } + finally + { + foreach (Socket acceptedSocket in acceptedSockets) + { + acceptedSocket.Dispose(); + } + + foreach (Socket client in clients) + { + client.Dispose(); + } + } + } + + private static async Task RunCqOverflowRecoveryScenarioAsync() + { + await RunTcpRoundTripAsync(8); + + int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest(); + int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest(); + ulong overflowBefore = GetIoUringTelemetryCounterValue("_ioUringCqOverflow"); + ulong recoveryBefore = GetIoUringTelemetryCounterValue("_ioUringCqOverflowRecoveries"); + + if (!TryInjectIoUringCqOverflowForTest(delta: 1, out int injectedEngineCount) || injectedEngineCount == 0) + { + return; + } + + await RunTcpRoundTripAsync(32); + + Assert.True( + await WaitForIoUringTelemetryCounterAtLeastAsync("_ioUringCqOverflowRecoveries", recoveryBefore + 1), + "Expected io_uring CQ overflow recovery counter to increment after injected overflow."); + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringCqOverflow") > overflowBefore, + "Expected io_uring CQ overflow counter to increment after injected overflow."); + Assert.True( + await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 1), + "Completion-slot usage did not return near baseline after CQ overflow recovery."); + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 1), + "Tracked io_uring operation count did not return near baseline after CQ overflow recovery."); + } + + private static async Task RunCqOverflowRecoveryWithZeroTrackedOperationsScenarioAsync() + { + await RunTcpRoundTripAsync(4); + + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(0), + "Expected zero tracked io_uring operations before injected CQ overflow."); + + ulong recoveryBefore = GetIoUringTelemetryCounterValue("_ioUringCqOverflowRecoveries"); + if (!TryInjectIoUringCqOverflowForTest(delta: 1, out int injectedEngineCount) || injectedEngineCount == 0) + { + return; + } + + await RunTcpRoundTripAsync(4); + + Assert.True( + await WaitForIoUringTelemetryCounterAtLeastAsync("_ioUringCqOverflowRecoveries", recoveryBefore + 1), + "Expected io_uring CQ overflow recovery counter to increment for zero-tracked-operations recovery."); + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(0), + "Tracked io_uring operation count should remain at zero after zero-tracked-operations recovery."); + } + + private static async Task RunCqOverflowRecoveryWithSmallRingScenarioAsync() + { + await RunTcpRoundTripAsync(8); + + int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest(); + int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest(); + ulong overflowBefore = GetIoUringTelemetryCounterValue("_ioUringCqOverflow"); + ulong recoveryBefore = GetIoUringTelemetryCounterValue("_ioUringCqOverflowRecoveries"); + + // Small CQ-ring overflow is timing-sensitive even with tiny rings; run multiple bursts. + for (int round = 0; round < 6; round++) + { + await RunTcpRoundTripAsync(256); + ulong overflowAfterRound = GetIoUringTelemetryCounterValue("_ioUringCqOverflow"); + if (overflowAfterRound > overflowBefore) + { + Assert.True( + await WaitForIoUringTelemetryCounterAtLeastAsync("_ioUringCqOverflowRecoveries", recoveryBefore + 1), + "Expected CQ overflow recovery counter to increment after real CQ overflow."); + Assert.True( + await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 2, timeoutMilliseconds: 15000), + "Completion-slot usage did not settle after real CQ overflow recovery."); + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 2, timeoutMilliseconds: 15000), + "Tracked-operation count did not settle after real CQ overflow recovery."); + return; + } + } + + // Non-deterministic fallback: on very fast machines the ring may drain without overflow. + Assert.True( + GetIoUringTelemetryCounterValue("_ioUringCqOverflow") >= overflowBefore, + "Real CQ overflow was not observed; counter should still be nondecreasing."); + } + + private static async Task RunMultishotAcceptOverflowArmingScenarioAsync() + { + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(4); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + Task acceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Multishot accept was not armed before injected CQ overflow."); + + int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest(); + int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest(); + ulong recoveryBefore = GetIoUringTelemetryCounterValue("_ioUringCqOverflowRecoveries"); + + if (!TryInjectIoUringCqOverflowForTest(delta: 1, out int injectedEngineCount) || injectedEngineCount == 0) + { + return; + } + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await client.ConnectAsync(endpoint); + + Socket? acceptedSocket = null; + Exception? acceptException = await Record.ExceptionAsync(async () => + { + acceptedSocket = await AwaitWithTimeoutAsync(acceptTask, "multishot accept after CQ overflow"); + }); + + if (acceptException is null) + { + using Socket server = Assert.IsType(acceptedSocket); + await AssertConnectedPairRoundTripAsync(client, server, 0xA1); + } + else + { + Assert.True( + acceptException is SocketException || acceptException is ObjectDisposedException, + $"Unexpected accept completion after CQ overflow: {acceptException}"); + } + + Task nextAcceptTask = listener.AcceptAsync(); + using Socket nextClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await nextClient.ConnectAsync(endpoint); + using Socket nextServer = await AwaitWithTimeoutAsync(nextAcceptTask, "post-overflow multishot accept follow-up"); + await AssertConnectedPairRoundTripAsync(nextClient, nextServer, 0xA2); + + Assert.True( + await WaitForIoUringTelemetryCounterAtLeastAsync("_ioUringCqOverflowRecoveries", recoveryBefore + 1), + "Expected CQ overflow recovery counter to increment after multishot-accept overflow scenario."); + Assert.True( + await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 2), + "Completion-slot usage remained unexpectedly elevated after multishot-accept overflow scenario."); + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 2), + "Tracked-operation count remained unexpectedly elevated after multishot-accept overflow scenario."); + } + + private static async Task RunTeardownUnderOverflowScenarioAsync() + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + + Task pendingAccept = listener.AcceptAsync(); + await Task.Yield(); + + int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest(); + int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest(); + _ = TryInjectIoUringCqOverflowForTest(delta: 1, out _); + + Task disposeTask = Task.Run(listener.Dispose); + + Task completed = await Task.WhenAny(pendingAccept, Task.Delay(TimeSpan.FromSeconds(60))); + Assert.Same(pendingAccept, completed); + await disposeTask; + + Exception? acceptException = await Record.ExceptionAsync(async () => await pendingAccept); + Assert.NotNull(acceptException); + Assert.True( + acceptException is ObjectDisposedException || acceptException is SocketException, + $"Unexpected pending-accept exception under teardown+overflow race: {acceptException}"); + + Assert.True( + await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 1, timeoutMilliseconds: 15000), + "Completion-slot usage did not settle after teardown-under-overflow scenario."); + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 1, timeoutMilliseconds: 15000), + "Tracked-operation count did not settle after teardown-under-overflow scenario."); + } + + private static async Task RunSustainedOverflowReentrancyScenarioAsync() + { + await RunTcpRoundTripAsync(4); + + int baselineCompletionSlotsInUse = GetIoUringCompletionSlotsInUseForTest(); + int baselineTrackedOperations = GetIoUringTrackedOperationCountForTest(); + ulong recoveryBefore = GetIoUringTelemetryCounterValue("_ioUringCqOverflowRecoveries"); + + if (!TryInjectIoUringCqOverflowForTest(delta: 1, out int injectedEngineCount) || injectedEngineCount == 0) + { + return; + } + + DateTime end = DateTime.UtcNow + TimeSpan.FromSeconds(10); + Task injectorTask = Task.Run(async () => + { + while (DateTime.UtcNow < end) + { + _ = TryInjectIoUringCqOverflowForTest(delta: 1, out _); + await Task.Delay(5); + } + }); + + Task workloadTask = Task.Run(async () => + { + while (DateTime.UtcNow < end) + { + await RunTcpRoundTripAsync(2); + } + }); + + Task combined = Task.WhenAll(injectorTask, workloadTask); + Task completed = await Task.WhenAny(combined, Task.Delay(TimeSpan.FromSeconds(30))); + Assert.Same(combined, completed); + await combined; + + Assert.True( + await WaitForIoUringTelemetryCounterAtLeastAsync("_ioUringCqOverflowRecoveries", recoveryBefore + 1), + "Expected CQ overflow recovery counter to increment during sustained overflow scenario."); + Assert.True( + await WaitForIoUringCompletionSlotsInUseAtMostAsync(baselineCompletionSlotsInUse + 2, timeoutMilliseconds: 15000), + "Completion-slot usage did not settle after sustained overflow scenario."); + Assert.True( + await WaitForIoUringTrackedOperationsAtMostAsync(baselineTrackedOperations + 2, timeoutMilliseconds: 15000), + "Tracked-operation count did not settle after sustained overflow scenario."); + } + + private static async Task RunCqOverflowReflectionTargetStabilityScenarioAsync() + { + await RunTcpRoundTripAsync(4); + bool hasIoUringPort = AssertIoUringCqReflectionTargetsStableForTest(); + if (!hasIoUringPort) + { + return; + } + + Assert.True(hasIoUringPort, "Expected at least one active io_uring engine when io_uring mode is enabled."); + } + + private static Task RunNativeMsghdrLayoutContractScenarioAsync() + { + AssertNativeMsghdrLayoutContractForIoUring(); + return Task.CompletedTask; + } + + private static Task RunNativeMsghdr32BitRejectionScenarioAsync() + { + AssertNativeMsghdr32BitRejectionPathForIoUring(); + return Task.CompletedTask; + } + + private static Task RunCompletionSlotLayoutContractScenarioAsync() + { + AssertIoUringCompletionSlotLayoutContractForIoUring(); + return Task.CompletedTask; + } + + private static Task RunCompletionSlotUserDataBoundaryScenarioAsync() + { + AssertCompletionSlotUserDataEncodingBoundaryContractForIoUring(); + return Task.CompletedTask; + } + + private static async Task RunCloseOnExecForkExecScenarioAsync() + { + await RunTcpRoundTripAsync(4); + + if (!TryGetIoUringRingFdForTest(out int ringFd) || + !TryGetIoUringWakeupEventFdForTest(out int wakeupEventFd)) + { + return; + } + + Assert.False( + DoesExecChildObserveFileDescriptor(ringFd), + $"Ring fd {ringFd} unexpectedly survived exec."); + Assert.False( + DoesExecChildObserveFileDescriptor(wakeupEventFd), + $"Wakeup eventfd {wakeupEventFd} unexpectedly survived exec."); + } + + private static async Task RunDebugNonEventLoopSingleIssuerAssertionScenarioAsync() + { +#if DEBUG + await RunTcpRoundTripAsync(4); + + SocketAsyncEngine[] engines = SocketAsyncEngine.GetActiveIoUringEnginesForTest(); + if (engines.Length == 0) + { + return; + } + + SocketAsyncEngine ioUringEngine = engines[0]; + + using var listener = new ThrowingTraceListener(); + Trace.Listeners.Add(listener); + try + { + Exception? ex = await Record.ExceptionAsync(async () => + { + await Task.Run(() => ioUringEngine.SubmitIoUringOperationsNormalizedForTest()); + }); + + Assert.NotNull(ex); + Assert.Contains("SINGLE_ISSUER", ex.ToString(), StringComparison.OrdinalIgnoreCase); + } + finally + { + Trace.Listeners.Remove(listener); + } +#endif + } + + private static async Task RunCompletionCancellationRaceAsync(int iterations) + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + int completedCount = 0; + int canceledCount = 0; + for (int i = 0; i < iterations; i++) + { + while (server.Available > 0) + { + int drainLength = Math.Min(server.Available, 256); + byte[] drainBuffer = new byte[drainLength]; + int drained = await ToTask(server.ReceiveAsync(drainBuffer, SocketFlags.None)); + if (drained == 0) + { + break; + } + } + + using var cts = new CancellationTokenSource(); + Task receiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token)); + Task sendTask; + + if ((i & 1) == 0) + { + cts.Cancel(); + sendTask = ToTask(client.SendAsync(new byte[] { unchecked((byte)(i + 1)) }, SocketFlags.None)); + } + else + { + sendTask = ToTask(client.SendAsync(new byte[] { unchecked((byte)(i + 1)) }, SocketFlags.None)); + await Task.Yield(); + cts.Cancel(); + } + + Exception? receiveException = await Record.ExceptionAsync(async () => await receiveTask); + if (receiveException is null) + { + completedCount++; + Assert.Equal(1, receiveTask.Result); + } + else + { + canceledCount++; + AssertCanceledOrInterrupted(receiveException); + } + + Assert.Equal(1, await sendTask); + } + + Assert.True(completedCount > 0); + Assert.True(canceledCount > 0); + } + + private static async Task DrainAvailableBytesAsync(Socket socket) + { + while (socket.Available > 0) + { + int bytesToRead = Math.Min(socket.Available, 256); + byte[] drainBuffer = new byte[bytesToRead]; + int read = await ToTask(socket.ReceiveAsync(drainBuffer, SocketFlags.None)); + if (read <= 0) + { + return; + } + } + } + + private static async Task RunForcedEagainReceiveScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] firstReceiveBuffer = new byte[1]; + Task receiveTask = ToTask(server.ReceiveAsync(firstReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + + byte sendByte = 0x31; + for (int i = 0; i < 6 && !receiveTask.IsCompleted; i++) + { + Assert.Equal(1, await client.SendAsync(new byte[] { sendByte }, SocketFlags.None)); + sendByte++; + await Task.Delay(10); + } + + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + Assert.True(await receiveTask > 0); + await DrainAvailableBytesAsync(server); + + byte[] secondReceiveBuffer = new byte[1]; + Task followUpReceiveTask = ToTask(server.ReceiveAsync(secondReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x40 }, SocketFlags.None)); + Task followUpCompleted = await Task.WhenAny(followUpReceiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(followUpReceiveTask, followUpCompleted); + Assert.True(await followUpReceiveTask > 0); + } + + private static async Task RunForcedEcanceledReceiveScenarioAsync() + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[1]; + Task forcedReceiveTask = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x44 }, SocketFlags.None)); + + Task completed = await Task.WhenAny(forcedReceiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(forcedReceiveTask, completed); + Exception? forcedReceiveException = await Record.ExceptionAsync(async () => await forcedReceiveTask); + if (forcedReceiveException is null) + { + Assert.True(forcedReceiveTask.Result > 0); + } + else + { + AssertCanceledOrInterrupted(forcedReceiveException); + } + await DrainAvailableBytesAsync(server); + + byte[] followUpReceiveBuffer = new byte[1]; + Task followUpReceiveTask = ToTask(server.ReceiveAsync(followUpReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + Assert.Equal(1, await client.SendAsync(new byte[] { 0x45 }, SocketFlags.None)); + Task followUpCompleted = await Task.WhenAny(followUpReceiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(followUpReceiveTask, followUpCompleted); + Assert.True(await followUpReceiveTask > 0); + } + + private static Task RunForcedReceiveScenarioAsync(bool forceEcanceled) => + forceEcanceled ? RunForcedEcanceledReceiveScenarioAsync() : RunForcedEagainReceiveScenarioAsync(); + + private static async Task RunForcedEnterEintrRetryLimitScenarioAsync() + { + Exception? firstFailure = await Record.ExceptionAsync(async () => await RunTcpRoundTripAsync(4)); + if (firstFailure is not null) + { + Assert.True( + firstFailure is SocketException || + firstFailure is OperationCanceledException || + firstFailure is ObjectDisposedException, + $"Unexpected exception after forced io_uring_enter EINTR-limit error: {firstFailure}"); + } + + await RunTcpRoundTripAsync(4); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(64), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringAndEpollEngines_MixedProcessWorkload_Completes() + { + await RemoteExecutor.Invoke( + static () => RunHybridIoUringAndEpollEngineScenarioAsync(), + CreateSocketEngineOptions( + ioUringValue: "1", + threadCount: 2)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // kernel-version fallback behavior is Linux-specific. + public static async Task IoUringCompletionMode_ForcedKernelVersionUnsupported_FallsBackToEpoll() + { + await RemoteExecutor.Invoke( + static () => RunKernelVersionUnsupportedFallbackScenarioAsync(), + CreateSocketEngineOptions( + ioUringValue: "1", + forceKernelVersionUnsupported: true)).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // hybrid routing behavior is Linux-specific. + public static async Task IoUringCompletionMode_CancellationRouting_ThreadCount2_Progresses() + { + await RemoteExecutor.Invoke( + static () => RunThreadCountTwoCancellationRoutingScenarioAsync(), + CreateSocketEngineOptions( + ioUringValue: "1", + threadCount: 2)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_UnixDomainSockets_ConnectSendReceive_Works() + { + await RemoteExecutor.Invoke(static () => RunUnixDomainSocketRoundTripAsync(), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task SocketEngine_DefaultOptOut_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task SocketEngine_KillSwitchZero_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: "0")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_AppContextSwitches_HonoredWhenEnvUnset() + { + await RemoteExecutor.Invoke( + static () => + { + AssertBooleanAppContextSwitch( + switchName: "System.Net.Sockets.UseIoUring", + methodName: "IsIoUringEnabled", + expectedWhenSwitchTrue: true, + expectedWhenSwitchFalse: false); + AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", true); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", false); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + }, + CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_SqPoll_DualOptIn_RequiresAppContextAndEnvironment() + { + await RemoteExecutor.Invoke( + static () => + { + AppContext.SetSwitch("System.Net.Sockets.UseIoUring", true); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsIoUringEnabled")); + + // SQPOLL request requires both AppContext opt-in and env var opt-in. + AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", true); + Assert.True(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", false); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + }, + CreateSocketEngineOptions( + ioUringValue: "0", + sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_ContradictoryPrimaryInputs_EnvironmentWinsOverAppContext() + { + // Env=0 must disable io_uring even when AppContext switch is true. + await RemoteExecutor.Invoke( + static () => + { + AppContext.SetSwitch("System.Net.Sockets.UseIoUring", true); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsIoUringEnabled")); + }, + CreateSocketEngineOptions(ioUringValue: "0")).DisposeAsync(); + + // Env=1 must enable io_uring even when AppContext switch is false. + await RemoteExecutor.Invoke( + static () => + { + AppContext.SetSwitch("System.Net.Sockets.UseIoUring", false); + Assert.True(InvokeSocketAsyncEngineBoolMethod("IsIoUringEnabled")); + }, + CreateSocketEngineOptions(ioUringValue: "1")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_SqPoll_ContradictoryInputs_RequireDualOptInAndOneValue() + { + // AppContext=true + env=0 must stay disabled. + await RemoteExecutor.Invoke( + static () => + { + AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", true); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + }, + CreateSocketEngineOptions(ioUringValue: null, sqPollEnabled: false)).DisposeAsync(); + + // AppContext=false + env=1 must stay disabled. + await RemoteExecutor.Invoke( + static () => + { + AppContext.SetSwitch("System.Net.Sockets.UseIoUringSqPoll", false); + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsSqPollRequested")); + }, + CreateSocketEngineOptions(ioUringValue: null, sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringConfig_RemovedProductionKnobs_DefaultEnabled() + { + await RemoteExecutor.Invoke( + static () => + { + Assert.False(InvokeSocketAsyncEngineBoolMethod("IsIoUringDirectSqeDisabled")); + Assert.True(InvokeSocketAsyncEngineBoolMethod("IsZeroCopySendOptedIn")); + Assert.True(InvokeSocketAsyncEngineBoolMethod("IsIoUringRegisterBuffersEnabled")); + }, + CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_UdpSendReceive_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + IPEndPoint receiverEndpoint = (IPEndPoint)receiver.LocalEndPoint!; + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + IPEndPoint senderEndpoint = (IPEndPoint)sender.LocalEndPoint!; + sender.Connect(receiverEndpoint); + + byte[] sendBuffer = new byte[] { 7 }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < 64; i++) + { + int sent = await sender.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + + EndPoint remote = new IPEndPoint(IPAddress.Any, 0); + SocketReceiveFromResult receiveFrom = await receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, remote); + Assert.Equal(1, receiveFrom.ReceivedBytes); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + Assert.Equal(senderEndpoint, receiveFrom.RemoteEndPoint); + + int echoed = await receiver.SendToAsync(sendBuffer, SocketFlags.None, receiveFrom.RemoteEndPoint); + Assert.Equal(1, echoed); + + int received = await sender.ReceiveAsync(receiveBuffer, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + unchecked + { + sendBuffer[0]++; + } + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_MultipleConcurrentConnections_Work() + { + await RemoteExecutor.Invoke(static async () => + { + const int ConnectionCount = 32; + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(ConnectionCount); + + var acceptTasks = new Task[ConnectionCount]; + var clients = new Socket[ConnectionCount]; + + for (int i = 0; i < ConnectionCount; i++) + { + acceptTasks[i] = listener.AcceptAsync(); + } + + var connectTasks = new Task[ConnectionCount]; + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + for (int i = 0; i < ConnectionCount; i++) + { + clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectTasks[i] = clients[i].ConnectAsync(endpoint); + } + + await Task.WhenAll(connectTasks); + Socket[] servers = await Task.WhenAll(acceptTasks); + + var roundTripTasks = new List(ConnectionCount); + for (int i = 0; i < ConnectionCount; i++) + { + Socket client = clients[i]; + Socket server = servers[i]; + byte value = (byte)(i + 1); + roundTripTasks.Add(Task.Run(async () => + { + byte[] tx = new byte[] { value }; + byte[] rx = new byte[1]; + + int sent = await client.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await server.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + + sent = await server.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + received = await client.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + })); + } + + await Task.WhenAll(roundTripTasks); + + for (int i = 0; i < ConnectionCount; i++) + { + servers[i].Dispose(); + clients[i].Dispose(); + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_DisconnectReconnectAndCancellation_Work() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + // First connection lifecycle — block scope ensures disposal before reconnect. + { + var firstPair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket firstClient = firstPair.Client; + using Socket firstServer = firstPair.Server; + } + + // Reconnect and validate cancellation + subsequent data flow. + var secondPair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket secondClient = secondPair.Client; + using Socket secondServer = secondPair.Server; + + byte[] receiveBuffer = new byte[1]; + using (var cts = new CancellationTokenSource()) + { + var pendingReceive = secondServer.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token); + cts.Cancel(); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + Assert.NotNull(ex); + Assert.True( + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + + byte[] sendBuffer = new byte[] { 42 }; + int sent = await secondClient.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await secondServer.ReceiveAsync(receiveBuffer, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_QueuedZeroByteReceive_DoesNotStall() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] firstReceiveBuffer = new byte[1]; + Task firstReceive = ToTask(server.ReceiveAsync(firstReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + + Task zeroByteReceive = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + byte[] firstPayload = new byte[] { 0x11 }; + Assert.Equal(1, await client.SendAsync(firstPayload, SocketFlags.None)); + Assert.Equal(1, await firstReceive); + Assert.Equal(firstPayload[0], firstReceiveBuffer[0]); + + Task completed = await Task.WhenAny(zeroByteReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(zeroByteReceive, completed); + Assert.Equal(0, await zeroByteReceive); + + byte[] secondReceiveBuffer = new byte[1]; + Task secondReceive = ToTask(server.ReceiveAsync(secondReceiveBuffer, SocketFlags.None)); + await Task.Yield(); + + byte[] secondPayload = new byte[] { 0x22 }; + Assert.Equal(1, await client.SendAsync(secondPayload, SocketFlags.None)); + Assert.Equal(1, await secondReceive); + Assert.Equal(secondPayload[0], secondReceiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PureCompletionMode_MixesTcpAndUdp() + { + await RemoteExecutor.Invoke( + static () => RunPureCompletionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CancelWithoutTraffic_CompletesPromptly() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] receiveBuffer = new byte[16]; + using var cts = new CancellationTokenSource(); + Task pendingReceive = ToTask(server.ReceiveAsync(receiveBuffer, SocketFlags.None, cts.Token)); + + cts.Cancel(); + Task completed = await Task.WhenAny(pendingReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(pendingReceive, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + AssertCanceledOrInterrupted(ex); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveMessageFrom_CancellationAndDispose_DoNotHang() + { + await RemoteExecutor.Invoke(static () => RunReceiveMessageFromCancellationAndDisposeScenariosAsync(), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveMessageFrom_Cancellation_DoesNotPoisonNextReceive() + { + await RemoteExecutor.Invoke( + static () => RunReceiveMessageFromCancelThenReceiveScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveMessageFrom_CancellationAndDispose_GcPressure_DoNotHang() + { + await RemoteExecutor.Invoke( + static () => RunReceiveMessageFromCancellationAndDisposeScenariosWithGcPressureAsync(iterations: 32), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_TeardownDrainTrackedOperations_CancelsPendingReceives() + { + await RemoteExecutor.Invoke( + static () => RunTeardownDrainTrackedOperationsScenarioAsync(iterations: 64), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_TeardownCancellationDuplicateGuard_DoesNotInflateAsyncCancelRequestCqes() + { + await RemoteExecutor.Invoke( + static () => RunTeardownCancellationDuplicateGuardScenarioAsync(iterations: 96), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_MixedReadinessAndCompletion_NoStarvation() + { + await RemoteExecutor.Invoke( + static () => RunMixedModeReadinessCompletionStressScenarioAsync(iterations: 128), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_SameSocketReadinessCompletionBacklog_NoStarvation() + { + await RemoteExecutor.Invoke( + static () => RunSameSocketReadinessCompletionBacklogScenarioAsync(iterations: 64, completionBatchSize: 8), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BoundedWaitBufferPressure_NoLossAndCountersIncrease() + { + await RemoteExecutor.Invoke( + static () => RunBoundedWaitBufferPressureScenarioAsync(connectionCount: 32), + CreateSocketEngineOptions(testEventBufferCount: 1)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_InvalidTestEventBufferCount_FallsBackToDefault() + { + await RemoteExecutor.Invoke( + static () => RunTcpRoundTripAsync(32), + CreateSocketEngineOptions(testEventBufferCountRaw: "not-a-number")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PrepareQueueOverflow_FallsBackAndCompletes() + { + await RemoteExecutor.Invoke( + static () => RunPrepareQueueOverflowFallbackScenarioAsync(connectionCount: 32), + CreateSocketEngineOptions(prepareQueueCapacity: 1, directSqeEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ConnectQueueOverflow_FallsBackAndCompletes() + { + await RemoteExecutor.Invoke( + static () => RunConnectQueueOverflowFallbackScenarioAsync(connectionCount: 32), + CreateSocketEngineOptions(prepareQueueCapacity: 1, directSqeEnabled: false)).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PrepareQueueOverflow_Stress_NoHangs() + { + await RemoteExecutor.Invoke( + static () => RunPrepareQueueOverflowFallbackScenarioAsync(connectionCount: 96), + CreateSocketEngineOptions(prepareQueueCapacity: 2, directSqeEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CqOverflow_Recovery_InjectAndCompletes() + { + await RemoteExecutor.Invoke( + static () => RunCqOverflowRecoveryScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CqOverflow_Recovery_ZeroTrackedOperations_Completes() + { + await RemoteExecutor.Invoke( + static () => RunCqOverflowRecoveryWithZeroTrackedOperationsScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CqOverflow_Recovery_SmallRing_RealKernelOverflow() + { + await RemoteExecutor.Invoke( + static () => RunCqOverflowRecoveryWithSmallRingScenarioAsync(), + CreateSocketEngineOptions( + queueEntries: 8, + threadCount: 1, + directSqeEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_CqOverflowDuringArming_RecoversWithoutSilentLoss() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptOverflowArmingScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CqOverflow_TeardownRace_DoesNotHang() + { + await RemoteExecutor.Invoke( + static () => RunTeardownUnderOverflowScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CqOverflow_SustainedReentrancy_NoDeadlock() + { + await RemoteExecutor.Invoke( + static () => RunSustainedOverflowReentrancyScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CqOverflow_ReflectionTargets_Stable() + { + await RemoteExecutor.Invoke( + static () => RunCqOverflowReflectionTargetStabilityScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_NativeMsghdrLayoutContract_IsStable() + { + await RemoteExecutor.Invoke( + static () => RunNativeMsghdrLayoutContractScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_NativeMsghdrLayout_Rejects32BitPath() + { + await RemoteExecutor.Invoke( + static () => RunNativeMsghdr32BitRejectionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CompletionSlotLayoutContract_IsStable() + { + await RemoteExecutor.Invoke( + static () => RunCompletionSlotLayoutContractScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CompletionSlotUserData_BoundaryEncoding_IsStable() + { + await RemoteExecutor.Invoke( + static () => RunCompletionSlotUserDataBoundaryScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring ring fd + fcntl are Linux-specific. + public static async Task IoUringCompletionMode_RingFd_HasCloseOnExecSet() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + await RunTcpRoundTripAsync(4); + + if (!TryGetIoUringRingFdForTest(out int ringFd)) + { + return; + } + + int descriptorFlags = Fcntl(ringFd, F_GETFD); + Assert.True(descriptorFlags >= 0, $"fcntl(F_GETFD) failed with errno {Marshal.GetLastPInvokeError()}."); + Assert.NotEqual(0, descriptorFlags & FD_CLOEXEC); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring wakeup eventfd + fcntl are Linux-specific. + public static async Task IoUringCompletionMode_WakeupEventFd_HasCloseOnExecSet() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + await RunTcpRoundTripAsync(4); + + if (!TryGetIoUringWakeupEventFdForTest(out int wakeupEventFd)) + { + return; + } + + int descriptorFlags = Fcntl(wakeupEventFd, F_GETFD); + Assert.True(descriptorFlags >= 0, $"fcntl(F_GETFD) failed with errno {Marshal.GetLastPInvokeError()}."); + Assert.NotEqual(0, descriptorFlags & FD_CLOEXEC); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // fork/exec descriptor inheritance is Linux-specific. + public static async Task IoUringCompletionMode_RingAndWakeupEventFd_DoNotLeakAcrossExec() + { + await RemoteExecutor.Invoke( + static () => RunCloseOnExecForkExecScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring accept flags are Linux-specific. + public static async Task IoUringCompletionMode_AcceptedSocket_HasCloseOnExecAndNonBlockingSet() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket server = await acceptTask; + + if (!TryGetIoUringRingFdForTest(out _)) + { + return; + } + + int acceptedFd = checked((int)server.SafeHandle.DangerousGetHandle()); + + int descriptorFlags = Fcntl(acceptedFd, F_GETFD); + Assert.True(descriptorFlags >= 0, $"fcntl(F_GETFD) failed with errno {Marshal.GetLastPInvokeError()}."); + Assert.NotEqual(0, descriptorFlags & FD_CLOEXEC); + + int statusFlags = Fcntl(acceptedFd, F_GETFL); + Assert.True(statusFlags >= 0, $"fcntl(F_GETFL) failed with errno {Marshal.GetLastPInvokeError()}."); + Assert.NotEqual(0, statusFlags & O_NONBLOCK); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring seccomp/submit-error behavior is Linux-specific. + public static async Task IoUringCompletionMode_ForcedSubmitEperm_DoesNotCrashProcess() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + Exception? firstFailure = await Record.ExceptionAsync(async () => await RunTcpRoundTripAsync(4)); + if (firstFailure is not null) + { + Assert.True( + firstFailure is SocketException || + firstFailure is OperationCanceledException || + firstFailure is ObjectDisposedException, + $"Unexpected exception after forced submit EPERM: {firstFailure}"); + } + + // Ensure the engine remains usable after the forced EPERM submit rejection. + await RunTcpRoundTripAsync(4); + }); + }, CreateSocketEngineOptions(forceSubmitEpermOnce: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring enter retry-limit behavior is Linux-specific. + public static async Task IoUringCompletionMode_ForcedEnterEintrRetryLimit_DoesNotCrashProcess() + { + await RemoteExecutor.Invoke( + static () => RunForcedEnterEintrRetryLimitScenarioAsync(), + CreateSocketEngineOptions(forceEnterEintrRetryLimitOnce: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_SingleIssuer_DebugAssertion_FiresOnNonEventLoopCall() + { + await RemoteExecutor.Invoke( + static () => RunDebugNonEventLoopSingleIssuerAssertionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_NonPinnableMemory_FallsBackAndCompletes(bool receivePath) + { + await RemoteExecutor.Invoke( + static arg => RunNonPinnableMemoryFallbackScenarioAsync(receivePath: bool.Parse(arg)), + receivePath.ToString(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_NonPinnableFallbackTelemetryCounter_Increments() + { + await RemoteExecutor.Invoke( + static () => RunNonPinnableFallbackTelemetryScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_PinnableMemory_PinReleaseLifecycle_Works() + { + await RemoteExecutor.Invoke( + static () => RunPinnableMemoryPinReleaseLifecycleScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegistrationLifecycle_IsStable() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRegistrationLifecycleScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_BufferSelectReceive_RecyclesBuffer() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferSelectReceiveScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RecyclesBeyondRingCapacity() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRecycleReuseScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_ForcedExhaustion_ReportsNoBufferSpace() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferExhaustionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_MixedWithRecvFrom_Works() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferMixedWorkloadScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_SmallMessages_Shrinks() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferSmallMessageShrinkScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_LargeMessages_Grows() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferLargeMessageGrowScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_MixedWorkload_Stable() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferMixedWorkloadStableScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_ResizeSwap_NoDataLoss() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferResizeSwapNoDataLossScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_ResizeSwap_ConcurrentInFlight_NoDataLoss() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferResizeSwapConcurrentInFlightNoDataLossScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_Disabled_StaysFixed() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferDisabledScenarioAsync(), + CreateSocketEngineOptions( + providedBufferSize: 4096, + adaptiveBufferSizingEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_Default_IsDisabled() + { + await RemoteExecutor.Invoke( + static () => RunAdaptiveProvidedBufferSizingStateScenarioAsync(expectedEnabled: false), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ProvidedBuffer_AdaptiveSizing_Switch_HonorsBothValues(bool enabled) + { + await RemoteExecutor.Invoke( + static arg => RunAdaptiveProvidedBufferSizingStateScenarioAsync(bool.Parse(arg)), + enabled.ToString(), + CreateSocketEngineOptions(adaptiveBufferSizingEnabled: enabled)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_DisabledByEnvVar() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelRegistrationDisabledScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_SuccessState_VisibleWhenAvailable() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelRegistrationSuccessScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_FailureWhenObserved_IsNonFatal() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelRegistrationFailureNonFatalScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true, + providedBufferSize: 65536)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_AdaptiveResize_TriggersReregistration() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferKernelReregistrationOnResizeScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true, + adaptiveBufferSizingEnabled: true, + providedBufferSize: 4096)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_DataCorrectness_WithRegisteredBuffers() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRegisteredBuffersDataCorrectnessScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_MemoryPressure_GracefulFallbackOrSuccess() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRegistrationMemoryPressureScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true, + providedBufferSize: 65536)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // provided-buffer OOM fallback is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_ForcedRingAllocationFailure_FallsBackGracefully() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferRingForcedAllocationFailureFallbackScenarioAsync(), + CreateSocketEngineOptions(forceProvidedBufferRingOomOnce: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring teardown ordering contract is Linux-specific. + public static async Task IoUringCompletionMode_ProvidedBuffer_RegisterBuffers_TeardownOrdering_UnregisterBeforeRingClose() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferTeardownOrderingContractScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_FixedRecv_Default_IsDisabled() + { + await RemoteExecutor.Invoke( + static () => RunFixedRecvStateScenarioAsync(expectedEnabled: false), + CreateSocketEngineOptions(registerBuffersEnabled: false)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_FixedRecv_Activation_FollowsRuntimeCapabilities() + { + await RemoteExecutor.Invoke( + static () => RunFixedRecvActivationFollowsRuntimeCapabilitiesScenarioAsync(), + CreateSocketEngineOptions(registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_FixedRecv_Enabled_DataCorrectness_WithRegisteredBuffers() + { + await RemoteExecutor.Invoke( + static () => RunFixedRecvDataCorrectnessScenarioAsync(), + CreateSocketEngineOptions( + registerBuffersEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_BasicSendReceive() + { + await RemoteExecutor.Invoke( + static () => RunSqPollBasicSendReceiveScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL request behavior is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_Requested_DoesNotBreakSocketOperations() + { + await RemoteExecutor.Invoke( + static () => RunSqPollRequestedScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // DEFER_TASKRUN submitter_task is Linux io_uring-specific. + public static async Task IoUringCompletionMode_DeferTaskrun_InitializesOnEventLoopThread() + { + await RemoteExecutor.Invoke( + static () => RunDeferTaskrunEventLoopInitScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL wakeup path is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_IdleWakeupPath_IncrementsWakeupCounterWhenObserved() + { + await RemoteExecutor.Invoke( + static () => RunSqPollWakeupAfterIdleScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL + multishot recv is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_MultishotRecv_Works() + { + await RemoteExecutor.Invoke( + static () => RunSqPollMultishotRecvScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL + zero-copy send is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_ZeroCopySend_Works() + { + await RemoteExecutor.Invoke( + static () => RunSqPollZeroCopySendScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL telemetry counters are Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_TelemetryCounters_Emitted() + { + await RemoteExecutor.Invoke( + static () => RunSqPollTelemetryCountersScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // SQPOLL SQ flags contract is Linux io_uring-specific. + public static async Task IoUringCompletionMode_SqPoll_SqNeedWakeup_ContractMatchesSqFlagBit() + { + await RemoteExecutor.Invoke( + static () => RunSqPollNeedWakeupContractScenarioAsync(), + CreateSocketEngineOptions(sqPollEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_Default_IsEnabledWhenSupported() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendStateScenarioAsync(expectedEnabledWhenSupported: true), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ZeroCopySend_Switch_HonorsBothValues(bool enabled) + { + await RemoteExecutor.Invoke( + static arg => RunZeroCopySendStateScenarioAsync(bool.Parse(arg)), + enabled.ToString(), + CreateSocketEngineOptions(zeroCopySendEnabled: enabled)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_LargeBuffer_CompletesCorrectly() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendLargeBufferRoundTripScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_SmallBuffer_UsesRegularSendFallbackPath_ForcedSendErrorObserved() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendSmallBufferUsesRegularSendWithForcedSendErrorScenarioAsync(), + CreateSocketEngineOptions( + zeroCopySendEnabled: true, + forceEcanceledOnceMask: "send")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_NotifCqe_ReleasesPinHolds() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendNotifCqeReleasesPinHoldsScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_ResetStorm_RecoversPendingNotificationSlots() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendResetStormSlotRecoveryScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_PartialSendResubmission_CompletesFully() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendPartialSendResubmissionScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_TaskCompletion_ReleasesPins() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendCompletionPinLifetimeScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_UnsupportedOpcode_FallsBackGracefully() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendUnsupportedOpcodeFallbackScenarioAsync(), + CreateSocketEngineOptions(zeroCopySendEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_BufferList4KSegments_AboveThreshold_UsesSendMsgZc() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendBufferListSegmentThresholdScenarioAsync(), + CreateSocketEngineOptions( + zeroCopySendEnabled: true, + forceEcanceledOnceMask: "sendmsg")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroCopySend_SendToAboveThreshold_UsesSendMsgZc() + { + await RemoteExecutor.Invoke( + static () => RunZeroCopySendToAboveThresholdScenarioAsync(), + CreateSocketEngineOptions( + zeroCopySendEnabled: true, + forceEcanceledOnceMask: "sendmsg")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_Basic_CompletesAcrossIterations() + { + await RemoteExecutor.Invoke( + static () => RunMultishotRecvBasicScenarioAsync(iterations: 64), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_Cancellation_Completes() + { + await RemoteExecutor.Invoke( + static () => RunMultishotRecvCancellationScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_PeerClose_Terminates() + { + await RemoteExecutor.Invoke( + static () => RunMultishotRecvPeerCloseScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_ProvidedBufferExhaustion_FollowsPolicy() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferExhaustionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_ProvidedBufferExhaustion_TerminatesAndRecovers() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvProvidedBufferExhaustionScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotRecv_MixedWithOneShot_Coexists() + { + await RemoteExecutor.Invoke( + static () => RunProvidedBufferMixedWorkloadScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_ShapeChange_CancelsAndRearms() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvShapeChangeScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_ConcurrentCloseRace_DoesNotHang() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvConcurrentCloseRaceScenarioAsync(iterations: 32), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring queue saturation behavior is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_DataQueueSaturation_CapsAtSixteenBufferedCompletions() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvQueueSaturationScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // direct SQE preparation race is Linux-specific. + public static async Task IoUringPersistentMultishotRecv_ConcurrentCloseRace_DirectSqeEnabled_DoesNotHang() + { + await RemoteExecutor.Invoke( + static () => RunPersistentMultishotRecvConcurrentCloseRaceScenarioAsync(iterations: 32), + CreateSocketEngineOptions(directSqeEnabled: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_Basic_CompletesAcrossIterations() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptBasicScenarioAsync(connectionCount: 10), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_PrequeuesConnections_BeforeSubsequentAcceptAsync() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptPrequeueScenarioAsync(prequeuedConnectionCount: 5), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_ListenerClose_CompletesPendingAcceptAndDrainsQueue() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptListenerCloseScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // teardown race behavior is Linux-specific. + public static async Task IoUringMultishotAccept_TeardownRace_InFlightAcceptDelivery_DoesNotHang() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptTeardownRaceScenarioAsync(iterations: 32), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_DisposeDuringArmingRace_DoesNotHang() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptDisposeDuringArmingRaceScenarioAsync(iterations: 64), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_PrepareUnsupported_UsesOneShotFallback() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptPrepareUnsupportedOneShotFallbackScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_TerminalCompletion_RearmsOnNextAccept() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptRearmAfterTerminalCqeScenarioAsync(), + CreateSocketEngineOptions(forceEcanceledOnceMask: "accept")).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringMultishotAccept_HighConnectionRate_NoLoss() + { + await RemoteExecutor.Invoke( + static () => RunMultishotAcceptHighConnectionRateScenarioAsync(connectionCount: 256), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // slot-capacity pressure behavior is Linux-specific. + public static async Task IoUringCompletionMode_SlotCapacityStress_4000Connections_Completes() + { + await RemoteExecutor.Invoke( + static () => RunSlotCapacityStressScenarioAsync(connectionCount: 4000), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_NetworkStream_ReadAsync_CancellationToken_Works() + { + await RemoteExecutor.Invoke( + static () => RunNetworkStreamReadAsyncCancellationTokenScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveAsync_SocketAsyncEventArgs_BufferList_Unaffected() + { + await RemoteExecutor.Invoke( + static () => RunReceiveAsyncSocketAsyncEventArgsBufferListScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListSendReceive_Works() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x01, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61 }; + var sendBuffers = new List> + { + new ArraySegment(payload, 0, 2), + new ArraySegment(payload, 2, 1), + new ArraySegment(payload, 3, 4) + }; + + byte[] receiveBuffer1 = new byte[3]; + byte[] receiveBuffer2 = new byte[4]; + var receiveBuffers = new List> + { + new ArraySegment(receiveBuffer1), + new ArraySegment(receiveBuffer2) + }; + + Task receiveTask = server.ReceiveAsync(receiveBuffers, SocketFlags.None); + await Task.Yield(); + + int sent = await client.SendAsync(sendBuffers, SocketFlags.None); + Assert.Equal(payload.Length, sent); + + int received = await receiveTask; + Assert.Equal(payload.Length, received); + + byte[] combined = new byte[payload.Length]; + Buffer.BlockCopy(receiveBuffer1, 0, combined, 0, receiveBuffer1.Length); + Buffer.BlockCopy(receiveBuffer2, 0, combined, receiveBuffer1.Length, receiveBuffer2.Length); + Assert.Equal(payload, combined); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListReceive_WithPeek_PreservesData() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[] { 0x0A, 0x1A, 0x2A, 0x3A }; + Assert.Equal(payload.Length, await client.SendAsync(payload, SocketFlags.None)); + + byte[] peekBuffer1 = new byte[2]; + byte[] peekBuffer2 = new byte[2]; + int peeked = await server.ReceiveAsync( + new List> + { + new ArraySegment(peekBuffer1), + new ArraySegment(peekBuffer2) + }, + SocketFlags.Peek); + Assert.Equal(payload.Length, peeked); + + byte[] peekCombined = new byte[payload.Length]; + Buffer.BlockCopy(peekBuffer1, 0, peekCombined, 0, peekBuffer1.Length); + Buffer.BlockCopy(peekBuffer2, 0, peekCombined, peekBuffer1.Length, peekBuffer2.Length); + Assert.Equal(payload, peekCombined); + + byte[] receiveBuffer1 = new byte[1]; + byte[] receiveBuffer2 = new byte[3]; + int received = await server.ReceiveAsync( + new List> + { + new ArraySegment(receiveBuffer1), + new ArraySegment(receiveBuffer2) + }, + SocketFlags.None); + Assert.Equal(payload.Length, received); + + byte[] receiveCombined = new byte[payload.Length]; + Buffer.BlockCopy(receiveBuffer1, 0, receiveCombined, 0, receiveBuffer1.Length); + Buffer.BlockCopy(receiveBuffer2, 0, receiveCombined, receiveBuffer1.Length, receiveBuffer2.Length); + Assert.Equal(payload, receiveCombined); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListReceiveFrom_WritesRemoteEndPoint() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] receiveBuffer1 = new byte[3]; + byte[] receiveBuffer2 = new byte[4]; + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer1), + new ArraySegment(receiveBuffer2) + }, + RemoteEndPoint = new IPEndPoint(IPAddress.Any, 0) + }; + + Task receiveTask = StartSocketAsyncEventArgsOperation( + receiver, + receiveEventArgs, + static (s, args) => s.ReceiveFromAsync(args)); + await Task.Yield(); + + byte[] payload = new byte[] { 0xA0, 0xB0, 0xC0, 0xD0, 0xE0, 0xF0, 0x01 }; + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketAsyncEventArgs completedReceive = await receiveTask; + Assert.Equal(SocketError.Success, completedReceive.SocketError); + Assert.Equal(payload.Length, completedReceive.BytesTransferred); + Assert.Equal(SocketFlags.None, completedReceive.SocketFlags); + + IPEndPoint expectedRemoteEndPoint = (IPEndPoint)sender.LocalEndPoint!; + IPEndPoint actualRemoteEndPoint = Assert.IsType(completedReceive.RemoteEndPoint); + Assert.Equal(expectedRemoteEndPoint, actualRemoteEndPoint); + + byte[] combined = new byte[payload.Length]; + Buffer.BlockCopy(receiveBuffer1, 0, combined, 0, receiveBuffer1.Length); + Buffer.BlockCopy(receiveBuffer2, 0, combined, receiveBuffer1.Length, receiveBuffer2.Length); + Assert.Equal(payload, combined); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_BufferListSendTo_WritesPayloadAndEndpoint() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[] { 0x33, 0x44, 0x55, 0x66, 0x77, 0x88 }; + byte[] receiveBuffer = new byte[payload.Length]; + + Task receiveTask = + ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + + using var sendEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(payload, 0, 2), + new ArraySegment(payload, 2, 1), + new ArraySegment(payload, 3, 3) + }, + RemoteEndPoint = receiver.LocalEndPoint + }; + + SocketAsyncEventArgs completedSend = await StartSocketAsyncEventArgsOperation( + sender, + sendEventArgs, + static (s, args) => s.SendToAsync(args)); + Assert.Equal(SocketError.Success, completedSend.SocketError); + Assert.Equal(payload.Length, completedSend.BytesTransferred); + + SocketReceiveFromResult receiveResult = await receiveTask; + Assert.Equal(payload.Length, receiveResult.ReceivedBytes); + Assert.Equal(payload, receiveBuffer); + + IPEndPoint expectedRemoteEndPoint = (IPEndPoint)sender.LocalEndPoint!; + IPEndPoint actualRemoteEndPoint = Assert.IsType(receiveResult.RemoteEndPoint); + Assert.Equal(expectedRemoteEndPoint, actualRemoteEndPoint); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AcceptConnect_SocketAsyncEventArgs_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + using var acceptEventArgs = new SocketAsyncEventArgs(); + Task acceptTask = StartSocketAsyncEventArgsOperation( + listener, + acceptEventArgs, + static (s, args) => s.AcceptAsync(args)); + await Task.Yield(); + + using var connectEventArgs = new SocketAsyncEventArgs + { + RemoteEndPoint = listener.LocalEndPoint + }; + + SocketAsyncEventArgs completedConnect = await StartSocketAsyncEventArgsOperation( + client, + connectEventArgs, + static (s, args) => s.ConnectAsync(args)); + Assert.Equal(SocketError.Success, completedConnect.SocketError); + + SocketAsyncEventArgs completedAccept = await acceptTask; + Assert.Equal(SocketError.Success, completedAccept.SocketError); + + Socket accepted = Assert.IsType(completedAccept.AcceptSocket); + completedAccept.AcceptSocket = null; + using Socket server = accepted; + + // Validates accept address-length handling: the endpoint must match the connecting socket exactly. + IPEndPoint expectedRemoteEndPoint = (IPEndPoint)client.LocalEndPoint!; + IPEndPoint actualRemoteEndPoint = Assert.IsType(server.RemoteEndPoint); + Assert.Equal(expectedRemoteEndPoint, actualRemoteEndPoint); + + byte[] payload = new byte[] { 0x5A }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AcceptAsync_CancellationToken_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); + Task acceptTask = ToTask(listener.AcceptAsync(cts.Token)); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket server = await acceptTask; + + byte[] payload = new byte[] { 0x4D }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AcceptAsync_SocketAsyncEventArgs_PrecreatedAcceptSocket_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(4); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + if (!IsIoUringMultishotAcceptSupported()) + { + return; + } + + // Arm multishot accept and leave one connection queued for pre-accept dequeue. + Task armingAcceptTask = listener.AcceptAsync(); + Assert.True( + await WaitForMultishotAcceptArmedStateAsync(listener, expectedArmed: true), + "Expected multishot accept to arm before precreated AcceptSocket test."); + + using Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await Task.WhenAll(firstClient.ConnectAsync(endpoint), secondClient.ConnectAsync(endpoint)); + using Socket firstServer = await armingAcceptTask; + + DateTime deadline = DateTime.UtcNow + TimeSpan.FromSeconds(5); + while (DateTime.UtcNow < deadline && GetListenerMultishotAcceptQueueCount(listener) == 0) + { + await Task.Delay(25); + } + + Assert.True(GetListenerMultishotAcceptQueueCount(listener) > 0, "Expected a queued pre-accepted connection."); + + using Socket precreatedAcceptSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + using var acceptEventArgs = new SocketAsyncEventArgs + { + AcceptSocket = precreatedAcceptSocket + }; + + SocketAsyncEventArgs completedAccept = await StartSocketAsyncEventArgsOperation( + listener, + acceptEventArgs, + static (s, args) => s.AcceptAsync(args)); + Assert.Equal(SocketError.Success, completedAccept.SocketError); + Assert.Same(precreatedAcceptSocket, completedAccept.AcceptSocket); + + byte[] payload = new byte[] { 0x3F }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await secondClient.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await precreatedAcceptSocket.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + + // Keep ownership of the accepted socket out of event-args disposal. + completedAccept.AcceptSocket = null; + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_TcpListener_AcceptTcpClientAsync_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using var listener = new TcpListener(IPAddress.Loopback, 0); + listener.Start(); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndpoint; + + Task acceptTask = listener.AcceptTcpClientAsync(); + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + await client.ConnectAsync(endpoint); + + using TcpClient acceptedClient = await acceptTask; + using Socket server = acceptedClient.Client; + + byte[] payload = new byte[] { 0x2A }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ConnectAsync_OffLenRegression_Ipv4AndIpv6_Works() + { + await RemoteExecutor.Invoke(static async () => + { + await VerifyConnectAsync(AddressFamily.InterNetwork, IPAddress.Loopback); + + if (Socket.OSSupportsIPv6) + { + await VerifyConnectAsync(AddressFamily.InterNetworkV6, IPAddress.IPv6Loopback); + } + + static async Task VerifyConnectAsync(AddressFamily addressFamily, IPAddress loopback) + { + using Socket listener = new Socket(addressFamily, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(addressFamily, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + + using var connectEventArgs = new SocketAsyncEventArgs + { + RemoteEndPoint = listener.LocalEndPoint + }; + + SocketAsyncEventArgs completedConnect = await StartSocketAsyncEventArgsOperation( + client, + connectEventArgs, + static (s, args) => s.ConnectAsync(args)); + Assert.Equal(SocketError.Success, completedConnect.SocketError); + + using Socket server = await acceptTask; + Assert.Equal(client.LocalEndPoint, server.RemoteEndPoint); + + byte[] payload = new byte[] { 0x3C }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await client.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await server.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ConnectAsync_WithInitialData_Success_ServerReceivesPayload() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + using var connectEventArgs = new SocketAsyncEventArgs + { + RemoteEndPoint = listener.LocalEndPoint + }; + + byte[] initialPayload = new byte[256]; + for (int i = 0; i < initialPayload.Length; i++) + { + initialPayload[i] = unchecked((byte)(0xA0 + i)); + } + + connectEventArgs.SetBuffer(initialPayload, 0, initialPayload.Length); + + Task acceptTask = listener.AcceptAsync(); + SocketAsyncEventArgs completedConnect = await StartSocketAsyncEventArgsOperation( + client, + connectEventArgs, + static (s, args) => s.ConnectAsync(args)); + Assert.Equal(SocketError.Success, completedConnect.SocketError); + + using Socket server = await acceptTask; + byte[] receivedPayload = new byte[initialPayload.Length]; + await ReceiveExactlyAsync(server, receivedPayload); + Assert.Equal(initialPayload, receivedPayload); + + await AssertConnectedPairRoundTripAsync(client, server, 0xAB); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ConnectAsync_WithInitialData_ForcedSendFailure_PropagatesError() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + client.SendBufferSize = 1024; + using var connectEventArgs = new SocketAsyncEventArgs + { + RemoteEndPoint = listener.LocalEndPoint + }; + + byte[] initialPayload = new byte[8 * 1024 * 1024]; + for (int i = 0; i < initialPayload.Length; i++) + { + initialPayload[i] = unchecked((byte)i); + } + connectEventArgs.SetBuffer(initialPayload, 0, initialPayload.Length); + + Task firstAcceptTask = listener.AcceptAsync(); + Task connectTask = StartSocketAsyncEventArgsOperation( + client, + connectEventArgs, + static (s, args) => s.ConnectAsync(args)); + using (Socket firstServer = await firstAcceptTask) + { + firstServer.LingerState = new LingerOption(enable: true, seconds: 0); + } + + Task completed = await Task.WhenAny(connectTask, Task.Delay(TimeSpan.FromSeconds(30))); + Assert.Same(connectTask, completed); + SocketAsyncEventArgs completedConnect = await connectTask; + Assert.NotEqual(SocketError.Success, completedConnect.SocketError); + + using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task secondAcceptTask = listener.AcceptAsync(); + await secondClient.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket secondServer = await secondAcceptTask; + + byte[] payload = new byte[] { 0x9A }; + byte[] receiveBuffer = new byte[1]; + Assert.Equal(1, await secondClient.SendAsync(payload, SocketFlags.None)); + Assert.Equal(1, await secondServer.ReceiveAsync(receiveBuffer, SocketFlags.None)); + Assert.Equal(payload[0], receiveBuffer[0]); + }, CreateSocketEngineOptions(forceEcanceledOnceMask: "send")).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ReceiveMessageFrom_PacketInformation_Works(bool useIpv6) + { + await RemoteExecutor.Invoke( + static arg => RunReceiveMessageFromPacketInformationRoundTripAsync(useIpv6: bool.Parse(arg)), + useIpv6.ToString(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ReceiveMessageFrom_BufferList_PacketInformation_Works(bool useIpv6) + { + await RemoteExecutor.Invoke(static async arg => + { + bool useIpv6 = bool.Parse(arg); + if (useIpv6 && !Socket.OSSupportsIPv6) + { + return; + } + + AddressFamily family = useIpv6 ? AddressFamily.InterNetworkV6 : AddressFamily.InterNetwork; + IPAddress loopback = useIpv6 ? IPAddress.IPv6Loopback : IPAddress.Loopback; + IPAddress anyAddress = useIpv6 ? IPAddress.IPv6Any : IPAddress.Any; + SocketOptionLevel packetInfoLevel = useIpv6 ? SocketOptionLevel.IPv6 : SocketOptionLevel.IP; + + using Socket receiver = new Socket(family, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(family, SocketType.Dgram, ProtocolType.Udp); + + receiver.SetSocketOption(packetInfoLevel, SocketOptionName.PacketInformation, true); + receiver.Bind(new IPEndPoint(loopback, 0)); + sender.Bind(new IPEndPoint(loopback, 0)); + + byte[] payload = new byte[] { 0x70, 0x71, 0x72, 0x73, 0x74 }; + byte[] receiveBuffer = new byte[payload.Length]; + + using var receiveEventArgs = new SocketAsyncEventArgs + { + BufferList = new List> + { + new ArraySegment(receiveBuffer, 0, 2), + new ArraySegment(receiveBuffer, 2, 3) + }, + RemoteEndPoint = new IPEndPoint(anyAddress, 0) + }; + + Task receiveTask = StartReceiveMessageFromAsync(receiver, receiveEventArgs); + await Task.Yield(); + + int sent = await sender.SendToAsync(payload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(payload.Length, sent); + + SocketAsyncEventArgs completedReceive = await receiveTask; + Assert.Equal(SocketError.Success, completedReceive.SocketError); + Assert.Equal(payload.Length, completedReceive.BytesTransferred); + Assert.Equal(payload, receiveBuffer); + Assert.Equal(sender.LocalEndPoint, completedReceive.RemoteEndPoint); + Assert.Equal(((IPEndPoint)sender.LocalEndPoint!).Address, completedReceive.ReceiveMessageFromPacketInfo.Address); + }, useIpv6.ToString(), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_SendAsync_PartialSendResubmission_CompletesFully(bool useBufferListSend) + { + await RemoteExecutor.Invoke( + static (arg) => RunLargeSendWithBackpressureAsync(useBufferListSend: bool.Parse(arg)), + useBufferListSend.ToString(), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + [InlineData(false)] + [InlineData(true)] + public static async Task IoUringCompletionMode_ForcedReceiveResultOnce_RecoversAndNextOperationStillWorks(bool forceEcanceled) + { + await RemoteExecutor.Invoke( + static arg => RunForcedReceiveScenarioAsync(forceEcanceled: bool.Parse(arg)), + forceEcanceled.ToString(), + CreateSocketEngineOptions( + forceEagainOnceMask: forceEcanceled ? null : "recv", + forceEcanceledOnceMask: forceEcanceled ? "recv" : null)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ForcedEagain_Recv_RequeuesViaCompletionPath() + { + await RemoteExecutor.Invoke( + static () => + { + long requeueFailureBefore = GetIoUringCompletionRequeueFailureCounterValue(); + long queuedRetryBefore = GetIoUringPendingRetryQueuedToPrepareQueueCount(); + + return Task.Run(async () => + { + await RunForcedReceiveScenarioAsync(forceEcanceled: false); + + Assert.Equal(requeueFailureBefore, GetIoUringCompletionRequeueFailureCounterValue()); + Assert.Equal(queuedRetryBefore, GetIoUringPendingRetryQueuedToPrepareQueueCount()); + }); + }, + CreateSocketEngineOptions( + forceEagainOnceMask: "recv")).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ZeroByteReceive_OnPeerClose_ReturnsZeroOrCloseError() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + Task zeroByteReceive = ToTask(server.ReceiveAsync(Memory.Empty, SocketFlags.None)); + await Task.Yield(); + + client.Shutdown(SocketShutdown.Both); + client.Dispose(); + + Task completed = await Task.WhenAny(zeroByteReceive, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(zeroByteReceive, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await zeroByteReceive); + if (ex is null) + { + Assert.Equal(0, await zeroByteReceive); + } + else + { + SocketException socketException = Assert.IsType(ex); + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error while waiting for peer-close zero-byte receive completion: {socketException.SocketErrorCode}"); + } + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_SendAsync_PeerClose_DoesNotReturnZeroByteSuccess() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + byte[] payload = new byte[64 * 1024]; + Task sendTask = ToTask(server.SendAsync(payload, SocketFlags.None)); + await Task.Yield(); + + client.Shutdown(SocketShutdown.Both); + client.Dispose(); + + Task completed = await Task.WhenAny(sendTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(sendTask, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await sendTask); + if (ex is null) + { + int sent = await sendTask; + Assert.True(sent > 0, "Non-empty send must not complete with success and zero bytes transferred."); + return; + } + + SocketException socketException = Assert.IsType(ex); + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.ConnectionAborted || + socketException.SocketErrorCode == SocketError.Shutdown || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error for peer-close send completion: {socketException.SocketErrorCode}"); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveFrom_TruncatedPayload_ReturnsTruncatedLengthOrMessageSizeError() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] sendPayload = new byte[] { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06 }; + EndPoint senderEndpoint = sender.LocalEndPoint!; + byte[] receiveBuffer = new byte[2]; + + Task receiveTask = + ToTask(receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + + int sent = await sender.SendToAsync(sendPayload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(sendPayload.Length, sent); + + Task completed = await Task.WhenAny(receiveTask, Task.Delay(TimeSpan.FromSeconds(15))); + Assert.Same(receiveTask, completed); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + if (ex is not null) + { + SocketException socketException = Assert.IsType(ex); + Assert.Equal(SocketError.MessageSize, socketException.SocketErrorCode); + return; + } + + SocketReceiveFromResult receiveResult = await receiveTask; + Assert.True(receiveResult.ReceivedBytes > 0 && receiveResult.ReceivedBytes <= receiveBuffer.Length); + for (int i = 0; i < receiveResult.ReceivedBytes; i++) + { + Assert.Equal(sendPayload[i], receiveBuffer[i]); + } + Assert.Equal(senderEndpoint, receiveResult.RemoteEndPoint); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_DatagramReceive_OversizedPayload_DoesNotArmPersistentMultishotRecv() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + receiver.Connect(sender.LocalEndPoint!); + sender.Connect(receiver.LocalEndPoint!); + + // Multishot recv uses IORING_OP_RECV and cannot observe MSG_TRUNC; datagram sockets + // must stay on one-shot receive paths where truncation semantics remain explicit. + Assert.False( + IsPersistentMultishotRecvArmed(receiver), + "Datagram receive should not arm persistent multishot state."); + + byte[] receiveBuffer = new byte[2]; + byte[] sendPayload = new byte[] { 0x31, 0x32, 0x33, 0x34, 0x35, 0x36 }; + + Task receiveTask = ToTask(receiver.ReceiveAsync(receiveBuffer, SocketFlags.None)); + await Task.Yield(); + + int sent = await sender.SendAsync(sendPayload, SocketFlags.None); + Assert.Equal(sendPayload.Length, sent); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + if (ex is null) + { + int received = await receiveTask; + Assert.True(received > 0 && received <= receiveBuffer.Length); + } + else + { + SocketException socketException = Assert.IsType(ex); + Assert.Equal(SocketError.MessageSize, socketException.SocketErrorCode); + } + + Assert.False( + IsPersistentMultishotRecvArmed(receiver), + "Datagram receive should remain outside persistent multishot state."); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ReceiveFrom_OversizedDatagram_ZeroLengthBuffer_CompletesOrMessageSize() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] sendPayload = new byte[60 * 1024]; + EndPoint senderEndpoint = sender.LocalEndPoint!; + + Task receiveTask = + ToTask(receiver.ReceiveFromAsync(Array.Empty(), SocketFlags.None, new IPEndPoint(IPAddress.Any, 0))); + await Task.Yield(); + + int sent = await sender.SendToAsync(sendPayload, SocketFlags.None, receiver.LocalEndPoint!); + Assert.Equal(sendPayload.Length, sent); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + if (ex is not null) + { + SocketException socketException = Assert.IsType(ex); + Assert.Equal(SocketError.MessageSize, socketException.SocketErrorCode); + return; + } + + SocketReceiveFromResult receiveResult = await receiveTask; + Assert.Equal(0, receiveResult.ReceivedBytes); + Assert.Equal(senderEndpoint, receiveResult.RemoteEndPoint); + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_SendTo_UnreachableEndpoint_CompletesOrFailsWithExpectedError() + { + await RemoteExecutor.Invoke(static () => + { + return Task.Run(async () => + { + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + + byte[] payload = new byte[] { 0xAA }; + EndPoint destination = new IPEndPoint(IPAddress.Parse("192.0.2.1"), 9); + + try + { + int sent = await sender.SendToAsync(payload, SocketFlags.None, destination); + Assert.Equal(payload.Length, sent); + // UDP sendto may succeed on some Linux/network configurations even for TEST-NET destinations. + return; + } + catch (Exception ex) + { + SocketException socketException = Assert.IsType(ex); + Assert.True( + socketException.SocketErrorCode == SocketError.NetworkUnreachable || + socketException.SocketErrorCode == SocketError.HostUnreachable || + socketException.SocketErrorCode == SocketError.HostNotFound || + socketException.SocketErrorCode == SocketError.NetworkDown || + socketException.SocketErrorCode == SocketError.AccessDenied || + socketException.SocketErrorCode == SocketError.InvalidArgument, + $"Unexpected socket error for unreachable send: {socketException.SocketErrorCode}"); + } + }); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_AsyncCancelRequestCqe_IsolatedFromManagedOperationDispatch() + { + await RemoteExecutor.Invoke(static () => RunAsyncCancelRequestIsolationScenarioAsync(64), CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // generation-dispatch behavior is Linux-specific. + public static async Task IoUringCompletionMode_CompletionDispatch_StaleWrappedGeneration_IsDiscarded() + { + await RemoteExecutor.Invoke( + static () => RunGenerationWrapAroundDispatchScenarioAsync(), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring cancel queue/wakeup path is Linux-specific. + public static async Task IoUringCompletionMode_CancelQueueFull_WakesBeforeOverflow() + { + await RemoteExecutor.Invoke( + static () => RunCancellationQueueWakeBeforeOverflowScenarioAsync(), + CreateSocketEngineOptions(prepareQueueCapacity: 1)).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_SlotGenerationTransitions_Arm64Stress_NoHangsOrLeaks() + { + await RemoteExecutor.Invoke( + static () => RunTrackedOperationGenerationTransitionStressScenarioAsync(connectionCount: 8, iterationsPerConnection: 1024), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CancellationSubmitContention_ProgressesUnderLoad() + { + await RemoteExecutor.Invoke( + static () => RunCancellationSubmitContentionScenarioAsync(connectionCount: 8, cancellationsPerConnection: 96), + CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CompletionCancellationRace_CompletesExactlyOnce() + { + await RemoteExecutor.Invoke(static () => RunCompletionCancellationRaceAsync(128), CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_RapidCancelWhileEnqueued_DoesNotCorruptState() + { + await RemoteExecutor.Invoke(static async () => + { + var trio = await CreateConnectedTcpSocketTrioAsync(); + using Socket _ = trio.Listener; + using Socket client = trio.Client; + using Socket server = trio.Server; + + const int WorkerCount = 8; + const int IterationsPerWorker = 128; + var tasks = new Task[WorkerCount]; + + for (int worker = 0; worker < WorkerCount; worker++) + { + tasks[worker] = Task.Run(async () => + { + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < IterationsPerWorker; i++) + { + using var cts = new CancellationTokenSource(); + var receiveTask = server.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token); + cts.Cancel(); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + AssertCanceledOrInterrupted(ex); + } + }); + } + + await Task.WhenAll(tasks); + + // Ensure socket state still allows normal async flow after rapid cancellation churn. + byte[] payload = new byte[] { 0xA5 }; + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(1, sent); + int received = await server.ReceiveAsync(payload, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(0xA5, payload[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CloseDisposeStress_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(32); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + for (int i = 0; i < 64; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + Task[] receives = new Task[16]; + for (int r = 0; r < receives.Length; r++) + { + receives[r] = ToTask(server.ReceiveAsync(new byte[1], SocketFlags.None)); + } + + client.Dispose(); + server.Dispose(); + + for (int r = 0; r < receives.Length; r++) + { + Exception? ex = await Record.ExceptionAsync(async () => await receives[r]); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_ConcurrentCloseWithPendingReceive_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(16); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < 64; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + var pendingReceive = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + + // Force teardown while an async receive is pending. + client.Dispose(); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not null) + { + throw ex; + } + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_ConcurrentRegistrationChurn_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + const int WorkerCount = 8; + const int IterationsPerWorker = 64; + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(WorkerCount * 2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var workers = new Task[WorkerCount]; + for (int worker = 0; worker < WorkerCount; worker++) + { + workers[worker] = Task.Run(async () => + { + byte[] sendBuffer = new byte[] { 0x5A }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < IterationsPerWorker; i++) + { + var pair = await AcceptConnectedTcpPairAsync(listener, endpoint); + using Socket client = pair.Client; + using Socket server = pair.Server; + + var pendingReceive = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + if ((i & 1) == 0) + { + int sent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + } + else + { + client.Dispose(); + } + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + }); + } + + await Task.WhenAll(workers); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_RepeatedRunStabilityGate() + { + await RemoteExecutor.Invoke(static async () => + { + const int Iterations = 50; + for (int i = 0; i < Iterations; i++) + { + await RunTcpRoundTripAsync(8); + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUringTestInfrastructure/SocketAsyncEngine.IoUringTestAccessors.Linux.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUringTestInfrastructure/SocketAsyncEngine.IoUringTestAccessors.Linux.cs new file mode 100644 index 00000000000000..168bd099a7677e --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUringTestInfrastructure/SocketAsyncEngine.IoUringTestAccessors.Linux.cs @@ -0,0 +1,1022 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Runtime.InteropServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { + internal readonly struct IoUringNonPinnableFallbackPublicationState + { + internal IoUringNonPinnableFallbackPublicationState(long publishedCount, int publishingGate, long fallbackCount) + { + PublishedCount = publishedCount; + PublishingGate = publishingGate; + FallbackCount = fallbackCount; + } + + internal long PublishedCount { get; } + internal int PublishingGate { get; } + internal long FallbackCount { get; } + } + + internal readonly struct IoUringNativeDiagnosticsSnapshotForTest + { + internal IoUringNativeDiagnosticsSnapshotForTest( + bool hasIoUringPort, + ulong asyncCancelRequestCqeCount, + ulong asyncCancelRequestCqeEnoentCount, + ulong asyncCancelRequestCqeEalreadyCount, + ulong asyncCancelRequestCqeOtherCount, + ulong socketEventBufferFullCount, + ulong unsupportedOpcodePrepareCount, + ulong cqOverflowCount) + { + HasIoUringPort = hasIoUringPort; + AsyncCancelRequestCqeCount = asyncCancelRequestCqeCount; + AsyncCancelRequestCqeEnoentCount = asyncCancelRequestCqeEnoentCount; + AsyncCancelRequestCqeEalreadyCount = asyncCancelRequestCqeEalreadyCount; + AsyncCancelRequestCqeOtherCount = asyncCancelRequestCqeOtherCount; + SocketEventBufferFullCount = socketEventBufferFullCount; + UnsupportedOpcodePrepareCount = unsupportedOpcodePrepareCount; + CqOverflowCount = cqOverflowCount; + } + + internal bool HasIoUringPort { get; } + internal ulong AsyncCancelRequestCqeCount { get; } + internal ulong AsyncCancelRequestCqeEnoentCount { get; } + internal ulong AsyncCancelRequestCqeEalreadyCount { get; } + internal ulong AsyncCancelRequestCqeOtherCount { get; } + internal ulong SocketEventBufferFullCount { get; } + internal ulong UnsupportedOpcodePrepareCount { get; } + internal ulong CqOverflowCount { get; } + } + + internal readonly struct IoUringProvidedBufferSnapshotForTest + { + internal IoUringProvidedBufferSnapshotForTest( + bool hasIoUringPort, + bool supportsProvidedBufferRings, + bool hasProvidedBufferRing, + bool hasRegisteredBuffers, + bool adaptiveBufferSizingEnabled, + int availableCount, + int inUseCount, + int totalBufferCount, + int bufferSize, + int recommendedBufferSize, + long recycledCount, + long allocationFailureCount) + { + HasIoUringPort = hasIoUringPort; + SupportsProvidedBufferRings = supportsProvidedBufferRings; + HasProvidedBufferRing = hasProvidedBufferRing; + HasRegisteredBuffers = hasRegisteredBuffers; + AdaptiveBufferSizingEnabled = adaptiveBufferSizingEnabled; + AvailableCount = availableCount; + InUseCount = inUseCount; + TotalBufferCount = totalBufferCount; + BufferSize = bufferSize; + RecommendedBufferSize = recommendedBufferSize; + RecycledCount = recycledCount; + AllocationFailureCount = allocationFailureCount; + } + + internal bool HasIoUringPort { get; } + internal bool SupportsProvidedBufferRings { get; } + internal bool HasProvidedBufferRing { get; } + internal bool HasRegisteredBuffers { get; } + internal bool AdaptiveBufferSizingEnabled { get; } + internal int AvailableCount { get; } + internal int InUseCount { get; } + internal int TotalBufferCount { get; } + internal int BufferSize { get; } + internal int RecommendedBufferSize { get; } + internal long RecycledCount { get; } + internal long AllocationFailureCount { get; } + } + + internal readonly struct IoUringZeroCopySendSnapshotForTest + { + internal IoUringZeroCopySendSnapshotForTest( + bool hasIoUringPort, + bool supportsSendZc, + bool supportsSendMsgZc, + bool zeroCopySendEnabled) + { + HasIoUringPort = hasIoUringPort; + SupportsSendZc = supportsSendZc; + SupportsSendMsgZc = supportsSendMsgZc; + ZeroCopySendEnabled = zeroCopySendEnabled; + } + + internal bool HasIoUringPort { get; } + internal bool SupportsSendZc { get; } + internal bool SupportsSendMsgZc { get; } + internal bool ZeroCopySendEnabled { get; } + } + + internal readonly struct IoUringFixedRecvSnapshotForTest + { + internal IoUringFixedRecvSnapshotForTest( + bool hasIoUringPort, + bool supportsReadFixed, + bool hasRegisteredBuffers) + { + HasIoUringPort = hasIoUringPort; + SupportsReadFixed = supportsReadFixed; + HasRegisteredBuffers = hasRegisteredBuffers; + } + + internal bool HasIoUringPort { get; } + internal bool SupportsReadFixed { get; } + internal bool HasRegisteredBuffers { get; } + } + + internal readonly struct IoUringSqPollSnapshotForTest + { + internal IoUringSqPollSnapshotForTest(bool hasIoUringPort, bool sqPollEnabled, bool deferTaskrunEnabled) + { + HasIoUringPort = hasIoUringPort; + SqPollEnabled = sqPollEnabled; + DeferTaskrunEnabled = deferTaskrunEnabled; + } + + internal bool HasIoUringPort { get; } + internal bool SqPollEnabled { get; } + internal bool DeferTaskrunEnabled { get; } + } + + internal readonly struct IoUringZeroCopyPinHoldSnapshotForTest + { + internal IoUringZeroCopyPinHoldSnapshotForTest( + bool hasIoUringPort, + int activePinHolds, + int pendingNotificationCount) + { + HasIoUringPort = hasIoUringPort; + ActivePinHolds = activePinHolds; + PendingNotificationCount = pendingNotificationCount; + } + + internal bool HasIoUringPort { get; } + internal int ActivePinHolds { get; } + internal int PendingNotificationCount { get; } + } + + internal readonly struct IoUringNativeMsghdrLayoutSnapshotForTest + { + internal IoUringNativeMsghdrLayoutSnapshotForTest( + int size, + int msgNameOffset, + int msgNameLengthOffset, + int msgIovOffset, + int msgIovLengthOffset, + int msgControlOffset, + int msgControlLengthOffset, + int msgFlagsOffset) + { + Size = size; + MsgNameOffset = msgNameOffset; + MsgNameLengthOffset = msgNameLengthOffset; + MsgIovOffset = msgIovOffset; + MsgIovLengthOffset = msgIovLengthOffset; + MsgControlOffset = msgControlOffset; + MsgControlLengthOffset = msgControlLengthOffset; + MsgFlagsOffset = msgFlagsOffset; + } + + internal int Size { get; } + internal int MsgNameOffset { get; } + internal int MsgNameLengthOffset { get; } + internal int MsgIovOffset { get; } + internal int MsgIovLengthOffset { get; } + internal int MsgControlOffset { get; } + internal int MsgControlLengthOffset { get; } + internal int MsgFlagsOffset { get; } + } + + internal readonly struct IoUringCompletionSlotLayoutSnapshotForTest + { + internal IoUringCompletionSlotLayoutSnapshotForTest( + int size, + int generationOffset, + int freeListNextOffset, + int packedStateOffset, + int fixedRecvBufferIdOffset, + int testForcedResultOffset) + { + Size = size; + GenerationOffset = generationOffset; + FreeListNextOffset = freeListNextOffset; + PackedStateOffset = packedStateOffset; + FixedRecvBufferIdOffset = fixedRecvBufferIdOffset; + TestForcedResultOffset = testForcedResultOffset; + } + + internal int Size { get; } + internal int GenerationOffset { get; } + internal int FreeListNextOffset { get; } + internal int PackedStateOffset { get; } + internal int FixedRecvBufferIdOffset { get; } + internal int TestForcedResultOffset { get; } + } + + internal static IoUringNonPinnableFallbackPublicationState GetIoUringNonPinnableFallbackPublicationStateForTest() => + new IoUringNonPinnableFallbackPublicationState( + GetPrimaryIoUringEnginePublishedNonPinnableFallbackCountForTest(), + publishingGate: 0, + SocketAsyncContext.GetIoUringNonPinnablePrepareFallbackCount()); + + internal static void SetIoUringNonPinnableFallbackPublicationStateForTest(IoUringNonPinnableFallbackPublicationState state) + { +#if DEBUG + // Test-only publication-state control keeps concurrent publisher tests deterministic. + SetPrimaryIoUringEnginePublishedNonPinnableFallbackCountForTest(state.PublishedCount); + SocketAsyncContext.SetIoUringNonPinnablePrepareFallbackCountForTest(state.FallbackCount); +#else + _ = state; +#endif + } + + internal static long GetIoUringNonPinnablePrepareFallbackDeltaForTest() + { + if (TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) && + ioUringEngine is not null && + ioUringEngine.TryPublishIoUringNonPinnablePrepareFallbackDelta(out long delta)) + { + return delta; + } + + return 0; + } + + private static long GetPrimaryIoUringEnginePublishedNonPinnableFallbackCountForTest() + { + if (TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) && + ioUringEngine is not null) + { + return Interlocked.Read(ref ioUringEngine._ioUringPublishedNonPinnablePrepareFallbackCount); + } + + return 0; + } + + private static void SetPrimaryIoUringEnginePublishedNonPinnableFallbackCountForTest(long value) + { + if (TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) && + ioUringEngine is not null) + { + Interlocked.Exchange(ref ioUringEngine._ioUringPublishedNonPinnablePrepareFallbackCount, value); + } + } + + internal static bool IsIoUringEnabledForTest() => IsIoUringEnabled(); + internal static bool IsSqPollRequestedForTest() => IsSqPollRequested(); + internal static bool IsIoUringDirectSqeDisabledForTest() => IsIoUringDirectSqeDisabled(); + internal static bool IsZeroCopySendOptedInForTest() => IsZeroCopySendOptedIn(); + internal static bool IsIoUringRegisterBuffersEnabledForTest() => IsIoUringRegisterBuffersEnabled(); + internal static bool IsNativeMsghdrLayoutSupportedForIoUringForTest(int pointerSize, int nativeMsghdrSize) => + IsNativeMsghdrLayoutSupportedForIoUring(pointerSize, nativeMsghdrSize); + internal static long GetIoUringPendingRetryQueuedToPrepareQueueCountForTest() => GetIoUringPendingRetryQueuedToPrepareQueueCount(); + internal static int GetIoUringCancellationQueueCapacityForTest() => s_ioUringCancellationQueueCapacity; + + internal static SocketAsyncEngine[] GetActiveIoUringEnginesForTest() + { + var engines = new List(s_engines.Length); + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled) + { + engines.Add(engine); + } + } + + return engines.ToArray(); + } + + internal bool SupportsMultishotRecvForTest => _supportsMultishotRecv; + internal bool SupportsMultishotAcceptForTest + { + get => _supportsMultishotAccept; + set + { +#if DEBUG + _supportsMultishotAccept = value; +#else + _ = value; +#endif + } + } + + internal bool SupportsProvidedBufferRingsForTest => _ioUringCapabilities.SupportsProvidedBufferRings; + internal bool HasProvidedBufferRingForTest => _ioUringProvidedBufferRing is not null; + internal bool IoUringBuffersRegisteredForTest => _ioUringCapabilities.HasRegisteredBuffers; + internal bool AdaptiveBufferSizingEnabledForTest => _adaptiveBufferSizingEnabled; + internal bool SupportsOpSendZcForTest + { + get => _supportsOpSendZc; + set + { +#if DEBUG + _supportsOpSendZc = value; +#else + _ = value; +#endif + } + } + + internal bool SupportsOpSendMsgZcForTest => _supportsOpSendMsgZc; + internal bool ZeroCopySendEnabledForTest + { + get => _zeroCopySendEnabled; + set + { +#if DEBUG + _zeroCopySendEnabled = value; +#else + _ = value; +#endif + } + } + + internal bool SupportsOpReadFixedForTest => _supportsOpReadFixed; + internal bool SqPollEnabledForTest => _sqPollEnabled; + internal IntPtr PortForTest => _port; + internal long IoUringCancelQueueLengthForTest + { + get => Interlocked.Read(ref _ioUringCancelQueueLength); + set + { +#if DEBUG + Interlocked.Exchange(ref _ioUringCancelQueueLength, value); +#else + _ = value; +#endif + } + } + + internal long IoUringCancelQueueOverflowCountForTest => Interlocked.Read(ref _ioUringCancelQueueOverflowCount); + internal long IoUringCancelQueueWakeRetryCountForTest + { + get + { +#if DEBUG + return Interlocked.Read(ref _testCancelQueueWakeRetryCount); +#else + _ = _ioUringInitialized; + return 0; +#endif + } + } + internal int IoUringWakeupRequestedForTest + { + get => Volatile.Read(ref _ioUringWakeupRequested); + set + { +#if DEBUG + Volatile.Write(ref _ioUringWakeupRequested, value); +#else + _ = value; +#endif + } + } + + internal bool TryEnqueueIoUringCancellationForTest(ulong userData) => TryEnqueueIoUringCancellation(userData); + internal Interop.Error SubmitIoUringOperationsNormalizedForTest() => SubmitIoUringOperationsNormalized(); + + internal bool SqNeedWakeupForTest() => SqNeedWakeup(); + + internal unsafe uint* GetManagedSqFlagsPointerForTest() => _managedSqFlagsPtr; + + internal static bool IsIoUringMultishotRecvSupportedForTest() + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled && engine._supportsMultishotRecv) + { + return true; + } + } + + return false; + } + + internal static bool IsIoUringMultishotAcceptSupportedForTest() + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled && engine._supportsMultishotAccept) + { + return true; + } + } + + return false; + } + + internal static IoUringNativeDiagnosticsSnapshotForTest GetIoUringNativeDiagnosticsSnapshotForTest() + { + bool hasIoUringPort = false; + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled) + { + hasIoUringPort = true; + break; + } + } + + if (!hasIoUringPort) + { + return new IoUringNativeDiagnosticsSnapshotForTest( + hasIoUringPort: false, + asyncCancelRequestCqeCount: 0, + asyncCancelRequestCqeEnoentCount: 0, + asyncCancelRequestCqeEalreadyCount: 0, + asyncCancelRequestCqeOtherCount: 0, + socketEventBufferFullCount: 0, + unsupportedOpcodePrepareCount: 0, + cqOverflowCount: 0); + } + + // Native io_uring diagnostics are not exposed through managed interop in this test build. + // Use telemetry-backed counters for the managed subset and zero the native-only breakdowns. + return new IoUringNativeDiagnosticsSnapshotForTest( + hasIoUringPort: true, + asyncCancelRequestCqeCount: SocketsTelemetry.GetIoUringCounterValueForTest(SocketsTelemetry.IoUringCounterFieldForTest.AsyncCancelRequestCqes), + asyncCancelRequestCqeEnoentCount: 0, + asyncCancelRequestCqeEalreadyCount: 0, + asyncCancelRequestCqeOtherCount: 0, + socketEventBufferFullCount: SocketsTelemetry.GetIoUringCounterValueForTest(SocketsTelemetry.IoUringCounterFieldForTest.SocketEventBufferFull), + unsupportedOpcodePrepareCount: 0, + cqOverflowCount: SocketsTelemetry.GetIoUringCounterValueForTest(SocketsTelemetry.IoUringCounterFieldForTest.CqOverflow)); + } + + internal static IoUringProvidedBufferSnapshotForTest GetIoUringProvidedBufferSnapshotForTest() + { + bool hasIoUringPort = false; + bool supportsProvidedBufferRings = false; + bool hasProvidedBufferRing = false; + bool hasRegisteredBuffers = false; + bool adaptiveBufferSizingEnabled = false; + int availableCount = 0; + int inUseCount = 0; + int totalBufferCount = 0; + int bufferSize = 0; + int recommendedBufferSize = 0; + long recycledCount = 0; + long allocationFailureCount = 0; + + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled) + { + continue; + } + + hasIoUringPort = true; + if (!engine._ioUringCapabilities.SupportsProvidedBufferRings) + { + continue; + } + + supportsProvidedBufferRings = true; + adaptiveBufferSizingEnabled |= engine._adaptiveBufferSizingEnabled; + hasRegisteredBuffers |= engine._ioUringCapabilities.HasRegisteredBuffers; + + IoUringProvidedBufferRing? providedBufferRing = engine._ioUringProvidedBufferRing; + if (providedBufferRing is null) + { + continue; + } + + hasProvidedBufferRing = true; + availableCount += providedBufferRing.AvailableCount; + inUseCount += providedBufferRing.InUseCount; + recycledCount += providedBufferRing.RecycledCount; + allocationFailureCount += providedBufferRing.AllocationFailureCount; + bufferSize = Math.Max(bufferSize, providedBufferRing.BufferSize); + recommendedBufferSize = Math.Max(recommendedBufferSize, providedBufferRing.RecommendedBufferSize); + totalBufferCount += providedBufferRing.TotalBufferCountForTest; + } + + return new IoUringProvidedBufferSnapshotForTest( + hasIoUringPort, + supportsProvidedBufferRings, + hasProvidedBufferRing, + hasRegisteredBuffers, + adaptiveBufferSizingEnabled, + availableCount, + inUseCount, + totalBufferCount, + bufferSize, + recommendedBufferSize, + recycledCount, + allocationFailureCount); + } + + internal static IoUringZeroCopySendSnapshotForTest GetIoUringZeroCopySendSnapshotForTest() + { + bool hasIoUringPort = false; + bool supportsSendZc = false; + bool supportsSendMsgZc = false; + bool zeroCopySendEnabled = false; + + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled) + { + continue; + } + + hasIoUringPort = true; + supportsSendZc |= engine._supportsOpSendZc; + supportsSendMsgZc |= engine._supportsOpSendMsgZc; + zeroCopySendEnabled |= engine._zeroCopySendEnabled; + } + + return new IoUringZeroCopySendSnapshotForTest( + hasIoUringPort, + supportsSendZc, + supportsSendMsgZc, + zeroCopySendEnabled); + } + + internal static IoUringFixedRecvSnapshotForTest GetIoUringFixedRecvSnapshotForTest() + { + bool hasIoUringPort = false; + bool supportsReadFixed = false; + bool hasRegisteredBuffers = false; + + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled) + { + continue; + } + + hasIoUringPort = true; + supportsReadFixed |= engine._supportsOpReadFixed; + hasRegisteredBuffers |= engine._ioUringCapabilities.HasRegisteredBuffers; + } + + return new IoUringFixedRecvSnapshotForTest( + hasIoUringPort, + supportsReadFixed, + hasRegisteredBuffers); + } + + internal static IoUringSqPollSnapshotForTest GetIoUringSqPollSnapshotForTest() + { + bool hasIoUringPort = false; + bool sqPollEnabled = false; + bool deferTaskrunEnabled = false; + + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled) + { + continue; + } + + hasIoUringPort = true; + sqPollEnabled |= engine._sqPollEnabled; + deferTaskrunEnabled |= (engine._managedNegotiatedFlags & IoUringConstants.SetupDeferTaskrun) != 0; + } + + return new IoUringSqPollSnapshotForTest(hasIoUringPort, sqPollEnabled, deferTaskrunEnabled); + } + + internal static bool IsAnyIoUringSqPollEngineNeedingWakeupForTest() + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled && + engine._sqPollEnabled && + engine.SqNeedWakeup()) + { + return true; + } + } + + return false; + } + + internal static bool TryValidateSqNeedWakeupMatchesRawSqFlagBitForTest(out bool matches) + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled || !engine._sqPollEnabled) + { + continue; + } + + bool methodValue = engine.SqNeedWakeup(); + if (engine._managedSqFlagsPtr == null) + { + matches = methodValue; + return true; + } + + bool rawValue = (Volatile.Read(ref *engine._managedSqFlagsPtr) & IoUringConstants.SqNeedWakeup) != 0; + matches = rawValue == methodValue; + return true; + } + + matches = true; + return false; + } + + internal static IoUringZeroCopyPinHoldSnapshotForTest GetIoUringZeroCopyPinHoldSnapshotForTest() + { + bool hasIoUringPort = false; + int activePinHolds = 0; + int pendingNotificationCount = 0; + + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled) + { + continue; + } + + hasIoUringPort = true; + MemoryHandle[]? pinHolds = engine._zeroCopyPinHolds; + if (pinHolds is not null) + { + for (int i = 0; i < pinHolds.Length; i++) + { + if (!pinHolds[i].Equals(default(MemoryHandle))) + { + activePinHolds++; + } + } + } + + pendingNotificationCount += engine.CountZeroCopyNotificationPendingSlots(); + } + + return new IoUringZeroCopyPinHoldSnapshotForTest( + hasIoUringPort, + activePinHolds, + pendingNotificationCount); + } + + internal static bool TryForceIoUringProvidedBufferRingExhaustionForTest(out int forcedBufferCount) + { +#if DEBUG + foreach (SocketAsyncEngine engine in s_engines) + { + if (TryForceIoUringProvidedBufferRingExhaustionForCurrentEngineForTest(engine, out forcedBufferCount)) + { + return true; + } + } + + forcedBufferCount = 0; + return false; +#else + forcedBufferCount = 0; + return false; +#endif + } + + internal static bool TryRecycleForcedIoUringProvidedBufferRingForTest(out int recycledBufferCount) + { +#if DEBUG + foreach (SocketAsyncEngine engine in s_engines) + { + if (TryRecycleForcedIoUringProvidedBufferRingForCurrentEngineForTest(engine, out recycledBufferCount)) + { + return true; + } + } + + recycledBufferCount = 0; + return false; +#else + recycledBufferCount = 0; + return false; +#endif + } + + private static bool TryForceIoUringProvidedBufferRingExhaustionForCurrentEngineForTest( + SocketAsyncEngine engine, + out int forcedBufferCount) + { +#if DEBUG + IoUringProvidedBufferRing? providedBufferRing = engine._ioUringProvidedBufferRing; + if (!engine.IsIoUringCompletionModeEnabled || + !engine._ioUringCapabilities.SupportsProvidedBufferRings || + providedBufferRing is null) + { + forcedBufferCount = 0; + return false; + } + + // Test-only deterministic exhaustion setup to force provided-buffer depletion paths. + providedBufferRing.ForceAllBuffersCheckedOutForTest(); + forcedBufferCount = providedBufferRing.TotalBufferCountForTest; + return true; +#else + forcedBufferCount = 0; + return false; +#endif + } + + private static bool TryRecycleForcedIoUringProvidedBufferRingForCurrentEngineForTest( + SocketAsyncEngine engine, + out int recycledBufferCount) + { +#if DEBUG + IoUringProvidedBufferRing? providedBufferRing = engine._ioUringProvidedBufferRing; + if (!engine.IsIoUringCompletionModeEnabled || + !engine._ioUringCapabilities.SupportsProvidedBufferRings || + providedBufferRing is null) + { + recycledBufferCount = 0; + return false; + } + + recycledBufferCount = providedBufferRing.RecycleCheckedOutBuffersForTeardown(); + return true; +#else + recycledBufferCount = 0; + return false; +#endif + } + + internal static IoUringNativeMsghdrLayoutSnapshotForTest GetIoUringNativeMsghdrLayoutForTest() + { + NativeMsghdr layout = default; + byte* basePtr = (byte*)&layout; + + return new IoUringNativeMsghdrLayoutSnapshotForTest( + size: sizeof(NativeMsghdr), + msgNameOffset: (int)((byte*)&layout.MsgName - basePtr), + msgNameLengthOffset: (int)((byte*)&layout.MsgNameLen - basePtr), + msgIovOffset: (int)((byte*)&layout.MsgIov - basePtr), + msgIovLengthOffset: (int)((byte*)&layout.MsgIovLen - basePtr), + msgControlOffset: (int)((byte*)&layout.MsgControl - basePtr), + msgControlLengthOffset: (int)((byte*)&layout.MsgControlLen - basePtr), + msgFlagsOffset: (int)((byte*)&layout.MsgFlags - basePtr)); + } + + internal static IoUringCompletionSlotLayoutSnapshotForTest GetIoUringCompletionSlotLayoutForTest() + { + IoUringCompletionSlot slot = default; + byte* basePtr = (byte*)&slot; + + return new IoUringCompletionSlotLayoutSnapshotForTest( + size: sizeof(IoUringCompletionSlot), + generationOffset: (int)((byte*)&slot.Generation - basePtr), + freeListNextOffset: (int)((byte*)&slot.FreeListNext - basePtr), + packedStateOffset: (int)Marshal.OffsetOf(typeof(IoUringCompletionSlot), "_packedState"), + fixedRecvBufferIdOffset: (int)((byte*)&slot.FixedRecvBufferId - basePtr), +#if DEBUG + testForcedResultOffset: (int)((byte*)&slot.TestForcedResult - basePtr) +#else + testForcedResultOffset: -1 +#endif + ); + } + + internal static ulong EncodeCompletionSlotUserDataForTest(int slotIndex, ulong generation) => + EncodeCompletionSlotUserData(slotIndex, generation); + + internal static bool TryDecodeCompletionSlotUserDataForTest(ulong userData, out int slotIndex, out ulong generation) + { + slotIndex = 0; + generation = 0; + + if ((byte)(userData >> IoUringUserDataTagShift) != IoUringConstants.TagReservedCompletion) + { + return false; + } + + ulong payload = userData & IoUringUserDataPayloadMask; + slotIndex = DecodeCompletionSlotIndex(payload); + generation = (payload >> IoUringConstants.SlotIndexBits) & IoUringConstants.GenerationMask; + return true; + } + + internal static ulong IncrementCompletionSlotGenerationForTest(ulong generation) + { + ulong nextGeneration = (generation + 1UL) & IoUringConstants.GenerationMask; + return nextGeneration == 0 ? 1UL : nextGeneration; + } + + internal static bool IsTrackedIoUringUserDataForTest(ulong userData) + { + if (!TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) || + ioUringEngine is null || + !ioUringEngine.IsIoUringCompletionModeEnabled || + ioUringEngine._trackedOperations is null) + { + return false; + } + + if (!TryDecodeCompletionSlotUserDataForTest(userData, out int slotIndex, out ulong generation)) + { + return false; + } + + IoUringTrackedOperationState[] trackedOperations = ioUringEngine._trackedOperations; + if ((uint)slotIndex >= (uint)trackedOperations.Length) + { + return false; + } + + ref IoUringTrackedOperationState trackedState = ref trackedOperations[slotIndex]; + return Volatile.Read(ref trackedState.TrackedOperationGeneration) == generation && + Volatile.Read(ref trackedState.TrackedOperation) is not null; + } + + [RequiresUnreferencedCode("Uses MethodBase.GetMethodBody() for test-only IL ordering validation.")] + internal static bool ValidateIoUringProvidedBufferTeardownOrderingForTest() + { + MethodInfo teardownMethod = typeof(SocketAsyncEngine).GetMethod("LinuxFreeIoUringResources", BindingFlags.NonPublic | BindingFlags.Instance)!; + MethodInfo freeProvidedBufferRingMethod = typeof(SocketAsyncEngine).GetMethod("FreeIoUringProvidedBufferRing", BindingFlags.NonPublic | BindingFlags.Instance)!; + MethodInfo cleanupManagedRingsMethod = typeof(SocketAsyncEngine).GetMethod("CleanupManagedRings", BindingFlags.NonPublic | BindingFlags.Instance)!; + +#if DEBUG + byte[] ilBytes = teardownMethod.GetMethodBody()?.GetILAsByteArray() ?? Array.Empty(); + if (ilBytes.Length == 0) + { + return false; + } + + ReadOnlySpan il = ilBytes; + int freeProvidedBufferRingOffset = FindCallMethodTokenOffset(il, freeProvidedBufferRingMethod.MetadataToken); + int cleanupManagedRingsOffset = FindCallMethodTokenOffset(il, cleanupManagedRingsMethod.MetadataToken); + + return freeProvidedBufferRingOffset >= 0 && + cleanupManagedRingsOffset >= 0 && + freeProvidedBufferRingOffset < cleanupManagedRingsOffset; +#else + return true; +#endif + } + + private static int FindCallMethodTokenOffset(ReadOnlySpan il, int targetMetadataToken) + { + Span tokenBytes = stackalloc byte[sizeof(int)]; + System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(tokenBytes, targetMetadataToken); + + for (int i = 1; i <= il.Length - sizeof(int); i++) + { + // call (0x28) and callvirt (0x6F) are followed by a 4-byte method token. + byte opcode = il[i - 1]; + if (opcode != 0x28 && opcode != 0x6F) + { + continue; + } + + if (il.Slice(i, sizeof(int)).SequenceEqual(tokenBytes)) + { + return i - 1; + } + } + + return -1; + } + + internal static bool TryInjectIoUringCqOverflowForTest(uint delta, out int injectedEngineCount) + { +#if DEBUG + injectedEngineCount = 0; + if (delta == 0) + { + return false; + } + + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled || engine._managedCqOverflowPtr == null) + { + continue; + } + + Volatile.Write( + ref *engine._managedCqOverflowPtr, + unchecked(Volatile.Read(ref *engine._managedCqOverflowPtr) + delta)); + injectedEngineCount++; + } + + return injectedEngineCount != 0; +#else + injectedEngineCount = 0; + return false; +#endif + } + + internal static bool HasActiveIoUringEngineWithInitializedCqStateForTest() + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled) + { + continue; + } + + return engine._managedCqRingPtr != null && + engine._managedCqOverflowPtr != null && + engine._completionSlots is not null && + engine._trackedOperations is not null && + engine._completionSlotStorage is not null; + } + + return false; + } + + internal static int GetIoUringCompletionSlotsInUseForTest() + { + int totalInUse = 0; + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled) + { + totalInUse += Volatile.Read(ref engine._completionSlotsInUse); + } + } + + return totalInUse; + } + + internal static int GetIoUringTrackedOperationCountForTest() + { + int totalTracked = 0; + foreach (SocketAsyncEngine engine in s_engines) + { + if (!engine.IsIoUringCompletionModeEnabled || engine._trackedOperations is null) + { + continue; + } + + IoUringTrackedOperationState[] storage = engine._trackedOperations; + for (int i = 0; i < storage.Length; i++) + { + if (Volatile.Read(ref storage[i].TrackedOperation) is not null) + { + totalTracked++; + } + } + } + + return totalTracked; + } + + internal static bool TryGetIoUringRingFdForTest(out int ringFd) + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled && engine._managedRingFd >= 0) + { + ringFd = engine._managedRingFd; + return true; + } + } + + ringFd = -1; + return false; + } + + internal static bool TryGetIoUringWakeupEventFdForTest(out int eventFd) + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled && engine._managedWakeupEventFd >= 0) + { + eventFd = engine._managedWakeupEventFd; + return true; + } + } + + eventFd = -1; + return false; + } + + internal static bool TryGetFirstIoUringEngineForTest(out SocketAsyncEngine? ioUringEngine) + { + foreach (SocketAsyncEngine engine in s_engines) + { + if (engine.IsIoUringCompletionModeEnabled) + { + ioUringEngine = engine; + return true; + } + } + + ioUringEngine = null; + return false; + } + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUringTestInfrastructure/SocketAsyncEngine.IoUringTestHooks.Linux.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUringTestInfrastructure/SocketAsyncEngine.IoUringTestHooks.Linux.cs new file mode 100644 index 00000000000000..9361c9128facad --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUringTestInfrastructure/SocketAsyncEngine.IoUringTestHooks.Linux.cs @@ -0,0 +1,235 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace System.Net.Sockets +{ + internal sealed unsafe partial class SocketAsyncEngine + { +#if DEBUG + // Raw Linux errno value used by forced-completion test hooks. + private const int ErrnoECANCELED = 125; +#endif + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ResetDebugTestForcedResult(ref IoUringCompletionSlot slot) + { +#if DEBUG + slot.HasTestForcedResult = false; + slot.TestForcedResult = 0; +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ResolveDebugTestForcedResult(ref IoUringCompletionSlot slot, ref int result) + { +#if DEBUG + if (slot.HasTestForcedResult) + { + result = slot.TestForcedResult; + slot.HasTestForcedResult = false; + } +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ApplyDebugTestForcedResult(ref IoUringCompletionSlot slot, byte opcode) + { +#if DEBUG + if ((_testForceEagainOnceMask | _testForceEcanceledOnceMask) == 0) + { + return; + } + + if (TryConsumeTestForcedResult(opcode, out int forced)) + { + slot.HasTestForcedResult = true; + slot.TestForcedResult = forced; + } +#else + _ = _ioUringInitialized; +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RestoreDebugTestForcedResultIfNeeded(int slotIndex, byte opcode) + { +#if DEBUG + Debug.Assert(_completionSlots is not null); + ref IoUringCompletionSlot slot = ref _completionSlots![slotIndex]; + if (slot.HasTestForcedResult) + { + RestoreTestForcedResult(slot.TestForcedResult, opcode); + } +#else + _ = _ioUringInitialized; +#endif + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private void InitializeDebugTestHooksFromEnvironment() + { +#if DEBUG + // Mirrors native pal_io_uring.c test hooks. + _testForceEagainOnceMask = ParseTestOpcodeMask( + Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceEagainOnceMask)); + _testForceEcanceledOnceMask = ParseTestOpcodeMask( + Environment.GetEnvironmentVariable(IoUringTestEnvironmentVariables.ForceEcanceledOnceMask)); + string? forceSubmitEperm = Environment.GetEnvironmentVariable( + IoUringTestEnvironmentVariables.ForceSubmitEpermOnce); + _testForceSubmitEpermOnce = string.Equals(forceSubmitEperm, "1", StringComparison.Ordinal) ? 1 : 0; +#else + _ = _ioUringInitialized; +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryConsumeDebugForcedSubmitError(out Interop.Error forcedError) + { + _ = _ioUringInitialized; + +#if DEBUG + if (Interlocked.Exchange(ref _testForceSubmitEpermOnce, 0) != 0) + { + forcedError = Interop.Error.EPERM; + return true; + } +#endif + + forcedError = Interop.Error.SUCCESS; + return false; + } + +#if DEBUG + /// + /// Parses a comma-separated list of opcode names (e.g. "send,recv,accept") into a + /// bitmask of TestOpcodeMask* values. + /// Mirrors GetIoUringTestOpcodeMaskFromOpcodeNameList in pal_io_uring.c. + /// + private static byte ParseTestOpcodeMask(string? opcodeNameList) + { + if (string.IsNullOrEmpty(opcodeNameList)) + { + return IoUringConstants.TestOpcodeMaskNone; + } + + byte mask = IoUringConstants.TestOpcodeMaskNone; + foreach (var name in opcodeNameList.Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)) + { + if (name.Equals("send", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskSend; + } + else if (name.Equals("recv", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskRecv; + } + else if (name.Equals("sendmsg", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskSendMsg; + } + else if (name.Equals("recvmsg", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskRecvMsg; + } + else if (name.Equals("accept", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskAccept; + } + else if (name.Equals("connect", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskConnect; + } + else if (name.Equals("sendzc", StringComparison.OrdinalIgnoreCase) || name.Equals("send_zc", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskSendZc; + } + else if (name.Equals("sendmsgzc", StringComparison.OrdinalIgnoreCase) || name.Equals("sendmsg_zc", StringComparison.OrdinalIgnoreCase)) + { + mask |= IoUringConstants.TestOpcodeMaskSendMsgZc; + } + } + return mask; + } + + /// + /// Maps an io_uring opcode to its corresponding test opcode mask bit. + /// Mirrors GetIoUringTestOpcodeMaskFromOpcode in pal_io_uring.c. + /// + private static byte GetTestOpcodeMaskFromOpcode(byte opcode) + { + return opcode switch + { + IoUringOpcodes.Send => IoUringConstants.TestOpcodeMaskSend, + IoUringOpcodes.Recv => IoUringConstants.TestOpcodeMaskRecv, + IoUringOpcodes.SendMsg => IoUringConstants.TestOpcodeMaskSendMsg, + IoUringOpcodes.RecvMsg => IoUringConstants.TestOpcodeMaskRecvMsg, + IoUringOpcodes.Accept => IoUringConstants.TestOpcodeMaskAccept, + IoUringOpcodes.Connect => IoUringConstants.TestOpcodeMaskConnect, + IoUringOpcodes.SendZc => IoUringConstants.TestOpcodeMaskSendZc, + IoUringOpcodes.SendMsgZc => IoUringConstants.TestOpcodeMaskSendMsgZc, + _ => IoUringConstants.TestOpcodeMaskNone, + }; + } + + /// + /// Tries to consume a forced test result for the given opcode. + /// EAGAIN takes priority over ECANCELED when both are set. + /// Mirrors TryConsumeIoUringForcedCompletionResultLocked in pal_io_uring.c. + /// + private bool TryConsumeTestForcedResult(byte opcode, out int forcedResult) + { + forcedResult = 0; + byte opcodeMask = GetTestOpcodeMaskFromOpcode(opcode); + if (opcodeMask == IoUringConstants.TestOpcodeMaskNone) + { + return false; + } + + if ((_testForceEagainOnceMask & opcodeMask) != 0) + { + _testForceEagainOnceMask &= (byte)~opcodeMask; + forcedResult = -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.EAGAIN); + return true; + } + + if ((_testForceEcanceledOnceMask & opcodeMask) != 0) + { + _testForceEcanceledOnceMask &= (byte)~opcodeMask; + forcedResult = -ErrnoECANCELED; + return true; + } + + return false; + } + + /// + /// Restores a previously consumed forced test result mask bit. + /// Called when SQE acquisition fails after the forced result was consumed, + /// so the test hook can fire on the next attempt. + /// Mirrors RestoreIoUringForcedCompletionResultLocked in pal_io_uring.c. + /// + private void RestoreTestForcedResult(int forcedResult, byte opcode) + { + byte opcodeMask = GetTestOpcodeMaskFromOpcode(opcode); + if (opcodeMask == IoUringConstants.TestOpcodeMaskNone) + { + return; + } + + if (forcedResult == -Interop.Sys.ConvertErrorPalToPlatform(Interop.Error.EAGAIN)) + { + _testForceEagainOnceMask |= opcodeMask; + } + else if (forcedResult == -ErrnoECANCELED) + { + _testForceEcanceledOnceMask |= opcodeMask; + } + } +#endif + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/MpscQueueTests.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/MpscQueueTests.cs new file mode 100644 index 00000000000000..b49ff6d7b30747 --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/MpscQueueTests.cs @@ -0,0 +1,310 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.DotNet.XUnitExtensions; +using Xunit; + +namespace System.Net.Sockets.Tests +{ + [PlatformSpecific(TestPlatforms.Linux)] // MPSC queue is used by Linux io_uring paths. + public class MpscQueueTests + { + private sealed class QueueProxy + { + private readonly MpscQueue _queue; + + public QueueProxy(int segmentSize) + { + _queue = new MpscQueue(segmentSize); + } + + public void Enqueue(T item) => _queue.Enqueue(item); + public bool TryEnqueue(T item) => _queue.TryEnqueue(item); + public bool TryDequeue(out T item) => _queue.TryDequeue(out item); + public bool IsEmpty => _queue.IsEmpty; + } + + [Fact] + public void MpscQueue_SingleProducerSingleConsumer_PreservesOrder() + { + const int count = 1024; + var queue = new QueueProxy(segmentSize: 16); + + for (int i = 0; i < count; i++) + { + queue.Enqueue(i); + } + + for (int i = 0; i < count; i++) + { + Assert.True(queue.TryDequeue(out int value)); + Assert.Equal(i, value); + } + + Assert.True(queue.IsEmpty); + Assert.False(queue.TryDequeue(out _)); + } + + [Fact] + public async Task MpscQueue_MultiProducerSingleConsumer_ReceivesAllItems() + { + const int producerCount = 4; + const int itemsPerProducer = 2000; + const int totalItems = producerCount * itemsPerProducer; + var queue = new QueueProxy(segmentSize: 32); + + Task[] producers = new Task[producerCount]; + for (int p = 0; p < producerCount; p++) + { + int producerIndex = p; + producers[p] = Task.Run(() => + { + int baseValue = producerIndex * itemsPerProducer; + for (int i = 0; i < itemsPerProducer; i++) + { + queue.Enqueue(baseValue + i); + } + }); + } + + var seen = new bool[totalItems]; + int received = 0; + var spin = new SpinWait(); + while (received < totalItems) + { + if (queue.TryDequeue(out int value)) + { + Assert.InRange(value, 0, totalItems - 1); + Assert.False(seen[value], $"duplicate dequeue value: {value}"); + seen[value] = true; + received++; + } + else + { + spin.SpinOnce(); + } + } + + await Task.WhenAll(producers); + Assert.All(seen, Assert.True); + Assert.True(queue.IsEmpty); + } + + [Fact] + public void MpscQueue_EmptyQueue_ReportsEmptyAndTryDequeueFalse() + { + var queue = new QueueProxy(segmentSize: 8); + + Assert.True(queue.IsEmpty); + Assert.False(queue.TryDequeue(out _)); + } + + [Theory] + [InlineData(0)] + [InlineData(-1)] + public void MpscQueue_Ctor_InvalidSegmentSize_Throws(int segmentSize) + { + Assert.Throws(() => new QueueProxy(segmentSize)); + } + + [Fact] + public void MpscQueue_SegmentCrossing_WorksAcrossMultipleSegments() + { + const int count = 37; + var queue = new QueueProxy(segmentSize: 2); + + for (int i = 0; i < count; i++) + { + queue.Enqueue(i); + } + + for (int i = 0; i < count; i++) + { + Assert.True(queue.TryDequeue(out int value)); + Assert.Equal(i, value); + } + + Assert.True(queue.IsEmpty); + } + + [Fact] + public async Task MpscQueue_SegmentSizeOne_MultiProducerSingleConsumer_ReceivesAllItems() + { + const int producerCount = 3; + const int itemsPerProducer = 1000; + const int totalItems = producerCount * itemsPerProducer; + var queue = new QueueProxy(segmentSize: 1); + + Task[] producers = new Task[producerCount]; + for (int p = 0; p < producerCount; p++) + { + int producerIndex = p; + producers[p] = Task.Run(() => + { + int baseValue = producerIndex * itemsPerProducer; + for (int i = 0; i < itemsPerProducer; i++) + { + queue.Enqueue(baseValue + i); + } + }); + } + + var seen = new bool[totalItems]; + int received = 0; + var spin = new SpinWait(); + while (received < totalItems) + { + if (queue.TryDequeue(out int value)) + { + Assert.InRange(value, 0, totalItems - 1); + Assert.False(seen[value], $"duplicate dequeue value: {value}"); + seen[value] = true; + received++; + } + else + { + spin.SpinOnce(); + } + } + + await Task.WhenAll(producers); + Assert.All(seen, Assert.True); + Assert.True(queue.IsEmpty); + } + + [Fact] + public async Task MpscQueue_Stress_NoLossAndNoDeadlock() + { + const int producerCount = 6; + const int itemsPerProducer = 4000; + const int totalItems = producerCount * itemsPerProducer; + var queue = new QueueProxy(segmentSize: 32); + + Task[] producers = new Task[producerCount]; + for (int p = 0; p < producerCount; p++) + { + int producerIndex = p; + producers[p] = Task.Run(() => + { + int baseValue = producerIndex * itemsPerProducer; + for (int i = 0; i < itemsPerProducer; i++) + { + queue.Enqueue(baseValue + i); + } + }); + } + + var seen = new HashSet(); + int received = 0; + var timeout = Stopwatch.StartNew(); + while (received < totalItems) + { + if (timeout.Elapsed > TimeSpan.FromSeconds(30)) + { + throw new TimeoutException($"Timed out draining MPSC queue. received={received}, expected={totalItems}"); + } + + if (queue.TryDequeue(out int value)) + { + Assert.True(seen.Add(value), $"duplicate dequeue value: {value}"); + received++; + } + else + { + await Task.Yield(); + } + } + + await Task.WhenAll(producers); + Assert.Equal(totalItems, seen.Count); + Assert.True(queue.IsEmpty); + } + + [Fact] + public async Task MpscQueue_Arm64_ConcurrentStress_NoLossOrDeadlock() + { + if (!PlatformDetection.IsArm64Process) + { + return; + } + + const int producerCount = 8; + const int itemsPerProducer = 20000; + const int totalItems = producerCount * itemsPerProducer; + var queue = new QueueProxy(segmentSize: 4); + + Task[] producers = new Task[producerCount]; + for (int p = 0; p < producerCount; p++) + { + int producerIndex = p; + producers[p] = Task.Run(() => + { + int baseValue = producerIndex * itemsPerProducer; + for (int i = 0; i < itemsPerProducer; i++) + { + queue.Enqueue(baseValue + i); + } + }); + } + + var seen = new bool[totalItems]; + int received = 0; + var timeout = Stopwatch.StartNew(); + while (received < totalItems) + { + if (timeout.Elapsed > TimeSpan.FromSeconds(90)) + { + throw new TimeoutException($"Timed out draining ARM64 MPSC stress queue. received={received}, expected={totalItems}"); + } + + if (queue.TryDequeue(out int value)) + { + Assert.InRange(value, 0, totalItems - 1); + Assert.False(seen[value], $"duplicate dequeue value: {value}"); + seen[value] = true; + received++; + continue; + } + + await Task.Yield(); + } + + await Task.WhenAll(producers); + Assert.All(seen, Assert.True); + Assert.True(queue.IsEmpty); + } + + [Fact] + public void MpscQueue_TryEnqueue_RecoversAfterSegmentAllocationOom() + { +#if DEBUG + var queue = new QueueProxy(segmentSize: 1); + queue.Enqueue(1); + MpscQueue.SetSegmentAllocationFailuresForTest(1); + try + { + Assert.False(queue.TryEnqueue(2)); + + Assert.True(queue.TryDequeue(out int first)); + Assert.Equal(1, first); + + Assert.True(queue.TryEnqueue(2)); + Assert.True(queue.TryDequeue(out int second)); + Assert.Equal(2, second); + Assert.True(queue.IsEmpty); + } + finally + { + MpscQueue.SetSegmentAllocationFailuresForTest(0); + } +#else + Assert.True(true); +#endif + } + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj b/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj index 43844aea397681..c8a14decece008 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj @@ -6,7 +6,90 @@ true true true + + default + <_SocketsIoUringTestModeSupported Condition="'$(SocketsIoUringTestMode)' == 'default' or '$(SocketsIoUringTestMode)' == 'enabled' or '$(SocketsIoUringTestMode)' == 'disabled'">true + true + + + + + + + + + + + + + + + + + <_IoUringVariantsRoot>$([MSBuild]::NormalizeDirectory('$(IntermediateOutputPath)', 'io_uring_variants')) + <_IoUringEnabledDir>$([MSBuild]::NormalizeDirectory('$(_IoUringVariantsRoot)', 'enabled')) + <_IoUringDisabledDir>$([MSBuild]::NormalizeDirectory('$(_IoUringVariantsRoot)', 'disabled')) + <_RunScriptName>RunTests.sh + <_EnabledArchivePath>$([MSBuild]::NormalizePath('$(TestArchiveTestsDir)', '$(TestProjectName).io_uring_enabled.zip')) + <_DisabledArchivePath>$([MSBuild]::NormalizePath('$(TestArchiveTestsDir)', '$(TestProjectName).io_uring_disabled.zip')) + + + + + + + <_OutDirFiles Include="$(OutDir)**/*" /> + + + + + + + + + + <_EnabledRunScriptLines Include="#!/usr/bin/env bash" /> + <_EnabledRunScriptLines Include="set -euo pipefail" /> + <_EnabledRunScriptLines Include="export DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1" /> + <_EnabledRunScriptLines Include="exec ./RunTests.base.sh "$@"" /> + <_DisabledRunScriptLines Include="#!/usr/bin/env bash" /> + <_DisabledRunScriptLines Include="set -euo pipefail" /> + <_DisabledRunScriptLines Include="export DOTNET_SYSTEM_NET_SOCKETS_IO_URING=0" /> + <_DisabledRunScriptLines Include="exec ./RunTests.base.sh "$@"" /> + + + + + + + + + + + @@ -22,6 +105,9 @@ + + + diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs index 69f61fc180a49c..c44eec0d8394fa 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs @@ -7,6 +7,8 @@ using System.Diagnostics.Tracing; using System.Linq; using System.Net.Test.Common; +using System.Reflection; +using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using Microsoft.DotNet.RemoteExecutor; @@ -20,6 +22,18 @@ public class TelemetryTest { private const string ActivitySourceName = "Experimental.System.Net.Sockets"; private const string ActivityName = ActivitySourceName + ".Connect"; + private static readonly string[] s_ioUringCounterNames = GetIoUringCounterNames(); + private static readonly string[] s_expectedIoUringCounterNames = new[] + { + "io-uring-completion-slot-exhaustions", + "io-uring-cq-overflows", + "io-uring-prepare-nonpinnable-fallbacks", + "io-uring-prepare-queue-overflow-fallbacks", + "io-uring-prepare-queue-overflows", + "io-uring-socket-event-buffer-full", + "io-uring-sqpoll-submissions-skipped", + "io-uring-sqpoll-wakeups" + }; private static readonly Lazy> s_remoteServerIsReachable = new Lazy>(() => Task.Run(async () => { @@ -46,6 +60,24 @@ public TelemetryTest(ITestOutputHelper output) _output = output; } + private static string[] GetIoUringCounterNames() + { + Type? counterNamesType = + typeof(Socket).Assembly.GetType("System.Net.Sockets.SocketsTelemetry+IoUringCounterNames", throwOnError: false); + + if (counterNamesType is null) + { + return Array.Empty(); + } + + return counterNamesType + .GetFields(BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static) + .Where(field => field.IsLiteral && !field.IsInitOnly && field.FieldType == typeof(string)) + .Select(field => (string)field.GetRawConstantValue()!) + .OrderBy(name => name, StringComparer.Ordinal) + .ToArray(); + } + [Fact] [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] public static void EventSource_ExistsWithCorrectId() @@ -59,6 +91,160 @@ public static void EventSource_ExistsWithCorrectId() Assert.NotEmpty(EventSource.GenerateManifest(esType, esType.Assembly.Location)); } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // Socket engine backend event is emitted by Linux engine initialization. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public async Task EventSource_SocketEngineBackendSelected_Emitted() + { + await RemoteExecutor.Invoke(async () => + { + using var listener = new TestEventListener("System.Net.Sockets", EventLevel.Verbose, 0.1); + listener.AddActivityTracking(); + + var events = new ConcurrentQueue<(EventWrittenEventArgs Event, Guid ActivityId)>(); + await listener.RunWithCallbackAsync(e => events.Enqueue((e, e.ActivityId)), async () => + { + using var server = new Socket(SocketType.Stream, ProtocolType.Tcp); + server.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + server.Listen(); + + using var client = new Socket(SocketType.Stream, ProtocolType.Tcp); + Task connectTask = client.ConnectAsync(server.LocalEndPoint); + using var accepted = await server.AcceptAsync(); + await connectTask; + + await WaitForEventAsync(events, "SocketEngineBackendSelected"); + }); + + EventWrittenEventArgs[] backendEvents = events + .Where(e => e.Event.EventName == "SocketEngineBackendSelected") + .Select(e => e.Event) + .ToArray(); + + Assert.NotEmpty(backendEvents); + foreach (EventWrittenEventArgs backendEvent in backendEvents) + { + Assert.Equal(3, backendEvent.Payload?.Count ?? 0); + string backend = Assert.IsType(backendEvent.Payload![0]); + int isIoUringPort = Convert.ToInt32(backendEvent.Payload[1]); + int sqPollEnabled = Convert.ToInt32(backendEvent.Payload[2]); + + Assert.True( + backend == "epoll" || backend == "io_uring_completion", + $"Unexpected backend payload: {backend}"); + Assert.Equal(backend == "io_uring_completion" ? 1 : 0, isIoUringPort); + Assert.True(sqPollEnabled == 0 || sqPollEnabled == 1, $"Unexpected sqpoll payload: {sqPollEnabled}"); + if (backend == "epoll") + { + Assert.Equal(0, sqPollEnabled); + } + } + }).DisposeAsync(); + } + + [Fact] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring interop types are Linux-only. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void IoUringSocketEventPortDiagnostics_LayoutContract() + { + Type? type = typeof(Socket).Assembly.GetType("Interop+Sys+IoUringSocketEventPortDiagnostics", throwOnError: false, ignoreCase: false); + if (type is null) + { + return; + } + + Assert.True(type.IsLayoutSequential); + + Assert.Equal(0, Marshal.OffsetOf(type, "AsyncCancelRequestCqeCount").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(type, "AsyncCancelRequestCqeEnoentCount").ToInt32()); + Assert.Equal(16, Marshal.OffsetOf(type, "AsyncCancelRequestCqeEalreadyCount").ToInt32()); + Assert.Equal(24, Marshal.OffsetOf(type, "AsyncCancelRequestCqeOtherCount").ToInt32()); + Assert.Equal(32, Marshal.OffsetOf(type, "SocketEventBufferFullCount").ToInt32()); + + if (type.GetField("CompletionBufferFullCount", BindingFlags.Public | BindingFlags.Instance) is not null) + { + Assert.Equal(40, Marshal.OffsetOf(type, "CompletionBufferFullCount").ToInt32()); + Assert.Equal(48, Marshal.OffsetOf(type, "UnsupportedOpcodePrepareCount").ToInt32()); + Assert.Equal(56, Marshal.OffsetOf(type, "CqOverflowCount").ToInt32()); + Assert.Equal(64, Marshal.SizeOf(type)); + } + else + { + Assert.Equal(40, Marshal.OffsetOf(type, "UnsupportedOpcodePrepareCount").ToInt32()); + Assert.Equal(48, Marshal.OffsetOf(type, "CqOverflowCount").ToInt32()); + Assert.Equal(56, Marshal.SizeOf(type)); + } + } + + [Fact] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring interop types are Linux-only. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void IoUringProvidedBufferInterop_LayoutContract() + { + Type ioUringBufType = GetInteropSysNestedType("IoUringBuf"); + Assert.True(ioUringBufType.IsExplicitLayout); + Assert.Equal(0, Marshal.OffsetOf(ioUringBufType, "Address").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(ioUringBufType, "Length").ToInt32()); + Assert.Equal(12, Marshal.OffsetOf(ioUringBufType, "BufferId").ToInt32()); + Assert.Equal(14, Marshal.OffsetOf(ioUringBufType, "Reserved").ToInt32()); + Assert.Equal(16, Marshal.SizeOf(ioUringBufType)); + + Type ioUringBufRingHeaderType = GetInteropSysNestedType("IoUringBufRingHeader"); + Assert.True(ioUringBufRingHeaderType.IsExplicitLayout); + Assert.Equal(0, Marshal.OffsetOf(ioUringBufRingHeaderType, "Reserved1").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(ioUringBufRingHeaderType, "Reserved2").ToInt32()); + Assert.Equal(12, Marshal.OffsetOf(ioUringBufRingHeaderType, "Reserved3").ToInt32()); + Assert.Equal(14, Marshal.OffsetOf(ioUringBufRingHeaderType, "Tail").ToInt32()); + Assert.Equal(16, Marshal.SizeOf(ioUringBufRingHeaderType)); + + Type ioUringBufRegType = GetInteropSysNestedType("IoUringBufReg"); + Assert.True(ioUringBufRegType.IsExplicitLayout); + Assert.Equal(0, Marshal.OffsetOf(ioUringBufRegType, "RingAddress").ToInt32()); + Assert.Equal(8, Marshal.OffsetOf(ioUringBufRegType, "RingEntries").ToInt32()); + Assert.Equal(12, Marshal.OffsetOf(ioUringBufRegType, "BufferGroupId").ToInt32()); + Assert.Equal(14, Marshal.OffsetOf(ioUringBufRegType, "Padding").ToInt32()); + Assert.Equal(16, Marshal.OffsetOf(ioUringBufRegType, "Reserved0").ToInt32()); + Assert.Equal(24, Marshal.OffsetOf(ioUringBufRegType, "Reserved1").ToInt32()); + Assert.Equal(32, Marshal.OffsetOf(ioUringBufRegType, "Reserved2").ToInt32()); + Assert.Equal(40, Marshal.SizeOf(ioUringBufRegType)); + } + + [Fact] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring interop types are Linux-only. + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void IoUringCompletionInteropType_IsAbsent() + { + Type? type = typeof(Socket).Assembly.GetType("Interop+Sys+IoUringCompletion", throwOnError: false, ignoreCase: false); + Assert.Null(type); + } + + [Fact] + [PlatformSpecific(TestPlatforms.AnyUnix)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/107981", TestPlatforms.Wasi)] + public static void MessageHeaderAndIoVector_LayoutContract() + { + Type messageHeaderType = GetInteropSysNestedType("MessageHeader"); + Type ioVectorType = GetInteropSysNestedType("IOVector"); + + Assert.True(messageHeaderType.IsLayoutSequential); + Assert.True(ioVectorType.IsLayoutSequential); + + int pointerSize = IntPtr.Size; + + Assert.Equal(0, Marshal.OffsetOf(ioVectorType, "Base").ToInt32()); + Assert.Equal(pointerSize, Marshal.OffsetOf(ioVectorType, "Count").ToInt32()); + Assert.Equal(pointerSize * 2, Marshal.SizeOf(ioVectorType)); + + Assert.Equal(0, Marshal.OffsetOf(messageHeaderType, "SocketAddress").ToInt32()); + Assert.Equal(pointerSize, Marshal.OffsetOf(messageHeaderType, "IOVectors").ToInt32()); + Assert.Equal(pointerSize * 2, Marshal.OffsetOf(messageHeaderType, "ControlBuffer").ToInt32()); + Assert.Equal(pointerSize * 3, Marshal.OffsetOf(messageHeaderType, "SocketAddressLen").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int), Marshal.OffsetOf(messageHeaderType, "IOVectorCount").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int) * 2, Marshal.OffsetOf(messageHeaderType, "ControlBufferLen").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int) * 3, Marshal.OffsetOf(messageHeaderType, "Flags").ToInt32()); + Assert.Equal(pointerSize * 3 + sizeof(int) * 4, Marshal.SizeOf(messageHeaderType)); + } + public static IEnumerable SocketMethods_MemberData() { if (!OperatingSystem.IsWasi()) yield return new[] { "Sync" }; @@ -110,6 +296,13 @@ private static SocketHelperBase GetHelperBase(string socketMethod) }; } + private static Type GetInteropSysNestedType(string nestedTypeName) + { + Type? type = typeof(Socket).Assembly.GetType($"Interop+Sys+{nestedTypeName}", throwOnError: false, ignoreCase: false); + Assert.NotNull(type); + return type!; + } + [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [MemberData(nameof(SocketMethods_WithBools_MemberData))] public async Task Connect_Success_ActivityRecorded(string connectMethod, bool ipv6) @@ -664,6 +857,20 @@ private static void VerifyEventCounters(ConcurrentQueue<(EventWrittenEventArgs E { Assert.True(datagramsSent[^1] > 0); } + + // Guard against telemetry drift: verify every canonical io_uring counter name exists. + // io_uring counters are only registered on Linux (OnEventCommand returns early on non-Linux). + if (OperatingSystem.IsLinux()) + { + Assert.Equal(s_expectedIoUringCounterNames, s_ioUringCounterNames); + foreach (string counterName in s_ioUringCounterNames) + { + Assert.True( + eventCounters.TryGetValue(counterName, out double[] ioUringCounterValues), + $"Missing io_uring EventCounter '{counterName}'."); + Assert.True(ioUringCounterValues[^1] >= 0, $"Unexpected negative counter value for '{counterName}'."); + } + } } } } diff --git a/src/native/libs/Common/pal_config.h.in b/src/native/libs/Common/pal_config.h.in index abc93358e69f6c..7f6475eeff8f0b 100644 --- a/src/native/libs/Common/pal_config.h.in +++ b/src/native/libs/Common/pal_config.h.in @@ -56,6 +56,7 @@ #cmakedefine01 HAVE_ETHTOOL_H #cmakedefine01 HAVE_SYS_POLL_H #cmakedefine01 HAVE_EPOLL +#cmakedefine01 HAVE_LINUX_IO_URING_H #cmakedefine01 HAVE_GETHOSTNAME #cmakedefine01 HAVE_GETNAMEINFO #cmakedefine01 HAVE_SOCKADDR_UN_SUN_PATH diff --git a/src/native/libs/System.Native/CMakeLists.txt b/src/native/libs/System.Native/CMakeLists.txt index 0b0078492eae5f..75dd5bd12bd4bc 100644 --- a/src/native/libs/System.Native/CMakeLists.txt +++ b/src/native/libs/System.Native/CMakeLists.txt @@ -9,6 +9,7 @@ set(NATIVE_SOURCES pal_io.c pal_maphardwaretype.c pal_memory.c + pal_io_uring_shim.c pal_random.c pal_runtimeinformation.c pal_string.c diff --git a/src/native/libs/System.Native/entrypoints.c b/src/native/libs/System.Native/entrypoints.c index 8414814970ea5c..03f11424e6603f 100644 --- a/src/native/libs/System.Native/entrypoints.c +++ b/src/native/libs/System.Native/entrypoints.c @@ -20,6 +20,7 @@ #include "pal_networkchange.h" #include "pal_networking.h" #include "pal_networkstatistics.h" +#include "pal_io_uring_shim.h" #include "pal_process.h" #include "pal_random.h" #include "pal_runtimeinformation.h" @@ -191,6 +192,16 @@ static const Entry s_sysNative[] = DllImportEntry(SystemNative_FreeSocketEventBuffer) DllImportEntry(SystemNative_TryChangeSocketEventRegistration) DllImportEntry(SystemNative_WaitForSocketEvents) + DllImportEntry(SystemNative_IoUringShimSetup) + DllImportEntry(SystemNative_IoUringShimEnter) + DllImportEntry(SystemNative_IoUringShimEnterExt) + DllImportEntry(SystemNative_IoUringShimRegister) + DllImportEntry(SystemNative_IoUringShimMmap) + DllImportEntry(SystemNative_IoUringShimMunmap) + DllImportEntry(SystemNative_IoUringShimCreateEventFd) + DllImportEntry(SystemNative_IoUringShimWriteEventFd) + DllImportEntry(SystemNative_IoUringShimReadEventFd) + DllImportEntry(SystemNative_IoUringShimCloseFd) DllImportEntry(SystemNative_GetWasiSocketDescriptor) DllImportEntry(SystemNative_PlatformSupportsDualModeIPv4PacketInfo) DllImportEntry(SystemNative_GetDomainSocketSizes) diff --git a/src/native/libs/System.Native/pal_io_uring_shim.c b/src/native/libs/System.Native/pal_io_uring_shim.c new file mode 100644 index 00000000000000..1e091928f0aacb --- /dev/null +++ b/src/native/libs/System.Native/pal_io_uring_shim.c @@ -0,0 +1,529 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include "pal_config.h" +#include "pal_io_uring_shim.h" +#include "pal_errno.h" + +#include +#include +#include +#include +#include +#include + +#if HAVE_LINUX_IO_URING_H && HAVE_SYS_POLL_H +#include +#include +#include +#include +#include +#include +#endif + +#include + +// Mirror the syscall-number defines from pal_io_uring.c for setup and enter. +// Register is gated separately because __NR_io_uring_register may not exist. +#if HAVE_LINUX_IO_URING_H && HAVE_SYS_POLL_H && \ + (defined(__NR_io_uring_setup) || defined(SYS_io_uring_setup)) && \ + (defined(__NR_io_uring_enter) || defined(SYS_io_uring_enter)) +#define SHIM_HAVE_IO_URING 1 +#else +#define SHIM_HAVE_IO_URING 0 +#endif + +#if SHIM_HAVE_IO_URING + +#define SHIM_EINTR_RETRY_LIMIT 1024 +#define SHIM_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE_ENV "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE" + +#if defined(IORING_SETUP_CLOEXEC) +_Static_assert(IORING_SETUP_CLOEXEC == (1U << 19), "Unexpected IORING_SETUP_CLOEXEC value"); +#endif + +#if defined(__NR_io_uring_setup) +#define IO_URING_SYSCALL_SETUP __NR_io_uring_setup +#else +#define IO_URING_SYSCALL_SETUP SYS_io_uring_setup +#endif + +#if defined(__NR_io_uring_enter) +#define IO_URING_SYSCALL_ENTER __NR_io_uring_enter +#else +#define IO_URING_SYSCALL_ENTER SYS_io_uring_enter +#endif + +#if defined(__NR_io_uring_register) || defined(SYS_io_uring_register) +#define SHIM_HAVE_IO_URING_REGISTER 1 +#if defined(__NR_io_uring_register) +#define IO_URING_SYSCALL_REGISTER __NR_io_uring_register +#else +#define IO_URING_SYSCALL_REGISTER SYS_io_uring_register +#endif +#else +#define SHIM_HAVE_IO_URING_REGISTER 0 +#endif + +// The io_uring_getevents_arg struct for IORING_ENTER_EXT_ARG. +// Defined locally to avoid dependency on kernel header version. +typedef struct ShimIoUringGeteventsArg +{ + uint64_t sigmask; + uint32_t sigmask_sz; + uint32_t min_wait_usec; + uint64_t ts; +} ShimIoUringGeteventsArg; + +static int32_t ConsumeForceEnterEintrRetryLimitOnce(void) +{ + static atomic_int s_forceEnterEintrRetryLimitOnce = ATOMIC_VAR_INIT(-1); + + int32_t state = atomic_load_explicit(&s_forceEnterEintrRetryLimitOnce, memory_order_relaxed); + if (state < 0) + { + const char* configuredValue = getenv(SHIM_TEST_FORCE_ENTER_EINTR_RETRY_LIMIT_ONCE_ENV); + int32_t initializedState = configuredValue != NULL && strcmp(configuredValue, "1") == 0 ? 1 : 0; + int expected = -1; + if (!atomic_compare_exchange_strong_explicit( + &s_forceEnterEintrRetryLimitOnce, + &expected, + initializedState, + memory_order_relaxed, + memory_order_relaxed)) + { + initializedState = expected; + } + + state = initializedState; + } + + if (state == 0) + { + return 0; + } + + int expected = 1; + return atomic_compare_exchange_strong_explicit( + &s_forceEnterEintrRetryLimitOnce, + &expected, + 0, + memory_order_relaxed, + memory_order_relaxed) + ? 1 + : 0; +} + +int32_t SystemNative_IoUringShimSetup(uint32_t entries, void* params, int32_t* ringFd) +{ + if (params == NULL || ringFd == NULL) + { + return Error_EFAULT; + } + + int fd = (int)syscall(IO_URING_SYSCALL_SETUP, entries, params); + if (fd < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *ringFd = fd; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimEnter(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, int32_t* result) +{ + if (result == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + + if (toSubmit != 0 && ConsumeForceEnterEintrRetryLimitOnce() != 0) + { + return SystemNative_ConvertErrorPlatformToPal(EINTR); + } + + int ret; + int retryCount = 0; + while ((ret = (int)syscall(IO_URING_SYSCALL_ENTER, ringFd, toSubmit, minComplete, flags, NULL, 0)) < 0 && errno == EINTR) + { + if (++retryCount >= SHIM_EINTR_RETRY_LIMIT) + { + return SystemNative_ConvertErrorPlatformToPal(EINTR); + } + } + + if (ret < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *result = ret; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimEnterExt(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, void* arg, int32_t* result) +{ + if (result == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + + if (toSubmit != 0 && ConsumeForceEnterEintrRetryLimitOnce() != 0) + { + return SystemNative_ConvertErrorPlatformToPal(EINTR); + } + + int ret; + int retryCount = 0; + while ((ret = (int)syscall(IO_URING_SYSCALL_ENTER, ringFd, toSubmit, minComplete, flags, arg, arg == NULL ? 0 : sizeof(ShimIoUringGeteventsArg))) < 0 && errno == EINTR) + { + if (++retryCount >= SHIM_EINTR_RETRY_LIMIT) + { + return SystemNative_ConvertErrorPlatformToPal(EINTR); + } + } + + if (ret < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *result = ret; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimRegister(int32_t ringFd, uint32_t opcode, void* arg, uint32_t nrArgs, int32_t* result) +{ + if (result == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + +#if SHIM_HAVE_IO_URING_REGISTER + int ret; + int retryCount = 0; + while ((ret = (int)syscall(IO_URING_SYSCALL_REGISTER, ringFd, opcode, arg, nrArgs)) < 0 && errno == EINTR) + { + if (++retryCount >= SHIM_EINTR_RETRY_LIMIT) + { + return SystemNative_ConvertErrorPlatformToPal(EINTR); + } + } + + if (ret < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *result = ret; + return Error_SUCCESS; +#else + (void)ringFd; + (void)opcode; + (void)arg; + (void)nrArgs; + (void)result; + return Error_ENOSYS; +#endif +} + +int32_t SystemNative_IoUringShimMmap(int32_t ringFd, uint64_t size, uint64_t offset, void** mappedPtr) +{ + if (mappedPtr == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + + void* ptr = mmap(0, (size_t)size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, ringFd, (off_t)offset); + if (ptr == MAP_FAILED) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *mappedPtr = ptr; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimMunmap(void* addr, uint64_t size) +{ + if (addr == NULL) + { + return Error_EFAULT; + } + + if (munmap(addr, (size_t)size) != 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimCreateEventFd(int32_t* eventFd) +{ + if (eventFd == NULL) + { + return Error_EFAULT; + } + + int fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (fd < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + *eventFd = fd; + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimWriteEventFd(int32_t eventFd) +{ + uint64_t val = 1; + ssize_t written; + int retryCount = 0; + while ((written = write(eventFd, &val, sizeof(val))) < 0 && errno == EINTR) + { + if (++retryCount >= SHIM_EINTR_RETRY_LIMIT) + { + return SystemNative_ConvertErrorPlatformToPal(EINTR); + } + } + + if (written < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + if (written != (ssize_t)sizeof(val)) + { + return Error_EIO; + } + + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimReadEventFd(int32_t eventFd, uint64_t* value) +{ + if (value == NULL) + { + return Error_EFAULT; + } + + ssize_t bytesRead; + int retryCount = 0; + while ((bytesRead = read(eventFd, value, sizeof(*value))) < 0 && errno == EINTR) + { + if (++retryCount >= SHIM_EINTR_RETRY_LIMIT) + { + return SystemNative_ConvertErrorPlatformToPal(EINTR); + } + } + + if (bytesRead < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + if ((size_t)bytesRead != sizeof(*value)) + { + return Error_EIO; + } + + return Error_SUCCESS; +} + +int32_t SystemNative_IoUringShimCloseFd(int32_t fd) +{ + // Linux close(2) closes the descriptor even when interrupted (EINTR). + // Retrying risks closing a reused descriptor opened by another thread. + if (close(fd) != 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + + return Error_SUCCESS; +} + +// Layout assertions for managed interop structs (kernel struct mirrors). +c_static_assert(sizeof(size_t) >= 8); +c_static_assert(sizeof(size_t) == sizeof(void*)); +c_static_assert(sizeof(struct io_uring_cqe) == 16); +c_static_assert(offsetof(struct io_uring_cqe, user_data) == 0); +c_static_assert(offsetof(struct io_uring_cqe, res) == 8); +c_static_assert(offsetof(struct io_uring_cqe, flags) == 12); + +c_static_assert(sizeof(struct io_uring_params) == 120); +c_static_assert(offsetof(struct io_uring_params, sq_entries) == 0); +c_static_assert(offsetof(struct io_uring_params, cq_entries) == 4); +c_static_assert(offsetof(struct io_uring_params, flags) == 8); +c_static_assert(offsetof(struct io_uring_params, features) == 20); +c_static_assert(offsetof(struct io_uring_params, sq_off) == 40); +c_static_assert(offsetof(struct io_uring_params, cq_off) == 80); + +c_static_assert(sizeof(struct io_sqring_offsets) == 40); +c_static_assert(offsetof(struct io_sqring_offsets, head) == 0); +c_static_assert(offsetof(struct io_sqring_offsets, tail) == 4); +c_static_assert(offsetof(struct io_sqring_offsets, ring_mask) == 8); +c_static_assert(offsetof(struct io_sqring_offsets, ring_entries) == 12); +c_static_assert(offsetof(struct io_sqring_offsets, flags) == 16); +c_static_assert(offsetof(struct io_sqring_offsets, dropped) == 20); +c_static_assert(offsetof(struct io_sqring_offsets, array) == 24); + +c_static_assert(sizeof(struct io_cqring_offsets) == 40); +c_static_assert(offsetof(struct io_cqring_offsets, head) == 0); +c_static_assert(offsetof(struct io_cqring_offsets, tail) == 4); +c_static_assert(offsetof(struct io_cqring_offsets, overflow) == 16); +c_static_assert(offsetof(struct io_cqring_offsets, cqes) == 20); + +#else // !SHIM_HAVE_IO_URING + +// Stub implementations when io_uring is not available. + +int32_t SystemNative_IoUringShimSetup(uint32_t entries, void* params, int32_t* ringFd) +{ + (void)entries; + if (params == NULL || ringFd == NULL) + { + return Error_EFAULT; + } + + *ringFd = -1; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimEnter(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, int32_t* result) +{ + (void)ringFd; (void)toSubmit; (void)minComplete; (void)flags; + if (result == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + + *result = 0; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimEnterExt(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, void* arg, int32_t* result) +{ + (void)ringFd; (void)toSubmit; (void)minComplete; (void)flags; (void)arg; + if (result == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + + *result = 0; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimRegister(int32_t ringFd, uint32_t opcode, void* arg, uint32_t nrArgs, int32_t* result) +{ + (void)ringFd; (void)opcode; (void)arg; (void)nrArgs; + if (result == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + + *result = 0; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimMmap(int32_t ringFd, uint64_t size, uint64_t offset, void** mappedPtr) +{ + (void)ringFd; (void)size; (void)offset; + if (mappedPtr == NULL) + { + return Error_EFAULT; + } + + if (ringFd < 0) + { + return Error_EBADF; + } + + *mappedPtr = NULL; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimMunmap(void* addr, uint64_t size) +{ + (void)size; + if (addr == NULL) + { + return Error_EFAULT; + } + + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimCreateEventFd(int32_t* eventFd) +{ + if (eventFd == NULL) + { + return Error_EFAULT; + } + + *eventFd = -1; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimWriteEventFd(int32_t eventFd) +{ + (void)eventFd; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimReadEventFd(int32_t eventFd, uint64_t* value) +{ + (void)eventFd; + if (value == NULL) + { + return Error_EFAULT; + } + + *value = 0; + return Error_ENOSYS; +} + +int32_t SystemNative_IoUringShimCloseFd(int32_t fd) +{ + (void)fd; + return Error_ENOSYS; +} + +#endif // SHIM_HAVE_IO_URING diff --git a/src/native/libs/System.Native/pal_io_uring_shim.h b/src/native/libs/System.Native/pal_io_uring_shim.h new file mode 100644 index 00000000000000..2f7a07888827d2 --- /dev/null +++ b/src/native/libs/System.Native/pal_io_uring_shim.h @@ -0,0 +1,27 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma once + +#include "pal_compiler.h" +#include "pal_types.h" + +PALEXPORT int32_t SystemNative_IoUringShimSetup(uint32_t entries, void* params, int32_t* ringFd); + +PALEXPORT int32_t SystemNative_IoUringShimEnter(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, int32_t* result); + +PALEXPORT int32_t SystemNative_IoUringShimEnterExt(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags, void* arg, int32_t* result); + +PALEXPORT int32_t SystemNative_IoUringShimRegister(int32_t ringFd, uint32_t opcode, void* arg, uint32_t nrArgs, int32_t* result); + +PALEXPORT int32_t SystemNative_IoUringShimMmap(int32_t ringFd, uint64_t size, uint64_t offset, void** mappedPtr); + +PALEXPORT int32_t SystemNative_IoUringShimMunmap(void* addr, uint64_t size); + +PALEXPORT int32_t SystemNative_IoUringShimCreateEventFd(int32_t* eventFd); + +PALEXPORT int32_t SystemNative_IoUringShimWriteEventFd(int32_t eventFd); + +PALEXPORT int32_t SystemNative_IoUringShimReadEventFd(int32_t eventFd, uint64_t* value); + +PALEXPORT int32_t SystemNative_IoUringShimCloseFd(int32_t fd); diff --git a/src/native/libs/configure.cmake b/src/native/libs/configure.cmake index 4da74e115c6db8..b2701be5757173 100644 --- a/src/native/libs/configure.cmake +++ b/src/native/libs/configure.cmake @@ -470,6 +470,10 @@ check_symbol_exists( sys/epoll.h HAVE_EPOLL) +check_include_files( + "linux/io_uring.h;sys/syscall.h" + HAVE_LINUX_IO_URING_H) + check_symbol_exists( gethostname unistd.h