From 20433c664704eb796ea733250dcdcbed4714277d Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 04:13:56 +0000 Subject: [PATCH 001/258] Add experimental io_uring socket event engine for Linux Introduce an opt-in io_uring-based socket async engine as an alternative to epoll on Linux kernels >= 5.10. The backend is enabled by setting DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 and transparently falls back to epoll when unavailable or unsupported. This uses raw io_uring syscalls (no liburing dependency) with IORING_OP_POLL_ADD/REMOVE for readiness notifications, keeping the same semantics as the existing epoll path. Includes a functional smoke test, CMake header detection, and a validation guide. Contributes to #753 --- .../testing-linux-sockets-io-uring.md | 80 ++ docs/workflow/testing/libraries/testing.md | 4 + .../Net/Sockets/SocketAsyncEngine.Unix.cs | 1 + .../tests/FunctionalTests/IoUring.Unix.cs | 67 ++ .../System.Net.Sockets.Tests.csproj | 1 + src/native/libs/Common/pal_config.h.in | 2 + .../libs/System.Native/pal_networking.c | 793 +++++++++++++++++- src/native/libs/configure.cmake | 10 + 8 files changed, 954 insertions(+), 4 deletions(-) create mode 100644 docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md create mode 100644 src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs diff --git a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md new file mode 100644 index 00000000000000..883582873d19af --- /dev/null +++ b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md @@ -0,0 +1,80 @@ +# Linux Sockets io_uring Validation + +This checklist is for validating the experimental Linux sockets io_uring backend enabled by: + +`DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1` + +The backend is opt-in and must safely fall back to epoll when io_uring is unavailable. + +## Prerequisites + +- Linux machine (x64 or arm64) +- .NET runtime repo built for `clr+libs` +- For direct io_uring path validation: Linux kernel `>= 5.10` + +## Functional Validation + +Run the targeted sockets functional test added for the opt-in path: + +```bash +./build.sh -subset libs.tests -test \ + /p:XunitMethodName=System.Net.Sockets.Tests.IoUring.IoUringOptIn_DoesNotBreakAsyncSocketWorkflows +``` + +Expected: + +1. Test passes with `DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1`. +2. Test passes on kernels `< 5.10` (fallback to epoll). +3. No hangs or intermittent failures across multiple runs. + +Suggested stress rerun: + +```bash +for i in $(seq 1 20); do + ./build.sh -subset libs.tests -test \ + /p:XunitMethodName=System.Net.Sockets.Tests.IoUring.IoUringOptIn_DoesNotBreakAsyncSocketWorkflows || break +done +``` + +## Backend Verification (Optional) + +Confirm io_uring syscalls are used on supported kernels: + +```bash +strace -f -e trace=io_uring_setup,io_uring_enter,epoll_create1 +``` + +Expected on kernel `>= 5.10` with opt-in enabled: + +1. `io_uring_setup` is present. +2. `io_uring_enter` is present. +3. `epoll_create1` is not used by the sockets event-port path. + +## Performance Checklist + +Use the same workload for baseline and experiment. + +Baseline: + +- `DOTNET_SYSTEM_NET_SOCKETS_IO_URING=0` (or unset) + +Experiment: + +- `DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1` + +Measure: + +1. Throughput (MB/s or req/s) +2. Process CPU% +3. `dotnet-counters`: + - `System.Runtime` CPU Usage + - ThreadPool Thread Count + - ThreadPool Completed Work Item Count +4. Error/stability signals (timeouts, socket exceptions, reconnect loops) + +Acceptance for initial PR: + +1. No functional regressions. +2. No clear throughput regression in steady state. +3. No sustained CPU regression versus epoll baseline for the same workload. +4. Fallback behavior remains correct on unsupported kernels/configurations. diff --git a/docs/workflow/testing/libraries/testing.md b/docs/workflow/testing/libraries/testing.md index ca7eb206c03ee8..32569dbc9e60d8 100644 --- a/docs/workflow/testing/libraries/testing.md +++ b/docs/workflow/testing/libraries/testing.md @@ -1,5 +1,9 @@ # Testing Libraries +Additional guides: + +- [Linux sockets io_uring validation](testing-linux-sockets-io-uring.md) + ## Full Build and Test Run These example commands automate the test run and all pre-requisite build steps in a single command from a clean enlistment. diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index ae9b6c9095e43f..d2ffeb9eec45d2 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -28,6 +28,7 @@ internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem private static int GetEngineCount() { // The responsibility of SocketAsyncEngine is to get notifications from epoll|kqueue + // (or io_uring on Linux when enabled in the native shim) // and schedule corresponding work items to ThreadPool (socket reads and writes). // // Using TechEmpower benchmarks that generate a LOT of SMALL socket reads and writes under a VERY HIGH load diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs new file mode 100644 index 00000000000000..bbe8a515939cbc --- /dev/null +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -0,0 +1,67 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Net; +using System.Threading.Tasks; +using Microsoft.DotNet.RemoteExecutor; +using Xunit; + +namespace System.Net.Sockets.Tests +{ + public class IoUring + { + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_DoesNotBreakAsyncSocketWorkflows() + { + RemoteInvokeOptions options = new RemoteInvokeOptions(); + options.StartInfo.EnvironmentVariables.Add("DOTNET_SYSTEM_NET_SOCKETS_IO_URING", "1"); + options.TimeOut = (int)TimeSpan.FromMinutes(2).TotalMilliseconds; + + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket server = await acceptTask; + + byte[] sendBuffer = new byte[] { 1 }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < 64; i++) + { + ValueTask serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, clientSent); + + int serverReceived = await serverReceiveTask; + Assert.Equal(1, serverReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + ValueTask clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, serverSent); + + int clientReceived = await clientReceiveTask; + Assert.Equal(1, clientReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + unchecked + { + sendBuffer[0]++; + } + } + }, options).DisposeAsync(); + } + } +} diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj b/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj index 43844aea397681..b4fe9f6d079c9f 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/System.Net.Sockets.Tests.csproj @@ -22,6 +22,7 @@ + diff --git a/src/native/libs/Common/pal_config.h.in b/src/native/libs/Common/pal_config.h.in index abc93358e69f6c..24370db1672c8e 100644 --- a/src/native/libs/Common/pal_config.h.in +++ b/src/native/libs/Common/pal_config.h.in @@ -56,6 +56,8 @@ #cmakedefine01 HAVE_ETHTOOL_H #cmakedefine01 HAVE_SYS_POLL_H #cmakedefine01 HAVE_EPOLL +#cmakedefine01 HAVE_LINUX_IO_URING_H +#cmakedefine01 HAVE_IO_URING_POLL32_EVENTS #cmakedefine01 HAVE_GETHOSTNAME #cmakedefine01 HAVE_GETNAMEINFO #cmakedefine01 HAVE_SOCKADDR_UN_SUN_PATH diff --git a/src/native/libs/System.Native/pal_networking.c b/src/native/libs/System.Native/pal_networking.c index 3b460d4c4e8296..8b0cf98f0f60d7 100644 --- a/src/native/libs/System.Native/pal_networking.c +++ b/src/native/libs/System.Native/pal_networking.c @@ -26,6 +26,18 @@ #include #include #endif +#if HAVE_LINUX_IO_URING_H && HAVE_SYS_POLL_H +#include +#include +#include +#include +#endif + +#if HAVE_LINUX_IO_URING_H && HAVE_SYS_POLL_H && defined(__NR_io_uring_setup) && defined(__NR_io_uring_enter) +#define HAVE_LINUX_IO_URING_SOCKET_ENGINE 1 +#else +#define HAVE_LINUX_IO_URING_SOCKET_ENGINE 0 +#endif #if HAVE_SYS_PROCINFO_H #include #include @@ -3102,7 +3114,7 @@ static uint32_t GetEPollEvents(SocketEvents events) (((events & SocketEvents_SA_ERROR) != 0) ? EPOLLERR : 0); } -static int32_t CreateSocketEventPortInner(int32_t* port) +static int32_t CreateSocketEventPortInnerEpoll(int32_t* port) { assert(port != NULL); @@ -3117,13 +3129,13 @@ static int32_t CreateSocketEventPortInner(int32_t* port) return Error_SUCCESS; } -static int32_t CloseSocketEventPortInner(int32_t port) +static int32_t CloseSocketEventPortInnerEpoll(int32_t port) { int err = close(port); return err == 0 || (err < 0 && errno == EINTR) ? Error_SUCCESS : SystemNative_ConvertErrorPlatformToPal(errno); } -static int32_t TryChangeSocketEventRegistrationInner( +static int32_t TryChangeSocketEventRegistrationInnerEpoll( int32_t port, int32_t socket, SocketEvents currentEvents, SocketEvents newEvents, uintptr_t data) { assert(currentEvents != newEvents); @@ -3166,7 +3178,7 @@ static void ConvertEventEPollToSocketAsync(SocketEvent* sae, struct epoll_event* sae->Events = GetSocketEvents(events); } -static int32_t WaitForSocketEventsInner(int32_t port, SocketEvent* buffer, int32_t* count) +static int32_t WaitForSocketEventsInnerEpoll(int32_t port, SocketEvent* buffer, int32_t* count) { assert(buffer != NULL); assert(count != NULL); @@ -3213,6 +3225,779 @@ static int32_t WaitForSocketEventsInner(int32_t port, SocketEvent* buffer, int32 return Error_SUCCESS; } +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + +enum +{ + IoUringMinKernelMajor = 5, + IoUringMinKernelMinor = 10, + IoUringQueueEntries = 1024, + IoUringPortNotFound = INT32_MIN + 1 +}; + +typedef struct SocketEventRegistration +{ + int32_t Socket; + SocketEvents Events; + uintptr_t Data; + uint64_t RequestId; + struct SocketEventRegistration* Next; +} SocketEventRegistration; + +typedef struct IoUringSocketEventPortState +{ + int32_t RingFd; + int8_t UsesSingleMmap; + int8_t IsClosing; + int32_t RefCount; + void* SqRingPtr; + size_t SqRingSize; + void* CqRingPtr; + size_t CqRingSize; + struct io_uring_sqe* Sqes; + size_t SqesSize; + uint32_t* SqHead; + uint32_t* SqTail; + uint32_t* SqMask; + uint32_t* SqEntries; + uint32_t* SqArray; + uint32_t* CqHead; + uint32_t* CqTail; + uint32_t* CqMask; + struct io_uring_cqe* Cqes; + uint32_t PendingSubmissions; + uint64_t NextRequestId; + SocketEventRegistration* Registrations; + pthread_mutex_t Lock; + struct IoUringSocketEventPortState* Next; +} IoUringSocketEventPortState; + +static pthread_mutex_t g_ioUringSocketEventPortsLock = PTHREAD_MUTEX_INITIALIZER; +static IoUringSocketEventPortState* g_ioUringSocketEventPorts = NULL; + +static void FreeIoUringState(IoUringSocketEventPortState* state); + +static int IoUringSetup(uint32_t entries, struct io_uring_params* params) +{ + return (int)syscall(__NR_io_uring_setup, entries, params); +} + +static int IoUringEnter(int32_t ringFd, uint32_t toSubmit, uint32_t minComplete, uint32_t flags) +{ + int result; + while ((result = (int)syscall(__NR_io_uring_enter, ringFd, toSubmit, minComplete, flags, NULL, 0)) < 0 && errno == EINTR); + return result; +} + +static bool IsIoUringSocketEngineEnabled(void) +{ + // Keep epoll as the default path; io_uring is currently opt-in. + const char* value = getenv("DOTNET_SYSTEM_NET_SOCKETS_IO_URING"); + return value != NULL && strcmp(value, "1") == 0; +} + +static bool IsIoUringKernelVersionSupported(void) +{ + struct utsname uts; + if (uname(&uts) != 0) + { + return false; + } + + uint32_t major; + uint32_t minor; + if (sscanf(uts.release, "%u.%u", &major, &minor) != 2) + { + return false; + } + + return major > IoUringMinKernelMajor || (major == IoUringMinKernelMajor && minor >= IoUringMinKernelMinor); +} + +static IoUringSocketEventPortState* FindIoUringSocketEventPortStateNoLock(int32_t ringFd) +{ + IoUringSocketEventPortState* state = g_ioUringSocketEventPorts; + while (state != NULL) + { + if (state->RingFd == ringFd) + { + return state; + } + + state = state->Next; + } + + return NULL; +} + +static IoUringSocketEventPortState* AcquireIoUringSocketEventPortState(int32_t ringFd) +{ + pthread_mutex_lock(&g_ioUringSocketEventPortsLock); + IoUringSocketEventPortState* state = FindIoUringSocketEventPortStateNoLock(ringFd); + if (state != NULL && !state->IsClosing) + { + state->RefCount++; + pthread_mutex_lock(&state->Lock); + } + else + { + state = NULL; + } + pthread_mutex_unlock(&g_ioUringSocketEventPortsLock); + + return state; +} + +static void ReleaseIoUringSocketEventPortState(IoUringSocketEventPortState* state) +{ + assert(state != NULL); + pthread_mutex_unlock(&state->Lock); + + int8_t shouldFree = 0; + pthread_mutex_lock(&g_ioUringSocketEventPortsLock); + assert(state->RefCount > 0); + state->RefCount--; + shouldFree = state->IsClosing && state->RefCount == 0; + pthread_mutex_unlock(&g_ioUringSocketEventPortsLock); + + if (shouldFree) + { + FreeIoUringState(state); + } +} + +static SocketEventRegistration* FindRegistrationBySocketLocked(IoUringSocketEventPortState* state, int32_t socket, SocketEventRegistration** previous) +{ + assert(state != NULL); + + SocketEventRegistration* prev = NULL; + SocketEventRegistration* current = state->Registrations; + while (current != NULL) + { + if (current->Socket == socket) + { + if (previous != NULL) + { + *previous = prev; + } + return current; + } + + prev = current; + current = current->Next; + } + + if (previous != NULL) + { + *previous = NULL; + } + return NULL; +} + +static SocketEventRegistration* FindRegistrationByRequestIdLocked(IoUringSocketEventPortState* state, uint64_t requestId) +{ + assert(state != NULL); + + SocketEventRegistration* current = state->Registrations; + while (current != NULL) + { + if (current->RequestId == requestId) + { + return current; + } + + current = current->Next; + } + + return NULL; +} + +static void RemoveRegistrationLocked(IoUringSocketEventPortState* state, SocketEventRegistration* registration, SocketEventRegistration* previous) +{ + assert(state != NULL); + assert(registration != NULL); + + if (previous == NULL) + { + state->Registrations = registration->Next; + } + else + { + previous->Next = registration->Next; + } + + free(registration); +} + +static uint32_t GetIoUringPollEvents(SocketEvents events) +{ + uint32_t pollEvents = (((events & SocketEvents_SA_READ) != 0) ? POLLIN : 0) | (((events & SocketEvents_SA_WRITE) != 0) ? POLLOUT : 0) | + POLLERR | POLLHUP; + +#ifdef POLLRDHUP + if ((events & SocketEvents_SA_READCLOSE) != 0) + { + pollEvents |= POLLRDHUP; + } +#endif + + return pollEvents; +} + +static SocketEvents GetSocketEventsFromIoUringPollResult(int32_t result) +{ + uint32_t events = (uint32_t)result; + if ((events & POLLHUP) != 0) + { + events = (events & ((uint32_t)~POLLHUP)) | POLLIN | POLLOUT; + } + + int32_t asyncEvents = (((events & POLLIN) != 0) ? SocketEvents_SA_READ : 0) | (((events & POLLOUT) != 0) ? SocketEvents_SA_WRITE : 0) | +#ifdef POLLRDHUP + (((events & POLLRDHUP) != 0) ? SocketEvents_SA_READCLOSE : 0) | +#else + 0 | +#endif + (((events & POLLERR) != 0) ? SocketEvents_SA_ERROR : 0); + return (SocketEvents)asyncEvents; +} + +static int32_t SubmitIoUringPendingEntriesLocked(IoUringSocketEventPortState* state) +{ + assert(state != NULL); + + while (state->PendingSubmissions > 0) + { + int result = IoUringEnter(state->RingFd, state->PendingSubmissions, 0, 0); + if (result < 0) + { + return SystemNative_ConvertErrorPlatformToPal(errno); + } + if (result == 0) + { + return Error_EAGAIN; + } + + state->PendingSubmissions -= (uint32_t)result; + } + + return Error_SUCCESS; +} + +static struct io_uring_sqe* GetIoUringSqeLocked(IoUringSocketEventPortState* state, int32_t* error) +{ + assert(state != NULL); + assert(error != NULL); + + while (true) + { + uint32_t head = __atomic_load_n(state->SqHead, __ATOMIC_ACQUIRE); + uint32_t tail = __atomic_load_n(state->SqTail, __ATOMIC_RELAXED); + uint32_t sqEntries = *state->SqEntries; + + if (tail - head < sqEntries) + { + uint32_t index = tail & *state->SqMask; + struct io_uring_sqe* sqe = &state->Sqes[index]; + memset(sqe, 0, sizeof(struct io_uring_sqe)); + state->SqArray[index] = index; + __atomic_store_n(state->SqTail, tail + 1, __ATOMIC_RELEASE); + state->PendingSubmissions++; + *error = Error_SUCCESS; + return sqe; + } + + *error = SubmitIoUringPendingEntriesLocked(state); + if (*error != Error_SUCCESS) + { + return NULL; + } + } +} + +static int32_t QueueIoUringPollRemoveLocked(IoUringSocketEventPortState* state, uint64_t requestId) +{ + if (requestId == 0) + { + return Error_SUCCESS; + } + + int32_t error; + struct io_uring_sqe* sqe = GetIoUringSqeLocked(state, &error); + if (sqe == NULL) + { + return error; + } + + sqe->opcode = IORING_OP_POLL_REMOVE; + sqe->addr = requestId; + sqe->user_data = 0; + + return Error_SUCCESS; +} + +static int32_t QueueIoUringPollAddLocked(IoUringSocketEventPortState* state, SocketEventRegistration* registration) +{ + assert(state != NULL); + assert(registration != NULL); + + int32_t error; + struct io_uring_sqe* sqe = GetIoUringSqeLocked(state, &error); + if (sqe == NULL) + { + return error; + } + + uint64_t requestId = ++state->NextRequestId; + if (requestId == 0) + { + requestId = ++state->NextRequestId; + } + + sqe->opcode = IORING_OP_POLL_ADD; + sqe->fd = registration->Socket; + uint32_t pollEvents = GetIoUringPollEvents(registration->Events); +#if HAVE_IO_URING_POLL32_EVENTS + sqe->poll32_events = pollEvents; +#else + // The poll flags currently used by socket readiness notifications fit in the legacy 16-bit field. + sqe->poll_events = (uint16_t)pollEvents; +#endif + sqe->user_data = requestId; + registration->RequestId = requestId; + + return Error_SUCCESS; +} + +static void FreeIoUringState(IoUringSocketEventPortState* state) +{ + if (state == NULL) + { + return; + } + + SocketEventRegistration* registration = state->Registrations; + while (registration != NULL) + { + SocketEventRegistration* next = registration->Next; + free(registration); + registration = next; + } + + if (state->Sqes != NULL && state->Sqes != MAP_FAILED) + { + munmap(state->Sqes, state->SqesSize); + } + + if (state->SqRingPtr != NULL && state->SqRingPtr != MAP_FAILED) + { + munmap(state->SqRingPtr, state->SqRingSize); + } + + if (!state->UsesSingleMmap && state->CqRingPtr != NULL && state->CqRingPtr != MAP_FAILED) + { + munmap(state->CqRingPtr, state->CqRingSize); + } + + if (state->RingFd != -1) + { + close(state->RingFd); + } + + pthread_mutex_destroy(&state->Lock); + free(state); +} + +static int32_t TryCreateSocketEventPortInnerIoUring(int32_t* port) +{ + assert(port != NULL); + + if (!IsIoUringSocketEngineEnabled() || !IsIoUringKernelVersionSupported()) + { + return Error_ENOSYS; + } + + IoUringSocketEventPortState* state = (IoUringSocketEventPortState*)calloc(1, sizeof(IoUringSocketEventPortState)); + if (state == NULL) + { + return Error_ENOMEM; + } + + state->RingFd = -1; + state->SqRingPtr = MAP_FAILED; + state->CqRingPtr = MAP_FAILED; + state->Sqes = MAP_FAILED; + state->NextRequestId = 1; + state->RefCount = 1; + + if (pthread_mutex_init(&state->Lock, NULL) != 0) + { + free(state); + return Error_EINVAL; + } + + struct io_uring_params params; + memset(¶ms, 0, sizeof(params)); + int ringFd = IoUringSetup(IoUringQueueEntries, ¶ms); + if (ringFd < 0) + { + int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); + FreeIoUringState(state); + return error; + } + + state->RingFd = ringFd; + state->SqRingSize = params.sq_off.array + params.sq_entries * sizeof(uint32_t); + state->CqRingSize = params.cq_off.cqes + params.cq_entries * sizeof(struct io_uring_cqe); + + if ((params.features & IORING_FEAT_SINGLE_MMAP) != 0) + { + state->UsesSingleMmap = 1; + size_t ringSize = state->SqRingSize > state->CqRingSize ? state->SqRingSize : state->CqRingSize; + state->SqRingPtr = mmap(NULL, ringSize, PROT_READ | PROT_WRITE, MAP_SHARED, ringFd, IORING_OFF_SQ_RING); + if (state->SqRingPtr == MAP_FAILED) + { + int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); + FreeIoUringState(state); + return error; + } + + state->SqRingSize = ringSize; + state->CqRingSize = ringSize; + state->CqRingPtr = state->SqRingPtr; + } + else + { + state->SqRingPtr = mmap(NULL, state->SqRingSize, PROT_READ | PROT_WRITE, MAP_SHARED, ringFd, IORING_OFF_SQ_RING); + if (state->SqRingPtr == MAP_FAILED) + { + int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); + FreeIoUringState(state); + return error; + } + + state->CqRingPtr = mmap(NULL, state->CqRingSize, PROT_READ | PROT_WRITE, MAP_SHARED, ringFd, IORING_OFF_CQ_RING); + if (state->CqRingPtr == MAP_FAILED) + { + int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); + FreeIoUringState(state); + return error; + } + } + + state->SqesSize = params.sq_entries * sizeof(struct io_uring_sqe); + state->Sqes = mmap(NULL, state->SqesSize, PROT_READ | PROT_WRITE, MAP_SHARED, ringFd, IORING_OFF_SQES); + if (state->Sqes == MAP_FAILED) + { + int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); + FreeIoUringState(state); + return error; + } + + state->SqHead = (uint32_t*)((uint8_t*)state->SqRingPtr + params.sq_off.head); + state->SqTail = (uint32_t*)((uint8_t*)state->SqRingPtr + params.sq_off.tail); + state->SqMask = (uint32_t*)((uint8_t*)state->SqRingPtr + params.sq_off.ring_mask); + state->SqEntries = (uint32_t*)((uint8_t*)state->SqRingPtr + params.sq_off.ring_entries); + state->SqArray = (uint32_t*)((uint8_t*)state->SqRingPtr + params.sq_off.array); + state->CqHead = (uint32_t*)((uint8_t*)state->CqRingPtr + params.cq_off.head); + state->CqTail = (uint32_t*)((uint8_t*)state->CqRingPtr + params.cq_off.tail); + state->CqMask = (uint32_t*)((uint8_t*)state->CqRingPtr + params.cq_off.ring_mask); + state->Cqes = (struct io_uring_cqe*)((uint8_t*)state->CqRingPtr + params.cq_off.cqes); + + pthread_mutex_lock(&g_ioUringSocketEventPortsLock); + state->Next = g_ioUringSocketEventPorts; + g_ioUringSocketEventPorts = state; + pthread_mutex_unlock(&g_ioUringSocketEventPortsLock); + + *port = ringFd; + return Error_SUCCESS; +} + +static int32_t CloseSocketEventPortInnerIoUring(int32_t port) +{ + pthread_mutex_lock(&g_ioUringSocketEventPortsLock); + + IoUringSocketEventPortState* previous = NULL; + IoUringSocketEventPortState* state = g_ioUringSocketEventPorts; + while (state != NULL) + { + if (state->RingFd == port) + { + break; + } + + previous = state; + state = state->Next; + } + + if (state == NULL) + { + pthread_mutex_unlock(&g_ioUringSocketEventPortsLock); + return IoUringPortNotFound; + } + + pthread_mutex_lock(&state->Lock); + state->IsClosing = 1; + if (previous == NULL) + { + g_ioUringSocketEventPorts = state->Next; + } + else + { + previous->Next = state->Next; + } + + assert(state->RefCount > 0); + state->RefCount--; + int8_t shouldFree = state->RefCount == 0; + + pthread_mutex_unlock(&g_ioUringSocketEventPortsLock); + pthread_mutex_unlock(&state->Lock); + + if (shouldFree) + { + FreeIoUringState(state); + } + return Error_SUCCESS; +} + +static int32_t TryChangeSocketEventRegistrationInnerIoUring( + int32_t port, int32_t socket, SocketEvents currentEvents, SocketEvents newEvents, uintptr_t data) +{ + (void)currentEvents; + + IoUringSocketEventPortState* state = AcquireIoUringSocketEventPortState(port); + if (state == NULL) + { + return IoUringPortNotFound; + } + + int32_t error = Error_SUCCESS; + + SocketEventRegistration* previous = NULL; + SocketEventRegistration* registration = FindRegistrationBySocketLocked(state, socket, &previous); + + if (registration != NULL && registration->RequestId != 0) + { + error = QueueIoUringPollRemoveLocked(state, registration->RequestId); + if (error != Error_SUCCESS) + { + ReleaseIoUringSocketEventPortState(state); + return error; + } + + registration->RequestId = 0; + } + + if (newEvents == SocketEvents_SA_NONE) + { + if (registration != NULL) + { + RemoveRegistrationLocked(state, registration, previous); + } + } + else + { + if (registration == NULL) + { + registration = (SocketEventRegistration*)calloc(1, sizeof(SocketEventRegistration)); + if (registration == NULL) + { + ReleaseIoUringSocketEventPortState(state); + return Error_ENOMEM; + } + + registration->Socket = socket; + registration->Next = state->Registrations; + state->Registrations = registration; + } + + registration->Events = newEvents; + registration->Data = data; + + error = QueueIoUringPollAddLocked(state, registration); + if (error != Error_SUCCESS) + { + ReleaseIoUringSocketEventPortState(state); + return error; + } + } + + error = SubmitIoUringPendingEntriesLocked(state); + ReleaseIoUringSocketEventPortState(state); + return error; +} + +static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer, int32_t* count) +{ + assert(buffer != NULL); + assert(count != NULL); + assert(*count >= 0); + + IoUringSocketEventPortState* state = AcquireIoUringSocketEventPortState(port); + if (state == NULL) + { + return IoUringPortNotFound; + } + + int32_t maxEvents = *count; + while (true) + { + int32_t produced = 0; + uint32_t cqHead = __atomic_load_n(state->CqHead, __ATOMIC_ACQUIRE); + uint32_t cqTail = __atomic_load_n(state->CqTail, __ATOMIC_ACQUIRE); + + while (cqHead != cqTail && produced < maxEvents) + { + struct io_uring_cqe cqe = state->Cqes[cqHead & *state->CqMask]; + cqHead++; + + if (cqe.user_data == 0) + { + continue; + } + + SocketEventRegistration* registration = FindRegistrationByRequestIdLocked(state, cqe.user_data); + if (registration == NULL) + { + continue; + } + + uintptr_t registrationData = registration->Data; + registration->RequestId = 0; + int8_t removeRegistration = 0; + + SocketEvents events = SocketEvents_SA_NONE; + if (cqe.res >= 0) + { + events = GetSocketEventsFromIoUringPollResult(cqe.res); + } + else if (cqe.res != -ECANCELED && cqe.res != -ENOENT) + { + events = SocketEvents_SA_ERROR; + } + + if (cqe.res == -EBADF || cqe.res == -ENOENT || cqe.res == -EINVAL) + { + removeRegistration = 1; + } + + if (events != SocketEvents_SA_NONE) + { + memset(&buffer[produced], 0, sizeof(SocketEvent)); + buffer[produced].Data = registrationData; + buffer[produced].Events = events; + produced++; + } + + if (!removeRegistration && + registration->Events != SocketEvents_SA_NONE && + (cqe.res >= 0 || (cqe.res != -EBADF && cqe.res != -ENOENT && cqe.res != -EINVAL))) + { + int32_t error = QueueIoUringPollAddLocked(state, registration); + if (error != Error_SUCCESS) + { + __atomic_store_n(state->CqHead, cqHead, __ATOMIC_RELEASE); + ReleaseIoUringSocketEventPortState(state); + *count = 0; + return error; + } + } + else if (removeRegistration) + { + SocketEventRegistration* previous = NULL; + SocketEventRegistration* current = FindRegistrationBySocketLocked(state, registration->Socket, &previous); + if (current == registration) + { + RemoveRegistrationLocked(state, registration, previous); + } + } + } + + __atomic_store_n(state->CqHead, cqHead, __ATOMIC_RELEASE); + + int32_t submitError = SubmitIoUringPendingEntriesLocked(state); + if (submitError != Error_SUCCESS) + { + ReleaseIoUringSocketEventPortState(state); + *count = 0; + return submitError; + } + + if (produced > 0) + { + ReleaseIoUringSocketEventPortState(state); + *count = produced; + return Error_SUCCESS; + } + + pthread_mutex_unlock(&state->Lock); + int result = IoUringEnter(port, 0, 1, IORING_ENTER_GETEVENTS); + pthread_mutex_lock(&state->Lock); + if (result < 0) + { + int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); + ReleaseIoUringSocketEventPortState(state); + *count = 0; + return error; + } + } +} + +#endif // HAVE_LINUX_IO_URING_SOCKET_ENGINE + +static int32_t CreateSocketEventPortInner(int32_t* port) +{ +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + if (TryCreateSocketEventPortInnerIoUring(port) == Error_SUCCESS) + { + return Error_SUCCESS; + } +#endif + + return CreateSocketEventPortInnerEpoll(port); +} + +static int32_t CloseSocketEventPortInner(int32_t port) +{ +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + int32_t ioUringError = CloseSocketEventPortInnerIoUring(port); + if (ioUringError != IoUringPortNotFound) + { + return ioUringError; + } +#endif + + return CloseSocketEventPortInnerEpoll(port); +} + +static int32_t TryChangeSocketEventRegistrationInner( + int32_t port, int32_t socket, SocketEvents currentEvents, SocketEvents newEvents, uintptr_t data) +{ +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + int32_t ioUringError = TryChangeSocketEventRegistrationInnerIoUring(port, socket, currentEvents, newEvents, data); + if (ioUringError != IoUringPortNotFound) + { + return ioUringError; + } +#endif + + return TryChangeSocketEventRegistrationInnerEpoll(port, socket, currentEvents, newEvents, data); +} + +static int32_t WaitForSocketEventsInner(int32_t port, SocketEvent* buffer, int32_t* count) +{ +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + int32_t ioUringError = WaitForSocketEventsInnerIoUring(port, buffer, count); + if (ioUringError != IoUringPortNotFound) + { + return ioUringError; + } +#endif + + return WaitForSocketEventsInnerEpoll(port, buffer, count); +} + #elif HAVE_KQUEUE c_static_assert(sizeof(SocketEvent) <= sizeof(struct kevent)); diff --git a/src/native/libs/configure.cmake b/src/native/libs/configure.cmake index 4da74e115c6db8..5a0367ef6ac09d 100644 --- a/src/native/libs/configure.cmake +++ b/src/native/libs/configure.cmake @@ -470,6 +470,16 @@ check_symbol_exists( sys/epoll.h HAVE_EPOLL) +check_include_files( + "linux/io_uring.h;sys/syscall.h" + HAVE_LINUX_IO_URING_H) + +check_struct_has_member( + "struct io_uring_sqe" + poll32_events + "linux/io_uring.h" + HAVE_IO_URING_POLL32_EVENTS) + check_symbol_exists( gethostname unistd.h From 6554822a2a1934baad1d94540250f02bf997cf28 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 04:49:26 +0000 Subject: [PATCH 002/258] Harden io_uring socket engine: scalable lookups, multishot poll, tracing, expanded tests - Replace linear registration linked-list with dual hash tables (socket-fd and request-id indexed) for O(1) lookups at scale. - Narrow lock scope in CQE processing to yield between entries, reducing contention with concurrent registration changes. - Close the ring fd during teardown to wake threads blocked in io_uring_enter(GETEVENTS). - Add IORING_POLL_ADD_MULTI support (kernel 5.13+) with runtime feature probe and fallback to one-shot rearm. - Add collision-safe NextRequestId allocation (skip in-use IDs). - Add EAGAIN retry with bounded backoff in submit path. - Add native diagnostics counters (rearm, submit retry/error, CQE error, max pending) logged on port close. - Add DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1 trace mode. - Expand test suite: UDP, multi-connection, disconnect/reconnect, cancellation, forced fallback, concurrent close, stability gate. - Update validation guide with seccomp and counter sections. --- .../testing-linux-sockets-io-uring.md | 43 ++ .../tests/FunctionalTests/IoUring.Unix.cs | 313 +++++++++++- .../libs/System.Native/pal_networking.c | 468 ++++++++++++++---- 3 files changed, 700 insertions(+), 124 deletions(-) diff --git a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md index 883582873d19af..2b87ce73be1483 100644 --- a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md +++ b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md @@ -12,6 +12,14 @@ The backend is opt-in and must safely fall back to epoll when io_uring is unavai - .NET runtime repo built for `clr+libs` - For direct io_uring path validation: Linux kernel `>= 5.10` +## Operational Caveats + +1. The backend is disabled by default and only enabled with `DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1`. +2. Kernels `< 5.10` do not meet the runtime gate and automatically use epoll. +3. Container seccomp policies may block `io_uring_setup` and force fallback to epoll. +4. `IORING_POLL_ADD_MULTI` is only used when both kernel and headers support it (typically kernel `>= 5.13`); otherwise one-shot poll rearm remains active. +5. Debug traces can be enabled with `DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1`. + ## Functional Validation Run the targeted sockets functional test added for the opt-in path: @@ -50,6 +58,41 @@ Expected on kernel `>= 5.10` with opt-in enabled: 2. `io_uring_enter` is present. 3. `epoll_create1` is not used by the sockets event-port path. +Optional trace-assisted verification: + +```bash +DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1 +``` + +Expected: + +1. Startup line indicates io_uring port creation and whether poll multishot is enabled. +2. Shutdown line includes lightweight counters (max pending SQEs, rearm count, submit retries/errors, CQE error count). + +## Container Seccomp Validation + +Default Docker seccomp profile usually blocks io_uring: + +```bash +docker run --rm -it /bin/bash -lc 'DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 ' +``` + +Permissive profile (or unconfined) should allow io_uring: + +```bash +docker run --rm -it --security-opt seccomp=unconfined /bin/bash -lc 'DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 ' +``` + +Validate fallback/allow behavior with either `strace` or trace mode output. + +## Rollback Guidance + +If production issues are observed after deployment: + +1. Disable io_uring immediately by unsetting `DOTNET_SYSTEM_NET_SOCKETS_IO_URING` (or setting it to `0`). +2. Re-run targeted socket workflows and confirm behavior is back on epoll. +3. Keep trace mode disabled unless actively diagnosing (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1`). + ## Performance Checklist Use the same workload for baseline and experiment. diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs index bbe8a515939cbc..5f6b961714f976 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -2,7 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Collections.Generic; using System.Net; +using System.Threading; using System.Threading.Tasks; using Microsoft.DotNet.RemoteExecutor; using Xunit; @@ -11,49 +13,126 @@ namespace System.Net.Sockets.Tests { public class IoUring { + private static RemoteInvokeOptions CreateSocketEngineOptions(string? ioUringValue = "1", bool forceFallback = false) + { + RemoteInvokeOptions options = new RemoteInvokeOptions(); + if (ioUringValue is not null) + { + options.StartInfo.EnvironmentVariables.Add("DOTNET_SYSTEM_NET_SOCKETS_IO_URING", ioUringValue); + } + + if (forceFallback) + { + options.StartInfo.EnvironmentVariables.Add("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_FALLBACK", "1"); + } + + options.TimeOut = (int)TimeSpan.FromMinutes(2).TotalMilliseconds; + return options; + } + + private static async Task RunTcpRoundTripAsync(int iterations) + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket server = await acceptTask; + + byte[] sendBuffer = new byte[] { 1 }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < iterations; i++) + { + ValueTask serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, clientSent); + + int serverReceived = await serverReceiveTask; + Assert.Equal(1, serverReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + ValueTask clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, serverSent); + + int clientReceived = await clientReceiveTask; + Assert.Equal(1, clientReceived); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + + unchecked + { + sendBuffer[0]++; + } + } + } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. public static async Task IoUringOptIn_DoesNotBreakAsyncSocketWorkflows() { - RemoteInvokeOptions options = new RemoteInvokeOptions(); - options.StartInfo.EnvironmentVariables.Add("DOTNET_SYSTEM_NET_SOCKETS_IO_URING", "1"); - options.TimeOut = (int)TimeSpan.FromMinutes(2).TotalMilliseconds; + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(64), CreateSocketEngineOptions()).DisposeAsync(); + } - await RemoteExecutor.Invoke(static async () => + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task SocketEngine_DefaultOptOut_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => { - using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); - listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); - listener.Listen(1); + Environment.SetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING", null); + return RunTcpRoundTripAsync(32); + }, CreateSocketEngineOptions(ioUringValue: null)).DisposeAsync(); + } - using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); - Task acceptTask = listener.AcceptAsync(); + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task SocketEngine_KillSwitchZero_DoesNotBreakAsyncSocketWorkflows() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: "0")).DisposeAsync(); + } - await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); - using Socket server = await acceptTask; + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_UdpSendReceive_Works() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket receiver = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + receiver.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + IPEndPoint receiverEndpoint = (IPEndPoint)receiver.LocalEndPoint!; - byte[] sendBuffer = new byte[] { 1 }; + using Socket sender = new Socket(AddressFamily.InterNetwork, SocketType.Dgram, ProtocolType.Udp); + sender.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + IPEndPoint senderEndpoint = (IPEndPoint)sender.LocalEndPoint!; + sender.Connect(receiverEndpoint); + + byte[] sendBuffer = new byte[] { 7 }; byte[] receiveBuffer = new byte[1]; for (int i = 0; i < 64; i++) { - ValueTask serverReceiveTask = server.ReceiveAsync(receiveBuffer, SocketFlags.None); - await Task.Yield(); - - int clientSent = await client.SendAsync(sendBuffer, SocketFlags.None); - Assert.Equal(1, clientSent); + int sent = await sender.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); - int serverReceived = await serverReceiveTask; - Assert.Equal(1, serverReceived); + EndPoint remote = new IPEndPoint(IPAddress.Any, 0); + SocketReceiveFromResult receiveFrom = await receiver.ReceiveFromAsync(receiveBuffer, SocketFlags.None, remote); + Assert.Equal(1, receiveFrom.ReceivedBytes); Assert.Equal(sendBuffer[0], receiveBuffer[0]); + Assert.Equal(senderEndpoint, receiveFrom.RemoteEndPoint); - ValueTask clientReceiveTask = client.ReceiveAsync(receiveBuffer, SocketFlags.None); - await Task.Yield(); + int echoed = await receiver.SendToAsync(sendBuffer, SocketFlags.None, receiveFrom.RemoteEndPoint); + Assert.Equal(1, echoed); - int serverSent = await server.SendAsync(sendBuffer, SocketFlags.None); - Assert.Equal(1, serverSent); - - int clientReceived = await clientReceiveTask; - Assert.Equal(1, clientReceived); + int received = await sender.ReceiveAsync(receiveBuffer, SocketFlags.None); + Assert.Equal(1, received); Assert.Equal(sendBuffer[0], receiveBuffer[0]); unchecked @@ -61,7 +140,187 @@ await RemoteExecutor.Invoke(static async () => sendBuffer[0]++; } } - }, options).DisposeAsync(); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_MultipleConcurrentConnections_Work() + { + await RemoteExecutor.Invoke(static async () => + { + const int ConnectionCount = 32; + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(ConnectionCount); + + var acceptTasks = new Task[ConnectionCount]; + var clients = new Socket[ConnectionCount]; + + for (int i = 0; i < ConnectionCount; i++) + { + acceptTasks[i] = listener.AcceptAsync(); + } + + var connectTasks = new Task[ConnectionCount]; + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + for (int i = 0; i < ConnectionCount; i++) + { + clients[i] = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + connectTasks[i] = clients[i].ConnectAsync(endpoint); + } + + await Task.WhenAll(connectTasks); + Socket[] servers = await Task.WhenAll(acceptTasks); + + var roundTripTasks = new List(ConnectionCount); + for (int i = 0; i < ConnectionCount; i++) + { + Socket client = clients[i]; + Socket server = servers[i]; + byte value = (byte)(i + 1); + roundTripTasks.Add(Task.Run(async () => + { + byte[] tx = new byte[] { value }; + byte[] rx = new byte[1]; + + int sent = await client.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await server.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + + sent = await server.SendAsync(tx, SocketFlags.None); + Assert.Equal(1, sent); + + received = await client.ReceiveAsync(rx, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(value, rx[0]); + })); + } + + await Task.WhenAll(roundTripTasks); + + for (int i = 0; i < ConnectionCount; i++) + { + servers[i].Dispose(); + clients[i].Dispose(); + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_DisconnectReconnectAndCancellation_Work() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(2); + + // First connection lifecycle. + using (Socket firstClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) + { + Task firstAcceptTask = listener.AcceptAsync(); + await firstClient.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket firstServer = await firstAcceptTask; + } + + // Reconnect and validate cancellation + subsequent data flow. + using Socket secondClient = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task secondAcceptTask = listener.AcceptAsync(); + await secondClient.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket secondServer = await secondAcceptTask; + + byte[] receiveBuffer = new byte[1]; + using (var cts = new CancellationTokenSource()) + { + ValueTask pendingReceive = secondServer.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token); + cts.Cancel(); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + Assert.NotNull(ex); + Assert.True( + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + + byte[] sendBuffer = new byte[] { 42 }; + int sent = await secondClient.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + + int received = await secondServer.ReceiveAsync(receiveBuffer, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(sendBuffer[0], receiveBuffer[0]); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_ForcedFallbackToEpoll_StillWorks() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(forceFallback: true)).DisposeAsync(); + } + + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_ConcurrentCloseWithPendingReceive_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(16); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < 64; i++) + { + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync(endpoint); + using Socket server = await acceptTask; + + ValueTask pendingReceive = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + + // Force teardown while an async receive is pending. + client.Dispose(); + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not null) + { + throw ex; + } + } + }, CreateSocketEngineOptions()).DisposeAsync(); + } + + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_RepeatedRunStabilityGate() + { + await RemoteExecutor.Invoke(static async () => + { + const int Iterations = 50; + for (int i = 0; i < Iterations; i++) + { + await RunTcpRoundTripAsync(8); + } + }, CreateSocketEngineOptions()).DisposeAsync(); } } } diff --git a/src/native/libs/System.Native/pal_networking.c b/src/native/libs/System.Native/pal_networking.c index 8b0cf98f0f60d7..2ac601c8562c2d 100644 --- a/src/native/libs/System.Native/pal_networking.c +++ b/src/native/libs/System.Native/pal_networking.c @@ -31,6 +31,7 @@ #include #include #include +#include #endif #if HAVE_LINUX_IO_URING_H && HAVE_SYS_POLL_H && defined(__NR_io_uring_setup) && defined(__NR_io_uring_enter) @@ -38,6 +39,12 @@ #else #define HAVE_LINUX_IO_URING_SOCKET_ENGINE 0 #endif + +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE && defined(IORING_POLL_ADD_MULTI) && defined(IORING_CQE_F_MORE) +#define HAVE_IO_URING_POLL_ADD_MULTI 1 +#else +#define HAVE_IO_URING_POLL_ADD_MULTI 0 +#endif #if HAVE_SYS_PROCINFO_H #include #include @@ -3231,7 +3238,10 @@ enum { IoUringMinKernelMajor = 5, IoUringMinKernelMinor = 10, + IoUringPollAddMultiMinKernelMajor = 5, + IoUringPollAddMultiMinKernelMinor = 13, IoUringQueueEntries = 1024, + IoUringRegistrationBucketCount = 2048, IoUringPortNotFound = INT32_MIN + 1 }; @@ -3241,7 +3251,8 @@ typedef struct SocketEventRegistration SocketEvents Events; uintptr_t Data; uint64_t RequestId; - struct SocketEventRegistration* Next; + struct SocketEventRegistration* NextBySocket; + struct SocketEventRegistration* NextByRequestId; } SocketEventRegistration; typedef struct IoUringSocketEventPortState @@ -3249,6 +3260,8 @@ typedef struct IoUringSocketEventPortState int32_t RingFd; int8_t UsesSingleMmap; int8_t IsClosing; + int8_t TraceEnabled; + int8_t UsesPollAddMulti; int32_t RefCount; void* SqRingPtr; size_t SqRingSize; @@ -3266,8 +3279,16 @@ typedef struct IoUringSocketEventPortState uint32_t* CqMask; struct io_uring_cqe* Cqes; uint32_t PendingSubmissions; + uint32_t MaxPendingSubmissions; uint64_t NextRequestId; - SocketEventRegistration* Registrations; + uint64_t PollRearmCount; + uint64_t SubmitRetryCount; + uint64_t SubmitErrorCount; + uint64_t CqeErrorCount; + uint32_t SocketRegistrationBucketMask; + uint32_t RequestIdRegistrationBucketMask; + SocketEventRegistration** RegistrationsBySocket; + SocketEventRegistration** RegistrationsByRequestId; pthread_mutex_t Lock; struct IoUringSocketEventPortState* Next; } IoUringSocketEventPortState; @@ -3296,22 +3317,72 @@ static bool IsIoUringSocketEngineEnabled(void) return value != NULL && strcmp(value, "1") == 0; } -static bool IsIoUringKernelVersionSupported(void) +static bool IsIoUringSocketEngineForcedFallbackForTests(void) +{ + // Test hook used by System.Net.Sockets functional tests to validate epoll fallback behavior. + const char* value = getenv("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_FALLBACK"); + return value != NULL && strcmp(value, "1") == 0; +} + +static bool IsIoUringSocketEngineTraceEnabled(void) +{ + const char* value = getenv("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE"); + return value != NULL && strcmp(value, "1") == 0; +} + +static bool TryGetIoUringKernelVersion(uint32_t* major, uint32_t* minor) { + assert(major != NULL); + assert(minor != NULL); + struct utsname uts; if (uname(&uts) != 0) { return false; } + unsigned int parsedMajor; + unsigned int parsedMinor; + if (sscanf(uts.release, "%u.%u", &parsedMajor, &parsedMinor) != 2) + { + return false; + } + + *major = (uint32_t)parsedMajor; + *minor = (uint32_t)parsedMinor; + return true; +} + +static bool IsIoUringKernelVersionAtLeast(uint32_t requiredMajor, uint32_t requiredMinor) +{ uint32_t major; uint32_t minor; - if (sscanf(uts.release, "%u.%u", &major, &minor) != 2) + if (!TryGetIoUringKernelVersion(&major, &minor)) { return false; } - return major > IoUringMinKernelMajor || (major == IoUringMinKernelMajor && minor >= IoUringMinKernelMinor); + return major > requiredMajor || (major == requiredMajor && minor >= requiredMinor); +} + +static bool IsIoUringKernelVersionSupported(void) +{ + return IsIoUringKernelVersionAtLeast(IoUringMinKernelMajor, IoUringMinKernelMinor); +} + +static void IoUringTrace(const IoUringSocketEventPortState* state, const char* format, ...) +{ + if (state == NULL || !state->TraceEnabled) + { + return; + } + + va_list args; + va_start(args, format); + fprintf(stderr, "[System.Native io_uring] "); + vfprintf(stderr, format, args); + fputc('\n', stderr); + va_end(args); } static IoUringSocketEventPortState* FindIoUringSocketEventPortStateNoLock(int32_t ringFd) @@ -3366,31 +3437,44 @@ static void ReleaseIoUringSocketEventPortState(IoUringSocketEventPortState* stat } } -static SocketEventRegistration* FindRegistrationBySocketLocked(IoUringSocketEventPortState* state, int32_t socket, SocketEventRegistration** previous) +static uint32_t GetSocketRegistrationBucketIndex(const IoUringSocketEventPortState* state, int32_t socket) { assert(state != NULL); - SocketEventRegistration* prev = NULL; - SocketEventRegistration* current = state->Registrations; + return ((uint32_t)socket) & state->SocketRegistrationBucketMask; +} + +static uint32_t GetRequestIdRegistrationBucketIndex(const IoUringSocketEventPortState* state, uint64_t requestId) +{ + assert(state != NULL); + + // 64-bit mix to keep request-id buckets evenly distributed under long-running wraps. + uint64_t hash = requestId; + hash ^= hash >> 33; + hash *= 0xff51afd7ed558ccdULL; + hash ^= hash >> 33; + hash *= 0xc4ceb9fe1a85ec53ULL; + hash ^= hash >> 33; + + return ((uint32_t)hash) & state->RequestIdRegistrationBucketMask; +} + +static SocketEventRegistration* FindRegistrationBySocketLocked(IoUringSocketEventPortState* state, int32_t socket) +{ + assert(state != NULL); + + uint32_t bucketIndex = GetSocketRegistrationBucketIndex(state, socket); + SocketEventRegistration* current = state->RegistrationsBySocket[bucketIndex]; while (current != NULL) { if (current->Socket == socket) { - if (previous != NULL) - { - *previous = prev; - } return current; } - prev = current; - current = current->Next; + current = current->NextBySocket; } - if (previous != NULL) - { - *previous = NULL; - } return NULL; } @@ -3398,7 +3482,8 @@ static SocketEventRegistration* FindRegistrationByRequestIdLocked(IoUringSocketE { assert(state != NULL); - SocketEventRegistration* current = state->Registrations; + uint32_t bucketIndex = GetRequestIdRegistrationBucketIndex(state, requestId); + SocketEventRegistration* current = state->RegistrationsByRequestId[bucketIndex]; while (current != NULL) { if (current->RequestId == requestId) @@ -3406,26 +3491,111 @@ static SocketEventRegistration* FindRegistrationByRequestIdLocked(IoUringSocketE return current; } - current = current->Next; + current = current->NextByRequestId; } return NULL; } -static void RemoveRegistrationLocked(IoUringSocketEventPortState* state, SocketEventRegistration* registration, SocketEventRegistration* previous) +static void AddRegistrationBySocketLocked(IoUringSocketEventPortState* state, SocketEventRegistration* registration) { assert(state != NULL); assert(registration != NULL); + assert(registration->Socket >= 0); - if (previous == NULL) + uint32_t bucketIndex = GetSocketRegistrationBucketIndex(state, registration->Socket); + registration->NextBySocket = state->RegistrationsBySocket[bucketIndex]; + state->RegistrationsBySocket[bucketIndex] = registration; +} + +static void RemoveRegistrationBySocketLocked(IoUringSocketEventPortState* state, SocketEventRegistration* registration) +{ + assert(state != NULL); + assert(registration != NULL); + + uint32_t bucketIndex = GetSocketRegistrationBucketIndex(state, registration->Socket); + SocketEventRegistration* previous = NULL; + SocketEventRegistration* current = state->RegistrationsBySocket[bucketIndex]; + while (current != NULL) { - state->Registrations = registration->Next; + if (current == registration) + { + if (previous == NULL) + { + state->RegistrationsBySocket[bucketIndex] = current->NextBySocket; + } + else + { + previous->NextBySocket = current->NextBySocket; + } + + registration->NextBySocket = NULL; + return; + } + + previous = current; + current = current->NextBySocket; } - else +} + +static void ClearRegistrationRequestIdLocked(IoUringSocketEventPortState* state, SocketEventRegistration* registration) +{ + assert(state != NULL); + assert(registration != NULL); + + if (registration->RequestId == 0) { - previous->Next = registration->Next; + return; } + uint32_t bucketIndex = GetRequestIdRegistrationBucketIndex(state, registration->RequestId); + SocketEventRegistration* previous = NULL; + SocketEventRegistration* current = state->RegistrationsByRequestId[bucketIndex]; + while (current != NULL) + { + if (current == registration) + { + if (previous == NULL) + { + state->RegistrationsByRequestId[bucketIndex] = current->NextByRequestId; + } + else + { + previous->NextByRequestId = current->NextByRequestId; + } + + break; + } + + previous = current; + current = current->NextByRequestId; + } + + registration->RequestId = 0; + registration->NextByRequestId = NULL; +} + +static void AssignRegistrationRequestIdLocked( + IoUringSocketEventPortState* state, SocketEventRegistration* registration, uint64_t requestId) +{ + assert(state != NULL); + assert(registration != NULL); + assert(requestId != 0); + assert(registration->RequestId == 0); + + registration->RequestId = requestId; + uint32_t bucketIndex = GetRequestIdRegistrationBucketIndex(state, requestId); + registration->NextByRequestId = state->RegistrationsByRequestId[bucketIndex]; + state->RegistrationsByRequestId[bucketIndex] = registration; +} + +static void RemoveRegistrationLocked(IoUringSocketEventPortState* state, SocketEventRegistration* registration) +{ + assert(state != NULL); + assert(registration != NULL); + + ClearRegistrationRequestIdLocked(state, registration); + RemoveRegistrationBySocketLocked(state, registration); free(registration); } @@ -3466,18 +3636,33 @@ static int32_t SubmitIoUringPendingEntriesLocked(IoUringSocketEventPortState* st { assert(state != NULL); + int zeroSubmitRetries = 0; while (state->PendingSubmissions > 0) { int result = IoUringEnter(state->RingFd, state->PendingSubmissions, 0, 0); if (result < 0) { + state->SubmitErrorCount++; + IoUringTrace(state, "submit failed: errno=%d pending=%u", errno, state->PendingSubmissions); return SystemNative_ConvertErrorPlatformToPal(errno); } if (result == 0) { + // Retry transient zero-submit responses a few times before surfacing EAGAIN. + if (++zeroSubmitRetries <= 4) + { + state->SubmitRetryCount++; + usleep(50); + continue; + } + + state->SubmitRetryCount++; + state->SubmitErrorCount++; + IoUringTrace(state, "submit returned zero repeatedly; surfacing EAGAIN pending=%u", state->PendingSubmissions); return Error_EAGAIN; } + zeroSubmitRetries = 0; state->PendingSubmissions -= (uint32_t)result; } @@ -3503,6 +3688,10 @@ static struct io_uring_sqe* GetIoUringSqeLocked(IoUringSocketEventPortState* sta state->SqArray[index] = index; __atomic_store_n(state->SqTail, tail + 1, __ATOMIC_RELEASE); state->PendingSubmissions++; + if (state->PendingSubmissions > state->MaxPendingSubmissions) + { + state->MaxPendingSubmissions = state->PendingSubmissions; + } *error = Error_SUCCESS; return sqe; } @@ -3540,6 +3729,7 @@ static int32_t QueueIoUringPollAddLocked(IoUringSocketEventPortState* state, Soc { assert(state != NULL); assert(registration != NULL); + assert(registration->RequestId == 0); int32_t error; struct io_uring_sqe* sqe = GetIoUringSqeLocked(state, &error); @@ -3548,11 +3738,12 @@ static int32_t QueueIoUringPollAddLocked(IoUringSocketEventPortState* state, Soc return error; } - uint64_t requestId = ++state->NextRequestId; - if (requestId == 0) + uint64_t requestId; + do { requestId = ++state->NextRequestId; } + while (requestId == 0 || FindRegistrationByRequestIdLocked(state, requestId) != NULL); sqe->opcode = IORING_OP_POLL_ADD; sqe->fd = registration->Socket; @@ -3562,9 +3753,15 @@ static int32_t QueueIoUringPollAddLocked(IoUringSocketEventPortState* state, Soc #else // The poll flags currently used by socket readiness notifications fit in the legacy 16-bit field. sqe->poll_events = (uint16_t)pollEvents; +#endif +#if HAVE_IO_URING_POLL_ADD_MULTI + if (state->UsesPollAddMulti) + { + sqe->len = IORING_POLL_ADD_MULTI; + } #endif sqe->user_data = requestId; - registration->RequestId = requestId; + AssignRegistrationRequestIdLocked(state, registration, requestId); return Error_SUCCESS; } @@ -3576,12 +3773,27 @@ static void FreeIoUringState(IoUringSocketEventPortState* state) return; } - SocketEventRegistration* registration = state->Registrations; - while (registration != NULL) + if (state->RegistrationsBySocket != NULL) + { + for (uint32_t i = 0; i <= state->SocketRegistrationBucketMask; i++) + { + SocketEventRegistration* registration = state->RegistrationsBySocket[i]; + while (registration != NULL) + { + SocketEventRegistration* next = registration->NextBySocket; + free(registration); + registration = next; + } + } + + free(state->RegistrationsBySocket); + state->RegistrationsBySocket = NULL; + } + + if (state->RegistrationsByRequestId != NULL) { - SocketEventRegistration* next = registration->Next; - free(registration); - registration = next; + free(state->RegistrationsByRequestId); + state->RegistrationsByRequestId = NULL; } if (state->Sqes != NULL && state->Sqes != MAP_FAILED) @@ -3612,7 +3824,7 @@ static int32_t TryCreateSocketEventPortInnerIoUring(int32_t* port) { assert(port != NULL); - if (!IsIoUringSocketEngineEnabled() || !IsIoUringKernelVersionSupported()) + if (!IsIoUringSocketEngineEnabled() || IsIoUringSocketEngineForcedFallbackForTests() || !IsIoUringKernelVersionSupported()) { return Error_ENOSYS; } @@ -3627,8 +3839,16 @@ static int32_t TryCreateSocketEventPortInnerIoUring(int32_t* port) state->SqRingPtr = MAP_FAILED; state->CqRingPtr = MAP_FAILED; state->Sqes = MAP_FAILED; + state->TraceEnabled = IsIoUringSocketEngineTraceEnabled() ? 1 : 0; +#if HAVE_IO_URING_POLL_ADD_MULTI + state->UsesPollAddMulti = IsIoUringKernelVersionAtLeast(IoUringPollAddMultiMinKernelMajor, IoUringPollAddMultiMinKernelMinor) ? 1 : 0; +#else + state->UsesPollAddMulti = 0; +#endif state->NextRequestId = 1; state->RefCount = 1; + state->SocketRegistrationBucketMask = IoUringRegistrationBucketCount - 1; + state->RequestIdRegistrationBucketMask = IoUringRegistrationBucketCount - 1; if (pthread_mutex_init(&state->Lock, NULL) != 0) { @@ -3636,6 +3856,15 @@ static int32_t TryCreateSocketEventPortInnerIoUring(int32_t* port) return Error_EINVAL; } + assert((IoUringRegistrationBucketCount & (IoUringRegistrationBucketCount - 1)) == 0); + state->RegistrationsBySocket = (SocketEventRegistration**)calloc(IoUringRegistrationBucketCount, sizeof(SocketEventRegistration*)); + state->RegistrationsByRequestId = (SocketEventRegistration**)calloc(IoUringRegistrationBucketCount, sizeof(SocketEventRegistration*)); + if (state->RegistrationsBySocket == NULL || state->RegistrationsByRequestId == NULL) + { + FreeIoUringState(state); + return Error_ENOMEM; + } + struct io_uring_params params; memset(¶ms, 0, sizeof(params)); int ringFd = IoUringSetup(IoUringQueueEntries, ¶ms); @@ -3709,6 +3938,7 @@ static int32_t TryCreateSocketEventPortInnerIoUring(int32_t* port) g_ioUringSocketEventPorts = state; pthread_mutex_unlock(&g_ioUringSocketEventPortsLock); + IoUringTrace(state, "port created fd=%d poll_add_multi=%s", ringFd, state->UsesPollAddMulti ? "enabled" : "disabled"); *port = ringFd; return Error_SUCCESS; } @@ -3738,6 +3968,21 @@ static int32_t CloseSocketEventPortInnerIoUring(int32_t port) pthread_mutex_lock(&state->Lock); state->IsClosing = 1; + + // Closing the ring fd here explicitly wakes threads blocked in io_uring_enter(GETEVENTS). + int32_t closeError = Error_SUCCESS; + if (state->RingFd != -1) + { + int closeResult; + while ((closeResult = close(state->RingFd)) < 0 && errno == EINTR); + if (closeResult != 0) + { + closeError = SystemNative_ConvertErrorPlatformToPal(errno); + } + + state->RingFd = -1; + } + if (previous == NULL) { g_ioUringSocketEventPorts = state->Next; @@ -3751,6 +3996,17 @@ static int32_t CloseSocketEventPortInnerIoUring(int32_t port) state->RefCount--; int8_t shouldFree = state->RefCount == 0; + IoUringTrace( + state, + "port closing fd=%d close_error=%d max_pending=%u rearm=%llu submit_retry=%llu submit_error=%llu cqe_error=%llu", + port, + closeError, + state->MaxPendingSubmissions, + (unsigned long long)state->PollRearmCount, + (unsigned long long)state->SubmitRetryCount, + (unsigned long long)state->SubmitErrorCount, + (unsigned long long)state->CqeErrorCount); + pthread_mutex_unlock(&g_ioUringSocketEventPortsLock); pthread_mutex_unlock(&state->Lock); @@ -3758,7 +4014,8 @@ static int32_t CloseSocketEventPortInnerIoUring(int32_t port) { FreeIoUringState(state); } - return Error_SUCCESS; + + return closeError; } static int32_t TryChangeSocketEventRegistrationInnerIoUring( @@ -3774,8 +4031,7 @@ static int32_t TryChangeSocketEventRegistrationInnerIoUring( int32_t error = Error_SUCCESS; - SocketEventRegistration* previous = NULL; - SocketEventRegistration* registration = FindRegistrationBySocketLocked(state, socket, &previous); + SocketEventRegistration* registration = FindRegistrationBySocketLocked(state, socket); if (registration != NULL && registration->RequestId != 0) { @@ -3786,14 +4042,14 @@ static int32_t TryChangeSocketEventRegistrationInnerIoUring( return error; } - registration->RequestId = 0; + ClearRegistrationRequestIdLocked(state, registration); } if (newEvents == SocketEvents_SA_NONE) { if (registration != NULL) { - RemoveRegistrationLocked(state, registration, previous); + RemoveRegistrationLocked(state, registration); } } else @@ -3808,8 +4064,7 @@ static int32_t TryChangeSocketEventRegistrationInnerIoUring( } registration->Socket = socket; - registration->Next = state->Registrations; - state->Registrations = registration; + AddRegistrationBySocketLocked(state, registration); } registration->Events = newEvents; @@ -3840,82 +4095,92 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer return IoUringPortNotFound; } + int32_t produced = 0; int32_t maxEvents = *count; + if (maxEvents == 0) + { + ReleaseIoUringSocketEventPortState(state); + return Error_SUCCESS; + } + while (true) { - int32_t produced = 0; uint32_t cqHead = __atomic_load_n(state->CqHead, __ATOMIC_ACQUIRE); uint32_t cqTail = __atomic_load_n(state->CqTail, __ATOMIC_ACQUIRE); - while (cqHead != cqTail && produced < maxEvents) + if (cqHead != cqTail && produced < maxEvents) { struct io_uring_cqe cqe = state->Cqes[cqHead & *state->CqMask]; cqHead++; + __atomic_store_n(state->CqHead, cqHead, __ATOMIC_RELEASE); - if (cqe.user_data == 0) - { - continue; - } - - SocketEventRegistration* registration = FindRegistrationByRequestIdLocked(state, cqe.user_data); - if (registration == NULL) + if (cqe.user_data != 0) { - continue; - } + SocketEventRegistration* registration = FindRegistrationByRequestIdLocked(state, cqe.user_data); + if (registration != NULL) + { + uintptr_t registrationData = registration->Data; + int8_t removeRegistration = 0; + int8_t pollStillArmed = 0; - uintptr_t registrationData = registration->Data; - registration->RequestId = 0; - int8_t removeRegistration = 0; + SocketEvents events = SocketEvents_SA_NONE; + if (cqe.res >= 0) + { + events = GetSocketEventsFromIoUringPollResult(cqe.res); +#if HAVE_IO_URING_POLL_ADD_MULTI + if (state->UsesPollAddMulti && (cqe.flags & IORING_CQE_F_MORE) != 0) + { + pollStillArmed = 1; + } +#endif + } + else if (cqe.res != -ECANCELED && cqe.res != -ENOENT) + { + events = SocketEvents_SA_ERROR; + state->CqeErrorCount++; + IoUringTrace(state, "cqe error res=%d request=%llu", cqe.res, (unsigned long long)cqe.user_data); + } - SocketEvents events = SocketEvents_SA_NONE; - if (cqe.res >= 0) - { - events = GetSocketEventsFromIoUringPollResult(cqe.res); - } - else if (cqe.res != -ECANCELED && cqe.res != -ENOENT) - { - events = SocketEvents_SA_ERROR; - } + if (cqe.res == -EBADF || cqe.res == -ENOENT || cqe.res == -EINVAL) + { + removeRegistration = 1; + } - if (cqe.res == -EBADF || cqe.res == -ENOENT || cqe.res == -EINVAL) - { - removeRegistration = 1; - } + if (!pollStillArmed || removeRegistration) + { + ClearRegistrationRequestIdLocked(state, registration); + } - if (events != SocketEvents_SA_NONE) - { - memset(&buffer[produced], 0, sizeof(SocketEvent)); - buffer[produced].Data = registrationData; - buffer[produced].Events = events; - produced++; - } + if (events != SocketEvents_SA_NONE) + { + memset(&buffer[produced], 0, sizeof(SocketEvent)); + buffer[produced].Data = registrationData; + buffer[produced].Events = events; + produced++; + } - if (!removeRegistration && - registration->Events != SocketEvents_SA_NONE && - (cqe.res >= 0 || (cqe.res != -EBADF && cqe.res != -ENOENT && cqe.res != -EINVAL))) - { - int32_t error = QueueIoUringPollAddLocked(state, registration); - if (error != Error_SUCCESS) - { - __atomic_store_n(state->CqHead, cqHead, __ATOMIC_RELEASE); - ReleaseIoUringSocketEventPortState(state); - *count = 0; - return error; - } - } - else if (removeRegistration) - { - SocketEventRegistration* previous = NULL; - SocketEventRegistration* current = FindRegistrationBySocketLocked(state, registration->Socket, &previous); - if (current == registration) - { - RemoveRegistrationLocked(state, registration, previous); + if (!removeRegistration && + !pollStillArmed && + registration->Events != SocketEvents_SA_NONE && + (cqe.res >= 0 || (cqe.res != -EBADF && cqe.res != -ENOENT && cqe.res != -EINVAL))) + { + state->PollRearmCount++; + int32_t error = QueueIoUringPollAddLocked(state, registration); + if (error != Error_SUCCESS) + { + ReleaseIoUringSocketEventPortState(state); + *count = 0; + return error; + } + } + else if (removeRegistration) + { + RemoveRegistrationLocked(state, registration); + } } } } - __atomic_store_n(state->CqHead, cqHead, __ATOMIC_RELEASE); - int32_t submitError = SubmitIoUringPendingEntriesLocked(state); if (submitError != Error_SUCCESS) { @@ -3931,12 +4196,21 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer return Error_SUCCESS; } + if (cqHead != cqTail) + { + // Allow registration updates to progress between CQE units of work. + pthread_mutex_unlock(&state->Lock); + pthread_mutex_lock(&state->Lock); + continue; + } + pthread_mutex_unlock(&state->Lock); int result = IoUringEnter(port, 0, 1, IORING_ENTER_GETEVENTS); pthread_mutex_lock(&state->Lock); if (result < 0) { int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); + IoUringTrace(state, "getevents failed: errno=%d", errno); ReleaseIoUringSocketEventPortState(state); *count = 0; return error; From 4dc22d9d2e5a33b2b9b11d0210181aab7127e8c1 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 05:00:02 +0000 Subject: [PATCH 003/258] Add tagged user-data encoding, evidence tooling, and expanded docs - Add IoUringUserDataTag encoding (8-bit tag in upper byte, 56-bit payload) to distinguish poll-readiness CQEs from future completion- model CQEs, with ReservedCompletion tag for Phase 2. - Add CQE dispatch by tag in WaitForSocketEventsInnerIoUring with trace logging for unexpected tags. - Mask NextRequestId to 56-bit payload range to stay within encoding. - Add evidence collection script for automated validation and perf capture (eng/testing/io-uring/collect-sockets-io-uring-evidence.sh). - Add PR evidence template with structured result tables. - Expand validation guide with operational caveats, trace-mode verification, seccomp container testing, and rollback guidance. --- .../io-uring-pr-evidence-template.md | 80 +++++++ .../testing-linux-sockets-io-uring.md | 5 + docs/workflow/testing/libraries/testing.md | 1 + .../collect-sockets-io-uring-evidence.sh | 217 ++++++++++++++++++ .../libs/System.Native/pal_networking.c | 56 ++++- 5 files changed, 353 insertions(+), 6 deletions(-) create mode 100644 docs/workflow/testing/libraries/io-uring-pr-evidence-template.md create mode 100644 eng/testing/io-uring/collect-sockets-io-uring-evidence.sh diff --git a/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md new file mode 100644 index 00000000000000..7af5a389e94c8e --- /dev/null +++ b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md @@ -0,0 +1,80 @@ +# Linux Sockets io_uring PR Evidence Template + +Use this template in the PR description after collecting Linux artifacts. + +Recommended collection script: + +`eng/testing/io-uring/collect-sockets-io-uring-evidence.sh` + +## Scope + +- Branch/commit: `` +- Kernel(s): `` +- Container runtime/profile: `` +- Test command: `` +- Workload command: `` + +## Targeted Functional Results + +| Scenario | Status | Artifact | +| --- | --- | --- | +| Baseline (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING=0`) | `` | `` | +| Opt-in (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1`) | `` | `` | +| Forced fallback (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_FALLBACK=1`) | `` | `` | +| Repeated stability gate | `` | `` | +| UDP coverage | `` | `` | +| Multi-connection coverage | `` | `` | +| Disconnect/reconnect/cancellation | `` | `` | + +## Fallback Verification + +| Environment | Expected behavior | Observed behavior | Artifact | +| --- | --- | --- | --- | +| Kernel `< 5.10` | Fall back to epoll | `` | `` | +| Docker default seccomp | Fall back to epoll | `` | `` | +| Docker permissive/unconfined seccomp | io_uring allowed | `` | `` | + +`strace` verification summary: + +- `io_uring_setup` seen: `` +- `io_uring_enter` seen: `` +- `epoll_create1` seen for sockets path: `` +- Artifact: `` + +Trace-mode summary (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1`): + +- Poll multishot enabled: `` +- Max pending submissions: `` +- Rearm count: `` +- Submit retries/errors: `/` +- CQE error count: `` +- Artifact: `` + +## Perf Comparison (Baseline vs Opt-in) + +| Metric | Baseline | Opt-in | Delta | +| --- | --- | --- | --- | +| Throughput (req/s or MB/s) | `` | `` | `` | +| CPU % | `` | `` | `` | +| ThreadPool thread count | `` | `` | `` | +| ThreadPool completed work items/s | `` | `` | `` | +| Error rate/timeouts | `` | `` | `` | + +Artifacts: + +- Baseline logs: `` +- Opt-in logs: `` +- Counter captures: `` + +## Risk and Rollback + +- Feature remains opt-in (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1`). +- Immediate kill switch: unset env var or set to `0`. +- Fallback conditions: kernel/version gate and unsupported seccomp/container policies. +- Reversion strategy: disable env var first; revert native io_uring changes if needed. + +## Known Limitations + +- Current implementation is readiness-driven (`POLL_ADD`) and does not yet use completion ops (`RECV`/`SEND`/`ACCEPT`/`CONNECT`). +- Large performance gains expected from completion model are deferred to Phase 2. +- Completion-path interop/cancellation/partial-send-receive work remains in Phase 2 backlog. diff --git a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md index 2b87ce73be1483..1df26eb7ee2740 100644 --- a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md +++ b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md @@ -6,6 +6,11 @@ This checklist is for validating the experimental Linux sockets io_uring backend The backend is opt-in and must safely fall back to epoll when io_uring is unavailable. +For evidence packaging, use: + +- Script: `eng/testing/io-uring/collect-sockets-io-uring-evidence.sh` +- PR template: `docs/workflow/testing/libraries/io-uring-pr-evidence-template.md` + ## Prerequisites - Linux machine (x64 or arm64) diff --git a/docs/workflow/testing/libraries/testing.md b/docs/workflow/testing/libraries/testing.md index 32569dbc9e60d8..59d3bac8d79cf4 100644 --- a/docs/workflow/testing/libraries/testing.md +++ b/docs/workflow/testing/libraries/testing.md @@ -3,6 +3,7 @@ Additional guides: - [Linux sockets io_uring validation](testing-linux-sockets-io-uring.md) +- [Linux sockets io_uring PR evidence template](io-uring-pr-evidence-template.md) ## Full Build and Test Run diff --git a/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh new file mode 100644 index 00000000000000..0565726cc64f28 --- /dev/null +++ b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Collect Linux sockets io_uring validation and perf evidence. + +Required: + --output-dir Root folder for evidence artifacts + --test-command Functional test command to execute + +Optional: + --workload-command Perf workload command (baseline vs opt-in) + --docker-image Docker image for seccomp validation + --perf-iterations Workload repetitions per mode (default: 3) + +Example: + eng/testing/io-uring/collect-sockets-io-uring-evidence.sh \ + --output-dir artifacts/io-uring \ + --test-command "./build.sh -subset libs.tests -test /p:XunitMethodName=System.Net.Sockets.Tests.IoUring.IoUringOptIn_DoesNotBreakAsyncSocketWorkflows" \ + --workload-command "dotnet run --project /src/MySocketBench/MySocketBench.csproj -c Release" \ + --docker-image mcr.microsoft.com/dotnet/runtime-deps:9.0 +EOF +} + +output_dir="" +test_command="" +workload_command="" +docker_image="" +perf_iterations=3 + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) + output_dir="${2:-}" + shift 2 + ;; + --test-command) + test_command="${2:-}" + shift 2 + ;; + --workload-command) + workload_command="${2:-}" + shift 2 + ;; + --docker-image) + docker_image="${2:-}" + shift 2 + ;; + --perf-iterations) + perf_iterations="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "$output_dir" || -z "$test_command" ]]; then + usage + exit 1 +fi + +timestamp="$(date -u +%Y%m%dT%H%M%SZ)" +run_dir="${output_dir%/}/${timestamp}" +mkdir -p "$run_dir" + +summary_file="$run_dir/summary.md" +touch "$summary_file" + +log() { + printf '[%s] %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" +} + +write_summary_line() { + printf '%s\n' "$*" >> "$summary_file" +} + +run_test_case() { + local name="$1" + shift + local log_file="$run_dir/${name}.log" + + log "Running test case: $name" + { + printf '# %s\n' "$name" + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf '$ env %s bash -lc "%s"\n' "$*" "$test_command" + env "$@" bash -lc "$test_command" + } > "$log_file" 2>&1 && { + write_summary_line "- [x] ${name} (`${log_file}`)" + return 0 + } + + write_summary_line "- [ ] ${name} (`${log_file}`)" + return 1 +} + +run_strace_case() { + local log_file="$run_dir/backend_verification_strace.log" + local trace_file="$run_dir/backend_verification_strace.txt" + + if ! command -v strace >/dev/null 2>&1; then + write_summary_line "- [ ] backend_verification_strace (strace not found)" + return 0 + fi + + log "Running backend verification with strace" + { + printf '# backend_verification_strace\n' + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf '$ strace -f -e trace=io_uring_setup,io_uring_enter,epoll_create1 -o "%s" env DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 bash -lc "%s"\n' "$trace_file" "$test_command" + strace -f -e trace=io_uring_setup,io_uring_enter,epoll_create1 \ + -o "$trace_file" \ + env DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 bash -lc "$test_command" + printf '\n# trace saved to %s\n' "$trace_file" + } > "$log_file" 2>&1 && { + write_summary_line "- [x] backend_verification_strace (`${log_file}`)" + return 0 + } + + write_summary_line "- [ ] backend_verification_strace (`${log_file}`)" + return 1 +} + +run_docker_case() { + local name="$1" + local seccomp_opt="$2" + local log_file="$run_dir/${name}.log" + + if [[ -z "$docker_image" ]]; then + write_summary_line "- [ ] ${name} (docker image not configured)" + return 0 + fi + + if ! command -v docker >/dev/null 2>&1; then + write_summary_line "- [ ] ${name} (docker not found)" + return 0 + fi + + log "Running docker seccomp case: $name" + { + printf '# %s\n' "$name" + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf '$ docker run --rm %s "%s" /bin/bash -lc %q\n' "$seccomp_opt" "$docker_image" "DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1 ${test_command}" + if [[ -n "$seccomp_opt" ]]; then + docker run --rm "$seccomp_opt" "$docker_image" /bin/bash -lc "DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1 ${test_command}" + else + docker run --rm "$docker_image" /bin/bash -lc "DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1 DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1 ${test_command}" + fi + } > "$log_file" 2>&1 && { + write_summary_line "- [x] ${name} (`${log_file}`)" + return 0 + } + + write_summary_line "- [ ] ${name} (`${log_file}`)" + return 1 +} + +run_perf_case() { + local mode_name="$1" + local io_uring_value="$2" + + if [[ -z "$workload_command" ]]; then + write_summary_line "- [ ] perf_${mode_name} (workload command not configured)" + return 0 + fi + + for ((i = 1; i <= perf_iterations; i++)); do + local log_file="$run_dir/perf_${mode_name}_${i}.log" + log "Running perf case: ${mode_name} iteration ${i}" + { + printf '# perf_%s_%d\n' "$mode_name" "$i" + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf '$ env DOTNET_SYSTEM_NET_SOCKETS_IO_URING=%s bash -lc "%s"\n' "$io_uring_value" "$workload_command" + if [[ -x /usr/bin/time ]]; then + /usr/bin/time -v env DOTNET_SYSTEM_NET_SOCKETS_IO_URING="$io_uring_value" bash -lc "$workload_command" + else + env DOTNET_SYSTEM_NET_SOCKETS_IO_URING="$io_uring_value" bash -lc "$workload_command" + fi + } > "$log_file" 2>&1 && { + write_summary_line "- [x] perf_${mode_name}_${i} (`${log_file}`)" + continue + } + + write_summary_line "- [ ] perf_${mode_name}_${i} (`${log_file}`)" + done +} + +{ + printf '# io_uring Evidence Summary\n\n' + printf '- UTC timestamp: `%s`\n' "$timestamp" + printf '- Host: `%s`\n' "$(uname -a)" + if [[ -f /etc/os-release ]]; then + printf '- OS:\n' + sed 's/^/ - /' /etc/os-release + fi + printf '\n## Results\n' +} > "$summary_file" + +run_test_case "functional_baseline_env0" "DOTNET_SYSTEM_NET_SOCKETS_IO_URING=0" || true +run_test_case "functional_optin_trace" "DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1" "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1" || true +run_test_case "functional_forced_fallback" "DOTNET_SYSTEM_NET_SOCKETS_IO_URING=1" "DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_FALLBACK=1" || true +run_strace_case || true +run_docker_case "docker_default_seccomp" "" || true +run_docker_case "docker_unconfined_seccomp" "--security-opt=seccomp=unconfined" || true +run_perf_case "baseline" "0" || true +run_perf_case "optin" "1" || true + +log "Evidence collection completed. Summary: $summary_file" diff --git a/src/native/libs/System.Native/pal_networking.c b/src/native/libs/System.Native/pal_networking.c index 2ac601c8562c2d..e6ffcb4cdc6928 100644 --- a/src/native/libs/System.Native/pal_networking.c +++ b/src/native/libs/System.Native/pal_networking.c @@ -3245,6 +3245,17 @@ enum IoUringPortNotFound = INT32_MIN + 1 }; +#define IO_URING_USER_DATA_TAG_SHIFT 56 +#define IO_URING_USER_DATA_TAG_MASK 0xFFULL +#define IO_URING_USER_DATA_PAYLOAD_MASK 0x00FFFFFFFFFFFFFFULL + +typedef enum IoUringUserDataTag +{ + IoUringUserDataTag_None = 0, + IoUringUserDataTag_PollReadiness = 1, + IoUringUserDataTag_ReservedCompletion = 2 +} IoUringUserDataTag; + typedef struct SocketEventRegistration { int32_t Socket; @@ -3385,6 +3396,25 @@ static void IoUringTrace(const IoUringSocketEventPortState* state, const char* f va_end(args); } +static uint64_t EncodeIoUringUserData(IoUringUserDataTag tag, uint64_t payload) +{ + assert(tag != IoUringUserDataTag_None); + assert((payload & ~IO_URING_USER_DATA_PAYLOAD_MASK) == 0); + + return (((uint64_t)tag & IO_URING_USER_DATA_TAG_MASK) << IO_URING_USER_DATA_TAG_SHIFT) | payload; +} + +static IoUringUserDataTag GetIoUringUserDataTag(uint64_t userData) +{ + uint64_t tag = (userData >> IO_URING_USER_DATA_TAG_SHIFT) & IO_URING_USER_DATA_TAG_MASK; + return (IoUringUserDataTag)tag; +} + +static uint64_t GetIoUringUserDataPayload(uint64_t userData) +{ + return userData & IO_URING_USER_DATA_PAYLOAD_MASK; +} + static IoUringSocketEventPortState* FindIoUringSocketEventPortStateNoLock(int32_t ringFd) { IoUringSocketEventPortState* state = g_ioUringSocketEventPorts; @@ -3581,6 +3611,7 @@ static void AssignRegistrationRequestIdLocked( assert(state != NULL); assert(registration != NULL); assert(requestId != 0); + assert((requestId & ~IO_URING_USER_DATA_PAYLOAD_MASK) == 0); assert(registration->RequestId == 0); registration->RequestId = requestId; @@ -3719,7 +3750,7 @@ static int32_t QueueIoUringPollRemoveLocked(IoUringSocketEventPortState* state, } sqe->opcode = IORING_OP_POLL_REMOVE; - sqe->addr = requestId; + sqe->addr = EncodeIoUringUserData(IoUringUserDataTag_PollReadiness, requestId); sqe->user_data = 0; return Error_SUCCESS; @@ -3741,7 +3772,8 @@ static int32_t QueueIoUringPollAddLocked(IoUringSocketEventPortState* state, Soc uint64_t requestId; do { - requestId = ++state->NextRequestId; + state->NextRequestId++; + requestId = state->NextRequestId & IO_URING_USER_DATA_PAYLOAD_MASK; } while (requestId == 0 || FindRegistrationByRequestIdLocked(state, requestId) != NULL); @@ -3760,7 +3792,7 @@ static int32_t QueueIoUringPollAddLocked(IoUringSocketEventPortState* state, Soc sqe->len = IORING_POLL_ADD_MULTI; } #endif - sqe->user_data = requestId; + sqe->user_data = EncodeIoUringUserData(IoUringUserDataTag_PollReadiness, requestId); AssignRegistrationRequestIdLocked(state, registration, requestId); return Error_SUCCESS; @@ -3845,7 +3877,7 @@ static int32_t TryCreateSocketEventPortInnerIoUring(int32_t* port) #else state->UsesPollAddMulti = 0; #endif - state->NextRequestId = 1; + state->NextRequestId = 0; state->RefCount = 1; state->SocketRegistrationBucketMask = IoUringRegistrationBucketCount - 1; state->RequestIdRegistrationBucketMask = IoUringRegistrationBucketCount - 1; @@ -4116,7 +4148,19 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer if (cqe.user_data != 0) { - SocketEventRegistration* registration = FindRegistrationByRequestIdLocked(state, cqe.user_data); + IoUringUserDataTag userDataTag = GetIoUringUserDataTag(cqe.user_data); + uint64_t userDataPayload = GetIoUringUserDataPayload(cqe.user_data); + if (userDataTag != IoUringUserDataTag_PollReadiness) + { + IoUringTrace( + state, + "unexpected cqe user_data tag=%u payload=%llu", + (unsigned int)userDataTag, + (unsigned long long)userDataPayload); + continue; + } + + SocketEventRegistration* registration = FindRegistrationByRequestIdLocked(state, userDataPayload); if (registration != NULL) { uintptr_t registrationData = registration->Data; @@ -4138,7 +4182,7 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer { events = SocketEvents_SA_ERROR; state->CqeErrorCount++; - IoUringTrace(state, "cqe error res=%d request=%llu", cqe.res, (unsigned long long)cqe.user_data); + IoUringTrace(state, "cqe error res=%d request=%llu", cqe.res, (unsigned long long)userDataPayload); } if (cqe.res == -EBADF || cqe.res == -ENOENT || cqe.res == -EINVAL) From eaf664e9b054d7c706f41841456d214567b3d850 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 05:16:12 +0000 Subject: [PATCH 004/258] Add Phase 2 completion-model scaffolding across native and managed layers Native: - Add exported stub functions for PrepareIoUringSend/Recv/Accept/Connect with full parameter validation, returning ENOSYS until wired to real SQE opcodes. - Add SubmitIoUringOperations (working) and IsIoUringSocketEventPort (working) exported functions. - Define IoUringCompletion struct in pal_networking.h. Managed interop: - Add LibraryImport declarations for all new native functions with both SafeHandle and IntPtr overloads. - Define IoUringCompletion managed struct and IsIoUringPort helper. Managed engine (SocketAsyncEngine): - Detect io_uring-backed port once at construction; cache in field. - Call SubmitIoUringBatch() at top of EventLoop to flush pending SQEs. Managed context (SocketAsyncContext): - Add IoUringPrepare() virtual on AsyncOperation with overrides for BufferMemorySendOperation (connected-socket), BufferMemoryReceiveOperation (single-buffer), AcceptOperation, and ConnectOperation. - Add atomic user-data token allocation and IoUringUserData field. Managed PAL (SocketPal.Unix.cs): - Add TryPrepareIoUringSend/Recv/Accept/Connect with fallback-on-ENOSYS convention via ConvertIoUringPrepareResult helper. Evidence script: - Add --phase2-metrics flag for completion-path counter extraction. --- .../collect-sockets-io-uring-evidence.sh | 48 ++++++ .../Unix/System.Native/Interop.SocketEvent.cs | 47 ++++++ .../Net/Sockets/SocketAsyncContext.Unix.cs | 66 ++++++++ .../Net/Sockets/SocketAsyncEngine.Unix.cs | 35 ++++ .../src/System/Net/Sockets/SocketPal.Unix.cs | 93 +++++++++++ .../libs/System.Native/pal_networking.c | 155 ++++++++++++++++++ .../libs/System.Native/pal_networking.h | 23 +++ 7 files changed, 467 insertions(+) diff --git a/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh index 0565726cc64f28..6c11888d457f97 100644 --- a/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh +++ b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh @@ -13,6 +13,7 @@ Optional: --workload-command Perf workload command (baseline vs opt-in) --docker-image Docker image for seccomp validation --perf-iterations Workload repetitions per mode (default: 3) + --phase2-metrics Extract Phase 2 completion-path metrics from logs (if present) Example: eng/testing/io-uring/collect-sockets-io-uring-evidence.sh \ @@ -28,6 +29,7 @@ test_command="" workload_command="" docker_image="" perf_iterations=3 +phase2_metrics=0 while [[ $# -gt 0 ]]; do case "$1" in @@ -51,6 +53,10 @@ while [[ $# -gt 0 ]]; do perf_iterations="${2:-}" shift 2 ;; + --phase2-metrics) + phase2_metrics=1 + shift + ;; -h|--help) usage exit 0 @@ -194,6 +200,47 @@ run_perf_case() { done } +extract_phase2_metrics() { + if [[ "$phase2_metrics" != "1" ]]; then + write_summary_line "- [ ] phase2_metrics_extraction (not requested)" + return 0 + fi + + local metrics_file="$run_dir/phase2-metrics.txt" + { + printf '# phase2_metrics\n' + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf '# Grepping known completion-path counters from captured logs.\n' + } > "$metrics_file" + + local found=0 + local patterns=( + "completion_cqe_count" + "completion_submit_count" + "completion_cancel_count" + "completion_partial_resubmit_count" + "buffer_pin_ns" + "buffer_pin_us" + "buffer_pin_ms" + ) + + for pattern in "${patterns[@]}"; do + if grep -R -n --include='*.log' -- "$pattern" "$run_dir" >> "$metrics_file" 2>/dev/null; then + found=1 + fi + done + + if [[ "$found" -eq 1 ]]; then + write_summary_line "- [x] phase2_metrics_extraction (`${metrics_file}`)" + else + { + printf '\n# No known Phase 2 metrics found in current logs.\n' + printf '# Add trace output for completion counters/buffer pin durations and rerun with --phase2-metrics.\n' + } >> "$metrics_file" + write_summary_line "- [ ] phase2_metrics_extraction (`${metrics_file}`)" + fi +} + { printf '# io_uring Evidence Summary\n\n' printf '- UTC timestamp: `%s`\n' "$timestamp" @@ -213,5 +260,6 @@ run_docker_case "docker_default_seccomp" "" || true run_docker_case "docker_unconfined_seccomp" "--security-opt=seccomp=unconfined" || true run_perf_case "baseline" "0" || true run_perf_case "optin" "1" || true +extract_phase2_metrics || true log "Evidence collection completed. Summary: $summary_file" diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs index ef14ce83a11713..95bcc45b5f72a9 100644 --- a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs +++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Net.Sockets; using System.Runtime.InteropServices; internal static partial class Interop @@ -27,6 +28,14 @@ internal struct SocketEvent private int _padding; } + [StructLayout(LayoutKind.Sequential)] + internal struct IoUringCompletion + { + public UIntPtr UserData; + public int Result; + public uint Flags; + } + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_CreateSocketEventPort")] internal static unsafe partial Error CreateSocketEventPort(IntPtr* port); @@ -47,5 +56,43 @@ internal struct SocketEvent [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_WaitForSocketEvents")] internal static unsafe partial Error WaitForSocketEvents(IntPtr port, SocketEvent* buffer, int* count); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringSend")] + internal static unsafe partial Error PrepareIoUringSend(IntPtr port, SafeHandle socket, byte* buffer, int bufferLen, SocketFlags flags, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringSend")] + internal static unsafe partial Error PrepareIoUringSend(IntPtr port, IntPtr socket, byte* buffer, int bufferLen, SocketFlags flags, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringRecv")] + internal static unsafe partial Error PrepareIoUringRecv(IntPtr port, SafeHandle socket, byte* buffer, int bufferLen, SocketFlags flags, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringRecv")] + internal static unsafe partial Error PrepareIoUringRecv(IntPtr port, IntPtr socket, byte* buffer, int bufferLen, SocketFlags flags, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringAccept")] + internal static unsafe partial Error PrepareIoUringAccept(IntPtr port, SafeHandle socket, byte* socketAddress, int* socketAddressLen, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringAccept")] + internal static unsafe partial Error PrepareIoUringAccept(IntPtr port, IntPtr socket, byte* socketAddress, int* socketAddressLen, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringConnect")] + internal static unsafe partial Error PrepareIoUringConnect(IntPtr port, SafeHandle socket, byte* socketAddress, int socketAddressLen, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringConnect")] + internal static unsafe partial Error PrepareIoUringConnect(IntPtr port, IntPtr socket, byte* socketAddress, int socketAddressLen, UIntPtr userData); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_SubmitIoUringOperations")] + internal static partial Error SubmitIoUringOperations(IntPtr port); + + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_IsIoUringSocketEventPort")] + internal static unsafe partial Error IsIoUringSocketEventPort(IntPtr port, int* isIoUringPort); + + internal static unsafe Error IsIoUringPort(IntPtr port, out bool isIoUringPort) + { + int isIoUring = 0; + Error error = IsIoUringSocketEventPort(port, &isIoUring); + isIoUringPort = isIoUring != 0; + return error; + } } } diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs index 4e2e117984084c..1e98996875ac1a 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs @@ -43,6 +43,18 @@ internal sealed partial class SocketAsyncContext private BufferListReceiveOperation? _cachedBufferListReceiveOperation; private BufferMemorySendOperation? _cachedBufferMemorySendOperation; private BufferListSendOperation? _cachedBufferListSendOperation; + private static long s_nextIoUringUserData; + + private static UIntPtr AllocateIoUringUserDataToken() + { + ulong userData = unchecked((ulong)Interlocked.Increment(ref s_nextIoUringUserData)); + if (userData == 0) + { + userData = unchecked((ulong)Interlocked.Increment(ref s_nextIoUringUserData)); + } + + return (UIntPtr)userData; + } private void ReturnOperation(AcceptOperation operation) { @@ -130,6 +142,7 @@ private enum State public SocketError ErrorCode; public Memory SocketAddress; public CancellationTokenRegistration CancellationRegistration; + public UIntPtr IoUringUserData; public ManualResetEventSlim? Event { get; set; } @@ -144,6 +157,7 @@ public void Reset() _state = State.Waiting; Event = null; Next = this; + IoUringUserData = UIntPtr.Zero; #if DEBUG _callbackQueued = false; #endif @@ -316,6 +330,19 @@ public void DoAbort() protected abstract bool DoTryComplete(SocketAsyncContext context); + public bool TryPrepareIoUring(SocketAsyncContext context, IntPtr port) + { + UIntPtr userData = AllocateIoUringUserDataToken(); + IoUringUserData = userData; + return IoUringPrepare(context, port, userData); + } + + protected virtual bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) + { + ErrorCode = SocketError.OperationNotSupported; + return true; + } + public abstract void InvokeCallback(bool allowPooling); [Conditional("SOCKETASYNCCONTEXT_TRACE")] @@ -374,6 +401,18 @@ protected override bool DoTryComplete(SocketAsyncContext context) return SocketPal.TryCompleteSendTo(context._socket, Buffer.Span, null, ref bufferIndex, ref Offset, ref Count, Flags, SocketAddress.Span, ref BytesTransferred, out ErrorCode); } + protected override bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) + { + if (SocketAddress.Length != 0) + { + // SEND completion prep currently supports connected sockets only; SendTo follows readiness path. + ErrorCode = SocketError.Success; + return false; + } + + return SocketPal.TryPrepareIoUringSend(port, context._socket, Buffer.Span.Slice(Offset, Count), Flags, userData, out ErrorCode); + } + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; @@ -485,6 +524,27 @@ protected override bool DoTryComplete(SocketAsyncContext context) } } + protected override bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) + { + // Zero-byte readiness probes are intentionally kept on the readiness path. + if (Buffer.Length == 0 && Flags == SocketFlags.None && SocketAddress.Length == 0) + { + BytesTransferred = 0; + ReceivedFlags = SocketFlags.None; + ErrorCode = SocketError.Success; + return true; + } + + if (SetReceivedFlags || SocketAddress.Length != 0) + { + // RecvFrom/recvmsg completion prep is not wired yet; fallback to readiness path. + ErrorCode = SocketError.Success; + return false; + } + + return SocketPal.TryPrepareIoUringRecv(port, context._socket, Buffer.Span, Flags, userData, out ErrorCode); + } + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; @@ -632,6 +692,9 @@ protected override bool DoTryComplete(SocketAsyncContext context) return completed; } + protected override bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) => + SocketPal.TryPrepareIoUringAccept(port, context._socket, SocketAddress, userData, out ErrorCode); + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; @@ -668,6 +731,9 @@ protected override bool DoTryComplete(SocketAsyncContext context) return result; } + protected override bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) => + SocketPal.TryPrepareIoUringConnect(port, context._socket, SocketAddress, userData, out ErrorCode); + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index d2ffeb9eec45d2..02f95a23d1837c 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -86,6 +86,7 @@ private static SocketAsyncEngine[] CreateEngines() private readonly IntPtr _port; private readonly Interop.Sys.SocketEvent* _buffer; + private readonly bool _isIoUringPort; // // Queue of events generated by EventLoop() that would be processed by the thread pool @@ -172,6 +173,7 @@ public static void UnregisterSocket(SocketAsyncContext context) private SocketAsyncEngine() { _port = (IntPtr)(-1); + _isIoUringPort = false; try { // @@ -196,6 +198,16 @@ private SocketAsyncEngine() } } + bool isIoUringPort; + err = Interop.Sys.IsIoUringPort(_port, out isIoUringPort); + if (err != Interop.Error.SUCCESS) + { + throw new InternalException(err); + } + + // Cached once for future completion-model path selection. + _isIoUringPort = isIoUringPort; + var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop()) { IsBackground = true, @@ -217,6 +229,12 @@ private void EventLoop() SocketEventHandler handler = new SocketEventHandler(this); while (true) { + Interop.Error submitError = SubmitIoUringBatch(); + if (submitError != Interop.Error.SUCCESS) + { + throw new InternalException(submitError); + } + int numEvents = EventBufferCount; Interop.Error err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents); if (err != Interop.Error.SUCCESS) @@ -239,6 +257,23 @@ private void EventLoop() } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private Interop.Error SubmitIoUringBatch() + { + if (!_isIoUringPort) + { + return Interop.Error.SUCCESS; + } + + Interop.Error error = Interop.Sys.SubmitIoUringOperations(_port); + if (error == Interop.Error.ENOSYS || error == Interop.Error.ENOTSUP || error == Interop.Error.EOPNOTSUPP) + { + return Interop.Error.SUCCESS; + } + + return error; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private void EnsureWorkerScheduled() { diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.Unix.cs index f2d91f5bdd6b9e..a38560979e718c 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketPal.Unix.cs @@ -764,6 +764,99 @@ public static unsafe bool TryCompleteConnect(SafeSocketHandle socket, out Socket return true; } + private static bool ConvertIoUringPrepareResult(Interop.Error error, out SocketError errorCode) + { + if (error == Interop.Error.SUCCESS) + { + errorCode = SocketError.Success; + return true; + } + + // Transient queue pressure and unsupported completion mode both signal caller fallback/retry. + if (error == Interop.Error.EAGAIN || error == Interop.Error.EWOULDBLOCK || + error == Interop.Error.ENOSYS || error == Interop.Error.ENOTSUP || error == Interop.Error.EOPNOTSUPP) + { + errorCode = SocketError.Success; + return false; + } + + errorCode = GetSocketErrorForErrorCode(error); + return true; + } + + public static unsafe bool TryPrepareIoUringSend( + IntPtr port, SafeSocketHandle socket, ReadOnlySpan buffer, SocketFlags flags, UIntPtr userData, out SocketError errorCode) + { + try + { + fixed (byte* rawBuffer = buffer) + { + Interop.Error error = Interop.Sys.PrepareIoUringSend(port, socket, rawBuffer, buffer.Length, flags, userData); + return ConvertIoUringPrepareResult(error, out errorCode); + } + } + catch (ObjectDisposedException) + { + errorCode = SocketError.OperationAborted; + return true; + } + } + + public static unsafe bool TryPrepareIoUringRecv( + IntPtr port, SafeSocketHandle socket, Span buffer, SocketFlags flags, UIntPtr userData, out SocketError errorCode) + { + try + { + fixed (byte* rawBuffer = buffer) + { + Interop.Error error = Interop.Sys.PrepareIoUringRecv(port, socket, rawBuffer, buffer.Length, flags, userData); + return ConvertIoUringPrepareResult(error, out errorCode); + } + } + catch (ObjectDisposedException) + { + errorCode = SocketError.OperationAborted; + return true; + } + } + + public static unsafe bool TryPrepareIoUringAccept( + IntPtr port, SafeSocketHandle socket, Memory socketAddress, UIntPtr userData, out SocketError errorCode) + { + try + { + fixed (byte* rawSocketAddress = socketAddress.Span) + { + int socketAddressLen = socketAddress.Length; + Interop.Error error = Interop.Sys.PrepareIoUringAccept(port, socket, rawSocketAddress, &socketAddressLen, userData); + return ConvertIoUringPrepareResult(error, out errorCode); + } + } + catch (ObjectDisposedException) + { + errorCode = SocketError.OperationAborted; + return true; + } + } + + public static unsafe bool TryPrepareIoUringConnect( + IntPtr port, SafeSocketHandle socket, Memory socketAddress, UIntPtr userData, out SocketError errorCode) + { + try + { + fixed (byte* rawSocketAddress = socketAddress.Span) + { + Interop.Error error = Interop.Sys.PrepareIoUringConnect(port, socket, rawSocketAddress, socketAddress.Length, userData); + return ConvertIoUringPrepareResult(error, out errorCode); + } + } + catch (ObjectDisposedException) + { + errorCode = SocketError.OperationAborted; + return true; + } + } + public static bool TryCompleteReceiveFrom(SafeSocketHandle socket, Span buffer, SocketFlags flags, Span socketAddress, out int socketAddressLen, out int bytesReceived, out SocketFlags receivedFlags, out SocketError errorCode) => TryCompleteReceiveFrom(socket, buffer, null, flags, socketAddress, out socketAddressLen, out bytesReceived, out receivedFlags, out errorCode); diff --git a/src/native/libs/System.Native/pal_networking.c b/src/native/libs/System.Native/pal_networking.c index e6ffcb4cdc6928..6d52f0cfdac83c 100644 --- a/src/native/libs/System.Native/pal_networking.c +++ b/src/native/libs/System.Native/pal_networking.c @@ -4606,6 +4606,161 @@ int32_t SystemNative_WaitForSocketEvents(intptr_t port, SocketEvent* buffer, int return WaitForSocketEventsInner(fd, buffer, count); } +static int32_t ValidateIoUringPrepareParameters(int32_t portFd, int32_t socketFd, uintptr_t userData) +{ + if (portFd < 0 || socketFd < 0 || userData == 0) + { + return Error_EINVAL; + } + + return Error_SUCCESS; +} + +static int32_t PrepareIoUringOperationNotSupported(int32_t portFd, const char* operationName, uintptr_t userData) +{ +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + IoUringSocketEventPortState* state = AcquireIoUringSocketEventPortState(portFd); + if (state != NULL) + { + IoUringTrace( + state, + "%s preparation requested but completion-mode operations are not enabled yet user_data=%llu", + operationName, + (unsigned long long)userData); + ReleaseIoUringSocketEventPortState(state); + } +#else + (void)portFd; + (void)operationName; + (void)userData; +#endif + + return Error_ENOSYS; +} + +int32_t +SystemNative_PrepareIoUringSend(intptr_t port, intptr_t socket, void* buffer, int32_t bufferLen, int32_t flags, uintptr_t userData) +{ + (void)flags; + + int32_t portFd = ToFileDescriptor(port); + int32_t socketFd = ToFileDescriptor(socket); + int32_t validationError = ValidateIoUringPrepareParameters(portFd, socketFd, userData); + if (validationError != Error_SUCCESS) + { + return validationError; + } + + if (bufferLen < 0 || (buffer == NULL && bufferLen > 0)) + { + return Error_EINVAL; + } + + return PrepareIoUringOperationNotSupported(portFd, "send", userData); +} + +int32_t +SystemNative_PrepareIoUringRecv(intptr_t port, intptr_t socket, void* buffer, int32_t bufferLen, int32_t flags, uintptr_t userData) +{ + (void)flags; + + int32_t portFd = ToFileDescriptor(port); + int32_t socketFd = ToFileDescriptor(socket); + int32_t validationError = ValidateIoUringPrepareParameters(portFd, socketFd, userData); + if (validationError != Error_SUCCESS) + { + return validationError; + } + + if (bufferLen < 0 || (buffer == NULL && bufferLen > 0)) + { + return Error_EINVAL; + } + + return PrepareIoUringOperationNotSupported(portFd, "recv", userData); +} + +int32_t +SystemNative_PrepareIoUringAccept(intptr_t port, intptr_t socket, uint8_t* socketAddress, int32_t* socketAddressLen, uintptr_t userData) +{ + int32_t portFd = ToFileDescriptor(port); + int32_t socketFd = ToFileDescriptor(socket); + int32_t validationError = ValidateIoUringPrepareParameters(portFd, socketFd, userData); + if (validationError != Error_SUCCESS) + { + return validationError; + } + + if (socketAddressLen == NULL || *socketAddressLen < 0 || (socketAddress == NULL && *socketAddressLen > 0)) + { + return Error_EINVAL; + } + + return PrepareIoUringOperationNotSupported(portFd, "accept", userData); +} + +int32_t +SystemNative_PrepareIoUringConnect(intptr_t port, intptr_t socket, uint8_t* socketAddress, int32_t socketAddressLen, uintptr_t userData) +{ + int32_t portFd = ToFileDescriptor(port); + int32_t socketFd = ToFileDescriptor(socket); + int32_t validationError = ValidateIoUringPrepareParameters(portFd, socketFd, userData); + if (validationError != Error_SUCCESS) + { + return validationError; + } + + if (socketAddress == NULL || socketAddressLen < 0) + { + return Error_EINVAL; + } + + return PrepareIoUringOperationNotSupported(portFd, "connect", userData); +} + +int32_t SystemNative_SubmitIoUringOperations(intptr_t port) +{ + int32_t portFd = ToFileDescriptor(port); +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + IoUringSocketEventPortState* state = AcquireIoUringSocketEventPortState(portFd); + if (state == NULL) + { + return Error_ENOSYS; + } + + int32_t error = SubmitIoUringPendingEntriesLocked(state); + ReleaseIoUringSocketEventPortState(state); + return error; +#else + (void)portFd; + return Error_ENOSYS; +#endif +} + +int32_t SystemNative_IsIoUringSocketEventPort(intptr_t port, int32_t* isIoUringPort) +{ + if (isIoUringPort == NULL) + { + return Error_EFAULT; + } + + *isIoUringPort = 0; + +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + int32_t portFd = ToFileDescriptor(port); + IoUringSocketEventPortState* state = AcquireIoUringSocketEventPortState(portFd); + if (state != NULL) + { + *isIoUringPort = 1; + ReleaseIoUringSocketEventPortState(state); + } +#else + (void)port; +#endif + + return Error_SUCCESS; +} + int32_t SystemNative_PlatformSupportsDualModeIPv4PacketInfo(void) { #if HAVE_SUPPORT_FOR_DUAL_MODE_IPV4_PACKET_INFO diff --git a/src/native/libs/System.Native/pal_networking.h b/src/native/libs/System.Native/pal_networking.h index c393a3dbad4619..a49b97dd4ea3b8 100644 --- a/src/native/libs/System.Native/pal_networking.h +++ b/src/native/libs/System.Native/pal_networking.h @@ -298,6 +298,13 @@ typedef struct uint32_t Padding; // Pad out to 8-byte alignment } SocketEvent; +typedef struct +{ + uintptr_t UserData; // Operation token supplied during SQE preparation + int32_t Result; // CQE result (>= 0 bytes/handle, < 0 negated errno) + uint32_t Flags; // CQE flags from the kernel +} IoUringCompletion; + PALEXPORT int32_t SystemNative_GetHostEntryForName(const uint8_t* address, int32_t addressFamily, HostEntry* entry); PALEXPORT void SystemNative_FreeHostEntry(HostEntry* entry); @@ -419,6 +426,22 @@ PALEXPORT int32_t SystemNative_TryChangeSocketEventRegistration( PALEXPORT int32_t SystemNative_WaitForSocketEvents(intptr_t port, SocketEvent* buffer, int32_t* count); +PALEXPORT int32_t +SystemNative_PrepareIoUringSend(intptr_t port, intptr_t socket, void* buffer, int32_t bufferLen, int32_t flags, uintptr_t userData); + +PALEXPORT int32_t +SystemNative_PrepareIoUringRecv(intptr_t port, intptr_t socket, void* buffer, int32_t bufferLen, int32_t flags, uintptr_t userData); + +PALEXPORT int32_t +SystemNative_PrepareIoUringAccept(intptr_t port, intptr_t socket, uint8_t* socketAddress, int32_t* socketAddressLen, uintptr_t userData); + +PALEXPORT int32_t +SystemNative_PrepareIoUringConnect(intptr_t port, intptr_t socket, uint8_t* socketAddress, int32_t socketAddressLen, uintptr_t userData); + +PALEXPORT int32_t SystemNative_SubmitIoUringOperations(intptr_t port); + +PALEXPORT int32_t SystemNative_IsIoUringSocketEventPort(intptr_t port, int32_t* isIoUringPort); + PALEXPORT int32_t SystemNative_PlatformSupportsDualModeIPv4PacketInfo(void); PALEXPORT void SystemNative_GetDomainSocketSizes(int32_t* pathOffset, int32_t* pathSize, int32_t* addressSize); From 9244c5176083113ce6881267b0c8c5ba443cd58f Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 05:24:22 +0000 Subject: [PATCH 005/258] Add cross-thread SQE staging and Phase 2 completion-mode opt-in gate - Add DOTNET_SYSTEM_NET_SOCKETS_IO_URING_COMPLETION=1 env var as a separate opt-in for Phase 2 completion mode, layered on top of the existing Phase 1 io_uring opt-in. - Add ConcurrentQueue _ioUringPrepareQueue on SocketAsyncEngine, drained in SubmitIoUringBatch() before flushing pending SQEs to the kernel. - Add TryEnqueueIoUringPreparation() on SocketAsyncContext to enqueue async operations for engine-thread preparation. - Enqueue from StartAsyncOperation for non-synchronous operations when completion mode is enabled. - Change AsyncOperation visibility from private to internal so the engine can reference it for the prepare queue. - Add explicit BufferListSendOperation and BufferListReceiveOperation IoUringPrepare() overrides returning false (readiness-path fallback). --- .../Net/Sockets/SocketAsyncContext.Unix.cs | 29 ++++++++++++++++++- .../Net/Sockets/SocketAsyncEngine.Unix.cs | 26 +++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs index 1e98996875ac1a..6119c4ff1b0fe0 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs @@ -120,7 +120,7 @@ private BufferListSendOperation RentBufferListSendOperation() => Interlocked.Exchange(ref _cachedBufferListSendOperation, null) ?? new BufferListSendOperation(this); - private abstract class AsyncOperation : IThreadPoolWorkItem + internal abstract class AsyncOperation : IThreadPoolWorkItem { private enum State { @@ -441,6 +441,13 @@ protected override bool DoTryComplete(SocketAsyncContext context) return SocketPal.TryCompleteSendTo(context._socket, default(ReadOnlySpan), Buffers, ref BufferIndex, ref Offset, ref Count, Flags, SocketAddress.Span, ref BytesTransferred, out ErrorCode); } + protected override bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) + { + // Vectored completion-mode preparation is tracked separately; keep readiness fallback for now. + ErrorCode = SocketError.Success; + return false; + } + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; @@ -578,6 +585,13 @@ protected override bool DoTryComplete(SocketAsyncContext context) return completed; } + protected override bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) + { + // Vectored completion-mode preparation is tracked separately; keep readiness fallback for now. + ErrorCode = SocketError.Success; + return false; + } + public override void InvokeCallback(bool allowPooling) { var cb = Callback!; @@ -956,6 +970,13 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation operation.CancellationRegistration = cancellationToken.UnsafeRegister(s => ((TOperation)s!).TryCancel(), operation); } + // Completion-mode staging is opt-in and currently disabled by default. + // When enabled, the engine thread will pick up queued operations and run IoUringPrepare. + if (operation.Event == null) + { + context.TryEnqueueIoUringPreparation(operation); + } + return true; case QueueState.Stopped: @@ -1349,6 +1370,12 @@ public bool PreferInlineCompletions get => _socket.PreferInlineCompletions; } + private bool TryEnqueueIoUringPreparation(AsyncOperation operation) + { + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + return engine != null && engine.TryEnqueueIoUringPreparation(operation); + } + private bool TryRegister(out Interop.Error error) { Debug.Assert(_isHandleNonBlocking); diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index 02f95a23d1837c..561427e65ab174 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -24,6 +24,8 @@ internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem // Setting PreferInlineCompletions allows continuations to run directly on the event thread. // PreferInlineCompletions defaults to false and can be set to true using the DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS envvar. internal static readonly bool InlineSocketCompletionsEnabled = Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_INLINE_COMPLETIONS") == "1"; + private static readonly bool s_ioUringCompletionModeEnabled = + Environment.GetEnvironmentVariable("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_COMPLETION") == "1"; private static int GetEngineCount() { @@ -87,11 +89,14 @@ private static SocketAsyncEngine[] CreateEngines() private readonly IntPtr _port; private readonly Interop.Sys.SocketEvent* _buffer; private readonly bool _isIoUringPort; + private readonly bool _ioUringCompletionModeEnabled; // // Queue of events generated by EventLoop() that would be processed by the thread pool // private readonly ConcurrentQueue _eventQueue = new ConcurrentQueue(); + private readonly ConcurrentQueue _ioUringPrepareQueue = + new ConcurrentQueue(); // This flag is used for communication between item enqueuing and workers that process the items. // There are two states of this flag: @@ -170,10 +175,22 @@ public static void UnregisterSocket(SocketAsyncContext context) context.GlobalContextIndex = -1; } + internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation) + { + if (!_ioUringCompletionModeEnabled) + { + return false; + } + + _ioUringPrepareQueue.Enqueue(operation); + return true; + } + private SocketAsyncEngine() { _port = (IntPtr)(-1); _isIoUringPort = false; + _ioUringCompletionModeEnabled = false; try { // @@ -207,6 +224,7 @@ private SocketAsyncEngine() // Cached once for future completion-model path selection. _isIoUringPort = isIoUringPort; + _ioUringCompletionModeEnabled = _isIoUringPort && s_ioUringCompletionModeEnabled; var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop()) { @@ -265,6 +283,14 @@ private Interop.Error SubmitIoUringBatch() return Interop.Error.SUCCESS; } + if (_ioUringCompletionModeEnabled) + { + while (_ioUringPrepareQueue.TryDequeue(out SocketAsyncContext.AsyncOperation? operation)) + { + operation.TryPrepareIoUring(operation.AssociatedContext, _port); + } + } + Interop.Error error = Interop.Sys.SubmitIoUringOperations(_port); if (error == Interop.Error.ENOSYS || error == Interop.Error.ENOTSUP || error == Interop.Error.EOPNOTSUPP) { From 92f509b876bf33221fb74d8695fa22b965094de5 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 05:51:35 +0000 Subject: [PATCH 006/258] Add dual-buffer WaitForSocketEventsAndCompletions and cancellation-safe prepare staging Native: refactor WaitForSocketEventsInnerIoUring to dispatch CQEs into separate SocketEvent and IoUringCompletion buffers by user_data tag, with per-buffer saturation detection that preserves unconsumed CQEs in the ring. Add WaitForSocketEventsAndCompletions exported function with kqueue and no-backend fallback stubs. Engine: allocate IoUringCompletion buffer when completion mode is enabled, use dual-buffer wait in EventLoop, add HandleIoUringCompletions stub for CQE draining. Replace raw AsyncOperation in prepare queue with IoUringPrepareWorkItem carrying a PrepareSequence for ABA-safe staging. Context: add _ioUringPrepareSequence and _ioUringPrepareQueued fields on AsyncOperation with MarkReadyForIoUringPreparation, CancelPendingIoUringPreparation, and sequence-validated TryPrepareIoUring to prevent stale/cancelled operations from being prepared after reset or cancellation. Tests: add completion-mode coverage for fallback-when-unsupported, without-Phase1-opt-in, and forced-epoll-fallback scenarios. --- .../Unix/System.Native/Interop.SocketEvent.cs | 8 + .../Net/Sockets/SocketAsyncContext.Unix.cs | 46 ++- .../Net/Sockets/SocketAsyncEngine.Unix.cs | 78 +++- .../tests/FunctionalTests/IoUring.Unix.cs | 31 +- .../libs/System.Native/pal_networking.c | 333 ++++++++++++++---- .../libs/System.Native/pal_networking.h | 7 + 6 files changed, 417 insertions(+), 86 deletions(-) diff --git a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs index 95bcc45b5f72a9..634b2ea6258c08 100644 --- a/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs +++ b/src/libraries/Common/src/Interop/Unix/System.Native/Interop.SocketEvent.cs @@ -57,6 +57,14 @@ internal struct IoUringCompletion [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_WaitForSocketEvents")] internal static unsafe partial Error WaitForSocketEvents(IntPtr port, SocketEvent* buffer, int* count); + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_WaitForSocketEventsAndCompletions")] + internal static unsafe partial Error WaitForSocketEventsAndCompletions( + IntPtr port, + SocketEvent* socketEventBuffer, + int* socketEventCount, + IoUringCompletion* completionBuffer, + int* completionCount); + [LibraryImport(Libraries.SystemNative, EntryPoint = "SystemNative_PrepareIoUringSend")] internal static unsafe partial Error PrepareIoUringSend(IntPtr port, SafeHandle socket, byte* buffer, int bufferLen, SocketFlags flags, UIntPtr userData); diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs index 6119c4ff1b0fe0..f856d342440d4a 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs @@ -132,6 +132,8 @@ private enum State } private volatile AsyncOperation.State _state; + private uint _ioUringPrepareSequence; + private int _ioUringPrepareQueued; #if DEBUG private bool _callbackQueued; // When true, the callback has been queued. @@ -158,6 +160,14 @@ public void Reset() Event = null; Next = this; IoUringUserData = UIntPtr.Zero; + uint nextPrepareSequence = unchecked(_ioUringPrepareSequence + 1); + if (nextPrepareSequence == 0) + { + nextPrepareSequence = 1; + } + + Volatile.Write(ref _ioUringPrepareSequence, nextPrepareSequence); + Volatile.Write(ref _ioUringPrepareQueued, 0); #if DEBUG _callbackQueued = false; #endif @@ -330,8 +340,32 @@ public void DoAbort() protected abstract bool DoTryComplete(SocketAsyncContext context); - public bool TryPrepareIoUring(SocketAsyncContext context, IntPtr port) + internal uint MarkReadyForIoUringPreparation() + { + uint prepareSequence = Volatile.Read(ref _ioUringPrepareSequence); + Debug.Assert(prepareSequence != 0); + Volatile.Write(ref _ioUringPrepareQueued, 1); + return prepareSequence; + } + + internal void CancelPendingIoUringPreparation(uint prepareSequence) { + if (Volatile.Read(ref _ioUringPrepareSequence) == prepareSequence) + { + Volatile.Write(ref _ioUringPrepareQueued, 0); + } + } + + public bool TryPrepareIoUring(SocketAsyncContext context, IntPtr port, uint prepareSequence) + { + if (prepareSequence == 0 || + Volatile.Read(ref _ioUringPrepareSequence) != prepareSequence || + Interlocked.Exchange(ref _ioUringPrepareQueued, 0) == 0 || + _state != State.Waiting) + { + return false; + } + UIntPtr userData = AllocateIoUringUserDataToken(); IoUringUserData = userData; return IoUringPrepare(context, port, userData); @@ -974,7 +1008,11 @@ public bool StartAsyncOperation(SocketAsyncContext context, TOperation operation // When enabled, the engine thread will pick up queued operations and run IoUringPrepare. if (operation.Event == null) { - context.TryEnqueueIoUringPreparation(operation); + uint prepareSequence = operation.MarkReadyForIoUringPreparation(); + if (!context.TryEnqueueIoUringPreparation(operation, prepareSequence)) + { + operation.CancelPendingIoUringPreparation(prepareSequence); + } } return true; @@ -1370,10 +1408,10 @@ public bool PreferInlineCompletions get => _socket.PreferInlineCompletions; } - private bool TryEnqueueIoUringPreparation(AsyncOperation operation) + private bool TryEnqueueIoUringPreparation(AsyncOperation operation, uint prepareSequence) { SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); - return engine != null && engine.TryEnqueueIoUringPreparation(operation); + return engine != null && engine.TryEnqueueIoUringPreparation(operation, prepareSequence); } private bool TryRegister(out Interop.Error error) diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index 561427e65ab174..f722c9380cfe9b 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -88,6 +88,7 @@ private static SocketAsyncEngine[] CreateEngines() private readonly IntPtr _port; private readonly Interop.Sys.SocketEvent* _buffer; + private readonly Interop.Sys.IoUringCompletion* _completionBuffer; private readonly bool _isIoUringPort; private readonly bool _ioUringCompletionModeEnabled; @@ -95,8 +96,8 @@ private static SocketAsyncEngine[] CreateEngines() // Queue of events generated by EventLoop() that would be processed by the thread pool // private readonly ConcurrentQueue _eventQueue = new ConcurrentQueue(); - private readonly ConcurrentQueue _ioUringPrepareQueue = - new ConcurrentQueue(); + private readonly ConcurrentQueue _ioUringPrepareQueue = + new ConcurrentQueue(); // This flag is used for communication between item enqueuing and workers that process the items. // There are two states of this flag: @@ -175,14 +176,26 @@ public static void UnregisterSocket(SocketAsyncContext context) context.GlobalContextIndex = -1; } - internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation) + private readonly struct IoUringPrepareWorkItem + { + public readonly SocketAsyncContext.AsyncOperation Operation; + public readonly uint PrepareSequence; + + public IoUringPrepareWorkItem(SocketAsyncContext.AsyncOperation operation, uint prepareSequence) + { + Operation = operation; + PrepareSequence = prepareSequence; + } + } + + internal bool TryEnqueueIoUringPreparation(SocketAsyncContext.AsyncOperation operation, uint prepareSequence) { if (!_ioUringCompletionModeEnabled) { return false; } - _ioUringPrepareQueue.Enqueue(operation); + _ioUringPrepareQueue.Enqueue(new IoUringPrepareWorkItem(operation, prepareSequence)); return true; } @@ -225,6 +238,15 @@ private SocketAsyncEngine() // Cached once for future completion-model path selection. _isIoUringPort = isIoUringPort; _ioUringCompletionModeEnabled = _isIoUringPort && s_ioUringCompletionModeEnabled; + if (_ioUringCompletionModeEnabled) + { + _completionBuffer = (Interop.Sys.IoUringCompletion*)NativeMemory.Alloc( + checked((nuint)EventBufferCount * (nuint)sizeof(Interop.Sys.IoUringCompletion))); + if (_completionBuffer == null) + { + throw new OutOfMemoryException(); + } + } var thread = new Thread(static s => ((SocketAsyncEngine)s!).EventLoop()) { @@ -254,16 +276,32 @@ private void EventLoop() } int numEvents = EventBufferCount; - Interop.Error err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents); + int numCompletions = 0; + Interop.Error err; + if (_ioUringCompletionModeEnabled) + { + numCompletions = EventBufferCount; + err = Interop.Sys.WaitForSocketEventsAndCompletions(_port, handler.Buffer, &numEvents, handler.CompletionBuffer, &numCompletions); + } + else + { + err = Interop.Sys.WaitForSocketEvents(_port, handler.Buffer, &numEvents); + } + if (err != Interop.Error.SUCCESS) { throw new InternalException(err); } - // The native shim is responsible for ensuring this condition. - Debug.Assert(numEvents > 0, $"Unexpected numEvents: {numEvents}"); + // The native shim should return at least one readiness event or completion. + Debug.Assert(numEvents > 0 || numCompletions > 0, $"Unexpected wait result: events={numEvents}, completions={numCompletions}"); + + if (numCompletions > 0) + { + handler.HandleIoUringCompletions(numCompletions); + } - if (handler.HandleSocketEvents(numEvents)) + if (numEvents > 0 && handler.HandleSocketEvents(numEvents)) { EnsureWorkerScheduled(); } @@ -285,9 +323,10 @@ private Interop.Error SubmitIoUringBatch() if (_ioUringCompletionModeEnabled) { - while (_ioUringPrepareQueue.TryDequeue(out SocketAsyncContext.AsyncOperation? operation)) + while (_ioUringPrepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem)) { - operation.TryPrepareIoUring(operation.AssociatedContext, _port); + SocketAsyncContext.AsyncOperation operation = workItem.Operation; + operation.TryPrepareIoUring(operation.AssociatedContext, _port, workItem.PrepareSequence); } } @@ -361,6 +400,12 @@ private void FreeNativeResources() { Interop.Sys.FreeSocketEventBuffer(_buffer); } + + if (_completionBuffer != null) + { + NativeMemory.Free(_completionBuffer); + } + if (_port != (IntPtr)(-1)) { Interop.Sys.CloseSocketEventPort(_port); @@ -375,15 +420,28 @@ private void FreeNativeResources() private readonly struct SocketEventHandler { public Interop.Sys.SocketEvent* Buffer { get; } + public Interop.Sys.IoUringCompletion* CompletionBuffer { get; } private readonly ConcurrentQueue _eventQueue; public SocketEventHandler(SocketAsyncEngine engine) { Buffer = engine._buffer; + CompletionBuffer = engine._completionBuffer; _eventQueue = engine._eventQueue; } + [MethodImpl(MethodImplOptions.NoInlining)] + public void HandleIoUringCompletions(int numCompletions) + { + Debug.Assert(numCompletions > 0); + Debug.Assert(CompletionBuffer != null); + + // Completion dispatch to AsyncOperation callbacks is tracked separately. + // For now, CQEs are drained in native and observed here to keep mixed-mode waits stable. + _ = new ReadOnlySpan(CompletionBuffer, numCompletions); + } + [MethodImpl(MethodImplOptions.NoInlining)] public bool HandleSocketEvents(int numEvents) { diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs index 5f6b961714f976..918f31523656d2 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -13,7 +13,10 @@ namespace System.Net.Sockets.Tests { public class IoUring { - private static RemoteInvokeOptions CreateSocketEngineOptions(string? ioUringValue = "1", bool forceFallback = false) + private static RemoteInvokeOptions CreateSocketEngineOptions( + string? ioUringValue = "1", + bool forceFallback = false, + bool completionMode = false) { RemoteInvokeOptions options = new RemoteInvokeOptions(); if (ioUringValue is not null) @@ -21,6 +24,11 @@ private static RemoteInvokeOptions CreateSocketEngineOptions(string? ioUringValu options.StartInfo.EnvironmentVariables.Add("DOTNET_SYSTEM_NET_SOCKETS_IO_URING", ioUringValue); } + if (completionMode) + { + options.StartInfo.EnvironmentVariables.Add("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_COMPLETION", "1"); + } + if (forceFallback) { options.StartInfo.EnvironmentVariables.Add("DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TEST_FORCE_FALLBACK", "1"); @@ -81,6 +89,13 @@ public static async Task IoUringOptIn_DoesNotBreakAsyncSocketWorkflows() await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(64), CreateSocketEngineOptions()).DisposeAsync(); } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionModeOptIn_FallsBackWhenOpsAreUnsupported() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(64), CreateSocketEngineOptions(completionMode: true)).DisposeAsync(); + } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. public static async Task SocketEngine_DefaultOptOut_DoesNotBreakAsyncSocketWorkflows() @@ -99,6 +114,13 @@ public static async Task SocketEngine_KillSwitchZero_DoesNotBreakAsyncSocketWork await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: "0")).DisposeAsync(); } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_WithoutPhase1OptIn_DoesNotChangeBehavior() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(ioUringValue: null, completionMode: true)).DisposeAsync(); + } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. public static async Task IoUringOptIn_UdpSendReceive_Works() @@ -267,6 +289,13 @@ public static async Task IoUringOptIn_ForcedFallbackToEpoll_StillWorks() await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(forceFallback: true)).DisposeAsync(); } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_ForcedFallbackToEpoll_StillWorks() + { + await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(forceFallback: true, completionMode: true)).DisposeAsync(); + } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. public static async Task IoUringOptIn_ConcurrentCloseWithPendingReceive_DoesNotHang() diff --git a/src/native/libs/System.Native/pal_networking.c b/src/native/libs/System.Native/pal_networking.c index 6d52f0cfdac83c..8e1cbd18dd247a 100644 --- a/src/native/libs/System.Native/pal_networking.c +++ b/src/native/libs/System.Native/pal_networking.c @@ -4115,11 +4115,18 @@ static int32_t TryChangeSocketEventRegistrationInnerIoUring( return error; } -static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer, int32_t* count) -{ - assert(buffer != NULL); - assert(count != NULL); - assert(*count >= 0); +static int32_t +WaitForSocketEventsInnerIoUring( + int32_t port, + SocketEvent* socketEventBuffer, + int32_t* socketEventCount, + IoUringCompletion* completionBuffer, + int32_t* completionCount) +{ + assert(socketEventBuffer != NULL); + assert(socketEventCount != NULL); + assert(*socketEventCount >= 0); + assert(completionCount == NULL || *completionCount >= 0); IoUringSocketEventPortState* state = AcquireIoUringSocketEventPortState(port); if (state == NULL) @@ -4127,10 +4134,17 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer return IoUringPortNotFound; } - int32_t produced = 0; - int32_t maxEvents = *count; - if (maxEvents == 0) + int32_t producedSocketEvents = 0; + int32_t producedCompletions = 0; + int32_t maxSocketEvents = *socketEventCount; + int32_t maxCompletions = completionCount != NULL ? *completionCount : 0; + if (maxSocketEvents == 0 && maxCompletions == 0) { + if (completionCount != NULL) + { + *completionCount = 0; + } + ReleaseIoUringSocketEventPortState(state); return Error_SUCCESS; } @@ -4140,88 +4154,169 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer uint32_t cqHead = __atomic_load_n(state->CqHead, __ATOMIC_ACQUIRE); uint32_t cqTail = __atomic_load_n(state->CqTail, __ATOMIC_ACQUIRE); - if (cqHead != cqTail && produced < maxEvents) + if (cqHead != cqTail) { struct io_uring_cqe cqe = state->Cqes[cqHead & *state->CqMask]; - cqHead++; - __atomic_store_n(state->CqHead, cqHead, __ATOMIC_RELEASE); + IoUringUserDataTag userDataTag = IoUringUserDataTag_None; + uint64_t userDataPayload = 0; if (cqe.user_data != 0) { - IoUringUserDataTag userDataTag = GetIoUringUserDataTag(cqe.user_data); - uint64_t userDataPayload = GetIoUringUserDataPayload(cqe.user_data); - if (userDataTag != IoUringUserDataTag_PollReadiness) + userDataTag = GetIoUringUserDataTag(cqe.user_data); + userDataPayload = GetIoUringUserDataPayload(cqe.user_data); + } + + int8_t completionBufferFullForTag = + (userDataTag == IoUringUserDataTag_PollReadiness && + maxSocketEvents > 0 && + producedSocketEvents >= maxSocketEvents) || + (userDataTag == IoUringUserDataTag_ReservedCompletion && + completionCount != NULL && + maxCompletions > 0 && + producedCompletions >= maxCompletions); + + if (!completionBufferFullForTag) + { + cqHead++; + __atomic_store_n(state->CqHead, cqHead, __ATOMIC_RELEASE); + } + + if (completionBufferFullForTag) + { + // Caller-provided buffer for this CQE type is full. Return what we have so far. + // Keep the CQE in the ring for a follow-up call with more capacity. + int32_t submitError = SubmitIoUringPendingEntriesLocked(state); + if (submitError != Error_SUCCESS) { - IoUringTrace( - state, - "unexpected cqe user_data tag=%u payload=%llu", - (unsigned int)userDataTag, - (unsigned long long)userDataPayload); - continue; + ReleaseIoUringSocketEventPortState(state); + *socketEventCount = 0; + if (completionCount != NULL) + { + *completionCount = 0; + } + + return submitError; + } + + if (producedSocketEvents > 0 || producedCompletions > 0) + { + ReleaseIoUringSocketEventPortState(state); + *socketEventCount = producedSocketEvents; + if (completionCount != NULL) + { + *completionCount = producedCompletions; + } + + return Error_SUCCESS; } - SocketEventRegistration* registration = FindRegistrationByRequestIdLocked(state, userDataPayload); - if (registration != NULL) + ReleaseIoUringSocketEventPortState(state); + *socketEventCount = producedSocketEvents; + if (completionCount != NULL) { - uintptr_t registrationData = registration->Data; - int8_t removeRegistration = 0; - int8_t pollStillArmed = 0; + *completionCount = producedCompletions; + } + + return Error_SUCCESS; + } - SocketEvents events = SocketEvents_SA_NONE; - if (cqe.res >= 0) + if (cqe.user_data != 0) + { + if (userDataTag == IoUringUserDataTag_PollReadiness) + { + SocketEventRegistration* registration = FindRegistrationByRequestIdLocked(state, userDataPayload); + if (registration != NULL) { - events = GetSocketEventsFromIoUringPollResult(cqe.res); + uintptr_t registrationData = registration->Data; + int8_t removeRegistration = 0; + int8_t pollStillArmed = 0; + + SocketEvents events = SocketEvents_SA_NONE; + if (cqe.res >= 0) + { + events = GetSocketEventsFromIoUringPollResult(cqe.res); #if HAVE_IO_URING_POLL_ADD_MULTI - if (state->UsesPollAddMulti && (cqe.flags & IORING_CQE_F_MORE) != 0) + if (state->UsesPollAddMulti && (cqe.flags & IORING_CQE_F_MORE) != 0) + { + pollStillArmed = 1; + } +#endif + } + else if (cqe.res != -ECANCELED && cqe.res != -ENOENT) { - pollStillArmed = 1; + events = SocketEvents_SA_ERROR; + state->CqeErrorCount++; + IoUringTrace(state, "cqe error res=%d request=%llu", cqe.res, (unsigned long long)userDataPayload); } -#endif - } - else if (cqe.res != -ECANCELED && cqe.res != -ENOENT) - { - events = SocketEvents_SA_ERROR; - state->CqeErrorCount++; - IoUringTrace(state, "cqe error res=%d request=%llu", cqe.res, (unsigned long long)userDataPayload); - } - if (cqe.res == -EBADF || cqe.res == -ENOENT || cqe.res == -EINVAL) - { - removeRegistration = 1; - } + if (cqe.res == -EBADF || cqe.res == -ENOENT || cqe.res == -EINVAL) + { + removeRegistration = 1; + } - if (!pollStillArmed || removeRegistration) - { - ClearRegistrationRequestIdLocked(state, registration); - } + if (!pollStillArmed || removeRegistration) + { + ClearRegistrationRequestIdLocked(state, registration); + } - if (events != SocketEvents_SA_NONE) - { - memset(&buffer[produced], 0, sizeof(SocketEvent)); - buffer[produced].Data = registrationData; - buffer[produced].Events = events; - produced++; - } + if (events != SocketEvents_SA_NONE && producedSocketEvents < maxSocketEvents) + { + memset(&socketEventBuffer[producedSocketEvents], 0, sizeof(SocketEvent)); + socketEventBuffer[producedSocketEvents].Data = registrationData; + socketEventBuffer[producedSocketEvents].Events = events; + producedSocketEvents++; + } - if (!removeRegistration && - !pollStillArmed && - registration->Events != SocketEvents_SA_NONE && - (cqe.res >= 0 || (cqe.res != -EBADF && cqe.res != -ENOENT && cqe.res != -EINVAL))) - { - state->PollRearmCount++; - int32_t error = QueueIoUringPollAddLocked(state, registration); - if (error != Error_SUCCESS) + if (!removeRegistration && + !pollStillArmed && + registration->Events != SocketEvents_SA_NONE && + (cqe.res >= 0 || (cqe.res != -EBADF && cqe.res != -ENOENT && cqe.res != -EINVAL))) + { + state->PollRearmCount++; + int32_t error = QueueIoUringPollAddLocked(state, registration); + if (error != Error_SUCCESS) + { + ReleaseIoUringSocketEventPortState(state); + *socketEventCount = 0; + if (completionCount != NULL) + { + *completionCount = 0; + } + + return error; + } + } + else if (removeRegistration) { - ReleaseIoUringSocketEventPortState(state); - *count = 0; - return error; + RemoveRegistrationLocked(state, registration); } } - else if (removeRegistration) + } + else if (userDataTag == IoUringUserDataTag_ReservedCompletion) + { + if (completionCount != NULL && completionBuffer != NULL && producedCompletions < maxCompletions) + { + completionBuffer[producedCompletions].UserData = (uintptr_t)userDataPayload; + completionBuffer[producedCompletions].Result = cqe.res; + completionBuffer[producedCompletions].Flags = cqe.flags; + producedCompletions++; + } + else { - RemoveRegistrationLocked(state, registration); + IoUringTrace( + state, + "dropping reserved completion user_data=%llu because completion buffer is unavailable", + (unsigned long long)userDataPayload); } } + else + { + IoUringTrace( + state, + "unexpected cqe user_data tag=%u payload=%llu", + (unsigned int)userDataTag, + (unsigned long long)userDataPayload); + } } } @@ -4229,14 +4324,24 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer if (submitError != Error_SUCCESS) { ReleaseIoUringSocketEventPortState(state); - *count = 0; + *socketEventCount = 0; + if (completionCount != NULL) + { + *completionCount = 0; + } + return submitError; } - if (produced > 0) + if (producedSocketEvents > 0 || producedCompletions > 0) { ReleaseIoUringSocketEventPortState(state); - *count = produced; + *socketEventCount = producedSocketEvents; + if (completionCount != NULL) + { + *completionCount = producedCompletions; + } + return Error_SUCCESS; } @@ -4256,7 +4361,12 @@ static int32_t WaitForSocketEventsInnerIoUring(int32_t port, SocketEvent* buffer int32_t error = SystemNative_ConvertErrorPlatformToPal(errno); IoUringTrace(state, "getevents failed: errno=%d", errno); ReleaseIoUringSocketEventPortState(state); - *count = 0; + *socketEventCount = 0; + if (completionCount != NULL) + { + *completionCount = 0; + } + return error; } } @@ -4306,7 +4416,7 @@ static int32_t TryChangeSocketEventRegistrationInner( static int32_t WaitForSocketEventsInner(int32_t port, SocketEvent* buffer, int32_t* count) { #if HAVE_LINUX_IO_URING_SOCKET_ENGINE - int32_t ioUringError = WaitForSocketEventsInnerIoUring(port, buffer, count); + int32_t ioUringError = WaitForSocketEventsInnerIoUring(port, buffer, count, NULL, NULL); if (ioUringError != IoUringPortNotFound) { return ioUringError; @@ -4316,6 +4426,31 @@ static int32_t WaitForSocketEventsInner(int32_t port, SocketEvent* buffer, int32 return WaitForSocketEventsInnerEpoll(port, buffer, count); } +static int32_t +WaitForSocketEventsAndCompletionsInner( + int32_t port, + SocketEvent* socketEventBuffer, + int32_t* socketEventCount, + IoUringCompletion* completionBuffer, + int32_t* completionCount) +{ +#if HAVE_LINUX_IO_URING_SOCKET_ENGINE + int32_t ioUringError = + WaitForSocketEventsInnerIoUring(port, socketEventBuffer, socketEventCount, completionBuffer, completionCount); + if (ioUringError != IoUringPortNotFound) + { + return ioUringError; + } +#endif + + if (completionCount != NULL) + { + *completionCount = 0; + } + + return WaitForSocketEventsInnerEpoll(port, socketEventBuffer, socketEventCount); +} + #elif HAVE_KQUEUE c_static_assert(sizeof(SocketEvent) <= sizeof(struct kevent)); @@ -4477,6 +4612,23 @@ static int32_t WaitForSocketEventsInner(int32_t port, SocketEvent* buffer, int32 return Error_SUCCESS; } +static int32_t +WaitForSocketEventsAndCompletionsInner( + int32_t port, + SocketEvent* socketEventBuffer, + int32_t* socketEventCount, + IoUringCompletion* completionBuffer, + int32_t* completionCount) +{ + (void)completionBuffer; + if (completionCount != NULL) + { + *completionCount = 0; + } + + return WaitForSocketEventsInner(port, socketEventBuffer, socketEventCount); +} + #else // !HAVE_KQUEUE !HAVE_EPOLL static const size_t SocketEventBufferElementSize = 0; @@ -4499,6 +4651,23 @@ static int32_t WaitForSocketEventsInner(int32_t port, SocketEvent* buffer, int32 { return Error_ENOSYS; } + +static int32_t +WaitForSocketEventsAndCompletionsInner( + int32_t port, + SocketEvent* socketEventBuffer, + int32_t* socketEventCount, + IoUringCompletion* completionBuffer, + int32_t* completionCount) +{ + (void)completionBuffer; + if (completionCount != NULL) + { + *completionCount = 0; + } + + return WaitForSocketEventsInner(port, socketEventBuffer, socketEventCount); +} #endif // !HAVE_KQUEUE !HAVE_EPOLL #if defined(TARGET_WASI) @@ -4606,6 +4775,28 @@ int32_t SystemNative_WaitForSocketEvents(intptr_t port, SocketEvent* buffer, int return WaitForSocketEventsInner(fd, buffer, count); } +int32_t +SystemNative_WaitForSocketEventsAndCompletions( + intptr_t port, + SocketEvent* socketEventBuffer, + int32_t* socketEventCount, + IoUringCompletion* completionBuffer, + int32_t* completionCount) +{ + if (socketEventBuffer == NULL || + socketEventCount == NULL || + *socketEventCount < 0 || + completionCount == NULL || + *completionCount < 0 || + (completionBuffer == NULL && *completionCount > 0)) + { + return Error_EFAULT; + } + + int fd = ToFileDescriptor(port); + return WaitForSocketEventsAndCompletionsInner(fd, socketEventBuffer, socketEventCount, completionBuffer, completionCount); +} + static int32_t ValidateIoUringPrepareParameters(int32_t portFd, int32_t socketFd, uintptr_t userData) { if (portFd < 0 || socketFd < 0 || userData == 0) diff --git a/src/native/libs/System.Native/pal_networking.h b/src/native/libs/System.Native/pal_networking.h index a49b97dd4ea3b8..33cd14ab986160 100644 --- a/src/native/libs/System.Native/pal_networking.h +++ b/src/native/libs/System.Native/pal_networking.h @@ -425,6 +425,13 @@ PALEXPORT int32_t SystemNative_TryChangeSocketEventRegistration( intptr_t port, intptr_t socket, int32_t currentEvents, int32_t newEvents, uintptr_t data); PALEXPORT int32_t SystemNative_WaitForSocketEvents(intptr_t port, SocketEvent* buffer, int32_t* count); +PALEXPORT int32_t +SystemNative_WaitForSocketEventsAndCompletions( + intptr_t port, + SocketEvent* socketEventBuffer, + int32_t* socketEventCount, + IoUringCompletion* completionBuffer, + int32_t* completionCount); PALEXPORT int32_t SystemNative_PrepareIoUringSend(intptr_t port, intptr_t socket, void* buffer, int32_t bufferLen, int32_t flags, uintptr_t userData); From e5661aff4bf8c813839b18e69e40e2802732136a Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 06:05:29 +0000 Subject: [PATCH 007/258] Add user_data lifecycle map, fix zero-capacity CQE loss, and add cancel-while-enqueued stress test Native: return EINVAL when ReservedCompletion CQE arrives with no completion buffer (old API safety); remove maxEvents > 0 guards so zero-capacity buffers trigger immediate buffer-full instead of consuming and dropping CQEs; replace unreachable dead-code return with assert; reject non-positive buffer capacities in WaitForSocketEventsAndCompletions. Engine: add ConcurrentDictionary lifecycle map for tracking prepared completion operations by user_data token. Insert on successful prepare in SubmitIoUringBatch, remove on CQE arrival in HandleIoUringCompletions, and remove on prepare failure with cleanup. TryUntrackIoUringOperation supports expectedOperation guard to prevent ABA removal. Context: wire TryUntrackIoUringOperation into ProcessCancellation and DoAbort paths so tracked operations are removed from the lifecycle map on cancel/abort. Add ClearIoUringUserData helper. Tests: add IoUringCompletionMode_RapidCancelWhileEnqueued stress test (8 workers x 128 cancel iterations) validating ABA-safe prepare sequencing and post-cancellation socket integrity. --- .../Net/Sockets/SocketAsyncContext.Unix.cs | 21 +++++ .../Net/Sockets/SocketAsyncEngine.Unix.cs | 78 ++++++++++++++++++- .../tests/FunctionalTests/IoUring.Unix.cs | 55 +++++++++++++ .../libs/System.Native/pal_networking.c | 43 +++++----- 4 files changed, 173 insertions(+), 24 deletions(-) diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs index f856d342440d4a..49504a93228eca 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncContext.Unix.cs @@ -269,6 +269,8 @@ public void ProcessCancellation() Debug.Assert(_state == State.Canceled); + AssociatedContext.TryUntrackIoUringOperation(this); + ErrorCode = SocketError.OperationAborted; ManualResetEventSlim? e = Event; @@ -335,6 +337,7 @@ void IThreadPoolWorkItem.Execute() // Called when op is not in the queue yet, so can't be otherwise executing public void DoAbort() { + AssociatedContext.TryUntrackIoUringOperation(this); ErrorCode = SocketError.OperationAborted; } @@ -371,6 +374,11 @@ public bool TryPrepareIoUring(SocketAsyncContext context, IntPtr port, uint prep return IoUringPrepare(context, port, userData); } + public void ClearIoUringUserData() + { + IoUringUserData = UIntPtr.Zero; + } + protected virtual bool IoUringPrepare(SocketAsyncContext context, IntPtr port, UIntPtr userData) { ErrorCode = SocketError.OperationNotSupported; @@ -1414,6 +1422,19 @@ private bool TryEnqueueIoUringPreparation(AsyncOperation operation, uint prepare return engine != null && engine.TryEnqueueIoUringPreparation(operation, prepareSequence); } + internal void TryUntrackIoUringOperation(AsyncOperation operation) + { + UIntPtr userData = operation.IoUringUserData; + if (userData == UIntPtr.Zero) + { + return; + } + + SocketAsyncEngine? engine = Volatile.Read(ref _asyncEngine); + engine?.TryUntrackIoUringOperation(userData, operation); + operation.ClearIoUringUserData(); + } + private bool TryRegister(out Interop.Error error) { Debug.Assert(_isHandleNonBlocking); diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index f722c9380cfe9b..04b3e10b80752e 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -98,6 +98,7 @@ private static SocketAsyncEngine[] CreateEngines() private readonly ConcurrentQueue _eventQueue = new ConcurrentQueue(); private readonly ConcurrentQueue _ioUringPrepareQueue = new ConcurrentQueue(); + private readonly ConcurrentDictionary? _ioUringOperationsByUserData; // This flag is used for communication between item enqueuing and workers that process the items. // There are two states of this flag: @@ -240,6 +241,7 @@ private SocketAsyncEngine() _ioUringCompletionModeEnabled = _isIoUringPort && s_ioUringCompletionModeEnabled; if (_ioUringCompletionModeEnabled) { + _ioUringOperationsByUserData = new ConcurrentDictionary(); _completionBuffer = (Interop.Sys.IoUringCompletion*)NativeMemory.Alloc( checked((nuint)EventBufferCount * (nuint)sizeof(Interop.Sys.IoUringCompletion))); if (_completionBuffer == null) @@ -326,7 +328,20 @@ private Interop.Error SubmitIoUringBatch() while (_ioUringPrepareQueue.TryDequeue(out IoUringPrepareWorkItem workItem)) { SocketAsyncContext.AsyncOperation operation = workItem.Operation; - operation.TryPrepareIoUring(operation.AssociatedContext, _port, workItem.PrepareSequence); + bool prepared = operation.TryPrepareIoUring(operation.AssociatedContext, _port, workItem.PrepareSequence); + if (prepared && operation.ErrorCode == SocketError.Success) + { + if (!TryTrackPreparedIoUringOperation(operation)) + { + operation.ClearIoUringUserData(); + return Interop.Error.EINVAL; + } + } + else + { + TryUntrackIoUringOperation(operation.IoUringUserData, operation); + operation.ClearIoUringUserData(); + } } } @@ -339,6 +354,46 @@ private Interop.Error SubmitIoUringBatch() return error; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TryTrackPreparedIoUringOperation(SocketAsyncContext.AsyncOperation operation) + { + ConcurrentDictionary? map = _ioUringOperationsByUserData; + if (map == null) + { + return false; + } + + UIntPtr userData = operation.IoUringUserData; + return userData != UIntPtr.Zero && map.TryAdd((nuint)userData, operation); + } + + internal void TryUntrackIoUringOperation(UIntPtr userData, SocketAsyncContext.AsyncOperation? expectedOperation = null) + { + if (userData == UIntPtr.Zero) + { + return; + } + + ConcurrentDictionary? map = _ioUringOperationsByUserData; + if (map == null) + { + return; + } + + nuint key = (nuint)userData; + if (expectedOperation == null) + { + map.TryRemove(key, out _); + return; + } + + if (map.TryGetValue(key, out SocketAsyncContext.AsyncOperation? trackedOperation) && + ReferenceEquals(trackedOperation, expectedOperation)) + { + map.TryRemove(key, out _); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private void EnsureWorkerScheduled() { @@ -423,12 +478,14 @@ private readonly struct SocketEventHandler public Interop.Sys.IoUringCompletion* CompletionBuffer { get; } private readonly ConcurrentQueue _eventQueue; + private readonly ConcurrentDictionary? _ioUringOperationsByUserData; public SocketEventHandler(SocketAsyncEngine engine) { Buffer = engine._buffer; CompletionBuffer = engine._completionBuffer; _eventQueue = engine._eventQueue; + _ioUringOperationsByUserData = engine._ioUringOperationsByUserData; } [MethodImpl(MethodImplOptions.NoInlining)] @@ -437,9 +494,22 @@ public void HandleIoUringCompletions(int numCompletions) Debug.Assert(numCompletions > 0); Debug.Assert(CompletionBuffer != null); - // Completion dispatch to AsyncOperation callbacks is tracked separately. - // For now, CQEs are drained in native and observed here to keep mixed-mode waits stable. - _ = new ReadOnlySpan(CompletionBuffer, numCompletions); + ReadOnlySpan completions = + new ReadOnlySpan(CompletionBuffer, numCompletions); + + ConcurrentDictionary? map = _ioUringOperationsByUserData; + if (map == null) + { + return; + } + + foreach (Interop.Sys.IoUringCompletion completion in completions) + { + if (completion.UserData != UIntPtr.Zero) + { + map.TryRemove((nuint)completion.UserData, out _); + } + } } [MethodImpl(MethodImplOptions.NoInlining)] diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs index 918f31523656d2..cebfcf05522106 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -296,6 +296,61 @@ public static async Task IoUringCompletionMode_ForcedFallbackToEpoll_StillWorks( await RemoteExecutor.Invoke(static () => RunTcpRoundTripAsync(32), CreateSocketEngineOptions(forceFallback: true, completionMode: true)).DisposeAsync(); } + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_RapidCancelWhileEnqueued_DoesNotCorruptState() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(1); + + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync((IPEndPoint)listener.LocalEndPoint!); + using Socket server = await acceptTask; + + const int WorkerCount = 8; + const int IterationsPerWorker = 128; + var tasks = new Task[WorkerCount]; + + for (int worker = 0; worker < WorkerCount; worker++) + { + tasks[worker] = Task.Run(async () => + { + byte[] receiveBuffer = new byte[1]; + for (int i = 0; i < IterationsPerWorker; i++) + { + using var cts = new CancellationTokenSource(); + ValueTask receiveTask = server.ReceiveAsync(receiveBuffer.AsMemory(), SocketFlags.None, cts.Token); + cts.Cancel(); + + Exception? ex = await Record.ExceptionAsync(async () => await receiveTask); + Assert.NotNull(ex); + Assert.True( + ex is OperationCanceledException || + ex is SocketException socketException && + (socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted), + $"Unexpected exception: {ex}"); + } + }); + } + + await Task.WhenAll(tasks); + + // Ensure socket state still allows normal async flow after rapid cancellation churn. + byte[] payload = new byte[] { 0xA5 }; + int sent = await client.SendAsync(payload, SocketFlags.None); + Assert.Equal(1, sent); + int received = await server.ReceiveAsync(payload, SocketFlags.None); + Assert.Equal(1, received); + Assert.Equal(0xA5, payload[0]); + }, CreateSocketEngineOptions(completionMode: true)).DisposeAsync(); + } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. public static async Task IoUringOptIn_ConcurrentCloseWithPendingReceive_DoesNotHang() diff --git a/src/native/libs/System.Native/pal_networking.c b/src/native/libs/System.Native/pal_networking.c index 8e1cbd18dd247a..31688d980f49f6 100644 --- a/src/native/libs/System.Native/pal_networking.c +++ b/src/native/libs/System.Native/pal_networking.c @@ -4166,13 +4166,22 @@ WaitForSocketEventsInnerIoUring( userDataPayload = GetIoUringUserDataPayload(cqe.user_data); } + if (userDataTag == IoUringUserDataTag_ReservedCompletion && completionCount == NULL) + { + IoUringTrace( + state, + "reserved completion user_data=%llu cannot be handled without completion buffer", + (unsigned long long)userDataPayload); + ReleaseIoUringSocketEventPortState(state); + *socketEventCount = 0; + return Error_EINVAL; + } + int8_t completionBufferFullForTag = (userDataTag == IoUringUserDataTag_PollReadiness && - maxSocketEvents > 0 && producedSocketEvents >= maxSocketEvents) || (userDataTag == IoUringUserDataTag_ReservedCompletion && completionCount != NULL && - maxCompletions > 0 && producedCompletions >= maxCompletions); if (!completionBufferFullForTag) @@ -4198,18 +4207,7 @@ WaitForSocketEventsInnerIoUring( return submitError; } - if (producedSocketEvents > 0 || producedCompletions > 0) - { - ReleaseIoUringSocketEventPortState(state); - *socketEventCount = producedSocketEvents; - if (completionCount != NULL) - { - *completionCount = producedCompletions; - } - - return Error_SUCCESS; - } - + assert(producedSocketEvents > 0 || producedCompletions > 0); ReleaseIoUringSocketEventPortState(state); *socketEventCount = producedSocketEvents; if (completionCount != NULL) @@ -4783,12 +4781,17 @@ SystemNative_WaitForSocketEventsAndCompletions( IoUringCompletion* completionBuffer, int32_t* completionCount) { - if (socketEventBuffer == NULL || - socketEventCount == NULL || - *socketEventCount < 0 || - completionCount == NULL || - *completionCount < 0 || - (completionBuffer == NULL && *completionCount > 0)) + if (socketEventBuffer == NULL || socketEventCount == NULL || completionCount == NULL) + { + return Error_EFAULT; + } + + if (*socketEventCount <= 0 || *completionCount <= 0) + { + return Error_EINVAL; + } + + if (completionBuffer == NULL) { return Error_EFAULT; } From c913e745a72c405f5ea0f0e6e74c1e1dfe5f0c45 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 06:16:52 +0000 Subject: [PATCH 008/258] Add Debug.Fail invariant checks for lifecycle map and close/dispose stress test Engine: add Debug.Fail on tracking collision in SubmitIoUringBatch (fail-fast policy for orphaned SQEs), untrack mismatch in TryUntrackIoUringOperation, and missing CQE entry in HandleIoUringCompletions. Add Debug.Assert in FreeNativeResources verifying lifecycle map is empty on teardown. Tests: add IoUringCompletionMode_CloseDisposeStress_DoesNotHang (64 iterations, 16 concurrent receives per iteration with immediate client+server dispose, verifying expected exception types and no hangs). --- .../Net/Sockets/SocketAsyncEngine.Unix.cs | 18 ++++++- .../tests/FunctionalTests/IoUring.Unix.cs | 48 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index 04b3e10b80752e..cfe5a89a8e2e1f 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -333,6 +333,9 @@ private Interop.Error SubmitIoUringBatch() { if (!TryTrackPreparedIoUringOperation(operation)) { + // Policy: treat a tracking collision as a fatal internal invariant violation. + // If we cannot track a prepared operation, its SQE may complete without a managed owner. + Debug.Fail("io_uring prepared operation could not be tracked by user_data."); operation.ClearIoUringUserData(); return Interop.Error.EINVAL; } @@ -392,6 +395,10 @@ internal void TryUntrackIoUringOperation(UIntPtr userData, SocketAsyncContext.As { map.TryRemove(key, out _); } + else + { + Debug.Fail("io_uring tracked operation mismatch while untracking user_data."); + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -456,6 +463,12 @@ private void FreeNativeResources() Interop.Sys.FreeSocketEventBuffer(_buffer); } + if (_ioUringOperationsByUserData != null) + { + Debug.Assert(_ioUringOperationsByUserData.IsEmpty, $"Leaked tracked io_uring operations: {_ioUringOperationsByUserData.Count}"); + _ioUringOperationsByUserData.Clear(); + } + if (_completionBuffer != null) { NativeMemory.Free(_completionBuffer); @@ -507,7 +520,10 @@ public void HandleIoUringCompletions(int numCompletions) { if (completion.UserData != UIntPtr.Zero) { - map.TryRemove((nuint)completion.UserData, out _); + if (!map.TryRemove((nuint)completion.UserData, out _)) + { + Debug.Fail("io_uring CQE user_data had no tracked managed operation."); + } } } } diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs index cebfcf05522106..103d9a0addcee4 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -351,6 +351,54 @@ ex is SocketException socketException && }, CreateSocketEngineOptions(completionMode: true)).DisposeAsync(); } + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringCompletionMode_CloseDisposeStress_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(32); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + for (int i = 0; i < 64; i++) + { + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync(endpoint); + using Socket server = await acceptTask; + + Task[] receives = new Task[16]; + for (int r = 0; r < receives.Length; r++) + { + receives[r] = server.ReceiveAsync(new byte[1], SocketFlags.None).AsTask(); + } + + client.Dispose(); + server.Dispose(); + + for (int r = 0; r < receives.Length; r++) + { + Exception? ex = await Record.ExceptionAsync(async () => await receives[r]); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + } + }, CreateSocketEngineOptions(completionMode: true)).DisposeAsync(); + } + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. public static async Task IoUringOptIn_ConcurrentCloseWithPendingReceive_DoesNotHang() From 614472aaefc0a9c9fa39587b57e70c599ae97cac Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 06:25:18 +0000 Subject: [PATCH 009/258] Fix false-positive Debug.Fail in lifecycle map and add teardown drain TryUntrackIoUringOperation: treat key-not-found as benign (prepare fallback or cancel-after-CQE race), retain Debug.Fail only for real invariant violations (key found but maps to different operation). HandleIoUringCompletions: remove Debug.Fail on missing tracked entry (cancel/abort may have already untracked); capture removed operation and clear its IoUringUserData on successful removal. FreeNativeResources: add DrainTrackedIoUringOperationsForTeardown that iterates remaining tracked operations and performs best-effort TryCancel + ClearIoUringUserData before the leak-detection assert. --- .../Net/Sockets/SocketAsyncEngine.Unix.cs | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index cfe5a89a8e2e1f..ddf8f972da4727 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -390,8 +390,13 @@ internal void TryUntrackIoUringOperation(UIntPtr userData, SocketAsyncContext.As return; } - if (map.TryGetValue(key, out SocketAsyncContext.AsyncOperation? trackedOperation) && - ReferenceEquals(trackedOperation, expectedOperation)) + if (!map.TryGetValue(key, out SocketAsyncContext.AsyncOperation? trackedOperation)) + { + // Benign race: CQE path or cancellation path may have already removed this entry. + return; + } + + if (ReferenceEquals(trackedOperation, expectedOperation)) { map.TryRemove(key, out _); } @@ -465,6 +470,7 @@ private void FreeNativeResources() if (_ioUringOperationsByUserData != null) { + DrainTrackedIoUringOperationsForTeardown(); Debug.Assert(_ioUringOperationsByUserData.IsEmpty, $"Leaked tracked io_uring operations: {_ioUringOperationsByUserData.Count}"); _ioUringOperationsByUserData.Clear(); } @@ -480,6 +486,26 @@ private void FreeNativeResources() } } + private void DrainTrackedIoUringOperationsForTeardown() + { + ConcurrentDictionary? map = _ioUringOperationsByUserData; + if (map == null || map.IsEmpty) + { + return; + } + + foreach (KeyValuePair tracked in map) + { + if (map.TryRemove(tracked.Key, out SocketAsyncContext.AsyncOperation? operation)) + { + // Teardown policy: best-effort cancellation of still-tracked operations + // before releasing native resources. + operation.TryCancel(); + operation.ClearIoUringUserData(); + } + } + } + // The JIT is allowed to arbitrarily extend the lifetime of locals, which may retain SocketAsyncContext references, // indirectly preventing Socket instances to be finalized, despite being no longer referenced by user code. // To avoid this, the event handling logic is delegated to a non-inlined processing method. @@ -520,9 +546,10 @@ public void HandleIoUringCompletions(int numCompletions) { if (completion.UserData != UIntPtr.Zero) { - if (!map.TryRemove((nuint)completion.UserData, out _)) + // Benign race: cancellation/abort paths may have already removed this tracked entry. + if (map.TryRemove((nuint)completion.UserData, out SocketAsyncContext.AsyncOperation? operation)) { - Debug.Fail("io_uring CQE user_data had no tracked managed operation."); + operation.ClearIoUringUserData(); } } } From afd4382805304ff8b7f4260c2f5d5ef1e078d893 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 06:39:59 +0000 Subject: [PATCH 010/258] Extract TryRemoveTrackedOperationCore helper, add late-CQE diagnostics and contention tests Extract IoUringTrackedOperationMapHelpers with generic TryRemoveTrackedOperationCore and RemoveResult enum (Removed/NotFound/ Mismatch) used by both TryUntrackIoUringOperation and HandleIoUringCompletions, enabling reflection-based unit testing. Add _ioUringBenignLateCompletionCount counter with rate-limited NetEventSource.Info logging (every 64th occurrence) for CQEs arriving after the tracked entry was already removed by cancel/abort. Report total on teardown. Tests: add IoUringTrackedMapLifecycle_RemoveOrdering_IsDeterministic covering all three RemoveResult paths via reflection. Add IoUringOptIn_ConcurrentRegistrationChurn_DoesNotHang (8 workers x 64 iterations, alternating send/dispose with pending receive) validating concurrent TryChangeSocketEventRegistration contention. --- .../Net/Sockets/SocketAsyncEngine.Unix.cs | 91 ++++++++++--- .../tests/FunctionalTests/IoUring.Unix.cs | 125 ++++++++++++++++++ 2 files changed, 198 insertions(+), 18 deletions(-) diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index ddf8f972da4727..831eaafd75794b 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -10,6 +10,48 @@ namespace System.Net.Sockets { + internal static class IoUringTrackedOperationMapHelpers + { + internal enum RemoveResult + { + Removed, + NotFound, + Mismatch + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static RemoveResult TryRemoveTrackedOperationCore( + ConcurrentDictionary map, + nuint key, + TOperation? expectedOperation, + out TOperation? removedOperation) + where TOperation : class + { + if (expectedOperation == null) + { + return map.TryRemove(key, out removedOperation) ? + RemoveResult.Removed : + RemoveResult.NotFound; + } + + if (!map.TryGetValue(key, out TOperation? trackedOperation)) + { + removedOperation = null; + return RemoveResult.NotFound; + } + + if (!ReferenceEquals(trackedOperation, expectedOperation)) + { + removedOperation = null; + return RemoveResult.Mismatch; + } + + return map.TryRemove(key, out removedOperation) ? + RemoveResult.Removed : + RemoveResult.NotFound; + } + } + internal sealed unsafe class SocketAsyncEngine : IThreadPoolWorkItem { private const int EventBufferCount = @@ -99,6 +141,7 @@ private static SocketAsyncEngine[] CreateEngines() private readonly ConcurrentQueue _ioUringPrepareQueue = new ConcurrentQueue(); private readonly ConcurrentDictionary? _ioUringOperationsByUserData; + private long _ioUringBenignLateCompletionCount; // This flag is used for communication between item enqueuing and workers that process the items. // There are two states of this flag: @@ -384,23 +427,9 @@ internal void TryUntrackIoUringOperation(UIntPtr userData, SocketAsyncContext.As } nuint key = (nuint)userData; - if (expectedOperation == null) - { - map.TryRemove(key, out _); - return; - } - - if (!map.TryGetValue(key, out SocketAsyncContext.AsyncOperation? trackedOperation)) - { - // Benign race: CQE path or cancellation path may have already removed this entry. - return; - } - - if (ReferenceEquals(trackedOperation, expectedOperation)) - { - map.TryRemove(key, out _); - } - else + IoUringTrackedOperationMapHelpers.RemoveResult removeResult = + IoUringTrackedOperationMapHelpers.TryRemoveTrackedOperationCore(map, key, expectedOperation, out _); + if (removeResult == IoUringTrackedOperationMapHelpers.RemoveResult.Mismatch) { Debug.Fail("io_uring tracked operation mismatch while untracking user_data."); } @@ -473,6 +502,12 @@ private void FreeNativeResources() DrainTrackedIoUringOperationsForTeardown(); Debug.Assert(_ioUringOperationsByUserData.IsEmpty, $"Leaked tracked io_uring operations: {_ioUringOperationsByUserData.Count}"); _ioUringOperationsByUserData.Clear(); + + long lateCompletionCount = Interlocked.Read(ref _ioUringBenignLateCompletionCount); + if (lateCompletionCount > 0 && NetEventSource.Log.IsEnabled()) + { + NetEventSource.Info(this, $"io_uring benign late-completion total={lateCompletionCount}"); + } } if (_completionBuffer != null) @@ -506,6 +541,16 @@ private void DrainTrackedIoUringOperationsForTeardown() } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void RecordBenignLateIoUringCompletion(UIntPtr userData) + { + long count = Interlocked.Increment(ref _ioUringBenignLateCompletionCount); + if (NetEventSource.Log.IsEnabled() && (count & 0x3F) == 1) + { + NetEventSource.Info(this, $"io_uring completion arrived after managed untrack: count={count}, user_data=0x{(nuint)userData:x}"); + } + } + // The JIT is allowed to arbitrarily extend the lifetime of locals, which may retain SocketAsyncContext references, // indirectly preventing Socket instances to be finalized, despite being no longer referenced by user code. // To avoid this, the event handling logic is delegated to a non-inlined processing method. @@ -518,9 +563,11 @@ private readonly struct SocketEventHandler private readonly ConcurrentQueue _eventQueue; private readonly ConcurrentDictionary? _ioUringOperationsByUserData; + private readonly SocketAsyncEngine _engine; public SocketEventHandler(SocketAsyncEngine engine) { + _engine = engine; Buffer = engine._buffer; CompletionBuffer = engine._completionBuffer; _eventQueue = engine._eventQueue; @@ -547,10 +594,18 @@ public void HandleIoUringCompletions(int numCompletions) if (completion.UserData != UIntPtr.Zero) { // Benign race: cancellation/abort paths may have already removed this tracked entry. - if (map.TryRemove((nuint)completion.UserData, out SocketAsyncContext.AsyncOperation? operation)) + if (IoUringTrackedOperationMapHelpers.TryRemoveTrackedOperationCore( + map, + (nuint)completion.UserData, + expectedOperation: null, + out SocketAsyncContext.AsyncOperation? operation) == IoUringTrackedOperationMapHelpers.RemoveResult.Removed) { operation.ClearIoUringUserData(); } + else + { + _engine.RecordBenignLateIoUringCompletion(completion.UserData); + } } } } diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs index 103d9a0addcee4..5465006fa2e484 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/IoUring.Unix.cs @@ -2,8 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Net; +using System.Reflection; using System.Threading; using System.Threading.Tasks; using Microsoft.DotNet.RemoteExecutor; @@ -13,6 +15,65 @@ namespace System.Net.Sockets.Tests { public class IoUring { + [Fact] + [PlatformSpecific(TestPlatforms.Linux)] // Uses Unix SocketAsyncEngine tracked-map internals. + public static void IoUringTrackedMapLifecycle_RemoveOrdering_IsDeterministic() + { + Type helperType = typeof(Socket).Assembly.GetType("System.Net.Sockets.IoUringTrackedOperationMapHelpers", throwOnError: true)!; + MethodInfo removeCoreDefinition = helperType.GetMethod( + "TryRemoveTrackedOperationCore", + BindingFlags.NonPublic | BindingFlags.Static)!; + MethodInfo removeCore = removeCoreDefinition.MakeGenericMethod(typeof(object)); + Type removeResultType = helperType.GetNestedType("RemoveResult", BindingFlags.NonPublic)!; + + object removed = Enum.Parse(removeResultType, "Removed"); + object notFound = Enum.Parse(removeResultType, "NotFound"); + object mismatch = Enum.Parse(removeResultType, "Mismatch"); + + var map = new ConcurrentDictionary(); + object operationA = new object(); + object operationB = new object(); + nuint userData = 42; + + static object? InvokeRemoveCore( + MethodInfo method, + ConcurrentDictionary dictionary, + nuint userDataToken, + object? expectedOperation, + out object? removedOperation) + { + object?[] args = new object?[] { dictionary, userDataToken, expectedOperation, null }; + object? result = method.Invoke(null, args); + removedOperation = args[3]; + return result; + } + + object? result = InvokeRemoveCore(removeCore, map, userData, operationA, out object? removedOperation); + Assert.Equal(notFound, result); + Assert.Null(removedOperation); + + map[userData] = operationA; + result = InvokeRemoveCore(removeCore, map, userData, operationA, out removedOperation); + Assert.Equal(removed, result); + Assert.Same(operationA, removedOperation); + + map[userData] = operationA; + result = InvokeRemoveCore(removeCore, map, userData, operationA, out removedOperation); + Assert.Equal(removed, result); + Assert.Same(operationA, removedOperation); + + result = InvokeRemoveCore(removeCore, map, userData, operationA, out removedOperation); + Assert.Equal(notFound, result); + Assert.Null(removedOperation); + + map[userData] = operationA; + result = InvokeRemoveCore(removeCore, map, userData, operationB, out removedOperation); + Assert.Equal(mismatch, result); + Assert.Null(removedOperation); + Assert.True(map.TryGetValue(userData, out object? remainingOperation)); + Assert.Same(operationA, remainingOperation); + } + private static RemoteInvokeOptions CreateSocketEngineOptions( string? ioUringValue = "1", bool forceFallback = false, @@ -440,6 +501,70 @@ await RemoteExecutor.Invoke(static async () => }, CreateSocketEngineOptions()).DisposeAsync(); } + [OuterLoop] + [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] + [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. + public static async Task IoUringOptIn_ConcurrentRegistrationChurn_DoesNotHang() + { + await RemoteExecutor.Invoke(static async () => + { + const int WorkerCount = 8; + const int IterationsPerWorker = 64; + + using Socket listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + listener.Bind(new IPEndPoint(IPAddress.Loopback, 0)); + listener.Listen(WorkerCount * 2); + IPEndPoint endpoint = (IPEndPoint)listener.LocalEndPoint!; + + var workers = new Task[WorkerCount]; + for (int worker = 0; worker < WorkerCount; worker++) + { + workers[worker] = Task.Run(async () => + { + byte[] sendBuffer = new byte[] { 0x5A }; + byte[] receiveBuffer = new byte[1]; + + for (int i = 0; i < IterationsPerWorker; i++) + { + using Socket client = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + Task acceptTask = listener.AcceptAsync(); + await client.ConnectAsync(endpoint); + using Socket server = await acceptTask; + + ValueTask pendingReceive = server.ReceiveAsync(receiveBuffer, SocketFlags.None); + await Task.Yield(); + + if ((i & 1) == 0) + { + int sent = await client.SendAsync(sendBuffer, SocketFlags.None); + Assert.Equal(1, sent); + } + else + { + client.Dispose(); + } + + Exception? ex = await Record.ExceptionAsync(async () => await pendingReceive); + if (ex is SocketException socketException) + { + Assert.True( + socketException.SocketErrorCode == SocketError.ConnectionReset || + socketException.SocketErrorCode == SocketError.OperationAborted || + socketException.SocketErrorCode == SocketError.Interrupted, + $"Unexpected socket error: {socketException.SocketErrorCode}"); + } + else if (ex is not ObjectDisposedException and not null) + { + throw ex; + } + } + }); + } + + await Task.WhenAll(workers); + }, CreateSocketEngineOptions()).DisposeAsync(); + } + [OuterLoop] [ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] [PlatformSpecific(TestPlatforms.Linux)] // io_uring is Linux-specific. From b604a7346508881dc4b969b736d703d20c2aed19 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 06:52:23 +0000 Subject: [PATCH 011/258] Add SocketsTelemetry late-CQE counter, atomic tracked-map removal, and envelope criteria Plumb benign late-CQE occurrences into SocketsTelemetry as a PollingCounter (io-uring-benign-late-completions) for structured monitoring via dotnet-counters and EventPipe. Replace the TOCTOU-prone TryGetValue/TryRemove sequence in TryRemoveTrackedOperationCore with atomic ICollection.Remove as the primary path. Add late-CQE envelope acceptance criteria to the PR evidence template and validation guide, and update the evidence script to capture the new counter. --- .../libraries/io-uring-pr-evidence-template.md | 11 +++++++++++ .../libraries/testing-linux-sockets-io-uring.md | 5 +++++ .../io-uring/collect-sockets-io-uring-evidence.sh | 3 +++ .../System/Net/Sockets/SocketAsyncEngine.Unix.cs | 14 +++++++++++--- .../src/System/Net/Sockets/SocketsTelemetry.cs | 12 ++++++++++++ 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md index 7af5a389e94c8e..4a88fccab65a7c 100644 --- a/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md +++ b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md @@ -48,6 +48,7 @@ Trace-mode summary (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1`): - Rearm count: `` - Submit retries/errors: `/` - CQE error count: `` +- Managed benign late CQE completions (`io-uring-benign-late-completions`): `` - Artifact: `` ## Perf Comparison (Baseline vs Opt-in) @@ -60,6 +61,16 @@ Trace-mode summary (`DOTNET_SYSTEM_NET_SOCKETS_IO_URING_TRACE=1`): | ThreadPool completed work items/s | `` | `` | `` | | Error rate/timeouts | `` | `` | `` | +Late-CQE envelope (measured from `io-uring-benign-late-completions` and workload operation counts): + +- Acceptance criteria: + - Absolute late-CQE count remains low enough that functionality is unaffected (no hangs, no duplicate callbacks, no leak assertions). + - Opt-in late-CQE rate does not exceed `2x` baseline rate for the same workload shape. + - If baseline rate is zero, opt-in late-CQE count should remain near-zero; investigate any sustained non-zero trend. +- Observed baseline late-CQE count/rate: `` +- Observed opt-in late-CQE count/rate: `` +- Envelope verdict: `` + Artifacts: - Baseline logs: `` diff --git a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md index 1df26eb7ee2740..6177869a1f6569 100644 --- a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md +++ b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md @@ -118,7 +118,11 @@ Measure: - `System.Runtime` CPU Usage - ThreadPool Thread Count - ThreadPool Completed Work Item Count + - `System.Net.Sockets` `io-uring-benign-late-completions` 4. Error/stability signals (timeouts, socket exceptions, reconnect loops) +5. Late-CQE race envelope: + - Opt-in late-CQE rate should not exceed `2x` baseline for the same workload shape. + - If baseline is zero, opt-in should remain near-zero; investigate sustained non-zero trends. Acceptance for initial PR: @@ -126,3 +130,4 @@ Acceptance for initial PR: 2. No clear throughput regression in steady state. 3. No sustained CPU regression versus epoll baseline for the same workload. 4. Fallback behavior remains correct on unsupported kernels/configurations. +5. Late-CQE metric remains within the agreed envelope and shows no runaway growth. diff --git a/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh index 6c11888d457f97..bcc3f96b07070f 100644 --- a/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh +++ b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh @@ -222,6 +222,9 @@ extract_phase2_metrics() { "buffer_pin_ns" "buffer_pin_us" "buffer_pin_ms" + "io-uring-benign-late-completions" + "io_uring_benign_late_completions" + "io_uring benign late-completion total=" ) for pattern in "${patterns[@]}"; do diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs index 831eaafd75794b..f19ff00b7f8a26 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketAsyncEngine.Unix.cs @@ -34,6 +34,13 @@ internal static RemoveResult TryRemoveTrackedOperationCore( RemoveResult.NotFound; } + // Remove atomically by key+value to avoid a TOCTOU window between lookup and remove. + if (((ICollection>)map).Remove(new KeyValuePair(key, expectedOperation))) + { + removedOperation = expectedOperation; + return RemoveResult.Removed; + } + if (!map.TryGetValue(key, out TOperation? trackedOperation)) { removedOperation = null; @@ -46,9 +53,9 @@ internal static RemoveResult TryRemoveTrackedOperationCore( return RemoveResult.Mismatch; } - return map.TryRemove(key, out removedOperation) ? - RemoveResult.Removed : - RemoveResult.NotFound; + // Entry was removed concurrently after the failed key+value remove. + removedOperation = null; + return RemoveResult.NotFound; } } @@ -545,6 +552,7 @@ private void DrainTrackedIoUringOperationsForTeardown() private void RecordBenignLateIoUringCompletion(UIntPtr userData) { long count = Interlocked.Increment(ref _ioUringBenignLateCompletionCount); + SocketsTelemetry.Log.IoUringBenignLateCompletion(); if (NetEventSource.Log.IsEnabled() && (count & 0x3F) == 1) { NetEventSource.Info(this, $"io_uring completion arrived after managed untrack: count={count}, user_data=0x{(nuint)userData:x}"); diff --git a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs index 1171961a204351..0e8547621cc4d8 100644 --- a/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs +++ b/src/libraries/System.Net.Sockets/src/System/Net/Sockets/SocketsTelemetry.cs @@ -23,6 +23,7 @@ internal sealed partial class SocketsTelemetry : EventSource private PollingCounter? _bytesSentCounter; private PollingCounter? _datagramsReceivedCounter; private PollingCounter? _datagramsSentCounter; + private PollingCounter? _ioUringBenignLateCompletionsCounter; private long _currentOutgoingConnectAttempts; private long _outgoingConnectionsEstablished; @@ -31,6 +32,7 @@ internal sealed partial class SocketsTelemetry : EventSource private long _bytesSent; private long _datagramsReceived; private long _datagramsSent; + private long _ioUringBenignLateCompletions; [Event(1, Level = EventLevel.Informational)] private void ConnectStart(string? address) @@ -231,6 +233,12 @@ public void DatagramSent() Interlocked.Increment(ref _datagramsSent); } + [NonEvent] + public void IoUringBenignLateCompletion() + { + Interlocked.Increment(ref _ioUringBenignLateCompletions); + } + private static string GetErrorType(SocketError socketError) => socketError switch { // Common connect() errors expected to be seen: @@ -291,6 +299,10 @@ protected override void OnEventCommand(EventCommandEventArgs command) { DisplayName = "Datagrams Sent", }; + _ioUringBenignLateCompletionsCounter ??= new PollingCounter("io-uring-benign-late-completions", this, () => Interlocked.Read(ref _ioUringBenignLateCompletions)) + { + DisplayName = "io_uring Benign Late Completions", + }; } } } From d9f8b225e43ae9796d2222a230bbf8a050dd17b5 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 07:05:37 +0000 Subject: [PATCH 012/258] Add structured counter capture, late-CQE envelope automation, and telemetry contract test Add dotnet-counters/EventPipe structured capture to evidence script with run_structured_counter_capture (JSON output), extract_counter_last_value (awk parser), and evaluate_late_cqe_envelope (dual-source ratio-based pass/fail with configurable thresholds). Add CLI flags for counter duration and envelope tuning. Add counter name contract test in TelemetryTest.cs to guard against tooling breakage from renames. Update evidence template and validation guide with artifact references and recommended script flags. --- .../io-uring-pr-evidence-template.md | 2 + .../testing-linux-sockets-io-uring.md | 7 + .../collect-sockets-io-uring-evidence.sh | 245 ++++++++++++++++++ .../tests/FunctionalTests/TelemetryTest.cs | 4 + 4 files changed, 258 insertions(+) diff --git a/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md index 4a88fccab65a7c..7513b1671833f8 100644 --- a/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md +++ b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md @@ -76,6 +76,8 @@ Artifacts: - Baseline logs: `` - Opt-in logs: `` - Counter captures: `` +- Structured counters (`dotnet-counters`/EventPipe): ``, `` +- Envelope verdict artifact: `` ## Risk and Rollback diff --git a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md index 6177869a1f6569..51c796414dc005 100644 --- a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md +++ b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md @@ -11,6 +11,12 @@ For evidence packaging, use: - Script: `eng/testing/io-uring/collect-sockets-io-uring-evidence.sh` - PR template: `docs/workflow/testing/libraries/io-uring-pr-evidence-template.md` +Recommended script flags for structured late-CQE analysis: + +- `--phase2-metrics` +- `--counters-duration 00:00:20` (or longer for slower workloads) +- Optional tuning: `--late-cqe-max-ratio `, `--late-cqe-near-zero ` + ## Prerequisites - Linux machine (x64 or arm64) @@ -123,6 +129,7 @@ Measure: 5. Late-CQE race envelope: - Opt-in late-CQE rate should not exceed `2x` baseline for the same workload shape. - If baseline is zero, opt-in should remain near-zero; investigate sustained non-zero trends. + - Use script artifacts `counters_baseline.json`, `counters_optin.json`, and `late-cqe-envelope.txt` for structured pass/fail evidence. Acceptance for initial PR: diff --git a/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh index bcc3f96b07070f..56b309e8811fd9 100644 --- a/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh +++ b/eng/testing/io-uring/collect-sockets-io-uring-evidence.sh @@ -14,6 +14,10 @@ Optional: --docker-image Docker image for seccomp validation --perf-iterations Workload repetitions per mode (default: 3) --phase2-metrics Extract Phase 2 completion-path metrics from logs (if present) + --no-structured-counters Skip dotnet-counters/EventPipe structured counter capture + --counters-duration dotnet-counters collection duration (default: 00:00:20) + --late-cqe-max-ratio Envelope threshold for opt-in/baseline late-CQE ratio (default: 2.0) + --late-cqe-near-zero Envelope threshold when baseline late-CQE is zero (default: 5) Example: eng/testing/io-uring/collect-sockets-io-uring-evidence.sh \ @@ -30,6 +34,10 @@ workload_command="" docker_image="" perf_iterations=3 phase2_metrics=0 +structured_counters=1 +counters_duration="00:00:20" +late_cqe_max_ratio="2.0" +late_cqe_near_zero="5" while [[ $# -gt 0 ]]; do case "$1" in @@ -57,6 +65,22 @@ while [[ $# -gt 0 ]]; do phase2_metrics=1 shift ;; + --no-structured-counters) + structured_counters=0 + shift + ;; + --counters-duration) + counters_duration="${2:-}" + shift 2 + ;; + --late-cqe-max-ratio) + late_cqe_max_ratio="${2:-}" + shift 2 + ;; + --late-cqe-near-zero) + late_cqe_near_zero="${2:-}" + shift 2 + ;; -h|--help) usage exit 0 @@ -200,6 +224,224 @@ run_perf_case() { done } +run_structured_counter_capture() { + local mode_name="$1" + local io_uring_value="$2" + local counter_name="io-uring-benign-late-completions" + local counters_file="$run_dir/counters_${mode_name}.json" + local collector_log="$run_dir/counters_${mode_name}.log" + local workload_log="$run_dir/counters_${mode_name}_workload.log" + + if [[ -z "$workload_command" ]]; then + write_summary_line "- [ ] structured_counters_${mode_name} (workload command not configured)" + return 0 + fi + + if [[ "$structured_counters" != "1" ]]; then + write_summary_line "- [ ] structured_counters_${mode_name} (disabled by --no-structured-counters)" + return 0 + fi + + if ! command -v dotnet-counters >/dev/null 2>&1; then + write_summary_line "- [ ] structured_counters_${mode_name} (dotnet-counters not found)" + return 0 + fi + + log "Running structured counter capture: ${mode_name}" + { + printf '# structured_counters_%s\n' "$mode_name" + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf '# Counter: %s\n' "$counter_name" + printf '$ env DOTNET_SYSTEM_NET_SOCKETS_IO_URING=%s bash -lc "exec %s"\n' "$io_uring_value" "$workload_command" + printf '$ dotnet-counters collect --process-id --duration %s --refresh-interval 1 --format json --counters System.Net.Sockets --output %s\n' "$counters_duration" "$counters_file" + } > "$collector_log" + + env DOTNET_SYSTEM_NET_SOCKETS_IO_URING="$io_uring_value" \ + bash -lc "exec ${workload_command}" > "$workload_log" 2>&1 & + local workload_pid=$! + + sleep 1 + if ! kill -0 "$workload_pid" >/dev/null 2>&1; then + wait "$workload_pid" || true + { + printf '\n# Workload process exited before counter collection could attach.\n' + printf '# Workload log: %s\n' "$workload_log" + } >> "$collector_log" + write_summary_line "- [ ] structured_counters_${mode_name} (`${collector_log}`)" + return 1 + fi + + set +e + dotnet-counters collect \ + --process-id "$workload_pid" \ + --duration "$counters_duration" \ + --refresh-interval 1 \ + --format json \ + --counters System.Net.Sockets \ + --output "$counters_file" >> "$collector_log" 2>&1 + local collector_status=$? + wait "$workload_pid" + local workload_status=$? + set -e + + if [[ "$workload_status" -eq 0 && -s "$counters_file" ]]; then + write_summary_line "- [x] structured_counters_${mode_name} (`${counters_file}`, `${collector_log}`, `${workload_log}`)" + return 0 + fi + + { + printf '\n# structured capture status\n' + printf 'collector_exit=%s\n' "$collector_status" + printf 'workload_exit=%s\n' "$workload_status" + if [[ ! -s "$counters_file" ]]; then + printf 'counter_file=missing_or_empty\n' + fi + } >> "$collector_log" + write_summary_line "- [ ] structured_counters_${mode_name} (`${collector_log}`, `${workload_log}`)" + return 1 +} + +extract_counter_last_value() { + local counter_file="$1" + local counter_name="$2" + + if [[ ! -s "$counter_file" ]]; then + return 1 + fi + + awk -v counter_name="$counter_name" ' + BEGIN { in_counter = 0; value = "" } + { + if ($0 ~ "\"Name\"[[:space:]]*:[[:space:]]*\"" counter_name "\"") { + in_counter = 1 + } + + if (in_counter == 1 && ($0 ~ "\"Mean\"[[:space:]]*:" || $0 ~ "\"Increment\"[[:space:]]*:")) { + if (match($0, /"Mean"[[:space:]]*:[[:space:]]*[-0-9.eE+]+/)) { + candidate = substr($0, RSTART, RLENGTH) + sub(/.*:[[:space:]]*/, "", candidate) + value = candidate + } else if (match($0, /"Increment"[[:space:]]*:[[:space:]]*[-0-9.eE+]+/)) { + candidate = substr($0, RSTART, RLENGTH) + sub(/.*:[[:space:]]*/, "", candidate) + value = candidate + } + in_counter = 0 + } + } + END { + if (value != "") { + print value + } + } + ' "$counter_file" +} + +extract_late_cqe_average_from_perf_logs() { + local mode_name="$1" + shopt -s nullglob + local logs=( "$run_dir"/perf_"$mode_name"_*.log ) + shopt -u nullglob + + if [[ "${#logs[@]}" -eq 0 ]]; then + return 1 + fi + + awk ' + { + if (match($0, /io_uring benign late-completion total=[0-9]+/)) { + value = substr($0, RSTART, RLENGTH) + sub(/.*=/, "", value) + sum += value + count += 1 + } + } + END { + if (count > 0) { + printf "%.6f\n", sum / count + } + } + ' "${logs[@]}" +} + +evaluate_late_cqe_envelope() { + local counter_name="io-uring-benign-late-completions" + local envelope_file="$run_dir/late-cqe-envelope.txt" + local baseline_counter_file="$run_dir/counters_baseline.json" + local optin_counter_file="$run_dir/counters_optin.json" + + local baseline_value="" + local optin_value="" + local baseline_source="structured-counter" + local optin_source="structured-counter" + + baseline_value="$(extract_counter_last_value "$baseline_counter_file" "$counter_name" || true)" + optin_value="$(extract_counter_last_value "$optin_counter_file" "$counter_name" || true)" + + if [[ -z "$baseline_value" ]]; then + baseline_value="$(extract_late_cqe_average_from_perf_logs baseline || true)" + baseline_source="perf-log-fallback" + fi + + if [[ -z "$optin_value" ]]; then + optin_value="$(extract_late_cqe_average_from_perf_logs optin || true)" + optin_source="perf-log-fallback" + fi + + if [[ -z "$baseline_value" && -z "$optin_value" ]]; then + { + printf '# late_cqe_envelope\n' + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf '# Not enough data to evaluate envelope.\n' + printf '# Expected either structured counters files or perf logs with late-CQE totals.\n' + } > "$envelope_file" + write_summary_line "- [ ] late_cqe_envelope (`${envelope_file}`)" + return 0 + fi + + # Missing values are treated as zero for envelope comparison, but source is preserved. + baseline_value="${baseline_value:-0}" + optin_value="${optin_value:-0}" + + local ratio + ratio="$(awk -v baseline="$baseline_value" -v optin="$optin_value" 'BEGIN { if (baseline > 0) printf "%.6f", optin / baseline; else print "inf" }')" + + local verdict="pass" + if awk -v baseline="$baseline_value" 'BEGIN { exit !(baseline > 0) }'; then + if ! awk -v baseline="$baseline_value" -v optin="$optin_value" -v max_ratio="$late_cqe_max_ratio" \ + 'BEGIN { exit !(optin <= baseline * max_ratio) }' + then + verdict="fail" + fi + else + if ! awk -v optin="$optin_value" -v near_zero="$late_cqe_near_zero" \ + 'BEGIN { exit !(optin <= near_zero) }' + then + verdict="fail" + fi + fi + + { + printf '# late_cqe_envelope\n' + printf '# UTC: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + printf 'counter_name=%s\n' "$counter_name" + printf 'baseline_value=%s\n' "$baseline_value" + printf 'baseline_source=%s\n' "$baseline_source" + printf 'optin_value=%s\n' "$optin_value" + printf 'optin_source=%s\n' "$optin_source" + printf 'ratio_optin_over_baseline=%s\n' "$ratio" + printf 'max_ratio_threshold=%s\n' "$late_cqe_max_ratio" + printf 'near_zero_threshold=%s\n' "$late_cqe_near_zero" + printf 'verdict=%s\n' "$verdict" + } > "$envelope_file" + + if [[ "$verdict" == "pass" ]]; then + write_summary_line "- [x] late_cqe_envelope (`${envelope_file}`)" + else + write_summary_line "- [ ] late_cqe_envelope (`${envelope_file}`)" + fi +} + extract_phase2_metrics() { if [[ "$phase2_metrics" != "1" ]]; then write_summary_line "- [ ] phase2_metrics_extraction (not requested)" @@ -263,6 +505,9 @@ run_docker_case "docker_default_seccomp" "" || true run_docker_case "docker_unconfined_seccomp" "--security-opt=seccomp=unconfined" || true run_perf_case "baseline" "0" || true run_perf_case "optin" "1" || true +run_structured_counter_capture "baseline" "0" || true +run_structured_counter_capture "optin" "1" || true +evaluate_late_cqe_envelope || true extract_phase2_metrics || true log "Evidence collection completed. Summary: $summary_file" diff --git a/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs b/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs index 69f61fc180a49c..881c5cdf12a7ca 100644 --- a/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs +++ b/src/libraries/System.Net.Sockets/tests/FunctionalTests/TelemetryTest.cs @@ -664,6 +664,10 @@ private static void VerifyEventCounters(ConcurrentQueue<(EventWrittenEventArgs E { Assert.True(datagramsSent[^1] > 0); } + + // Keep an explicit contract test for this counter name because io_uring evidence tooling depends on it. + Assert.True(eventCounters.TryGetValue("io-uring-benign-late-completions", out double[] ioUringBenignLateCompletions)); + Assert.True(ioUringBenignLateCompletions[^1] >= 0); } } } From 7125c688f3d4f9f883f9e464b28c070cae60cf3d Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 13 Feb 2026 07:29:09 +0000 Subject: [PATCH 013/258] Add operation-level buffer pinning, perf comparison report, and evidence tooling hardening Add MemoryHandle-based buffer pinning on AsyncOperation (PinIoUringBuffer, TryPinIoUringBuffer, ReleasePinnedIoUringBuffer) with pin retained from prepare through ClearIoUringUserData/Reset. Add pointer-based SocketPal TryPrepareIoUring overloads; Span/Memory overloads now delegate via fixed. Harden evidence script: fix backtick command substitution in all summary lines, add retry loop for structured counter capture, expand counter spec to System.Net.Sockets+System.Runtime, add workload-unit normalization to envelope evaluation, add generate_perf_comparison_report for baseline vs opt-in metric tables, and add smoke-validation script. --- .../io-uring-pr-evidence-template.md | 4 + .../testing-linux-sockets-io-uring.md | 8 +- .../collect-sockets-io-uring-evidence.sh | 465 +++++++++++++++--- ...collect-sockets-io-uring-evidence-smoke.sh | 167 +++++++ .../Net/Sockets/SocketAsyncContext.Unix.cs | 96 +++- .../src/System/Net/Sockets/SocketPal.Unix.cs | 73 ++- 6 files changed, 737 insertions(+), 76 deletions(-) create mode 100644 eng/testing/io-uring/validate-collect-sockets-io-uring-evidence-smoke.sh diff --git a/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md index 7513b1671833f8..a9ad2b7a710df5 100644 --- a/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md +++ b/docs/workflow/testing/libraries/io-uring-pr-evidence-template.md @@ -66,15 +66,19 @@ Late-CQE envelope (measured from `io-uring-benign-late-completions` and workload - Acceptance criteria: - Absolute late-CQE count remains low enough that functionality is unaffected (no hangs, no duplicate callbacks, no leak assertions). - Opt-in late-CQE rate does not exceed `2x` baseline rate for the same workload shape. + - When workload-unit normalization is configured, normalized opt-in/baseline ratio should remain within agreed threshold. - If baseline rate is zero, opt-in late-CQE count should remain near-zero; investigate any sustained non-zero trend. - Observed baseline late-CQE count/rate: `` - Observed opt-in late-CQE count/rate: `` +- Observed baseline normalized late-CQE (``): `` +- Observed opt-in normalized late-CQE (``): `` - Envelope verdict: `` Artifacts: - Baseline logs: `` - Opt-in logs: `` +- Perf comparison table artifact: `` - Counter captures: `` - Structured counters (`dotnet-counters`/EventPipe): ``, `` - Envelope verdict artifact: `` diff --git a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md index 51c796414dc005..7bd9114de55c13 100644 --- a/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md +++ b/docs/workflow/testing/libraries/testing-linux-sockets-io-uring.md @@ -9,13 +9,17 @@ The backend is opt-in and must safely fall back to epoll when io_uring is unavai For evidence packaging, use: - Script: `eng/testing/io-uring/collect-sockets-io-uring-evidence.sh` +- Smoke validator: `eng/testing/io-uring/validate-collect-sockets-io-uring-evidence-smoke.sh` - PR template: `docs/workflow/testing/libraries/io-uring-pr-evidence-template.md` Recommended script flags for structured late-CQE analysis: - `--phase2-metrics` - `--counters-duration 00:00:20` (or longer for slower workloads) -- Optional tuning: `--late-cqe-max-ratio `, `--late-cqe-near-zero ` +- Reliability tuning: `--structured-capture-attempts ` for short-lived workloads +- Optional perf extraction: `--throughput-pattern `, `--throughput-label