diff --git a/src/inc/clrconfigvalues.h b/src/inc/clrconfigvalues.h index 4da57f51d618..7b096b438f7b 100644 --- a/src/inc/clrconfigvalues.h +++ b/src/inc/clrconfigvalues.h @@ -943,7 +943,12 @@ RETAIL_CONFIG_DWORD_INFO(INTERNAL_ThreadPool_ForceMaxWorkerThreads, W("ThreadPoo RETAIL_CONFIG_DWORD_INFO(INTERNAL_ThreadPool_DisableStarvationDetection, W("ThreadPool_DisableStarvationDetection"), 0, "Disables the ThreadPool feature that forces new threads to be added when workitems run for too long") RETAIL_CONFIG_DWORD_INFO(INTERNAL_ThreadPool_DebugBreakOnWorkerStarvation, W("ThreadPool_DebugBreakOnWorkerStarvation"), 0, "Breaks into the debugger if the ThreadPool detects work queue starvation") RETAIL_CONFIG_DWORD_INFO(INTERNAL_ThreadPool_EnableWorkerTracking, W("ThreadPool_EnableWorkerTracking"), 0, "Enables extra expensive tracking of how many workers threads are working simultaneously") -RETAIL_CONFIG_DWORD_INFO(INTERNAL_ThreadPool_UnfairSemaphoreSpinLimit, W("ThreadPool_UnfairSemaphoreSpinLimit"), 50, "Per processor limit used when calculating spin duration in UnfairSemaphore::Wait") +#ifdef _TARGET_ARM64_ +// Spinning scheme is currently different on ARM64, see CLRLifoSemaphore::Wait(DWORD, UINT32, UINT32) +RETAIL_CONFIG_DWORD_INFO(INTERNAL_ThreadPool_UnfairSemaphoreSpinLimit, W("ThreadPool_UnfairSemaphoreSpinLimit"), 0x32, "Maximum number of spins per processor a thread pool worker thread performs before waiting for work") +#else // !_TARGET_ARM64_ +RETAIL_CONFIG_DWORD_INFO(INTERNAL_ThreadPool_UnfairSemaphoreSpinLimit, W("ThreadPool_UnfairSemaphoreSpinLimit"), 0x46, "Maximum number of spins a thread pool worker thread performs before waiting for work") +#endif // _TARGET_ARM64_ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_Thread_UseAllCpuGroups, W("Thread_UseAllCpuGroups"), 0, "Specifies if to automatically distribute thread across CPU Groups") CONFIG_DWORD_INFO(INTERNAL_ThreadpoolTickCountAdjustment, W("ThreadpoolTickCountAdjustment"), 0, "") diff --git a/src/pal/inc/pal.h b/src/pal/inc/pal.h index 4ac91db0cfca..4ae2187b69f3 100644 --- a/src/pal/inc/pal.h +++ b/src/pal/inc/pal.h @@ -1464,6 +1464,13 @@ WaitForSingleObject( IN HANDLE hHandle, IN DWORD dwMilliseconds); +PALIMPORT +DWORD +PALAPI +PAL_WaitForSingleObjectPrioritized( + IN HANDLE hHandle, + IN DWORD dwMilliseconds); + PALIMPORT DWORD PALAPI diff --git a/src/pal/src/include/pal/corunix.hpp b/src/pal/src/include/pal/corunix.hpp index bfdfb6c16764..e35c5b056e4c 100644 --- a/src/pal/src/include/pal/corunix.hpp +++ b/src/pal/src/include/pal/corunix.hpp @@ -773,7 +773,8 @@ namespace CorUnix RegisterWaitingThread( WaitType eWaitType, DWORD dwIndex, - bool fAltertable + bool fAltertable, + bool fPrioritize ) = 0; // diff --git a/src/pal/src/include/pal/synchobjects.hpp b/src/pal/src/include/pal/synchobjects.hpp index 62f401749205..1ee4f1c57b59 100644 --- a/src/pal/src/include/pal/synchobjects.hpp +++ b/src/pal/src/include/pal/synchobjects.hpp @@ -40,7 +40,8 @@ namespace CorUnix CONST HANDLE *lpHandles, BOOL bWaitAll, DWORD dwMilliseconds, - BOOL bAlertable); + BOOL bAlertable, + BOOL bPrioritize = FALSE); PAL_ERROR InternalSleepEx( CPalThread * pthrCurrent, diff --git a/src/pal/src/synchmgr/synchcontrollers.cpp b/src/pal/src/synchmgr/synchcontrollers.cpp index 68fe42946202..6eae9187d97d 100644 --- a/src/pal/src/synchmgr/synchcontrollers.cpp +++ b/src/pal/src/synchmgr/synchcontrollers.cpp @@ -262,7 +262,8 @@ namespace CorUnix PAL_ERROR CSynchWaitController::RegisterWaitingThread( WaitType wtWaitType, DWORD dwIndex, - bool fAlertable) + bool fAlertable, + bool fPrioritize) { VALIDATEOBJECT(m_psdSynchData); @@ -421,12 +422,12 @@ namespace CorUnix // Add new node to queue if (fSharedObject) { - m_psdSynchData->SharedWaiterEnqueue(shridNewNode); + m_psdSynchData->SharedWaiterEnqueue(shridNewNode, fPrioritize); ptwiWaitInfo->lSharedObjCount += 1; } else { - m_psdSynchData->WaiterEnqueue(pwtlnNewNode); + m_psdSynchData->WaiterEnqueue(pwtlnNewNode, fPrioritize); } // Succeeded: update object count @@ -1821,7 +1822,7 @@ namespace CorUnix Note: this method must be called while holding the local process synchronization lock. --*/ - void CSynchData::WaiterEnqueue(WaitingThreadsListNode * pwtlnNewNode) + void CSynchData::WaiterEnqueue(WaitingThreadsListNode * pwtlnNewNode, bool fPrioritize) { VALIDATEOBJECT(this); VALIDATEOBJECT(pwtlnNewNode); @@ -1833,26 +1834,55 @@ namespace CorUnix "Trying to add a WaitingThreadsListNode marked as shared " "as it was a local one\n"); - WaitingThreadsListNode * pwtlnCurrLast = m_ptrWTLTail.ptr; - - pwtlnNewNode->ptrNext.ptr = NULL; - if (NULL == pwtlnCurrLast) + if (!fPrioritize) { - _ASSERT_MSG(NULL == m_ptrWTLHead.ptr, - "Corrupted waiting list on local CSynchData @ %p\n", - this); + // Enqueue normally to the end of the queue + WaitingThreadsListNode * pwtlnCurrLast = m_ptrWTLTail.ptr; + + pwtlnNewNode->ptrNext.ptr = NULL; + if (NULL == pwtlnCurrLast) + { + _ASSERT_MSG(NULL == m_ptrWTLHead.ptr, + "Corrupted waiting list on local CSynchData @ %p\n", + this); - pwtlnNewNode->ptrPrev.ptr = NULL; - m_ptrWTLHead.ptr = pwtlnNewNode; - m_ptrWTLTail.ptr = pwtlnNewNode; + pwtlnNewNode->ptrPrev.ptr = NULL; + m_ptrWTLHead.ptr = pwtlnNewNode; + m_ptrWTLTail.ptr = pwtlnNewNode; + } + else + { + VALIDATEOBJECT(pwtlnCurrLast); + + pwtlnNewNode->ptrPrev.ptr = pwtlnCurrLast; + pwtlnCurrLast->ptrNext.ptr = pwtlnNewNode; + m_ptrWTLTail.ptr = pwtlnNewNode; + } } else { - VALIDATEOBJECT(pwtlnCurrLast); + // The wait is prioritized, enqueue to the beginning of the queue + WaitingThreadsListNode * pwtlnCurrFirst = m_ptrWTLHead.ptr; + + pwtlnNewNode->ptrPrev.ptr = NULL; + if (NULL == pwtlnCurrFirst) + { + _ASSERT_MSG(NULL == m_ptrWTLTail.ptr, + "Corrupted waiting list on local CSynchData @ %p\n", + this); + + pwtlnNewNode->ptrNext.ptr = NULL; + m_ptrWTLHead.ptr = pwtlnNewNode; + m_ptrWTLTail.ptr = pwtlnNewNode; + } + else + { + VALIDATEOBJECT(pwtlnCurrFirst); - pwtlnNewNode->ptrPrev.ptr = pwtlnCurrLast; - pwtlnCurrLast->ptrNext.ptr = pwtlnNewNode; - m_ptrWTLTail.ptr = pwtlnNewNode; + pwtlnNewNode->ptrNext.ptr = pwtlnCurrFirst; + pwtlnCurrFirst->ptrPrev.ptr = pwtlnNewNode; + m_ptrWTLHead.ptr = pwtlnNewNode; + } } m_ulcWaitingThreads += 1; @@ -1872,7 +1902,7 @@ namespace CorUnix Note: this method must be called while holding both local and shared synchronization locks. --*/ - void CSynchData::SharedWaiterEnqueue(SharedID shridNewNode) + void CSynchData::SharedWaiterEnqueue(SharedID shridNewNode, bool fPrioritize) { VALIDATEOBJECT(this); @@ -1880,37 +1910,77 @@ namespace CorUnix "Trying to enqueue a WaitingThreadsListNode as shared " "on a local object\n"); - SharedID shridCurrLast; - WaitingThreadsListNode * pwtlnCurrLast, * pwtlnNewNode; + if (!fPrioritize) + { + // Enqueue normally to the end of the queue + SharedID shridCurrLast; + WaitingThreadsListNode * pwtlnCurrLast, * pwtlnNewNode; - shridCurrLast = m_ptrWTLTail.shrid; - pwtlnCurrLast = SharedIDToTypePointer(WaitingThreadsListNode, shridCurrLast); - pwtlnNewNode = SharedIDToTypePointer(WaitingThreadsListNode, shridNewNode); + shridCurrLast = m_ptrWTLTail.shrid; + pwtlnCurrLast = SharedIDToTypePointer(WaitingThreadsListNode, shridCurrLast); + pwtlnNewNode = SharedIDToTypePointer(WaitingThreadsListNode, shridNewNode); - _ASSERT_MSG(1 == (WTLN_FLAG_OWNER_OBJECT_IS_SHARED & pwtlnNewNode->dwFlags), - "Trying to add a WaitingThreadsListNode marked as local " - "as it was a shared one\n"); + _ASSERT_MSG(1 == (WTLN_FLAG_OWNER_OBJECT_IS_SHARED & pwtlnNewNode->dwFlags), + "Trying to add a WaitingThreadsListNode marked as local " + "as it was a shared one\n"); - VALIDATEOBJECT(pwtlnNewNode); + VALIDATEOBJECT(pwtlnNewNode); - pwtlnNewNode->ptrNext.shrid = NULL; - if (NULL == pwtlnCurrLast) - { - _ASSERT_MSG(NULL == m_ptrWTLHead.shrid, - "Corrupted waiting list on shared CSynchData at " - "{shrid=%p, p=%p}\n", m_shridThis, this); + pwtlnNewNode->ptrNext.shrid = NULL; + if (NULL == pwtlnCurrLast) + { + _ASSERT_MSG(NULL == m_ptrWTLHead.shrid, + "Corrupted waiting list on shared CSynchData at " + "{shrid=%p, p=%p}\n", m_shridThis, this); - pwtlnNewNode->ptrPrev.shrid = NULL; - m_ptrWTLHead.shrid = shridNewNode; - m_ptrWTLTail.shrid = shridNewNode; + pwtlnNewNode->ptrPrev.shrid = NULL; + m_ptrWTLHead.shrid = shridNewNode; + m_ptrWTLTail.shrid = shridNewNode; + } + else + { + VALIDATEOBJECT(pwtlnCurrLast); + + pwtlnNewNode->ptrPrev.shrid = shridCurrLast; + pwtlnCurrLast->ptrNext.shrid = shridNewNode; + m_ptrWTLTail.shrid = shridNewNode; + } } else { - VALIDATEOBJECT(pwtlnCurrLast); + // The wait is prioritized, enqueue to the beginning of the queue + SharedID shridCurrFirst; + WaitingThreadsListNode * pwtlnCurrFirst, * pwtlnNewNode; + + shridCurrFirst = m_ptrWTLHead.shrid; + pwtlnCurrFirst = SharedIDToTypePointer(WaitingThreadsListNode, shridCurrFirst); + pwtlnNewNode = SharedIDToTypePointer(WaitingThreadsListNode, shridNewNode); + + _ASSERT_MSG(1 == (WTLN_FLAG_OWNER_OBJECT_IS_SHARED & pwtlnNewNode->dwFlags), + "Trying to add a WaitingThreadsListNode marked as local " + "as it was a shared one\n"); + + VALIDATEOBJECT(pwtlnNewNode); + + pwtlnNewNode->ptrPrev.shrid = NULL; + if (NULL == pwtlnCurrFirst) + { + _ASSERT_MSG(NULL == m_ptrWTLTail.shrid, + "Corrupted waiting list on shared CSynchData at " + "{shrid=%p, p=%p}\n", m_shridThis, this); - pwtlnNewNode->ptrPrev.shrid = shridCurrLast; - pwtlnCurrLast->ptrNext.shrid = shridNewNode; - m_ptrWTLTail.shrid = shridNewNode; + pwtlnNewNode->ptrNext.shrid = NULL; + m_ptrWTLHead.shrid = shridNewNode; + m_ptrWTLTail.shrid = shridNewNode; + } + else + { + VALIDATEOBJECT(pwtlnCurrFirst); + + pwtlnNewNode->ptrNext.shrid = shridCurrFirst; + pwtlnCurrFirst->ptrPrev.shrid = shridNewNode; + m_ptrWTLHead.shrid = shridNewNode; + } } m_ulcWaitingThreads += 1; diff --git a/src/pal/src/synchmgr/synchmanager.cpp b/src/pal/src/synchmgr/synchmanager.cpp index a683255a3e48..048ea3ee7dc8 100644 --- a/src/pal/src/synchmgr/synchmanager.cpp +++ b/src/pal/src/synchmgr/synchmanager.cpp @@ -3867,7 +3867,7 @@ namespace CorUnix pwtlnNew->shridWaitingState = pwtlnOld->shridWaitingState; pwtlnNew->ptwiWaitInfo = pwtlnOld->ptwiWaitInfo; - psdShared->SharedWaiterEnqueue(rgshridWTLNodes[i]); + psdShared->SharedWaiterEnqueue(rgshridWTLNodes[i], false); psdShared->AddRef(); _ASSERTE(pwtlnOld = pwtlnOld->ptwiWaitInfo->rgpWTLNodes[pwtlnOld->dwObjIndex]); diff --git a/src/pal/src/synchmgr/synchmanager.hpp b/src/pal/src/synchmgr/synchmanager.hpp index b0cc2e762257..89e1d13568e6 100644 --- a/src/pal/src/synchmgr/synchmanager.hpp +++ b/src/pal/src/synchmgr/synchmanager.hpp @@ -206,8 +206,8 @@ namespace CorUnix CPalThread * pthrCurrent, CPalThread * pthrTarget); - void WaiterEnqueue(WaitingThreadsListNode * pwtlnNewNode); - void SharedWaiterEnqueue(SharedID shridNewNode); + void WaiterEnqueue(WaitingThreadsListNode * pwtlnNewNode, bool fPrioritize); + void SharedWaiterEnqueue(SharedID shridNewNode, bool fPrioritize); // Object Domain accessor methods ObjectDomain GetObjectDomain(void) @@ -464,7 +464,8 @@ namespace CorUnix virtual PAL_ERROR RegisterWaitingThread( WaitType wtWaitType, DWORD dwIndex, - bool fAlertable); + bool fAlertable, + bool fPrioritize); virtual void ReleaseController(void); diff --git a/src/pal/src/synchmgr/wait.cpp b/src/pal/src/synchmgr/wait.cpp index 8ef65aaa0193..fc5bb674dbfb 100644 --- a/src/pal/src/synchmgr/wait.cpp +++ b/src/pal/src/synchmgr/wait.cpp @@ -73,6 +73,35 @@ WaitForSingleObject(IN HANDLE hHandle, } +/*++ +Function: + WaitForSingleObjectPrioritized + +Similar to WaitForSingleObject, except uses a LIFO release policy for waiting threads by prioritizing new waiters (registering +them at the beginning of the wait queue rather than at the end). +--*/ +DWORD +PALAPI +PAL_WaitForSingleObjectPrioritized(IN HANDLE hHandle, + IN DWORD dwMilliseconds) +{ + DWORD dwRet; + + PERF_ENTRY(PAL_WaitForSingleObjectPrioritized); + ENTRY("PAL_WaitForSingleObjectPrioritized(hHandle=%p, dwMilliseconds=%u)\n", + hHandle, dwMilliseconds); + + CPalThread * pThread = InternalGetCurrentThread(); + + dwRet = InternalWaitForMultipleObjectsEx(pThread, 1, &hHandle, FALSE, + dwMilliseconds, FALSE, TRUE /* bPrioritize */); + + LOGEXIT("PAL_WaitForSingleObjectPrioritized returns DWORD %u\n", dwRet); + PERF_EXIT(PAL_WaitForSingleObjectPrioritized); + return dwRet; +} + + /*++ Function: WaitForSingleObjectEx @@ -285,7 +314,8 @@ DWORD CorUnix::InternalWaitForMultipleObjectsEx( CONST HANDLE *lpHandles, BOOL bWaitAll, DWORD dwMilliseconds, - BOOL bAlertable) + BOOL bAlertable, + BOOL bPrioritize) { DWORD dwRet = WAIT_FAILED; PAL_ERROR palErr = NO_ERROR; @@ -530,7 +560,8 @@ DWORD CorUnix::InternalWaitForMultipleObjectsEx( palErr = ppISyncWaitCtrlrs[i]->RegisterWaitingThread( wtWaitType, i, - (TRUE == bAlertable)); + (TRUE == bAlertable), + bPrioritize != FALSE); if (NO_ERROR != palErr) { ERROR("RegisterWaitingThread() failed for %d-th object " diff --git a/src/vm/synch.cpp b/src/vm/synch.cpp index e159b7813a73..e4fae65855de 100644 --- a/src/vm/synch.cpp +++ b/src/vm/synch.cpp @@ -590,6 +590,418 @@ DWORD CLRSemaphore::Wait(DWORD dwMilliseconds, BOOL alertable) } } +void CLRLifoSemaphore::Create(INT32 initialSignalCount, INT32 maximumSignalCount) +{ + CONTRACTL + { + THROWS; + GC_NOTRIGGER; + SO_TOLERANT; + } + CONTRACTL_END; + + _ASSERTE(maximumSignalCount > 0); + _ASSERTE(initialSignalCount <= maximumSignalCount); + _ASSERTE(m_handle == nullptr); + +#ifdef FEATURE_PAL + HANDLE h = UnsafeCreateSemaphore(nullptr, initialSignalCount, maximumSignalCount, nullptr); +#else // !FEATURE_PAL + HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, nullptr, 0, maximumSignalCount); +#endif // FEATURE_PAL + if (h == nullptr) + { + ThrowOutOfMemory(); + } + + m_handle = h; + m_counts.signalCount = initialSignalCount; + INDEBUG(m_maximumSignalCount = maximumSignalCount); +} + +void CLRLifoSemaphore::Close() +{ + LIMITED_METHOD_CONTRACT; + + if (m_handle == nullptr) + { + return; + } + + CloseHandle(m_handle); + m_handle = nullptr; +} + +bool CLRLifoSemaphore::WaitForSignal(DWORD timeoutMs) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + SO_TOLERANT; + } + CONTRACTL_END; + + _ASSERTE(timeoutMs != 0); + _ASSERTE(m_handle != nullptr); + _ASSERTE(m_counts.waiterCount != (UINT16)0); + + while (true) + { + // Wait for a signal + BOOL waitSuccessful; + { +#ifdef FEATURE_PAL + // Do a prioritized wait to get LIFO waiter release order + DWORD waitResult = PAL_WaitForSingleObjectPrioritized(m_handle, timeoutMs); + _ASSERTE(waitResult == WAIT_OBJECT_0 || waitResult == WAIT_TIMEOUT); + waitSuccessful = waitResult == WAIT_OBJECT_0; +#else // !FEATURE_PAL + // I/O completion ports release waiters in LIFO order, see + // https://msdn.microsoft.com/en-us/library/windows/desktop/aa365198(v=vs.85).aspx + DWORD numberOfBytes; + ULONG_PTR completionKey; + LPOVERLAPPED overlapped; + waitSuccessful = GetQueuedCompletionStatus(m_handle, &numberOfBytes, &completionKey, &overlapped, timeoutMs); + _ASSERTE(waitSuccessful || GetLastError() == WAIT_TIMEOUT); + _ASSERTE(overlapped == nullptr); +#endif // FEATURE_PAL + } + + // Unregister the waiter if this thread will not be waiting anymore, and try to acquire the semaphore + Counts counts = m_counts.VolatileLoad(); + while (true) + { + _ASSERTE(counts.waiterCount != (UINT16)0); + Counts newCounts = counts; + if (counts.signalCount != 0) + { + --newCounts.signalCount; + --newCounts.waiterCount; + } + else if (!waitSuccessful) + { + --newCounts.waiterCount; + } + + // This waiter has woken up and this needs to be reflected in the count of waiters signaled to wake. Since we don't + // have thread-specific signal state, there is not enough information to tell whether this thread woke up because it + // was signaled. For instance, this thread may have timed out and then we don't know whether this thread also got + // signaled. So in any woken case, decrement the count if possible. As such, timeouts could cause more waiters to + // wake than necessary. + if (counts.countOfWaitersSignaledToWake != (UINT8)0) + { + --newCounts.countOfWaitersSignaledToWake; + } + + Counts countsBeforeUpdate = m_counts.CompareExchange(newCounts, counts); + if (countsBeforeUpdate == counts) + { + if (counts.signalCount != 0) + { + return true; + } + break; + } + + counts = countsBeforeUpdate; + } + + if (!waitSuccessful) + { + return false; + } + } +} + +bool CLRLifoSemaphore::Wait(DWORD timeoutMs) +{ + WRAPPER_NO_CONTRACT; + + _ASSERTE(m_handle != nullptr); + + // Acquire the semaphore or register as a waiter + Counts counts = m_counts.VolatileLoad(); + while (true) + { + _ASSERTE(counts.signalCount <= m_maximumSignalCount); + Counts newCounts = counts; + if (counts.signalCount != 0) + { + --newCounts.signalCount; + } + else if (timeoutMs != 0) + { + ++newCounts.waiterCount; + _ASSERTE(newCounts.waiterCount != (UINT16)0); // overflow check, this many waiters is currently not supported + } + + Counts countsBeforeUpdate = m_counts.CompareExchange(newCounts, counts); + if (countsBeforeUpdate == counts) + { + return counts.signalCount != 0 || (timeoutMs != 0 && WaitForSignal(timeoutMs)); + } + + counts = countsBeforeUpdate; + } +} + +bool CLRLifoSemaphore::Wait(DWORD timeoutMs, UINT32 spinCount, UINT32 processorCount) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + SO_TOLERANT; + } + CONTRACTL_END; + + _ASSERTE(m_handle != nullptr); + + if (timeoutMs == 0 || spinCount == 0) + { + return Wait(timeoutMs); + } + + // Try to acquire the semaphore or register as a spinner + Counts counts = m_counts.VolatileLoad(); + while (true) + { + Counts newCounts = counts; + if (counts.signalCount != 0) + { + --newCounts.signalCount; + } + else + { + ++newCounts.spinnerCount; + if (newCounts.spinnerCount == (UINT8)0) + { + // Maximum number of spinners reached, register as a waiter instead + --newCounts.spinnerCount; + ++newCounts.waiterCount; + _ASSERTE(newCounts.waiterCount != (UINT16)0); // overflow check, this many waiters is currently not supported + } + } + + Counts countsBeforeUpdate = m_counts.CompareExchange(newCounts, counts); + if (countsBeforeUpdate == counts) + { + if (counts.signalCount != 0) + { + return true; + } + if (newCounts.waiterCount != counts.waiterCount) + { + return WaitForSignal(timeoutMs); + } + break; + } + + counts = countsBeforeUpdate; + } + +#ifdef _TARGET_ARM64_ + // For now, the spinning changes are disabled on ARM64. The spin loop below replicates how UnfairSemaphore used to spin. + // Once more tuning is done on ARM64, it should be possible to come up with a spinning scheme that works well everywhere. + int spinCountPerProcessor = spinCount; + for (UINT32 i = 1; ; ++i) + { + // Wait + ClrSleepEx(0, false); + + // Try to acquire the semaphore and unregister as a spinner + counts = m_counts.VolatileLoad(); + while (true) + { + _ASSERTE(counts.spinnerCount != (UINT8)0); + if (counts.signalCount == 0) + { + break; + } + + Counts newCounts = counts; + --newCounts.signalCount; + --newCounts.spinnerCount; + + Counts countsBeforeUpdate = m_counts.CompareExchange(newCounts, counts); + if (countsBeforeUpdate == counts) + { + return true; + } + + counts = countsBeforeUpdate; + } + + // Determine whether to spin further + double spinnersPerProcessor = (double)counts.spinnerCount / processorCount; + UINT32 spinLimit = (UINT32)(spinCountPerProcessor / spinnersPerProcessor + 0.5); + if (i >= spinLimit) + { + break; + } + } +#else // !_TARGET_ARM64_ + const UINT32 Sleep0Threshold = 10; + YieldProcessorWithBackOffNormalizationInfo normalizationInfo; +#ifdef FEATURE_PAL + // The PAL's wait subsystem is quite slow, spin more to compensate for the more expensive wait + spinCount *= 2; +#endif // FEATURE_PAL + for (UINT32 i = 0; i < spinCount; ++i) + { + // Wait + // + // (i - Sleep0Threshold) % 2 != 0: The purpose of this check is to interleave Thread.Yield/Sleep(0) with + // Thread.SpinWait. Otherwise, the following issues occur: + // - When there are no threads to switch to, Yield and Sleep(0) become no-op and it turns the spin loop into a + // busy-spin that may quickly reach the max spin count and cause the thread to enter a wait state. Completing the + // spin loop too early can cause excessive context switcing from the wait. + // - If there are multiple threads doing Yield and Sleep(0) (typically from the same spin loop due to contention), + // they may switch between one another, delaying work that can make progress. + if (i < Sleep0Threshold || (i - Sleep0Threshold) % 2 != 0) + { + YieldProcessorWithBackOffNormalized(normalizationInfo, i); + } + else + { + // Not doing SwitchToThread(), it does not seem to have any benefit over Sleep(0) + ClrSleepEx(0, false); + } + + // Try to acquire the semaphore and unregister as a spinner + counts = m_counts.VolatileLoad(); + while (true) + { + _ASSERTE(counts.spinnerCount != (UINT8)0); + if (counts.signalCount == 0) + { + break; + } + + Counts newCounts = counts; + --newCounts.signalCount; + --newCounts.spinnerCount; + + Counts countsBeforeUpdate = m_counts.CompareExchange(newCounts, counts); + if (countsBeforeUpdate == counts) + { + return true; + } + + counts = countsBeforeUpdate; + } + } +#endif // _TARGET_ARM64_ + + // Unregister as a spinner, and acquire the semaphore or register as a waiter + counts = m_counts.VolatileLoad(); + while (true) + { + _ASSERTE(counts.spinnerCount != (UINT8)0); + Counts newCounts = counts; + --newCounts.spinnerCount; + if (counts.signalCount != 0) + { + --newCounts.signalCount; + } + else + { + ++newCounts.waiterCount; + _ASSERTE(newCounts.waiterCount != (UINT16)0); // overflow check, this many waiters is currently not supported + } + + Counts countsBeforeUpdate = m_counts.CompareExchange(newCounts, counts); + if (countsBeforeUpdate == counts) + { + return counts.signalCount != 0 || WaitForSignal(timeoutMs); + } + + counts = countsBeforeUpdate; + } +} + +void CLRLifoSemaphore::Release(INT32 releaseCount) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + SO_TOLERANT; + } + CONTRACTL_END; + + _ASSERTE(releaseCount > 0); + _ASSERTE((UINT32)releaseCount <= m_maximumSignalCount); + _ASSERTE(m_handle != INVALID_HANDLE_VALUE); + + INT32 countOfWaitersToWake; + Counts counts = m_counts.VolatileLoad(); + while (true) + { + Counts newCounts = counts; + + // Increase the signal count. The addition doesn't overflow because of the limit on the max signal count in Create. + newCounts.signalCount += releaseCount; + _ASSERTE(newCounts.signalCount > counts.signalCount); + + // Determine how many waiters to wake, taking into account how many spinners and waiters there are and how many waiters + // have previously been signaled to wake but have not yet woken + countOfWaitersToWake = + (INT32)min(newCounts.signalCount, (UINT32)newCounts.waiterCount + newCounts.spinnerCount) - + newCounts.spinnerCount - + newCounts.countOfWaitersSignaledToWake; + if (countOfWaitersToWake > 0) + { + // Ideally, limiting to a maximum of releaseCount would not be necessary and could be an assert instead, but since + // WaitForSignal() does not have enough information to tell whether a woken thread was signaled, and due to the cap + // below, it's possible for countOfWaitersSignaledToWake to be less than the number of threads that have actually + // been signaled to wake. + if (countOfWaitersToWake > releaseCount) + { + countOfWaitersToWake = releaseCount; + } + + // Cap countOfWaitersSignaledToWake to its max value. It's ok to ignore some woken threads in this count, it just + // means some more threads will be woken next time. Typically, it won't reach the max anyway. + newCounts.countOfWaitersSignaledToWake += (UINT8)min(countOfWaitersToWake, (INT32)UINT8_MAX); + if (newCounts.countOfWaitersSignaledToWake <= counts.countOfWaitersSignaledToWake) + { + newCounts.countOfWaitersSignaledToWake = UINT8_MAX; + } + } + + Counts countsBeforeUpdate = m_counts.CompareExchange(newCounts, counts); + if (countsBeforeUpdate == counts) + { + _ASSERTE((UINT32)releaseCount <= m_maximumSignalCount - counts.signalCount); + _ASSERTE(newCounts.countOfWaitersSignaledToWake <= newCounts.waiterCount); + if (countOfWaitersToWake <= 0) + { + return; + } + break; + } + + counts = countsBeforeUpdate; + } + + // Wake waiters +#ifdef FEATURE_PAL + BOOL released = UnsafeReleaseSemaphore(m_handle, countOfWaitersToWake, nullptr); + _ASSERTE(released); +#else // !FEATURE_PAL + while (--countOfWaitersToWake >= 0) + { + while (!PostQueuedCompletionStatus(m_handle, 0, 0, nullptr)) + { + // Probably out of memory. It's not valid to stop and throw here, so try again after a delay. + ClrSleepEx(1, false); + } + } +#endif // FEATURE_PAL +} + void CLRMutex::Create(LPSECURITY_ATTRIBUTES lpMutexAttributes, BOOL bInitialOwner, LPCTSTR lpName) { CONTRACTL diff --git a/src/vm/synch.h b/src/vm/synch.h index d88ec4634280..c8e9baf481b7 100644 --- a/src/vm/synch.h +++ b/src/vm/synch.h @@ -177,6 +177,99 @@ class CLRSemaphore { HANDLE m_handle; }; +class CLRLifoSemaphore +{ +private: + struct Counts + { + union + { + struct + { + UINT32 signalCount; + UINT16 waiterCount; + UINT8 spinnerCount; + UINT8 countOfWaitersSignaledToWake; + }; + UINT64 data; + }; + + Counts(UINT64 data = 0) : data(data) + { + LIMITED_METHOD_CONTRACT; + } + + operator UINT64() const + { + LIMITED_METHOD_CONTRACT; + return data; + } + + Counts &operator =(UINT64 data) + { + LIMITED_METHOD_CONTRACT; + + this->data = data; + return *this; + } + + Counts VolatileLoad() const + { + LIMITED_METHOD_CONTRACT; + return ::VolatileLoad(&data); + } + + Counts CompareExchange(Counts toCounts, Counts fromCounts) + { + LIMITED_METHOD_CONTRACT; + return (UINT64)InterlockedCompareExchange64((LONG64 *)&data, (LONG64)toCounts, (LONG64)fromCounts); + } + }; + +public: + CLRLifoSemaphore() : m_handle(nullptr) + { + LIMITED_METHOD_CONTRACT; + } + + ~CLRLifoSemaphore() + { + WRAPPER_NO_CONTRACT; + Close(); + } + +public: + void Create(INT32 initialSignalCount, INT32 maximumSignalCount); + void Close(); + +public: + BOOL IsValid() const + { + LIMITED_METHOD_CONTRACT; + return m_handle != nullptr; + } + +private: + bool WaitForSignal(DWORD timeoutMs); +public: + bool Wait(DWORD timeoutMs); + bool Wait(DWORD timeoutMs, UINT32 spinCount, UINT32 processorCount); + void Release(INT32 releaseCount); + +private: + BYTE __padding1[MAX_CACHE_LINE_SIZE]; // padding to ensure that m_counts gets its own cache line + Counts m_counts; + BYTE __padding2[MAX_CACHE_LINE_SIZE]; // padding to ensure that m_counts gets its own cache line + +#if defined(DEBUG) + UINT32 m_maximumSignalCount; +#endif // _DEBUG && !FEATURE_PAL + + // When FEATURE_PAL is defined, this is a handle to an instance of the PAL's LIFO semaphore. When FEATURE_PAL is not + // defined, this is a handle to an I/O completion port. + HANDLE m_handle; +}; + class CLRMutex { public: CLRMutex() diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index eabbcb93aed1..97c020a4b6eb 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -103,6 +103,7 @@ DWORD ThreadpoolMgr::NextCompletedWorkRequestsTime; LARGE_INTEGER ThreadpoolMgr::CurrentSampleStartTime; +unsigned int ThreadpoolMgr::WorkerThreadSpinLimit; int ThreadpoolMgr::ThreadAdjustmentInterval; #define INVALID_HANDLE ((HANDLE) -1) @@ -136,8 +137,8 @@ CLREvent * ThreadpoolMgr::RetiredCPWakeupEvent; // wakeup event for comple CrstStatic ThreadpoolMgr::WaitThreadsCriticalSection; ThreadpoolMgr::LIST_ENTRY ThreadpoolMgr::WaitThreadsHead; -ThreadpoolMgr::UnfairSemaphore* ThreadpoolMgr::WorkerSemaphore; -CLRSemaphore* ThreadpoolMgr::RetiredWorkerSemaphore; +CLRLifoSemaphore* ThreadpoolMgr::WorkerSemaphore; +CLRLifoSemaphore* ThreadpoolMgr::RetiredWorkerSemaphore; CrstStatic ThreadpoolMgr::TimerQueueCriticalSection; HANDLE ThreadpoolMgr::TimerThread=NULL; @@ -353,6 +354,7 @@ BOOL ThreadpoolMgr::Initialize() EX_TRY { + WorkerThreadSpinLimit = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_ThreadPool_UnfairSemaphoreSpinLimit); ThreadAdjustmentInterval = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_HillClimbing_SampleIntervalLow); pADTPCount->InitResources(); @@ -370,26 +372,26 @@ BOOL ThreadpoolMgr::Initialize() RetiredCPWakeupEvent->CreateAutoEvent(FALSE); _ASSERTE(RetiredCPWakeupEvent->IsValid()); - int spinLimitPerProcessor = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_ThreadPool_UnfairSemaphoreSpinLimit); - WorkerSemaphore = new UnfairSemaphore(ThreadCounter::MaxPossibleCount, spinLimitPerProcessor); + WorkerSemaphore = new CLRLifoSemaphore(); + WorkerSemaphore->Create(0, ThreadCounter::MaxPossibleCount); - RetiredWorkerSemaphore = new CLRSemaphore(); + RetiredWorkerSemaphore = new CLRLifoSemaphore(); RetiredWorkerSemaphore->Create(0, ThreadCounter::MaxPossibleCount); - //ThreadPool_CPUGroup - if (CPUGroupInfo::CanEnableGCCPUGroups() && CPUGroupInfo::CanEnableThreadUseAllCpuGroups()) + //ThreadPool_CPUGroup + if (CPUGroupInfo::CanEnableGCCPUGroups() && CPUGroupInfo::CanEnableThreadUseAllCpuGroups()) RecycledLists.Initialize( CPUGroupInfo::GetNumActiveProcessors() ); else - RecycledLists.Initialize( g_SystemInfo.dwNumberOfProcessors ); - /* - { - SYSTEM_INFO sysInfo; + RecycledLists.Initialize( g_SystemInfo.dwNumberOfProcessors ); + /* + { + SYSTEM_INFO sysInfo; - ::GetSystemInfo( &sysInfo ); + ::GetSystemInfo( &sysInfo ); - RecycledLists.Initialize( sysInfo.dwNumberOfProcessors ); - } - */ + RecycledLists.Initialize( sysInfo.dwNumberOfProcessors ); + } + */ } EX_CATCH { @@ -1034,9 +1036,7 @@ void ThreadpoolMgr::MaybeAddWorkingWorker() if (toUnretire > 0) { - LONG previousCount; - INDEBUG(BOOL success =) RetiredWorkerSemaphore->Release((LONG)toUnretire, &previousCount); - _ASSERTE(success); + RetiredWorkerSemaphore->Release(toUnretire); } if (toRelease > 0) @@ -2055,10 +2055,7 @@ DWORD WINAPI ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) while (true) { RetryRetire: - DWORD result = RetiredWorkerSemaphore->Wait(AppX::IsAppXProcess() ? WorkerTimeoutAppX : WorkerTimeout, FALSE); - _ASSERTE(WAIT_OBJECT_0 == result || WAIT_TIMEOUT == result); - - if (WAIT_OBJECT_0 == result) + if (RetiredWorkerSemaphore->Wait(AppX::IsAppXProcess() ? WorkerTimeoutAppX : WorkerTimeout)) { foundWork = true; @@ -2134,59 +2131,57 @@ DWORD WINAPI ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) FireEtwThreadPoolWorkerThreadWait(counts.NumActive, counts.NumRetired, GetClrInstanceId()); RetryWaitForWork: - if (!WorkerSemaphore->Wait(AppX::IsAppXProcess() ? WorkerTimeoutAppX : WorkerTimeout)) + if (WorkerSemaphore->Wait(AppX::IsAppXProcess() ? WorkerTimeoutAppX : WorkerTimeout, WorkerThreadSpinLimit, NumberOfProcessors)) { - if (!IsIoPending()) - { - // - // We timed out, and are about to exit. This puts us in a very similar situation to the - // retirement case above - someone may think we're still waiting, and go ahead and: - // - // 1) Increment NumWorking - // 2) Signal WorkerSemaphore - // - // The solution is much like retirement; when we're decrementing NumActive, we need to make - // sure it doesn't drop below NumWorking. If it would, then we need to go back and wait - // again. - // + foundWork = true; + goto Work; + } - DangerousNonHostedSpinLockHolder tal(&ThreadAdjustmentLock); + if (!IsIoPending()) + { + // + // We timed out, and are about to exit. This puts us in a very similar situation to the + // retirement case above - someone may think we're still waiting, and go ahead and: + // + // 1) Increment NumWorking + // 2) Signal WorkerSemaphore + // + // The solution is much like retirement; when we're decrementing NumActive, we need to make + // sure it doesn't drop below NumWorking. If it would, then we need to go back and wait + // again. + // - // counts volatile read paired with CompareExchangeCounts loop set - counts = WorkerCounter.DangerousGetDirtyCounts(); - while (true) - { - if (counts.NumActive == counts.NumWorking) - { - goto RetryWaitForWork; - } + DangerousNonHostedSpinLockHolder tal(&ThreadAdjustmentLock); - newCounts = counts; - newCounts.NumActive--; + // counts volatile read paired with CompareExchangeCounts loop set + counts = WorkerCounter.DangerousGetDirtyCounts(); + while (true) + { + if (counts.NumActive == counts.NumWorking) + { + goto RetryWaitForWork; + } - // if we timed out while active, then Hill Climbing needs to be told that we need fewer threads - newCounts.MaxWorking = max(MinLimitTotalWorkerThreads, min(newCounts.NumActive, newCounts.MaxWorking)); + newCounts = counts; + newCounts.NumActive--; - oldCounts = WorkerCounter.CompareExchangeCounts(newCounts, counts); + // if we timed out while active, then Hill Climbing needs to be told that we need fewer threads + newCounts.MaxWorking = max(MinLimitTotalWorkerThreads, min(newCounts.NumActive, newCounts.MaxWorking)); - if (oldCounts == counts) - { - HillClimbingInstance.ForceChange(newCounts.MaxWorking, ThreadTimedOut); - goto Exit; - } + oldCounts = WorkerCounter.CompareExchangeCounts(newCounts, counts); - counts = oldCounts; + if (oldCounts == counts) + { + HillClimbingInstance.ForceChange(newCounts.MaxWorking, ThreadTimedOut); + goto Exit; } - } - else - { - goto RetryWaitForWork; + + counts = oldCounts; } } else { - foundWork = true; - goto Work; + goto RetryWaitForWork; } Exit: diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index fc5742b4946a..764c65efdcb8 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -105,245 +105,6 @@ class ThreadpoolMgr friend class HillClimbing; friend struct _DacGlobals; - // - // UnfairSemaphore is a more scalable semaphore than CLRSemaphore. It prefers to release threads that have more recently begun waiting, - // to preserve locality. Additionally, very recently-waiting threads can be released without an addition kernel transition to unblock - // them, which reduces latency. - // - // UnfairSemaphore is only appropriate in scenarios where the order of unblocking threads is not important, and where threads frequently - // need to be woken. This is true of the ThreadPool's "worker semaphore", but not, for example, of the "retired worker semaphore" which is - // only rarely signalled. - // - // A further optimization that could be done here would be to replace CLRSemaphore with a Win32 IO Completion Port. Completion ports - // unblock threads in LIFO order, unlike the roughly-FIFO ordering of ordinary semaphores, and that would help to keep the "warm" threads warm. - // We did not do this in CLR 4.0 because hosts currently have no way of intercepting calls to IO Completion Ports (other than THE completion port - // behind the I/O thread pool), and we did not have time to explore the implications of this. Also, completion ports are not available on the Mac, - // though Snow Leopard has something roughly similar (and a regular Semaphore would do on the Mac in a pinch). - // - class UnfairSemaphore - { - private: - // padding to ensure we get our own cache line - BYTE padding1[MAX_CACHE_LINE_SIZE]; - - // - // We track everything we care about in a single 64-bit struct to allow us to - // do CompareExchanges on this for atomic updates. - // - union Counts - { - struct - { - int spinners : 16; //how many threads are currently spin-waiting for this semaphore? - int countForSpinners : 16; //how much of the semaphore's count is availble to spinners? - int waiters : 16; //how many threads are blocked in the OS waiting for this semaphore? - int countForWaiters : 16; //how much count is available to waiters? - }; - - LONGLONG asLongLong; - - } m_counts; - - private: - // padding to ensure we get our own cache line - BYTE padding2[MAX_CACHE_LINE_SIZE]; - - const int m_spinLimitPerProcessor; //used when calculating max spin duration - CLRSemaphore m_sem; //waiters wait on this - - INDEBUG(int m_maxCount;) - - bool UpdateCounts(Counts newCounts, Counts currentCounts) - { - LIMITED_METHOD_CONTRACT; - Counts oldCounts; - oldCounts.asLongLong = FastInterlockCompareExchangeLong(&m_counts.asLongLong, newCounts.asLongLong, currentCounts.asLongLong); - if (oldCounts.asLongLong == currentCounts.asLongLong) - { - // we succesfully updated the counts. Now validate what we put in. - // Note: we can't validate these unless the CompareExchange succeeds, because - // on x86 a VolatileLoad of m_counts is not atomic; we could end up getting inconsistent - // values. It's not until we've successfully stored the new values that we know for sure - // that the old values were correct (because if they were not, the CompareExchange would have - // failed. - _ASSERTE(newCounts.spinners >= 0); - _ASSERTE(newCounts.countForSpinners >= 0); - _ASSERTE(newCounts.waiters >= 0); - _ASSERTE(newCounts.countForWaiters >= 0); - _ASSERTE(newCounts.countForSpinners + newCounts.countForWaiters <= m_maxCount); - - return true; - } - else - { - // we lost a race with some other thread, and will need to try again. - return false; - } - } - - public: - - UnfairSemaphore(int maxCount, int spinLimitPerProcessor) - : m_spinLimitPerProcessor(spinLimitPerProcessor) - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - SO_TOLERANT; - MODE_ANY; - } - CONTRACTL_END; - _ASSERTE(maxCount <= 0x7fff); //counts need to fit in signed 16-bit ints - INDEBUG(m_maxCount = maxCount;) - - m_counts.asLongLong = 0; - m_sem.Create(0, maxCount); - } - - // - // no destructor - CLRSemaphore will close itself in its own destructor. - // - //~UnfairSemaphore() - //{ - //} - - - void Release(int countToRelease) - { - while (true) - { - Counts currentCounts, newCounts; - currentCounts.asLongLong = VolatileLoad(&m_counts.asLongLong); - newCounts = currentCounts; - - int remainingCount = countToRelease; - - // First, prefer to release existing spinners, - // because a) they're hot, and b) we don't need a kernel - // transition to release them. - int spinnersToRelease = max(0, min(remainingCount, currentCounts.spinners - currentCounts.countForSpinners)); - newCounts.countForSpinners += spinnersToRelease; - remainingCount -= spinnersToRelease; - - // Next, prefer to release existing waiters - int waitersToRelease = max(0, min(remainingCount, currentCounts.waiters - currentCounts.countForWaiters)); - newCounts.countForWaiters += waitersToRelease; - remainingCount -= waitersToRelease; - - // Finally, release any future spinners that might come our way - newCounts.countForSpinners += remainingCount; - - // Try to commit the transaction - if (UpdateCounts(newCounts, currentCounts)) - { - // Now we need to release the waiters we promised to release - if (waitersToRelease > 0) - { - LONG previousCount; - INDEBUG(BOOL success =) m_sem.Release((LONG)waitersToRelease, &previousCount); - _ASSERTE(success); - } - break; - } - } - } - - - bool Wait(DWORD timeout) - { - while (true) - { - Counts currentCounts, newCounts; - currentCounts.asLongLong = VolatileLoad(&m_counts.asLongLong); - newCounts = currentCounts; - - // First, just try to grab some count. - if (currentCounts.countForSpinners > 0) - { - newCounts.countForSpinners--; - if (UpdateCounts(newCounts, currentCounts)) - return true; - } - else - { - // No count available, become a spinner - newCounts.spinners++; - if (UpdateCounts(newCounts, currentCounts)) - break; - } - } - - // - // Now we're a spinner. - // - int numSpins = 0; - while (true) - { - Counts currentCounts, newCounts; - - currentCounts.asLongLong = VolatileLoad(&m_counts.asLongLong); - newCounts = currentCounts; - - if (currentCounts.countForSpinners > 0) - { - newCounts.countForSpinners--; - newCounts.spinners--; - if (UpdateCounts(newCounts, currentCounts)) - return true; - } - else - { - double spinnersPerProcessor = (double)currentCounts.spinners / ThreadpoolMgr::NumberOfProcessors; - int spinLimit = (int)((m_spinLimitPerProcessor / spinnersPerProcessor) + 0.5); - if (numSpins >= spinLimit) - { - newCounts.spinners--; - newCounts.waiters++; - if (UpdateCounts(newCounts, currentCounts)) - break; - } - else - { - // - // We yield to other threads using SleepEx rather than the more traditional SwitchToThread. - // This is because SwitchToThread does not yield to threads currently scheduled to run on other - // processors. On a 4-core machine, for example, this means that SwitchToThread is only ~25% likely - // to yield to the correct thread in some scenarios. - // SleepEx has the disadvantage of not yielding to lower-priority threads. However, this is ok because - // once we've called this a few times we'll become a "waiter" and wait on the CLRSemaphore, and that will - // yield to anything that is runnable. - // - ClrSleepEx(0, FALSE); - numSpins++; - } - } - } - - // - // Now we're a waiter - // - DWORD result = m_sem.Wait(timeout, FALSE); - _ASSERTE(WAIT_OBJECT_0 == result || WAIT_TIMEOUT == result); - - while (true) - { - Counts currentCounts, newCounts; - - currentCounts.asLongLong = VolatileLoad(&m_counts.asLongLong); - newCounts = currentCounts; - - newCounts.waiters--; - - if (result == WAIT_OBJECT_0) - newCounts.countForWaiters--; - - if (UpdateCounts(newCounts, currentCounts)) - return (result == WAIT_OBJECT_0); - } - } - }; - public: struct ThreadCounter { @@ -1258,6 +1019,7 @@ class ThreadpoolMgr static LARGE_INTEGER CurrentSampleStartTime; + static unsigned int WorkerThreadSpinLimit; static int ThreadAdjustmentInterval; SPTR_DECL(WorkRequest,WorkRequestHead); // Head of work request queue @@ -1286,7 +1048,7 @@ class ThreadpoolMgr // 2) There is no functional reason why any particular thread should be preferred when waking workers. This only impacts performance, // and un-fairness helps performance in this case. // - static UnfairSemaphore* WorkerSemaphore; + static CLRLifoSemaphore* WorkerSemaphore; // // RetiredWorkerSemaphore is a regular CLRSemaphore, not an UnfairSemaphore, because if a thread waits on this semaphore is it almost certainly @@ -1295,7 +1057,7 @@ class ThreadpoolMgr // down, by constantly re-using the same small set of retired workers rather than round-robining between all of them as CLRSemaphore will do. // If we go that route, we should add a "no-spin" option to UnfairSemaphore.Wait to avoid wasting CPU. // - static CLRSemaphore* RetiredWorkerSemaphore; + static CLRLifoSemaphore* RetiredWorkerSemaphore; static CLREvent * RetiredCPWakeupEvent;