diff --git a/ACKNOWLEDGEMENTS.txt b/ACKNOWLEDGEMENTS.txt new file mode 100644 index 0000000..48dfada --- /dev/null +++ b/ACKNOWLEDGEMENTS.txt @@ -0,0 +1,30 @@ +=========================================================================== +moodycamel::ConcurrentQueue (https://github.com/cameron314/concurrentqueue) +=========================================================================== + +Simplified BSD License: + +Copyright (c) 2013-2016, Cameron Desrochers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials +provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + diff --git a/CMakeLists.txt b/CMakeLists.txt index 87b7181..d9cacba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ if(NOT LibJpegTurbo_FOUND) list(APPEND COMMON_FIND_PACKAGE_DEFINES DEFLECT_USE_LEGACY_LIBJPEGTURBO) endif() common_find_package(OpenGL) +common_find_package(OpenMP) common_find_package(Qt5Concurrent REQUIRED SYSTEM) common_find_package(Qt5Core REQUIRED) if(APPLE) @@ -48,6 +49,8 @@ if(NOT Qt5Quick_VERSION VERSION_LESS 5.5) option(DEFLECT_QMLSTREAMER_MULTITHREADED "Use multithreaded-rendering in QMLStreamer" ON) endif() +set(LCOV_EXCLUDE "deflect/moodycamel/*") + add_subdirectory(deflect) add_subdirectory(apps) if(Boost_FOUND) diff --git a/deflect/ImageSegmenter.cpp b/deflect/ImageSegmenter.cpp index ea18ee6..38dbae8 100644 --- a/deflect/ImageSegmenter.cpp +++ b/deflect/ImageSegmenter.cpp @@ -76,7 +76,7 @@ Segment ImageSegmenter::compressSingleSegment(const ImageWrapper& image) if (segments.size() > 1) throw std::runtime_error( "compressSingleSegment only works for small images"); - ImageSegmenter::_computeJpeg(segments[0], false); + _computeJpeg(segments[0], false); return segments[0]; #else throw std::runtime_error( @@ -234,7 +234,7 @@ SegmentParametersList ImageSegmenter::_makeSegmentParameters( p.y = image.y + j * info.height; p.width = (i < info.countX - 1) ? info.width : info.lastWidth; p.height = (j < info.countY - 1) ? info.height : info.lastHeight; - parameters.push_back(p); + parameters.emplace_back(p); } } return parameters; diff --git a/deflect/ImageSegmenter.h b/deflect/ImageSegmenter.h index 9f1de90..ed100f7 100644 --- a/deflect/ImageSegmenter.h +++ b/deflect/ImageSegmenter.h @@ -98,6 +98,7 @@ class ImageSegmenter * * @param image The image to be compressed * @return the compressed segment + * @threadsafe */ DEFLECT_API Segment compressSingleSegment(const ImageWrapper& image); diff --git a/deflect/MessageHeader.cpp b/deflect/MessageHeader.cpp index 4db2d64..657ade4 100644 --- a/deflect/MessageHeader.cpp +++ b/deflect/MessageHeader.cpp @@ -69,9 +69,7 @@ MessageHeader::MessageHeader(const MessageType type_, const uint32_t size_, QDataStream& operator<<(QDataStream& out, const deflect::MessageHeader& header) { out << (qint32)header.type << (quint32)header.size; - - for (size_t i = 0; i < MESSAGE_HEADER_URI_LENGTH; ++i) - out << (quint8)header.uri[i]; + out.writeRawData(header.uri, MESSAGE_HEADER_URI_LENGTH); return out; } diff --git a/deflect/Socket.cpp b/deflect/Socket.cpp index 0a66608..b224c01 100644 --- a/deflect/Socket.cpp +++ b/deflect/Socket.cpp @@ -1,5 +1,7 @@ /*********************************************************************/ -/* Copyright (c) 2011 - 2012, The University of Texas at Austin. */ +/* Copyright (c) 2015-2017, EPFL/Blue Brain Project */ +/* Raphael Dumusc */ +/* Daniel Nachbaur */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -33,7 +35,7 @@ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ +/* or implied, of Ecole polytechnique federale de Lausanne. */ /*********************************************************************/ #include "Socket.h" @@ -108,7 +110,8 @@ bool Socket::hasMessage(const size_t messageSize) const (int)(MessageHeader::serializedSize + messageSize); } -bool Socket::send(const MessageHeader& messageHeader, const QByteArray& message) +bool Socket::send(const MessageHeader& messageHeader, const QByteArray& message, + const bool waitForBytesWritten) { QMutexLocker locker(&_socketMutex); if (!isConnected()) @@ -123,10 +126,13 @@ bool Socket::send(const MessageHeader& messageHeader, const QByteArray& message) // send message const bool allSent = _write(message); - // Needed in the absence of event loop, otherwise the reception is frozen. - while (_socket->bytesToWrite() > 0 && isConnected()) - _socket->waitForBytesWritten(); - + if (waitForBytesWritten) + { + // Needed in the absence of event loop, otherwise the reception is + // frozen. + while (_socket->bytesToWrite() > 0 && isConnected()) + _socket->waitForBytesWritten(); + } return allSent; } diff --git a/deflect/Socket.h b/deflect/Socket.h index 46a8c1d..2e9094c 100644 --- a/deflect/Socket.h +++ b/deflect/Socket.h @@ -1,5 +1,7 @@ /*********************************************************************/ -/* Copyright (c) 2011 - 2012, The University of Texas at Austin. */ +/* Copyright (c) 2015-2017, EPFL/Blue Brain Project */ +/* Raphael Dumusc */ +/* Daniel Nachbaur */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -33,7 +35,7 @@ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ +/* or implied, of Ecole polytechnique federale de Lausanne. */ /*********************************************************************/ #ifndef DEFLECT_SOCKET_H @@ -103,9 +105,13 @@ class Socket : public QObject * Send a message. * @param messageHeader The message header * @param message The message data + * @param waitForBytesWritten wait until the message is completely send; in + * case of multiple sends per frame it is advised to do this only + * once per frame * @return true if the message could be sent, false otherwise */ - bool send(const MessageHeader& messageHeader, const QByteArray& message); + bool send(const MessageHeader& messageHeader, const QByteArray& message, + bool waitForBytesWritten); /** * Receive a message. diff --git a/deflect/StreamSendWorker.cpp b/deflect/StreamSendWorker.cpp index 6996895..67b709e 100644 --- a/deflect/StreamSendWorker.cpp +++ b/deflect/StreamSendWorker.cpp @@ -57,6 +57,7 @@ namespace deflect StreamSendWorker::StreamSendWorker(Socket& socket, const std::string& id) : _socket(socket) , _id(id) + , _dequeuedRequests(std::thread::hardware_concurrency() / 2) { _imageSegmenter.setNominalSegmentDimensions(SEGMENT_SIZE, SEGMENT_SIZE); } @@ -71,62 +72,93 @@ void StreamSendWorker::run() _running = true; while (true) { - // Copy request, unlock enqueue methods during processing of tasks - std::unique_lock lock(_mutex); - while (_requests.empty() && _running) - _condition.wait(lock); - if (!_running) break; - const auto request = std::move(_requests.front()); - _requests.pop_front(); - lock.unlock(); + size_t count = 0; + if (!_pendingFinish) + count = _requests.wait_dequeue_bulk(_dequeuedRequests.begin(), + _dequeuedRequests.size()); + else + { + // in case we encountered a finish request, get all remaining send + // requests w/o waiting + count = _requests.try_dequeue_bulk(_dequeuedRequests.begin(), + _dequeuedRequests.size()); + + // no more pending sends, now process the finish request and reset + // for next finish + if (count == 0) + { + count = 1; + _finishRequest.isFinish = false; // reset this to process this + // request now + _dequeuedRequests[0] = _finishRequest; + _pendingFinish = false; + } + } - bool success = true; - for (auto& task : request.tasks) + for (size_t i = 0; i < count; ++i) { - if (!task()) + bool success = true; + auto& request = _dequeuedRequests[i]; + + // postpone a finish request to maintain order (as the lockfree + // does not guarantee order) + if (request.isFinish) + { + if (_pendingFinish) + throw std::runtime_error("Already have pending finish"); + + _finishRequest = request; + _pendingFinish = true; + continue; + } + + for (auto& task : request.tasks) { - success = false; - break; + if (!task()) + { + success = false; + break; + } } + if (request.promise) + request.promise->set_value(success); } - request.promise->set_value(success); } } void StreamSendWorker::stop() { { - std::lock_guard lock(_mutex); - if (!_running) - return; _running = false; - _condition.notify_all(); + _enqueueRequest(std::vector()); } quit(); wait(); - while (!_requests.empty()) + Request request; + while (_requests.try_dequeue(request)) { - _requests.front().promise->set_value(false); - _requests.pop_front(); + if (request.promise) + request.promise->set_value(false); } } Stream::Future StreamSendWorker::enqueueImage(const ImageWrapper& image, const bool finish) { + if (_pendingFinish) + throw(std::runtime_error{"Pending finish, no send allowed"}); + if (image.compressionPolicy != COMPRESSION_ON && image.pixelFormat != RGBA) { std::cerr << "Currently, RAW images can only be sent in RGBA format. " "Other formats support remain to be implemented." << std::endl; - std::promise promise; - promise.set_value(false); - return promise.get_future(); + return make_ready_future(false); } std::vector tasks; @@ -136,6 +168,12 @@ Stream::Future StreamSendWorker::enqueueImage(const ImageWrapper& image, { auto segment = _imageSegmenter.compressSingleSegment(image); tasks.emplace_back([this, segment] { return _sendSegment(segment); }); + + // as we expect to encounter a lot of these small sends, be optimistic + // and fulfill the promise already to reduce load in the send thread + // (c.f. lock ops performance on KNL) + _requests.enqueue({nullptr, tasks, false}); + return make_ready_future(true); } else tasks.emplace_back([this, image] { return _sendImage(image); }); @@ -148,7 +186,7 @@ Stream::Future StreamSendWorker::enqueueImage(const ImageWrapper& image, Stream::Future StreamSendWorker::enqueueFinish() { - return _enqueueRequest({[this] { return _sendFinish(); }}); + return _enqueueRequest({[this] { return _sendFinish(); }}, true); } Stream::Future StreamSendWorker::enqueueOpen() @@ -195,13 +233,11 @@ Stream::Future StreamSendWorker::enqueueData(const QByteArray data) {[this, data] { return _send(MESSAGE_TYPE_DATA, data); }}); } -Stream::Future StreamSendWorker::_enqueueRequest(std::vector&& tasks) +Stream::Future StreamSendWorker::_enqueueRequest(std::vector&& tasks, + const bool isFinish) { PromisePtr promise(new Promise); - - std::lock_guard lock(_mutex); - _requests.push_back({promise, tasks}); - _condition.notify_all(); + _requests.enqueue({promise, tasks, isFinish}); return promise->get_future(); } @@ -230,7 +266,7 @@ bool StreamSendWorker::_sendSegment(const Segment& segment) auto message = QByteArray{(const char*)(&segment.parameters), sizeof(SegmentParameters)}; message.append(segment.imageData); - return _send(MESSAGE_TYPE_PIXELSTREAM, message); + return _send(MESSAGE_TYPE_PIXELSTREAM, message, false); } bool StreamSendWorker::_sendFinish() @@ -238,8 +274,10 @@ bool StreamSendWorker::_sendFinish() return _send(MESSAGE_TYPE_PIXELSTREAM_FINISH_FRAME, {}); } -bool StreamSendWorker::_send(const MessageType type, const QByteArray& message) +bool StreamSendWorker::_send(const MessageType type, const QByteArray& message, + const bool waitForBytesWritten) { - return _socket.send(MessageHeader(type, message.size(), _id), message); + return _socket.send(MessageHeader(type, message.size(), _id), message, + waitForBytesWritten); } } diff --git a/deflect/StreamSendWorker.h b/deflect/StreamSendWorker.h index 1f6730e..a56ac30 100644 --- a/deflect/StreamSendWorker.h +++ b/deflect/StreamSendWorker.h @@ -46,10 +46,16 @@ #include "Socket.h" // member #include "Stream.h" // Stream::Future -#include +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" +#endif +#include "moodycamel/blockingconcurrentqueue.h" +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif -#include -#include +#include namespace deflect { @@ -99,29 +105,34 @@ class StreamSendWorker : public QThread { PromisePtr promise; std::vector tasks; + bool isFinish; }; Socket& _socket; const std::string& _id; ImageSegmenter _imageSegmenter; - std::deque _requests; - std::mutex _mutex; - std::condition_variable _condition; + moodycamel::BlockingConcurrentQueue _requests; bool _running = false; View _currentView = View::mono; + std::vector _dequeuedRequests; + bool _pendingFinish = false; + Request _finishRequest; + /** Main QThread loop doing asynchronous processing of queued tasks. */ void run() final; - Stream::Future _enqueueRequest(std::vector&& actions); + Stream::Future _enqueueRequest(std::vector&& actions, + bool isFinish = false); friend class deflect::test::Application; // to send pre-compressed segments bool _sendImage(const ImageWrapper& image); bool _sendImageView(View view); bool _sendSegment(const Segment& segment); bool _sendFinish(); - bool _send(MessageType type, const QByteArray& message); + bool _send(MessageType type, const QByteArray& message, + bool waitForBytesWritten = true); }; } #endif diff --git a/deflect/moodycamel/blockingconcurrentqueue.h b/deflect/moodycamel/blockingconcurrentqueue.h new file mode 100644 index 0000000..48405fa --- /dev/null +++ b/deflect/moodycamel/blockingconcurrentqueue.h @@ -0,0 +1,1097 @@ +// Provides an efficient blocking version of moodycamel::ConcurrentQueue. +// ©2015-2016 Cameron Desrochers. Distributed under the terms of the simplified +// BSD license, available at the top of concurrentqueue.h. +// Uses Jeff Preshing's semaphore implementation (under the terms of its +// separate zlib license, embedded below). + +#pragma once + +#include "concurrentqueue.h" +#include +#include +#include +#include +#include + +#if defined(_WIN32) +// Avoid including windows.h in a header; we only need a handful of +// items, so we'll redeclare them here (this is relatively safe since +// the API generally has to remain stable between Windows versions). +// I know this is an ugly hack but it still beats polluting the global +// namespace with thousands of generic names or adding a .cpp for nothing. +extern "C" { +struct _SECURITY_ATTRIBUTES; +__declspec(dllimport) void* __stdcall CreateSemaphoreW( + _SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, + long lMaximumCount, const wchar_t* lpName); +__declspec(dllimport) int __stdcall CloseHandle(void* hObject); +__declspec(dllimport) unsigned long __stdcall WaitForSingleObject( + void* hHandle, unsigned long dwMilliseconds); +__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, + long lReleaseCount, + long* lpPreviousCount); +} +#elif defined(__MACH__) +#include +#elif defined(__unix__) +#include +#endif + +namespace moodycamel +{ +namespace details +{ +// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's +// portable + lightweight semaphore implementations, originally from +// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h +// LICENSE: +// Copyright (c) 2015 Jeff Preshing +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. +namespace mpmc_sema +{ +#if defined(_WIN32) +class Semaphore +{ +private: + void* m_hSema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + const long maxLong = 0x7fffffff; + m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); + } + + ~Semaphore() { CloseHandle(m_hSema); } + void wait() + { + const unsigned long infinite = 0xffffffff; + WaitForSingleObject(m_hSema, infinite); + } + + bool try_wait() + { + const unsigned long RC_WAIT_TIMEOUT = 0x00000102; + return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT; + } + + bool timed_wait(std::uint64_t usecs) + { + const unsigned long RC_WAIT_TIMEOUT = 0x00000102; + return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != + RC_WAIT_TIMEOUT; + } + + void signal(int count = 1) { ReleaseSemaphore(m_hSema, count, nullptr); } +}; +#elif defined(__MACH__) +//--------------------------------------------------------- +// Semaphore (Apple iOS and OSX) +// Can't use POSIX semaphores due to +// http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html +//--------------------------------------------------------- +class Semaphore +{ +private: + semaphore_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, + initialCount); + } + + ~Semaphore() { semaphore_destroy(mach_task_self(), m_sema); } + void wait() { semaphore_wait(m_sema); } + bool try_wait() { return timed_wait(0); } + bool timed_wait(std::uint64_t timeout_usecs) + { + mach_timespec_t ts; + ts.tv_sec = timeout_usecs / 1000000; + ts.tv_nsec = (timeout_usecs % 1000000) * 1000; + + // added in OSX 10.10: + // https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html + kern_return_t rc = semaphore_timedwait(m_sema, ts); + + return rc != KERN_OPERATION_TIMED_OUT; + } + + void signal() { semaphore_signal(m_sema); } + void signal(int count) + { + while (count-- > 0) + { + semaphore_signal(m_sema); + } + } +}; +#elif defined(__unix__) +//--------------------------------------------------------- +// Semaphore (POSIX, Linux) +//--------------------------------------------------------- +class Semaphore +{ +private: + sem_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + sem_init(&m_sema, 0, initialCount); + } + + ~Semaphore() { sem_destroy(&m_sema); } + void wait() + { + // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error + int rc; + do + { + rc = sem_wait(&m_sema); + } while (rc == -1 && errno == EINTR); + } + + bool try_wait() + { + int rc; + do + { + rc = sem_trywait(&m_sema); + } while (rc == -1 && errno == EINTR); + return !(rc == -1 && errno == EAGAIN); + } + + bool timed_wait(std::uint64_t usecs) + { + struct timespec ts; + const int usecs_in_1_sec = 1000000; + const int nsecs_in_1_sec = 1000000000; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += usecs / usecs_in_1_sec; + ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000; + // sem_timedwait bombs if you have more than 1e9 in tv_nsec + // so we have to clean things up before passing it in + if (ts.tv_nsec > nsecs_in_1_sec) + { + ts.tv_nsec -= nsecs_in_1_sec; + ++ts.tv_sec; + } + + int rc; + do + { + rc = sem_timedwait(&m_sema, &ts); + } while (rc == -1 && errno == EINTR); + return !(rc == -1 && errno == ETIMEDOUT); + } + + void signal() { sem_post(&m_sema); } + void signal(int count) + { + while (count-- > 0) + { + sem_post(&m_sema); + } + } +}; +#else +#error Unsupported platform! (No semaphore wrapper available) +#endif + +//--------------------------------------------------------- +// LightweightSemaphore +//--------------------------------------------------------- +class LightweightSemaphore +{ +public: + typedef std::make_signed::type ssize_t; + +private: + std::atomic m_count; + Semaphore m_sema; + + bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) + { + ssize_t oldCount; + // Is there a better way to set the initial spin count? + // If we lower it to 1000, testBenaphore becomes 15x slower on my Core + // i7-5930K Windows PC, + // as threads start hitting the kernel semaphore. + int spin = 10000; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if ((oldCount > 0) && + m_count.compare_exchange_strong(oldCount, oldCount - 1, + std::memory_order_acquire, + std::memory_order_relaxed)) + return true; + std::atomic_signal_fence(std::memory_order_acquire); // Prevent the + // compiler + // from + // collapsing + // the loop. + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount > 0) + return true; + if (timeout_usecs < 0) + { + m_sema.wait(); + return true; + } + if (m_sema.timed_wait((std::uint64_t)timeout_usecs)) + return true; + // At this point, we've timed out waiting for the semaphore, but the + // count is still decremented indicating we may still be waiting on + // it. So we have to re-adjust the count, but only if the semaphore + // wasn't signaled enough times for us too since then. If it was, we + // need to release the semaphore too. + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + return true; + if (oldCount < 0 && + m_count.compare_exchange_strong(oldCount, oldCount + 1, + std::memory_order_relaxed, + std::memory_order_relaxed)) + return false; + } + } + + ssize_t waitManyWithPartialSpinning(ssize_t max, + std::int64_t timeout_usecs = -1) + { + assert(max > 0); + ssize_t oldCount; + int spin = 10000; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_strong(oldCount, newCount, + std::memory_order_acquire, + std::memory_order_relaxed)) + return oldCount - newCount; + } + std::atomic_signal_fence(std::memory_order_acquire); + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount <= 0) + { + if (timeout_usecs < 0) + m_sema.wait(); + else if (!m_sema.timed_wait((std::uint64_t)timeout_usecs)) + { + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + break; + if (oldCount < 0 && + m_count.compare_exchange_strong( + oldCount, oldCount + 1, std::memory_order_relaxed, + std::memory_order_relaxed)) + return 0; + } + } + } + if (max > 1) + return 1 + tryWaitMany(max - 1); + return 1; + } + +public: + LightweightSemaphore(ssize_t initialCount = 0) + : m_count(initialCount) + { + assert(initialCount >= 0); + } + + bool tryWait() + { + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + if (m_count.compare_exchange_weak(oldCount, oldCount - 1, + std::memory_order_acquire, + std::memory_order_relaxed)) + return true; + } + return false; + } + + void wait() + { + if (!tryWait()) + waitWithPartialSpinning(); + } + + bool wait(std::int64_t timeout_usecs) + { + return tryWait() || waitWithPartialSpinning(timeout_usecs); + } + + // Acquires between 0 and (greedily) max, inclusive + ssize_t tryWaitMany(ssize_t max) + { + assert(max >= 0); + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_weak(oldCount, newCount, + std::memory_order_acquire, + std::memory_order_relaxed)) + return oldCount - newCount; + } + return 0; + } + + // Acquires at least one, and (greedily) at most max + ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs) + { + assert(max >= 0); + ssize_t result = tryWaitMany(max); + if (result == 0 && max > 0) + result = waitManyWithPartialSpinning(max, timeout_usecs); + return result; + } + + ssize_t waitMany(ssize_t max) + { + ssize_t result = waitMany(max, -1); + assert(result > 0); + return result; + } + + void signal(ssize_t count = 1) + { + assert(count >= 0); + ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release); + ssize_t toRelease = -oldCount < count ? -oldCount : count; + if (toRelease > 0) + { + m_sema.signal((int)toRelease); + } + } + + ssize_t availableApprox() const + { + ssize_t count = m_count.load(std::memory_order_relaxed); + return count > 0 ? count : 0; + } +}; +} // end namespace mpmc_sema +} // end namespace details + +// This is a blocking version of the queue. It has an almost identical interface +// to +// the normal non-blocking version, with the addition of various wait_dequeue() +// methods +// and the removal of producer-specific dequeue methods. +template +class BlockingConcurrentQueue +{ +private: + typedef ::moodycamel::ConcurrentQueue ConcurrentQueue; + typedef details::mpmc_sema::LightweightSemaphore LightweightSemaphore; + +public: + typedef typename ConcurrentQueue::producer_token_t producer_token_t; + typedef typename ConcurrentQueue::consumer_token_t consumer_token_t; + + typedef typename ConcurrentQueue::index_t index_t; + typedef typename ConcurrentQueue::size_t size_t; + typedef typename std::make_signed::type ssize_t; + + static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE; + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = + ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD; + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = + ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE; + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = + ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE; + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = + ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE; + static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE; + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be + // allocated + // up-front, which means only a single producer will be able to enqueue + // elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads + // (this + // includes making the memory effects of construction visible, possibly with + // a + // memory barrier). + explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : inner(capacity) + , sema(create(), + &BlockingConcurrentQueue::template destroy) + { + assert( + reinterpret_cast((BlockingConcurrentQueue*)1) == + &((BlockingConcurrentQueue*)1)->inner && + "BlockingConcurrentQueue must have ConcurrentQueue as its first " + "member"); + if (!sema) + { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, + size_t maxImplicitProducers) + : inner(minCapacity, maxExplicitProducers, maxImplicitProducers) + , sema(create(), + &BlockingConcurrentQueue::template destroy) + { + assert( + reinterpret_cast((BlockingConcurrentQueue*)1) == + &((BlockingConcurrentQueue*)1)->inner && + "BlockingConcurrentQueue must have ConcurrentQueue as its first " + "member"); + if (!sema) + { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + // Disable copying and copy assignment + BlockingConcurrentQueue(BlockingConcurrentQueue const&) + MOODYCAMEL_DELETE_FUNCTION; + BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) + MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : inner(std::move(other.inner)), + sema(std::move(other.sema)) + { + } + + inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) + MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other) + { + if (this == &other) + { + return *this; + } + + inner.swap(other.inner); + sema.swap(other.sema); + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit + // production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if (details::likely(inner.enqueue(item))) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit + // production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if (details::likely(inner.enqueue(std::move(item)))) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + if (details::likely(inner.enqueue(token, item))) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + if (details::likely(inner.enqueue(token, std::move(item)))) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be + // surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead + // of copied. + // Thread-safe. + template + inline bool enqueue_bulk(It itemFirst, size_t count) + { + if (details::likely( + inner.enqueue_bulk(std::forward(itemFirst), count))) + { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, + size_t count) + { + if (details::likely( + inner.enqueue_bulk(token, std::forward(itemFirst), count))) + { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or + // implicit + // production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (inner.try_enqueue(item)) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (inner.try_enqueue(std::move(item))) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + if (inner.try_enqueue(token, item)) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + if (inner.try_enqueue(token, std::move(item))) + { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(std::forward(itemFirst), count)) + { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, + size_t count) + { + if (inner.try_enqueue_bulk(token, std::forward(itemFirst), count)) + { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(U& item) + { + if (sema->tryWait()) + { + while (!inner.try_dequeue(item)) + { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(consumer_token_t& token, U& item) + { + if (sema->tryWait()) + { + while (!inner.try_dequeue(token, item)) + { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany( + (LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) + { + count += + inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, + size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany( + (LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) + { + count += inner.template try_dequeue_bulk(token, itemFirst, + max - count); + } + return count; + } + + // Blocks the current thread until there's something to dequeue, then + // dequeues it. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(U& item) + { + sema->wait(); + while (!inner.try_dequeue(item)) + { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) + { + return false; + } + while (!inner.try_dequeue(item)) + { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed( + U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed( + item, std::chrono::duration_cast(timeout) + .count()); + } + + // Blocks the current thread until there's something to dequeue, then + // dequeues it using an explicit consumer token. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(consumer_token_t& token, U& item) + { + sema->wait(); + while (!inner.try_dequeue(token, item)) + { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, + std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) + { + return false; + } + while (!inner.try_dequeue(token, item)) + { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed( + consumer_token_t& token, U& item, + std::chrono::duration const& timeout) + { + return wait_dequeue_timed( + token, item, + std::chrono::duration_cast(timeout) + .count()); + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = + (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) + { + count += + inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, + std::int64_t timeout_usecs) + { + size_t count = 0; + max = + (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, + timeout_usecs); + while (count != max) + { + count += + inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed( + It itemFirst, size_t max, + std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed( + itemFirst, max, + std::chrono::duration_cast(timeout) + .count()); + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, + size_t max) + { + size_t count = 0; + max = + (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) + { + count += inner.template try_dequeue_bulk(token, itemFirst, + max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, + size_t max, + std::int64_t timeout_usecs) + { + size_t count = 0; + max = + (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, + timeout_usecs); + while (count != max) + { + count += inner.template try_dequeue_bulk(token, itemFirst, + max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed( + consumer_token_t& token, It itemFirst, size_t max, + std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed( + token, itemFirst, max, + std::chrono::duration_cast(timeout) + .count()); + } + + // Returns an estimate of the total number of elements currently in the + // queue. This + // estimate is only accurate if the queue has completely stabilized before + // it is called + // (i.e. all enqueue and dequeue operations have completed and their memory + // effects are + // visible on the calling thread, and no further operations start while this + // method is + // being called). + // Thread-safe. + inline size_t size_approx() const + { + return (size_t)sema->availableApprox(); + } + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() { return ConcurrentQueue::is_lock_free(); } +private: + template + static inline U* create() + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) + { + p->~U(); + } + (Traits::free)(p); + } + +private: + ConcurrentQueue inner; + std::unique_ptr sema; +}; + +template +inline void swap(BlockingConcurrentQueue& a, + BlockingConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} // end namespace moodycamel diff --git a/deflect/moodycamel/concurrentqueue.h b/deflect/moodycamel/concurrentqueue.h new file mode 100644 index 0000000..3dabb2a --- /dev/null +++ b/deflect/moodycamel/concurrentqueue.h @@ -0,0 +1,5089 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free +// queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2016, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of +// conditions and the following disclaimer in the documentation and/or other +// materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing +// warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` +// method declarations. +// We'll override the default trait malloc ourselves without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include +#include +#include // for CHAR_BIT +#include // for max_align_t +#include +#include +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading +#include +#include + +// Platform-specific definitions of a numeric thread ID type and an invalid +// value +namespace moodycamel +{ +namespace details +{ +template +struct thread_id_converter +{ + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } +}; +} +} +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel +{ +namespace details +{ +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; +static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; +static inline thread_id_t thread_id() +{ + return rl::thread_index(); +} +} +} +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the +// function +// we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId( + void); +namespace moodycamel +{ +namespace details +{ +static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), + "Expected size of unsigned long to be 32 bits on Windows"); +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = + 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx +static const thread_id_t invalid_thread_id2 = + 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used + // in practice. Note that all Win32 thread IDs are presently + // multiples of 4. +static inline thread_id_t thread_id() +{ + return static_cast(::GetCurrentThreadId()); +} +} +} +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ + (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel +{ +namespace details +{ +static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, + "std::thread::id is expected to be either 4 or 8 bytes"); + +typedef std::thread::id thread_id_t; +static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + +// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have +// one; it's +// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which +// it won't +// be. +static inline thread_id_t thread_id() +{ + return std::this_thread::get_id(); +} + +template +struct thread_id_size +{ +}; +template <> +struct thread_id_size<4> +{ + typedef std::uint32_t numeric_t; +}; +template <> +struct thread_id_size<8> +{ + typedef std::uint64_t numeric_t; +}; + +template <> +struct thread_id_converter +{ + typedef thread_id_size::numeric_t + thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const& x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } +}; +} +} +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a +// thread-local +// static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel +{ +namespace details +{ +typedef std::uintptr_t thread_id_t; +static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr +static const thread_id_t invalid_thread_id2 = + 1; // Member accesses off a null pointer are also generally invalid. Plus + // it's not aligned. +static inline thread_id_t thread_id() +{ + static MOODYCAMEL_THREADLOCAL int x; + return reinterpret_cast(&x); +} +} +} +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || \ + (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ + (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw(expr) +#else +#define MOODYCAMEL_TRY if (true) +#define MOODYCAMEL_CATCH(...) else if (false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when +// it shouldn't :-( +// We have to assume *all* non-trivial constructors may throw on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value \ + : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value || \ + std::is_nothrow_move_constructible::value \ + : std::is_trivially_copy_constructible::value || \ + std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a +// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 +// g++ <=4.7 doesn't support thread_local either. +// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to +// compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ + (!defined(__MINGW32__) && !defined(__MINGW64__) || \ + !defined(__WINPTHREADS_VERSION)) && \ + (!defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ + (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ + !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 +// compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now +// since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link +// error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +// Compiler-specific likely/unlikely hints +namespace moodycamel +{ +namespace details +{ +#if defined(__GNUC__) +inline bool likely(bool x) +{ + return __builtin_expect((x), true); +} +inline bool unlikely(bool x) +{ + return __builtin_expect((x), false); +} +#else +inline bool likely(bool x) +{ + return x; +} +inline bool unlikely(bool x) +{ + return x; +} +#endif +} +} + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel +{ +namespace details +{ +template +struct const_numeric_max +{ + static_assert(std::is_integral::value, + "const_numeric_max can only be used with integers"); + static const T value = + std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - + static_cast(1) + : static_cast(-1); +}; + +#if defined(__GNUC__) && !defined(__clang__) +typedef ::max_align_t max_align_t; // GCC forgot to add it to std:: for a while +#else +typedef std::max_align_t max_align_t; // Others (e.g. MSVC) insist it can *only* + // be accessed via std:: +#endif +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of + // elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few + // elements + // but many producers, a smaller block size should be favoured. For few + // producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per + // element. + // For large block sizes, this is too inefficient, and switching to an + // atomic + // counter-based approach is faster. The switch is made for block sizes + // strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This + // should + // reflect that number's maximum for optimal performance. Must be a power of + // 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This + // should + // reflect that number's maximum for optimal performance. Must be a power of + // 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit + // producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit + // production + // (using the enqueue methods without an explicit producer token) is + // disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a + // token) + // must consume before it causes all consumers to rotate and move on to the + // next + // internal queue. + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a + // sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. + // Note + // that this limit is enforced at the block level (for performance reasons), + // i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = + details::const_numeric_max::value; + +#ifndef MCDBGQ_USE_RELACY +// Memory allocation can be customized if needed. +// malloc should return nullptr on failure, and handle alignment like +// std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } +#endif +}; + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template +class ConcurrentQueue; +template +class BlockingConcurrentQueue; +class ConcurrentQueueTests; + +namespace details +{ +struct ConcurrentQueueProducerTypelessBase +{ + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr) + , inactive(false) + , token(nullptr) + { + } +}; + +template +struct _hash_32_or_64 +{ + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see + // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is + // propagate that + // uniqueness evenly across all the bits, so that we can use a subset of + // the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } +}; +template <> +struct _hash_32_or_64<1> +{ + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } +}; +template +struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> +{ +}; + +static inline size_t hash_thread_id(thread_id_t id) +{ + static_assert( + sizeof(thread_id_t) <= 8, + "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast( + hash_32_or_64::thread_id_hash_t)>:: + hash(thread_id_converter::prehash(id))); +} + +template +static inline bool circular_less_than(T a, T b) +{ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4554) +#endif + static_assert(std::is_integral::value && + !std::numeric_limits::is_signed, + "circular_less_than is intended to be used only with " + "unsigned integer types"); + return static_cast(a - b) > + static_cast(static_cast(1) + << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif +} + +template +static inline char* align_for(char* ptr) +{ + const std::size_t alignment = std::alignment_of::value; + return ptr + + (alignment - (reinterpret_cast(ptr) % alignment)) % + alignment; +} + +template +static inline T ceil_to_pow_2(T x) +{ + static_assert(std::is_integral::value && + !std::numeric_limits::is_signed, + "ceil_to_pow_2 is intended to be used only with unsigned " + "integer types"); + + // Adapted from + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) + { + x |= x >> (i << 3); + } + ++x; + return x; +} + +template +static inline void swap_relaxed(std::atomic& left, std::atomic& right) +{ + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), + std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); +} + +template +static inline T const& nomove(T const& x) +{ + return x; +} + +template +struct nomove_if +{ + template + static inline T const& eval(T const& x) + { + return x; + } +}; + +template <> +struct nomove_if +{ + template + static inline auto eval(U&& x) -> decltype(std::forward(x)) + { + return std::forward(x); + } +}; + +template +static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) +{ + return *it; +} + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +template +struct is_trivially_destructible : std::is_trivially_destructible +{ +}; +#else +template +struct is_trivially_destructible : std::has_trivial_destructor +{ +}; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +typedef RelacyThreadExitListener ThreadExitListener; +typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else +struct ThreadExitListener +{ + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; + + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier +}; + +class ThreadExitNotifier +{ +public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) + { + if (ptr == listener) + { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + +private: + ThreadExitNotifier() + : tail(nullptr) + { + } + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) + MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && + "If this assert fails, you likely have a buggy compiler! Change " + "the preprocessor conditions such that " + "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) + { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + +private: + ThreadExitListener* tail; +}; +#endif +#endif + +template +struct static_is_lock_free_num +{ + enum + { + value = 0 + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_CHAR_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_SHORT_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_INT_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LONG_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LLONG_LOCK_FREE + }; +}; +template +struct static_is_lock_free + : static_is_lock_free_num::type> +{ +}; +template <> +struct static_is_lock_free +{ + enum + { + value = ATOMIC_BOOL_LOCK_FREE + }; +}; +template +struct static_is_lock_free +{ + enum + { + value = ATOMIC_POINTER_LOCK_FREE + }; +}; +} + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) + { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) + { + producer->token = this; + } + if (other.producer != nullptr) + { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + ~ProducerToken() + { + if (producer != nullptr) + { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), + lastKnownGlobalOffset(other.lastKnownGlobalOffset), + itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), + currentProducer(other.currentProducer), + desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See +// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, + typename ConcurrentQueue::ImplicitProducerKVP& b) + MOODYCAMEL_NOEXCEPT; + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = + static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = + static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + static_cast( + Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4307) // + integral constant overflow (that's what the + // ternary expression is for!) +#pragma warning(disable : 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = + (details::const_numeric_max::value - + static_cast(Traits::MAX_SUBQUEUE_SIZE) < + BLOCK_SIZE) + ? details::const_numeric_max::value + : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + + (BLOCK_SIZE - 1)) / + BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), + "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), + "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && + !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & + (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), + "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " + "power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && + !(EXPLICIT_INITIAL_INDEX_SIZE & + (EXPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && + !(IMPLICIT_INITIAL_INDEX_SIZE & + (IMPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert( + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || + !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || + INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at " + "least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be + // allocated + // up-front, which means only a single producer will be able to enqueue + // elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads + // (this + // includes making the memory effects of construction visible, possibly with + // a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr) + , producerCount(0) + , initialBlockPoolIndex(0) + , nextExplicitConsumerId(0) + , globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list( + capacity / BLOCK_SIZE + + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, + size_t maxImplicitProducers) + : producerListTail(nullptr) + , producerCount(0) + , initialBlockPoolIndex(0) + , nextExplicitConsumerId(0) + , globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * + (maxExplicitProducers + 1) + + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) + { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) + { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) + { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) + { + auto prev = hash->prev; + if (prev != nullptr) + { // The last hash is part of this object and was not allocated + // dynamically + for (size_t i = 0; i != hash->capacity; ++i) + { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) + { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) + { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) + MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail( + other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex( + other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId( + other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load( + std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty + // queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(other.explicitProducers.load( + std::memory_order_relaxed), + std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load( + std::memory_order_relaxed), + std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) + MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) + { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, + other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, + other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, + other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit + // production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit + // production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be + // surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead + // of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or + // implicit + // production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, + size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause + // needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) + { + auto size = ptr->size_approx(); + if (size > 0) + { + if (size > bestSize) + { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time + // we try to dequeue from it, we need to make sure every queue's been + // tried + if (nonEmptyCount > 0) + { + if (details::likely(best->dequeue(item))) + { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) + { + if (ptr != best && ptr->dequeue(item)) + { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall + // throughput + // under contention, but will give more predictable results in + // single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) + { + if (ptr->dequeue(item)) + { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the + // global offset) -> this means the highest efficiency consumer dictates + // the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your + // consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until + // you find a producer with some items + // If the global offset has not changed but you've run out of items to + // consume, move over from your current position until you find an + // producer with something in it + + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + if (!update_current_producer_after_rotation(token)) + { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time + // we try to dequeue from it, we need to make sure every queue's been + // tried + if (static_cast(token.currentProducer)->dequeue(item)) + { + if (++token.itemsConsumedFromCurrent == + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) + { + globalExplicitConsumerOffset.fetch_add( + 1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = + static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) + { + if (ptr->dequeue(item)) + { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) + { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) + { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + if (!update_current_producer_after_rotation(token)) + { + return 0; + } + } + + size_t count = static_cast(token.currentProducer) + ->dequeue_bulk(itemFirst, max); + if (count == max) + { + if ((token.itemsConsumedFromCurrent += static_cast( + max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) + { + globalExplicitConsumerOffset.fetch_add( + 1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = + static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) + { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) + { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = + static_cast(dequeued); + } + if (dequeued == max) + { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + } + return count; + } + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, + U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner + // queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer( + producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer) + ->dequeue_bulk(itemFirst, max); + } + + // Returns an estimate of the total number of elements currently in the + // queue. This + // estimate is only accurate if the queue has completely stabilized before + // it is called + // (i.e. all enqueue and dequeue operations have completed and their memory + // effects are + // visible on the calling thread, and no further operations start while this + // method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) + { + size += ptr->size_approx(); + } + return size; + } + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free< + typename details::thread_id_converter:: + thread_id_numeric_size_t>::value == 2; + } + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + friend struct ExplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode + { + CanAlloc, + CannotAlloc + }; + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue( + std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer + ->ConcurrentQueue::ImplicitProducer::template enqueue< + canAlloc>(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, + size_t count) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk< + canAlloc>(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) + { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = + globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if (details::unlikely(token.desiredProducer == nullptr)) + { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from + // end -- subtract from count first + std::uint32_t offset = + prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) + { + token.desiredProducer = + static_cast(token.desiredProducer) + ->next_prod(); + if (token.desiredProducer == nullptr) + { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) + { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) + { + token.desiredProducer = + static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) + { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() + : freeListRefs(0) + , freeListNext(nullptr) + { + } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the + // world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free + // list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields + // (and initialization of them) + struct FreeList + { + FreeList() + : freeListHead(nullptr) + { + } + FreeList(FreeList&& other) + : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) + { + other.freeListHead.store(nullptr, std::memory_order_relaxed); + } + void swap(FreeList& other) + { + details::swap_relaxed(freeListHead, other.freeListHead); + } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { +#if MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so + // it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, + std::memory_order_acq_rel) == 0) + { + // Oh look! We were the last ones referencing this node, and we + // know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { +#if MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) + { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || + !head->freeListRefs.compare_exchange_strong( + refs, refs + 1, std::memory_order_acquire, + std::memory_order_relaxed)) + { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at + // zero), which means we can read the + // next and not worry about it changing between now and the time + // we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong( + head, next, std::memory_order_acquire, + std::memory_order_relaxed)) + { + // Yay, got the node. This means it was on the list, which + // means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been + // taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & + SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for + // the list's ref + head->freeListRefs.fetch_add(-2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to + // decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we + // do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = + prevHead->freeListRefs.fetch_add(-1, + std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) + { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to + // destroy remaining nodes) + N* head_unsafe() const + { + return freeListHead.load(std::memory_order_relaxed); + } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's + // zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single + // thread case), then we know + // we can safely change the next pointer of the node; however, once + // the refcount is back above + // zero, then other threads could increase it (happens under heavy + // contention, when the refcount + // goes to zero in between a load and a refcount increment of a node + // in try_get, then back up to + // something non-zero, then the refcount increment is done by the + // other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount + // and leave the add operation to + // the next thread who puts the refcount back at zero (which could + // be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) + { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong( + head, node, std::memory_order_release, + std::memory_order_relaxed)) + { + // Hmm, the add failed, but we can only try again when the + // refcount goes back to zero + if (node->freeListRefs.fetch_add( + SHOULD_BE_ON_FREELIST - 1, + std::memory_order_release) == 1) + { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes + // are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#if MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext + { + implicit_context = 0, + explicit_context = 1 + }; + + struct Block + { + Block() + : next(nullptr) + , elementsCompletelyDequeued(0) + , freeListRefs(0) + , freeListNext(nullptr) + , shouldBeOnFreeList(false) + , dynamicallyAllocated(true) + { +#if MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + if (context == explicit_context && + BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) + { + if (!emptyFlags[i].load(std::memory_order_relaxed)) + { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that + // happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else + { + // Check counter + if (elementsCompletelyDequeued.load( + std::memory_order_relaxed) == BLOCK_SIZE) + { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load( + std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit + // context) + template + inline bool set_empty(index_t i) + { + if (context == explicit_context && + BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - + static_cast(i & static_cast( + BLOCK_SIZE - 1))] + .load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - + static_cast( + i & static_cast(BLOCK_SIZE - 1))] + .store(true, std::memory_order_release); + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add( + 1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no + // wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit + // context). + template + inline bool set_many_empty(index_t i, size_t count) + { + if (context == explicit_context && + BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - + static_cast(i & + static_cast(BLOCK_SIZE - 1)) - + count + 1; + for (size_t j = 0; j != count; ++j) + { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add( + count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + if (context == explicit_context && + BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) + { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, + std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + if (context == explicit_context && + BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) + { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & + static_cast(BLOCK_SIZE - 1)); + } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & + static_cast(BLOCK_SIZE - 1)); + } + + private: + // IMPORTANT: This must be the first member in Block, so that if T + // depends on the alignment of + // addresses returned by malloc, that alignment will be preserved. + // Apparently clang actually + // generates code that uses this assumption for AVX instructions in some + // cases. Ideally, we + // should also align Block to the alignment of T in case it's higher + // than malloc's 16-byte + // alignment, but this is hard to do in a cross-platform way. Assert for + // this case: + static_assert( + std::alignment_of::value <= + std::alignment_of::value, + "The queue does not support super-aligned types at this time"); + // Additionally, we need the alignment of Block itself to be a multiple + // of max_align_t since + // otherwise the appropriate padding will not be added at the end of + // Block in order to make + // arrays of Blocks all be properly aligned (not just the first one). We + // use a union to force + // this. + union { + char elements[sizeof(T) * BLOCK_SIZE]; + details::max_align_t dummy; + }; + + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic + emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD + ? BLOCK_SIZE + : 1]; + + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be + // 'isNotPartOfInitialBlockPool' + +#if MCDBGQ_TRACKMEM + void* owner; +#endif + }; + static_assert(std::alignment_of::value >= + std::alignment_of::value, + "Internal error: Blocks must be at least as aligned as the " + "type they are wrapping"); + +#if MCDBGQ_TRACKMEM +public: + struct MemStats; + +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) + : tailIndex(0) + , headIndex(0) + , dequeueOptimisticCount(0) + , dequeueOvercommit(0) + , tailBlock(nullptr) + , isExplicit(isExplicit_) + , parent(parent_) + { + } + + virtual ~ProducerBase(){}; + + template + inline bool dequeue(U& element) + { + if (isExplicit) + { + return static_cast(this)->dequeue(element); + } + else + { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) + { + return static_cast(this)->dequeue_bulk( + itemFirst, max); + } + else + { + return static_cast(this)->dequeue_bulk( + itemFirst, max); + } + } + + inline ProducerBase* next_prod() const + { + return static_cast(next); + } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) + ? static_cast(tail - head) + : 0; + } + + inline index_t getTail() const + { + return tailIndex.load(std::memory_order_relaxed); + } + + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: +#if MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* parent) + : ProducerBase(parent, true) + , blockIndex(nullptr) + , pr_blockIndexSlotsUsed(0) + , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1) + , pr_blockIndexFront(0) + , pr_blockIndexEntries(nullptr) + , pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = + details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) + { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number + // of current entries, i.e. + // EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) + { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) != 0) + { + // The head's not on a block boundary, meaning a block + // somewhere is partially dequeued + // (or the head block is the tail block and was fully + // dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & + (pr_blockIndexSize - 1); + while (details::circular_less_than( + pr_blockIndexEntries[i].base + BLOCK_SIZE, + this->headIndex.load(std::memory_order_relaxed))) + { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than( + pr_blockIndexEntries[i].base, + this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop + // gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do + { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty< + explicit_context>()) + { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) + { + i = static_cast( + this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the + // tail block, we need to stop when we reach the tail index + auto lastValidIndex = + (this->tailIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) == 0 + ? BLOCK_SIZE + : static_cast( + this->tailIndex.load( + std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && + (block != this->tailBlock || i != lastValidIndex)) + { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) + { + auto block = this->tailBlock; + do + { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) + { + destroy(block); + } + else + { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) + { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) + { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && + this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty()) + { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock + ->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + + // We'll put the block on the block index (guaranteed to be + // room since we're conceptually removing the + // last block from it first -- except instead of removing + // then adding, we can just overwrite). + // Note that there must be a valid block index here, since + // even if allocation failed in the ctor, + // it would have been re-attempted when adding the first + // block to the queue; since there is such + // a block, a block index must have been successfully + // allocated. + } + else + { + // Whatever head value we see here is >= the last value we + // saw here (relatively), + // and <= its current value. Since we have the most recent + // tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert( + !details::circular_less_than(currentTailIndex, + head)); + if (!details::circular_less_than(head, + currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head))) + { + // We can't enqueue in another block because there's not + // enough leeway -- the + // tail could surpass the head by the time the block + // fills up! (Or we'll exceed + // the size limit, if the second part of the condition + // was true.) + return false; + } + // We're going to need a new block; check that the block + // index has room + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize) + { + // Hmm, the circular block index is already full -- + // we'll need + // to allocate a new index. Note pr_blockIndexRaw can + // only be nullptr if + // the initial allocation failed in the constructor. + + if (allocMode == CannotAlloc || + !new_block_index(pr_blockIndexSlotsUsed)) + { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue:: + template requisition_block(); + if (newBlock == nullptr) + { + return false; + } +#if MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (this->tailBlock == nullptr) + { + newBlock->next = newBlock; + } + else + { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, + new (nullptr) + T(std::forward(element)))) + { + // The constructor may throw. We want the element not to + // appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY + { + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + // Revert change to the current block, but leave the new + // block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? this->tailBlock + : startBlock; + MOODYCAMEL_RETHROW; + } + } + else + { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed) + ->front.store(pr_blockIndexFront, + std::memory_order_release); + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, + new (nullptr) + T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load( + std::memory_order_relaxed) - + overcommit, + tail)) + { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the + // common case when the queue is + // empty and the values are eventually consistent -- we may + // enter here spuriously. + + // Note that whatever the values of overcommit and tail are, + // they are not going to change (unless we + // change them) and must be the same value at this point (inside + // the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the + // release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into + // overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent + // as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be + // sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the + // latest value in the modification order), but + // unfortunately that can't be shown to be correct using only + // the C++11 standard. + // See + // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the + // boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= + // dequeueOptimisticCount (because dequeueOvercommit is only + // ever + // incremented after dequeueOptimisticCount -- this is enforced + // in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as + // recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that + // synchronizes with it), overcommit <= myDequeueCount. + assert(overcommit <= myDequeueCount); + + // Note that we reload tail here in case it changed; it will be + // the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load + // above. This is supported by read-read + // coherency (as defined in the standard), explained here: + // http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if (details::likely(details::circular_less_than( + myDequeueCount - overcommit, tail))) + { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be + // at least one element, this + // will never exceed tail. We need to do an acquire-release + // fence here since it's possible + // that whatever condition got us to this point was for an + // earlier enqueued element (that + // we already see the memory effects for), but that by the + // time we increment somebody else + // has incremented it, and we need to see the memory effects + // for *that* element, which is + // in such a case is necessarily visible on the thread that + // incremented it in the first + // place with the more current condition (they must have + // acquired a tail that is at least + // as recent). + auto index = + this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + + auto localBlockIndex = + blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing + // because of index wrap-around. + // When an index wraps, we need to preserve the sign of the + // offset when dividing it by the + // block size (in order to get a correct signed block count + // offset in all cases): + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = + index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + blockBaseIndex - headBase) / + BLOCK_SIZE); + auto block = localBlockIndex + ->entries[(localBlockIndexHead + offset) & + (localBlockIndex->size - 1)] + .block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, + element = std::move(el))) + { + // Make sure the element is still fully dequeued and + // destroyed even if the assignment + // throws + struct Guard + { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block:: + template set_empty(index); + } + } guard = {block, index}; + + element = std::move(el); + } + else + { + element = std::move(el); + el.~T(); + block->ConcurrentQueue::Block::template set_empty< + explicit_context>(index); + } + + return true; + } + else + { + // Wasn't anything to dequeue after all; make the effective + // dequeue count eventually consistent + this->dequeueOvercommit.fetch_add( + 1, std::memory_order_release); // Release so that the + // fetch_add on + // dequeueOptimisticCount + // is guaranteed to + // happen before this + // write + } + } + + return false; + } + + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of + // the elements; + // this means pre-allocating blocks and putting them in the block + // index (but only if + // all the allocations succeeded). + index_t startTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) + { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && + this->tailBlock->next != firstAllocatedBlock && + this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty()) + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert( + !details::circular_less_than(currentTailIndex, + head)); + bool full = + !details::circular_less_than(head, + currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize || full) + { + if (allocMode == CannotAlloc || full || + !new_block_index(originalBlockIndexSlotsUsed)) + { + // Failed to allocate, undo changes (but keep + // injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = + originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, + // so we need to + // update our fallback value too (since we keep the new + // index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue:: + template requisition_block(); + if (newBlock == nullptr) + { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + +#if MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty< + explicit_context>(); + if (this->tailBlock == nullptr) + { + newBlock->next = newBlock; + } + else + { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's + // emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) + { + block->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (block == this->tailBlock) + { + break; + } + block = block->next; + } + + if (MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (nullptr) T(details::deref_noexcept(itemFirst)))) + { + blockIndex.load(std::memory_order_relaxed) + ->front.store((pr_blockIndexFront - 1) & + (pr_blockIndexSize - 1), + std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != + 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) + { + this->tailBlock = firstAllocatedBlock; + } + while (true) + { + auto stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, + stopIndex)) + { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (nullptr) T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex++]) + T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) + { + // Must use copy constructor even if move + // constructor is available + // because we may have to revert if there's an + // exception. + // Sorry about the horrible templated next line, but + // it was the only way + // to disable moving *at compile time*, which is + // important because a type + // may only define a (noexcept) move constructor, + // and so calls to the + // cctor will not compile, even if they are in an if + // branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if< + (bool)!MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (nullptr) T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + // Oh dear, an exception's been thrown -- destroy the + // elements that + // were enqueued so far and revert the entire bulk + // operation (we'll keep + // any allocated blocks in our linked list for later, + // though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + + if (!details::is_trivially_destructible::value) + { + auto block = startBlock; + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) == 0) + { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) + { + stopIndex = + (currentTailIndex & + ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than( + constructedStopIndex, stopIndex)) + { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) + { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) + { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) + { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (nullptr) T(details::deref_noexcept(itemFirst))) && + firstAllocatedBlock != nullptr) + { + blockIndex.load(std::memory_order_relaxed) + ->front.store((pr_blockIndexFront - 1) & + (pr_blockIndexSize - 1), + std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - + (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) + { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + assert(overcommit <= myDequeueCount); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) + { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) + { + this->dequeueOvercommit.fetch_add( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed + // to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = + this->headIndex.fetch_add(actualCount, + std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = + blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = + firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + firstBlockBaseIndex - headBase) / + BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & + (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do + { + auto firstIndexInBlock = index; + auto endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T&&, details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) + { + while (index != endIndex) + { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else + { + MOODYCAMEL_TRY + { + while (index != endIndex) + { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + // It's too late to revert the dequeue, but we + // can make sure that all + // the dequeued objects are properly destroyed + // and the block index + // (and empty count) are properly updated before + // we propagate the exception + do + { + block = localBlockIndex->entries[indexIndex] + .block; + while (index != endIndex) + { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block:: + template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast( + endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & + (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & + ~static_cast( + BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast( + actualCount), + endIndex) + ? firstIndex + static_cast( + actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast(endIndex - firstIndexInBlock)); + indexIndex = + (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else + { + // Wasn't anything to dequeue after all; make the effective + // dequeue count eventually consistent + this->dequeueOvercommit.fetch_add( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like + // pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast( + (Traits::malloc)(sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) + { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast( + details::align_for(newRawPtr + + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) + { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & + prevBlockSizeMask; + do + { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, + std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old + // one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in + // referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer* nextExplicitProducer; + + private: +#endif + +#if MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue* parent) + : ProducerBase(parent, false) + , nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE) + , blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { +// Note that since we're in the destructor we can assume that all +// enqueue/dequeue operations +// completed already; this means that all undequeued elements are placed +// contiguously across +// contiguous blocks, and that only the first and last remaining blocks can be +// only partially +// empty (all other remaining blocks must be completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) + { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = index != tail; // If we enter the loop, + // then the last (tail) + // block will not be freed + while (index != tail) + { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || + block == nullptr) + { + if (block != nullptr) + { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load( + std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on + // the free list + // (unless the head index reached the end of it, in which case the + // tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && + (forceFreeLastBlock || + (tail & static_cast(BLOCK_SIZE - 1)) != 0)) + { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) + { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) + { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do + { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) + { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, + head)); + if (!details::circular_less_than(head, + currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head))) + { + return false; + } +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block + // index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, + currentTailIndex)) + { + return false; + } + + // Get ahold of a new block + auto newBlock = + this->parent->ConcurrentQueue::template requisition_block< + allocMode>(); + if (newBlock == nullptr) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#if MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context>(); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, + new (nullptr) + T(std::forward(element)))) + { + // May throw, try to insert now before we publish the fact + // that we have this new block + MOODYCAMEL_TRY + { + new ((*newBlock)[currentTailIndex]) + T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, + new (nullptr) + T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load( + std::memory_order_relaxed) - + overcommit, + tail)) + { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + assert(overcommit <= myDequeueCount); + tail = this->tailIndex.load(std::memory_order_acquire); + if (details::likely(details::circular_less_than( + myDequeueCount - overcommit, tail))) + { + index_t index = + this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, + element = std::move(el))) + { +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead + // of only when a block + // is released is very sub-optimal, but it is, after + // all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard + { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block:: + template set_empty( + index)) + { + entry->value.store( + nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = {block, index, entry, this->parent}; + + element = std::move(el); + } + else + { + element = std::move(el); + el.~T(); + + if (block->ConcurrentQueue::Block::template set_empty< + implicit_context>(index)) + { + { +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool + // (and remove from block index) + entry->value.store(nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + } + + return true; + } + else + { + this->dequeueOvercommit.fetch_add( + 1, std::memory_order_release); + } + } + + return false; + } + + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of + // the elements; + // this means pre-allocating blocks and putting them in the block + // index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us + // any more; + // this happens if it was filled up exactly to the top (setting + // tailIndex to + // the first index of the next block which is not yet allocated), + // then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) + { +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block + // index + BlockIndexEntry* idxEntry = + nullptr; // initialization here unnecessary but compiler + // can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert( + !details::circular_less_than(currentTailIndex, + head)); + bool full = + !details::circular_less_than(head, + currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head)); + if (full || + !(indexInserted = insert_block_index_entry( + idxEntry, currentTailIndex)) || + (newBlock = + this->parent + ->ConcurrentQueue::template requisition_block< + allocMode>()) == nullptr) + { + // Index allocation or block allocation failed; revert + // any other allocations + // and index insertions done so far for this operation + if (indexInserted) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + } + currentTailIndex = + (startTailIndex - 1) & + ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) + { + currentTailIndex += + static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index( + currentTailIndex); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list( + firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#if MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context>(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later + // allocations fail, + // and so that we can find the blocks when we do the actual + // enqueueing + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) != 0 || + firstAllocatedBlock != nullptr) + { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? newBlock + : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != + 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) + { + this->tailBlock = firstAllocatedBlock; + } + while (true) + { + auto stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, + stopIndex)) + { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (nullptr) T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex++]) + T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if< + (bool)!MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (nullptr) T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) + { + auto block = startBlock; + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) == 0) + { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) + { + stopIndex = + (currentTailIndex & + ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than( + constructedStopIndex, stopIndex)) + { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) + { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) + { + break; + } + block = block->next; + } + } + + currentTailIndex = + (startTailIndex - 1) & + ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) + { + currentTailIndex += + static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index( + currentTailIndex); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list( + firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) + { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - + (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) + { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + assert(overcommit <= myDequeueCount); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) + { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) + { + this->dequeueOvercommit.fetch_add( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed + // to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = + this->headIndex.fetch_add(actualCount, + std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = + get_block_index_index_for_index(index, localBlockIndex); + do + { + auto blockStartIndex = index; + auto endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = + entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T&&, details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) + { + while (index != endIndex) + { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else + { + MOODYCAMEL_TRY + { + while (index != endIndex) + { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + do + { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load( + std::memory_order_relaxed); + while (index != endIndex) + { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block:: + template set_many_empty< + implicit_context>( + blockStartIndex, + static_cast( + endIndex - + blockStartIndex))) + { +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store( + nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list( + block); + } + indexIndex = + (indexIndex + 1) & + (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & + ~static_cast( + BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast( + actualCount), + endIndex) + ? firstIndex + static_cast( + actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block:: + template set_many_empty( + blockStartIndex, + static_cast(endIndex - + blockStartIndex))) + { + { +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a + // release, meaning that anybody who acquires + // the block + // we're about to free can use it safely since + // our writes (and reads!) will have + // happened-before then. + entry->value.store(nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + indexIndex = + (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else + { + this->dequeueOvercommit.fetch_add( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an + // invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, + index_t blockStartIndex) + { + auto localBlockIndex = + blockIndex.load(std::memory_order_relaxed); // We're the only + // writer thread, + // relaxed is OK + if (localBlockIndex == nullptr) + { + return false; // this can happen if new_block_index failed in + // the constructor + } + auto newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == + INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) + { + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + if (allocMode == CannotAlloc || !new_block_index()) + { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == + INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store( + (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & + (localBlockIndex->capacity - 1), + std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index( + index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index( + index_t index, BlockIndexHeader*& localBlockIndex) const + { +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load( + std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may + // wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast( + static_cast::type>( + index - tailBase) / + BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load( + std::memory_order_relaxed) == index && + localBlockIndex->index[idx]->value.load( + std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = + prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) + { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast( + details::align_for(raw + + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast( + details::align_for( + reinterpret_cast(entries) + + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) + { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do + { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) + { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, + std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & + (nextBlockIndexCapacity - 1), + std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer* nextImplicitProducer; + + private: +#endif + +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#if MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) + { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) + { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) + { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= + initialBlockPoolSize) + { + return nullptr; + } + + auto index = + initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) + : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { +#if MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) + { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() { return freeList.try_get(); } + // Gets a free block from one of the memory pools, or allocates a new one + // (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) + { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) + { + return block; + } + + if (canAlloc == CanAlloc) + { + return create(); + } + + return nullptr; + } + +#if MCDBGQ_TRACKMEM +public: + struct MemStats + { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = {0}; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) + { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) + { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) + { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = + prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) + { + for (size_t i = 0; i != hash->capacity; ++i) + { + if (hash->index[i]->key.load( + std::memory_order_relaxed) != + ImplicitProducer::INVALID_BLOCK_BASE && + hash->index[i]->value.load( + std::memory_order_relaxed) != nullptr) + { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += + hash->capacity * + sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) + { + stats.implicitBlockIndexBytes += + sizeof(typename ImplicitProducer:: + BlockIndexHeader) + + hash->capacity * + sizeof(typename ImplicitProducer:: + BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); + head += BLOCK_SIZE) + { + // auto block = + // prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else + { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) + { + auto block = tailBlock; + do + { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block:: + template is_empty() || + wasNonEmpty) + { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = + prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) + { + stats.explicitBlockIndexBytes += + sizeof( + typename ExplicitProducer::BlockIndexHeader) + + index->size * + sizeof( + typename ExplicitProducer::BlockIndexEntry); + index = static_cast< + typename ExplicitProducer::BlockIndexHeader*>( + index->prev); + } + } + } + + auto freeOnInitialPool = + q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= + q->initialBlockPoolSize + ? 0 + : q->initialBlockPoolSize - + q->initialBlockPoolIndex.load( + std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() { return MemStats::getFor(this); } +private: + friend struct MemStats; +#endif + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled) + { +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) + { + if (ptr->inactive.load(std::memory_order_relaxed) && + ptr->isExplicit == isExplicit) + { + bool expected = true; + if (ptr->inactive.compare_exchange_strong( + expected, /* desired */ false, + std::memory_order_acquire, std::memory_order_relaxed)) + { + // We caught one! It's been marked as activated, the caller + // can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast( + create(this)) + : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) + { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do + { + producer->next = prevTail; + } while ( + !producerListTail.compare_exchange_weak(prevTail, producer, + std::memory_order_release, + std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) + { + auto prevTailExplicit = + explicitProducers.load(std::memory_order_relaxed); + do + { + static_cast(producer)->nextExplicitProducer = + prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak( + prevTailExplicit, static_cast(producer), + std::memory_order_release, std::memory_order_relaxed)); + } + else + { + auto prevTailImplicit = + implicitProducers.load(std::memory_order_relaxed); + do + { + static_cast(producer)->nextImplicitProducer = + prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak( + prevTailImplicit, static_cast(producer), + std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); + ptr != nullptr; ptr = ptr->next_prod()) + { + ptr->parent = this; + } + } + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer* value; // No need for atomicity since it's only read + // by the thread that sets it in the first + // place + + ImplicitProducerKVP() + : value(nullptr) + { + } + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), + std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) + MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) + { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap( + typename ConcurrentQueue::ImplicitProducerKVP&, + typename ConcurrentQueue::ImplicitProducerKVP&) + MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return; + + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) + { + initialImplicitProducerHashEntries[i].key.store( + details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return; + + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap( + other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = + &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = + &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, + other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == + &other.initialImplicitProducerHash) + { + implicitProducerHash.store(&initialImplicitProducerHash, + std::memory_order_relaxed); + } + else + { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &other.initialImplicitProducerHash; + hash = hash->prev) + { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == + &initialImplicitProducerHash) + { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, + std::memory_order_relaxed); + } + else + { + ImplicitProducerHash* hash; + for (hash = + other.implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &initialImplicitProducerHash; hash = hash->prev) + { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { +// Note that since the data is essentially thread-local (key is thread ID), +// there's a reduced need for fences (memory ordering is already consistent +// for any individual thread), except for the current table itself. + +// Start by looking for the thread ID in the current and all previous hash +// tables. +// If it's not found, it must not be in there yet, since this same thread would +// have added it previously to one of the tables that we traversed. + +// Code and algorithm adapted from +// http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) + { + // Look for the id in this hash + auto index = hashedId; + while (true) + { // Not an infinite loop because at least one slot is free in the + // hash table + index &= hash->capacity - 1; + + auto probedKey = + hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) + { + // Found it! If we had to search several hashes deep, + // though, we should lazily add it + // to the current main hash table to avoid the extended + // search next time. + // Note there's guaranteed to be room in the current hash + // table since every subsequent + // table implicitly reserves space for all previous tables + // (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) + { + index = hashedId; + while (true) + { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load( + std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && + mainHash->entries[index] + .key.compare_exchange_strong( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed)) || + (probedKey == reusable && + mainHash->entries[index] + .key.compare_exchange_strong( + reusable, id, + std::memory_order_acquire, + std::memory_order_acquire))) + { +#else + if ((probedKey == empty && + mainHash->entries[index] + .key.compare_exchange_strong( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed))) + { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) + { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = + 1 + + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) + { + if (newCount >= (mainHash->capacity >> 1) && + !implicitProducerHashResizeInProgress.test_and_set( + std::memory_order_acquire)) + { + // We've acquired the resize lock, try to allocate a bigger hash + // table. + // Note the acquire fence synchronizes with the release fence at + // the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent + // version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) + { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) + { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)( + sizeof(ImplicitProducerHash) + + std::alignment_of::value - 1 + + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) + { + // Allocation failed + implicitProducerHashCount.fetch_add( + -1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear( + std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = newCapacity; + newHash->entries = reinterpret_cast( + details::align_for( + raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) + { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store( + details::invalid_thread_id, + std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, + std::memory_order_release); + implicitProducerHashResizeInProgress.clear( + std::memory_order_release); + mainHash = newHash; + } + else + { + implicitProducerHashResizeInProgress.clear( + std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that + // we don't have to wait for the next table + // to finish being allocated by another thread (and if we just + // finished allocating above, the condition will + // always be true) + if (newCount < + (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) + { + bool recycled; + auto producer = static_cast( + recycle_or_create_producer(false, recycled)); + if (producer == nullptr) + { + implicitProducerHashCount.fetch_add( + -1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) + { + implicitProducerHashCount.fetch_add( + -1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = + &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe( + &producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) + { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load( + std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && + mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed)) || + (probedKey == reusable && + mainHash->entries[index].key.compare_exchange_strong( + reusable, id, std::memory_order_acquire, + std::memory_order_acquire))) + { +#else + if ((probedKey == empty && + mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed))) + { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy + // allocating a new one. + // We need to wait for the allocating thread to finish (if it + // succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + +// Remove from hash +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered + // if we were added to a hash in the first + // place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't + // on the current one yet and are + // trying to add an entry thinking there's a free slot (because they + // reused a producer) + for (; hash != nullptr; hash = hash->prev) + { + auto index = hashedId; + do + { + index &= hash->capacity - 1; + probedKey = + hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) + { + hash->entries[index].key.store(details::invalid_thread_id2, + std::memory_order_release); + break; + } + ++index; + } while (probedKey != + details::invalid_thread_id); // Can happen if the hash has + // changed but we weren't put + // back in it yet, or if we + // weren't added to this hash + // in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + auto p = static_cast((Traits::malloc)(sizeof(U) * count)); + if (p == nullptr) + { + return nullptr; + } + + for (size_t i = 0; i != count; ++i) + { + new (p + i) U(); + } + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) + { + assert(count > 0); + for (size_t i = count; i != 0;) + { + (p + --i)->~U(); + } + (Traits::free)(p); + } + } + + template + static inline U* create() + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) + { + p->~U(); + } + (Traits::free)(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + +#if !MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically + // used + ImplicitProducerHash initialImplicitProducerHash; + std::array + initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) + { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) + : producer(reinterpret_cast*>(&queue) + ->recycle_or_create_producer(true)) +{ + if (producer != nullptr) + { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue& queue) + : itemsConsumedFromCurrent(0) + , currentProducer(nullptr) + , desiredProducer(nullptr) +{ + initialOffset = + queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) + : itemsConsumedFromCurrent(0) + , currentProducer(nullptr) + , desiredProducer(nullptr) +{ + initialOffset = + reinterpret_cast*>(&queue) + ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +inline void swap(ConcurrentQueue& a, + ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, + typename ConcurrentQueue::ImplicitProducerKVP& b) + MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} +} + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/deflect/qt/helpers.h b/deflect/qt/helpers.h index d3fe7e9..1991242 100644 --- a/deflect/qt/helpers.h +++ b/deflect/qt/helpers.h @@ -1,6 +1,6 @@ /*********************************************************************/ -/* Copyright (c) 2016, EPFL/Blue Brain Project */ -/* Raphael Dumusc */ +/* Copyright (c) 2016-2017, EPFL/Blue Brain Project */ +/* Raphael Dumusc */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -47,14 +47,6 @@ namespace deflect { namespace qt { -template -std::future make_ready_future(T&& value) -{ - std::promise promise; - promise.set_value(std::forward(value)); - return promise.get_future(); -} - // missing make_unique() implementation in C++11 standard // source: http://herbsutter.com/gotw/_102/ template diff --git a/deflect/types.h b/deflect/types.h index 0b30208..dcb8cbc 100644 --- a/deflect/types.h +++ b/deflect/types.h @@ -73,6 +73,14 @@ constexpr typename std::underlying_type::type as_underlying_type(E e) return static_cast::type>(e); } +template +std::future make_ready_future(T&& value) +{ + std::promise promise; + promise.set_value(std::forward(value)); + return promise.get_future(); +} + class EventReceiver; class Frame; class FrameDispatcher; diff --git a/doc/Changelog.md b/doc/Changelog.md index 8f15cbc..4677130 100644 --- a/doc/Changelog.md +++ b/doc/Changelog.md @@ -5,6 +5,8 @@ Changelog {#Changelog} ### 0.14.0 (git master) +* [176](https://github.com/BlueBrain/Deflect/pull/176): + OPT: Lock-free request queueing for multi-threaded stream clients (e.g. KNL) * [175](https://github.com/BlueBrain/Deflect/pull/175): Add deflect::Observer which can be used to only receive events w/o the need to send images diff --git a/tests/cpp/ServerTests.cpp b/tests/cpp/ServerTests.cpp index 456cfcd..e0e2066 100644 --- a/tests/cpp/ServerTests.cpp +++ b/tests/cpp/ServerTests.cpp @@ -48,6 +48,7 @@ namespace ut = boost::unit_test; #include #include +#include #include #include @@ -376,3 +377,152 @@ BOOST_AUTO_TEST_CASE(testOneObserverAndOneStream) BOOST_CHECK_EQUAL(openedStreams, 0); BOOST_CHECK_EQUAL(receivedFrames, expectedFrames); } + +BOOST_AUTO_TEST_CASE(testThreadedSmallSegmentStream) +{ + QThread serverThread; + deflect::Server* server = new deflect::Server(0 /* OS-chosen port */); + server->moveToThread(&serverThread); + serverThread.connect(&serverThread, &QThread::finished, server, + &deflect::Server::deleteLater); + serverThread.start(); + + // to wait in this thread until server thread is done with certain + // operations + QWaitCondition received; + QMutex mutex; + bool receivedState = false; + + auto processServerMessages = [&] { + for (size_t j = 0; j < 20; ++j) + { + mutex.lock(); + received.wait(&mutex, 100 /*ms*/); + if (receivedState) + { + mutex.unlock(); + break; + } + mutex.unlock(); + } + BOOST_REQUIRE(receivedState); + receivedState = false; + }; + + const unsigned int segmentSize = 64; + const unsigned int width = 1920; + const unsigned int height = 1088; + + struct Segment + { + unsigned int x; + unsigned int y; + }; + std::vector segments; + for (unsigned int i = 0; i < width; i += segmentSize) + { + for (unsigned int j = 0; j < height; j += segmentSize) + segments.emplace_back(Segment{i, j}); + } + + const unsigned int numSegments = segments.size(); + BOOST_REQUIRE_EQUAL(numSegments, + std::ceil((float)width / segmentSize) * + std::ceil((float)height / segmentSize)); + + const std::vector pixels(segmentSize * segmentSize * 4, 42); + + size_t openedStreams = 0; + // only continue once we have the stream + server->connect(server, &deflect::Server::pixelStreamOpened, + [&](const QString) { + ++openedStreams; + if (openedStreams == 1) + { + mutex.lock(); + receivedState = true; + received.wakeAll(); + mutex.unlock(); + } + }); + + // make sure that we get the close of the stream + server->connect(server, &deflect::Server::pixelStreamClosed, + [&](const QString) { + --openedStreams; + if (openedStreams == 0) + { + mutex.lock(); + receivedState = true; + received.wakeAll(); + mutex.unlock(); + } + }); + + // handle received frames to test the stream's purpose + const size_t expectedFrames = 10; + size_t receivedFrames = 0; + server->connect(server, &deflect::Server::receivedFrame, + [&](deflect::FramePtr frame) { + BOOST_CHECK_EQUAL(frame->segments.size(), numSegments); + BOOST_CHECK_EQUAL(frame->uri.toStdString(), + testStreamId.toStdString()); + const auto dim = frame->computeDimensions(); + BOOST_CHECK_EQUAL(dim.width(), width); + BOOST_CHECK_EQUAL(dim.height(), height); + ++receivedFrames; + mutex.lock(); + receivedState = true; + received.wakeAll(); + mutex.unlock(); + }); + + { + deflect::Stream stream(testStreamId.toStdString(), "localhost", + server->serverPort()); + BOOST_REQUIRE(stream.isConnected()); + + // handle connects first before sending and receiving frames + processServerMessages(); + + std::mutex testMutex; // BOOST_CHECK is not thread-safe + for (size_t i = 0; i < expectedFrames; ++i) + { + receivedState = false; + +// to make coverage report work; otherwise fails for unknown reasons +#ifdef NDEBUG +#pragma omp parallel for +#endif + for (int j = 0; j < int(segments.size()); ++j) + { + deflect::ImageWrapper deflectImage((const void*)pixels.data(), + segmentSize, segmentSize, + deflect::RGBA, segments[j].x, + segments[j].y); + deflectImage.compressionPolicy = deflect::COMPRESSION_ON; + + const bool success = stream.send(deflectImage).get(); + + std::lock_guard lock(testMutex); + BOOST_CHECK(success); + } + + BOOST_CHECK(stream.finishFrame().get()); + + server->requestFrame(testStreamId); + + // process frame receive + processServerMessages(); + } + } + + // handle close of streamer + processServerMessages(); + + serverThread.quit(); + serverThread.wait(); + + BOOST_CHECK_EQUAL(openedStreams, 0); + BOOST_CHECK_EQUAL(receivedFrames, expectedFrames); +}