diff --git a/ACKNOWLEDGEMENTS.txt b/ACKNOWLEDGEMENTS.txt
new file mode 100644
index 0000000..48dfada
--- /dev/null
+++ b/ACKNOWLEDGEMENTS.txt
@@ -0,0 +1,30 @@
+===========================================================================
+moodycamel::ConcurrentQueue (https://github.com/cameron314/concurrentqueue)
+===========================================================================
+
+Simplified BSD License:
+
+Copyright (c) 2013-2016, Cameron Desrochers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87b7181..d9cacba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ if(NOT LibJpegTurbo_FOUND)
   list(APPEND COMMON_FIND_PACKAGE_DEFINES DEFLECT_USE_LEGACY_LIBJPEGTURBO)
 endif()
 common_find_package(OpenGL)
+common_find_package(OpenMP)
 common_find_package(Qt5Concurrent REQUIRED SYSTEM)
 common_find_package(Qt5Core REQUIRED)
 if(APPLE)
@@ -48,6 +49,8 @@ if(NOT Qt5Quick_VERSION VERSION_LESS 5.5)
   option(DEFLECT_QMLSTREAMER_MULTITHREADED "Use multithreaded-rendering in QMLStreamer" ON)
 endif()
 
+set(LCOV_EXCLUDE "deflect/moodycamel/*")
+
 add_subdirectory(deflect)
 add_subdirectory(apps)
 if(Boost_FOUND)
diff --git a/deflect/ImageSegmenter.cpp b/deflect/ImageSegmenter.cpp
index ea18ee6..38dbae8 100644
--- a/deflect/ImageSegmenter.cpp
+++ b/deflect/ImageSegmenter.cpp
@@ -76,7 +76,7 @@ Segment ImageSegmenter::compressSingleSegment(const ImageWrapper& image)
     if (segments.size() > 1)
         throw std::runtime_error(
             "compressSingleSegment only works for small images");
-    ImageSegmenter::_computeJpeg(segments[0], false);
+    _computeJpeg(segments[0], false);
     return segments[0];
 #else
     throw std::runtime_error(
@@ -234,7 +234,7 @@ SegmentParametersList ImageSegmenter::_makeSegmentParameters(
             p.y = image.y + j * info.height;
             p.width = (i < info.countX - 1) ? info.width : info.lastWidth;
             p.height = (j < info.countY - 1) ? info.height : info.lastHeight;
-            parameters.push_back(p);
+            parameters.emplace_back(p);
         }
     }
     return parameters;
diff --git a/deflect/ImageSegmenter.h b/deflect/ImageSegmenter.h
index 9f1de90..ed100f7 100644
--- a/deflect/ImageSegmenter.h
+++ b/deflect/ImageSegmenter.h
@@ -98,6 +98,7 @@ class ImageSegmenter
      *
      * @param image The image to be compressed
      * @return the compressed segment
+     * @threadsafe
      */
     DEFLECT_API Segment compressSingleSegment(const ImageWrapper& image);
 
diff --git a/deflect/MessageHeader.cpp b/deflect/MessageHeader.cpp
index 4db2d64..657ade4 100644
--- a/deflect/MessageHeader.cpp
+++ b/deflect/MessageHeader.cpp
@@ -69,9 +69,7 @@ MessageHeader::MessageHeader(const MessageType type_, const uint32_t size_,
 QDataStream& operator<<(QDataStream& out, const deflect::MessageHeader& header)
 {
     out << (qint32)header.type << (quint32)header.size;
-
-    for (size_t i = 0; i < MESSAGE_HEADER_URI_LENGTH; ++i)
-        out << (quint8)header.uri[i];
+    out.writeRawData(header.uri, MESSAGE_HEADER_URI_LENGTH);
 
     return out;
 }
diff --git a/deflect/Socket.cpp b/deflect/Socket.cpp
index 0a66608..b224c01 100644
--- a/deflect/Socket.cpp
+++ b/deflect/Socket.cpp
@@ -1,5 +1,7 @@
 /*********************************************************************/
-/* Copyright (c) 2011 - 2012, The University of Texas at Austin.     */
+/* Copyright (c) 2015-2017, EPFL/Blue Brain Project                  */
+/*                          Raphael Dumusc <raphael.dumusc@epfl.ch>  */
+/*                          Daniel Nachbaur <daniel.nachbaur@epfl.ch>*/
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -33,7 +35,7 @@
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
+/* or implied, of Ecole polytechnique federale de Lausanne.          */
 /*********************************************************************/
 
 #include "Socket.h"
@@ -108,7 +110,8 @@ bool Socket::hasMessage(const size_t messageSize) const
            (int)(MessageHeader::serializedSize + messageSize);
 }
 
-bool Socket::send(const MessageHeader& messageHeader, const QByteArray& message)
+bool Socket::send(const MessageHeader& messageHeader, const QByteArray& message,
+                  const bool waitForBytesWritten)
 {
     QMutexLocker locker(&_socketMutex);
     if (!isConnected())
@@ -123,10 +126,13 @@ bool Socket::send(const MessageHeader& messageHeader, const QByteArray& message)
     // send message
     const bool allSent = _write(message);
 
-    // Needed in the absence of event loop, otherwise the reception is frozen.
-    while (_socket->bytesToWrite() > 0 && isConnected())
-        _socket->waitForBytesWritten();
-
+    if (waitForBytesWritten)
+    {
+        // Needed in the absence of event loop, otherwise the reception is
+        // frozen.
+        while (_socket->bytesToWrite() > 0 && isConnected())
+            _socket->waitForBytesWritten();
+    }
     return allSent;
 }
 
diff --git a/deflect/Socket.h b/deflect/Socket.h
index 46a8c1d..2e9094c 100644
--- a/deflect/Socket.h
+++ b/deflect/Socket.h
@@ -1,5 +1,7 @@
 /*********************************************************************/
-/* Copyright (c) 2011 - 2012, The University of Texas at Austin.     */
+/* Copyright (c) 2015-2017, EPFL/Blue Brain Project                  */
+/*                          Raphael Dumusc <raphael.dumusc@epfl.ch>  */
+/*                          Daniel Nachbaur <daniel.nachbaur@epfl.ch>*/
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -33,7 +35,7 @@
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
+/* or implied, of Ecole polytechnique federale de Lausanne.          */
 /*********************************************************************/
 
 #ifndef DEFLECT_SOCKET_H
@@ -103,9 +105,13 @@ class Socket : public QObject
      * Send a message.
      * @param messageHeader The message header
      * @param message The message data
+     * @param waitForBytesWritten wait until the message is completely send; in
+     *        case of multiple sends per frame it is advised to do this only
+     *        once per frame
      * @return true if the message could be sent, false otherwise
      */
-    bool send(const MessageHeader& messageHeader, const QByteArray& message);
+    bool send(const MessageHeader& messageHeader, const QByteArray& message,
+              bool waitForBytesWritten);
 
     /**
      * Receive a message.
diff --git a/deflect/StreamSendWorker.cpp b/deflect/StreamSendWorker.cpp
index 6996895..67b709e 100644
--- a/deflect/StreamSendWorker.cpp
+++ b/deflect/StreamSendWorker.cpp
@@ -57,6 +57,7 @@ namespace deflect
 StreamSendWorker::StreamSendWorker(Socket& socket, const std::string& id)
     : _socket(socket)
     , _id(id)
+    , _dequeuedRequests(std::thread::hardware_concurrency() / 2)
 {
     _imageSegmenter.setNominalSegmentDimensions(SEGMENT_SIZE, SEGMENT_SIZE);
 }
@@ -71,62 +72,93 @@ void StreamSendWorker::run()
     _running = true;
     while (true)
     {
-        // Copy request, unlock enqueue methods during processing of tasks
-        std::unique_lock<std::mutex> lock(_mutex);
-        while (_requests.empty() && _running)
-            _condition.wait(lock);
-
         if (!_running)
             break;
 
-        const auto request = std::move(_requests.front());
-        _requests.pop_front();
-        lock.unlock();
+        size_t count = 0;
+        if (!_pendingFinish)
+            count = _requests.wait_dequeue_bulk(_dequeuedRequests.begin(),
+                                                _dequeuedRequests.size());
+        else
+        {
+            // in case we encountered a finish request, get all remaining send
+            // requests w/o waiting
+            count = _requests.try_dequeue_bulk(_dequeuedRequests.begin(),
+                                               _dequeuedRequests.size());
+
+            // no more pending sends, now process the finish request and reset
+            // for next finish
+            if (count == 0)
+            {
+                count = 1;
+                _finishRequest.isFinish = false; // reset this to process this
+                                                 // request now
+                _dequeuedRequests[0] = _finishRequest;
+                _pendingFinish = false;
+            }
+        }
 
-        bool success = true;
-        for (auto& task : request.tasks)
+        for (size_t i = 0; i < count; ++i)
         {
-            if (!task())
+            bool success = true;
+            auto& request = _dequeuedRequests[i];
+
+            // postpone a finish request to maintain order (as the lockfree
+            // does not guarantee order)
+            if (request.isFinish)
+            {
+                if (_pendingFinish)
+                    throw std::runtime_error("Already have pending finish");
+
+                _finishRequest = request;
+                _pendingFinish = true;
+                continue;
+            }
+
+            for (auto& task : request.tasks)
             {
-                success = false;
-                break;
+                if (!task())
+                {
+                    success = false;
+                    break;
+                }
             }
+            if (request.promise)
+                request.promise->set_value(success);
         }
-        request.promise->set_value(success);
     }
 }
 
 void StreamSendWorker::stop()
 {
     {
-        std::lock_guard<std::mutex> lock(_mutex);
-        if (!_running)
-            return;
         _running = false;
-        _condition.notify_all();
+        _enqueueRequest(std::vector<Task>());
     }
 
     quit();
     wait();
 
-    while (!_requests.empty())
+    Request request;
+    while (_requests.try_dequeue(request))
     {
-        _requests.front().promise->set_value(false);
-        _requests.pop_front();
+        if (request.promise)
+            request.promise->set_value(false);
     }
 }
 
 Stream::Future StreamSendWorker::enqueueImage(const ImageWrapper& image,
                                               const bool finish)
 {
+    if (_pendingFinish)
+        throw(std::runtime_error{"Pending finish, no send allowed"});
+
     if (image.compressionPolicy != COMPRESSION_ON && image.pixelFormat != RGBA)
     {
         std::cerr << "Currently, RAW images can only be sent in RGBA format. "
                      "Other formats support remain to be implemented."
                   << std::endl;
-        std::promise<bool> promise;
-        promise.set_value(false);
-        return promise.get_future();
+        return make_ready_future(false);
     }
 
     std::vector<Task> tasks;
@@ -136,6 +168,12 @@ Stream::Future StreamSendWorker::enqueueImage(const ImageWrapper& image,
     {
         auto segment = _imageSegmenter.compressSingleSegment(image);
         tasks.emplace_back([this, segment] { return _sendSegment(segment); });
+
+        // as we expect to encounter a lot of these small sends, be optimistic
+        // and fulfill the promise already to reduce load in the send thread
+        // (c.f. lock ops performance on KNL)
+        _requests.enqueue({nullptr, tasks, false});
+        return make_ready_future(true);
     }
     else
         tasks.emplace_back([this, image] { return _sendImage(image); });
@@ -148,7 +186,7 @@ Stream::Future StreamSendWorker::enqueueImage(const ImageWrapper& image,
 
 Stream::Future StreamSendWorker::enqueueFinish()
 {
-    return _enqueueRequest({[this] { return _sendFinish(); }});
+    return _enqueueRequest({[this] { return _sendFinish(); }}, true);
 }
 
 Stream::Future StreamSendWorker::enqueueOpen()
@@ -195,13 +233,11 @@ Stream::Future StreamSendWorker::enqueueData(const QByteArray data)
         {[this, data] { return _send(MESSAGE_TYPE_DATA, data); }});
 }
 
-Stream::Future StreamSendWorker::_enqueueRequest(std::vector<Task>&& tasks)
+Stream::Future StreamSendWorker::_enqueueRequest(std::vector<Task>&& tasks,
+                                                 const bool isFinish)
 {
     PromisePtr promise(new Promise);
-
-    std::lock_guard<std::mutex> lock(_mutex);
-    _requests.push_back({promise, tasks});
-    _condition.notify_all();
+    _requests.enqueue({promise, tasks, isFinish});
     return promise->get_future();
 }
 
@@ -230,7 +266,7 @@ bool StreamSendWorker::_sendSegment(const Segment& segment)
     auto message = QByteArray{(const char*)(&segment.parameters),
                               sizeof(SegmentParameters)};
     message.append(segment.imageData);
-    return _send(MESSAGE_TYPE_PIXELSTREAM, message);
+    return _send(MESSAGE_TYPE_PIXELSTREAM, message, false);
 }
 
 bool StreamSendWorker::_sendFinish()
@@ -238,8 +274,10 @@ bool StreamSendWorker::_sendFinish()
     return _send(MESSAGE_TYPE_PIXELSTREAM_FINISH_FRAME, {});
 }
 
-bool StreamSendWorker::_send(const MessageType type, const QByteArray& message)
+bool StreamSendWorker::_send(const MessageType type, const QByteArray& message,
+                             const bool waitForBytesWritten)
 {
-    return _socket.send(MessageHeader(type, message.size(), _id), message);
+    return _socket.send(MessageHeader(type, message.size(), _id), message,
+                        waitForBytesWritten);
 }
 }
diff --git a/deflect/StreamSendWorker.h b/deflect/StreamSendWorker.h
index 1f6730e..a56ac30 100644
--- a/deflect/StreamSendWorker.h
+++ b/deflect/StreamSendWorker.h
@@ -46,10 +46,16 @@
 #include "Socket.h"         // member
 #include "Stream.h"         // Stream::Future
 
-#include <QThread>
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#endif
+#include "moodycamel/blockingconcurrentqueue.h"
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
 
-#include <deque>
-#include <mutex>
+#include <QThread>
 
 namespace deflect
 {
@@ -99,29 +105,34 @@ class StreamSendWorker : public QThread
     {
         PromisePtr promise;
         std::vector<Task> tasks;
+        bool isFinish;
     };
 
     Socket& _socket;
     const std::string& _id;
 
     ImageSegmenter _imageSegmenter;
-    std::deque<Request> _requests;
-    std::mutex _mutex;
-    std::condition_variable _condition;
+    moodycamel::BlockingConcurrentQueue<Request> _requests;
     bool _running = false;
     View _currentView = View::mono;
 
+    std::vector<Request> _dequeuedRequests;
+    bool _pendingFinish = false;
+    Request _finishRequest;
+
     /** Main QThread loop doing asynchronous processing of queued tasks. */
     void run() final;
 
-    Stream::Future _enqueueRequest(std::vector<Task>&& actions);
+    Stream::Future _enqueueRequest(std::vector<Task>&& actions,
+                                   bool isFinish = false);
 
     friend class deflect::test::Application; // to send pre-compressed segments
     bool _sendImage(const ImageWrapper& image);
     bool _sendImageView(View view);
     bool _sendSegment(const Segment& segment);
     bool _sendFinish();
-    bool _send(MessageType type, const QByteArray& message);
+    bool _send(MessageType type, const QByteArray& message,
+               bool waitForBytesWritten = true);
 };
 }
 #endif
diff --git a/deflect/moodycamel/blockingconcurrentqueue.h b/deflect/moodycamel/blockingconcurrentqueue.h
new file mode 100644
index 0000000..48405fa
--- /dev/null
+++ b/deflect/moodycamel/blockingconcurrentqueue.h
@@ -0,0 +1,1097 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2016 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, embedded below).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include <cerrno>
+#include <chrono>
+#include <ctime>
+#include <memory>
+#include <type_traits>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+struct _SECURITY_ATTRIBUTES;
+__declspec(dllimport) void* __stdcall CreateSemaphoreW(
+    _SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount,
+    long lMaximumCount, const wchar_t* lpName);
+__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(
+    void* hHandle, unsigned long dwMilliseconds);
+__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore,
+                                                     long lReleaseCount,
+                                                     long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+namespace mpmc_sema
+{
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+    void* m_hSema;
+
+    Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+    Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        const long maxLong = 0x7fffffff;
+        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+    }
+
+    ~Semaphore() { CloseHandle(m_hSema); }
+    void wait()
+    {
+        const unsigned long infinite = 0xffffffff;
+        WaitForSingleObject(m_hSema, infinite);
+    }
+
+    bool try_wait()
+    {
+        const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+        return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT;
+    }
+
+    bool timed_wait(std::uint64_t usecs)
+    {
+        const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+        return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) !=
+               RC_WAIT_TIMEOUT;
+    }
+
+    void signal(int count = 1) { ReleaseSemaphore(m_hSema, count, nullptr); }
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to
+// http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+    semaphore_t m_sema;
+
+    Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+    Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO,
+                         initialCount);
+    }
+
+    ~Semaphore() { semaphore_destroy(mach_task_self(), m_sema); }
+    void wait() { semaphore_wait(m_sema); }
+    bool try_wait() { return timed_wait(0); }
+    bool timed_wait(std::uint64_t timeout_usecs)
+    {
+        mach_timespec_t ts;
+        ts.tv_sec = timeout_usecs / 1000000;
+        ts.tv_nsec = (timeout_usecs % 1000000) * 1000;
+
+        // added in OSX 10.10:
+        // https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+        kern_return_t rc = semaphore_timedwait(m_sema, ts);
+
+        return rc != KERN_OPERATION_TIMED_OUT;
+    }
+
+    void signal() { semaphore_signal(m_sema); }
+    void signal(int count)
+    {
+        while (count-- > 0)
+        {
+            semaphore_signal(m_sema);
+        }
+    }
+};
+#elif defined(__unix__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+    sem_t m_sema;
+
+    Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+    Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+    Semaphore(int initialCount = 0)
+    {
+        assert(initialCount >= 0);
+        sem_init(&m_sema, 0, initialCount);
+    }
+
+    ~Semaphore() { sem_destroy(&m_sema); }
+    void wait()
+    {
+        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+        int rc;
+        do
+        {
+            rc = sem_wait(&m_sema);
+        } while (rc == -1 && errno == EINTR);
+    }
+
+    bool try_wait()
+    {
+        int rc;
+        do
+        {
+            rc = sem_trywait(&m_sema);
+        } while (rc == -1 && errno == EINTR);
+        return !(rc == -1 && errno == EAGAIN);
+    }
+
+    bool timed_wait(std::uint64_t usecs)
+    {
+        struct timespec ts;
+        const int usecs_in_1_sec = 1000000;
+        const int nsecs_in_1_sec = 1000000000;
+        clock_gettime(CLOCK_REALTIME, &ts);
+        ts.tv_sec += usecs / usecs_in_1_sec;
+        ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
+        // sem_timedwait bombs if you have more than 1e9 in tv_nsec
+        // so we have to clean things up before passing it in
+        if (ts.tv_nsec > nsecs_in_1_sec)
+        {
+            ts.tv_nsec -= nsecs_in_1_sec;
+            ++ts.tv_sec;
+        }
+
+        int rc;
+        do
+        {
+            rc = sem_timedwait(&m_sema, &ts);
+        } while (rc == -1 && errno == EINTR);
+        return !(rc == -1 && errno == ETIMEDOUT);
+    }
+
+    void signal() { sem_post(&m_sema); }
+    void signal(int count)
+    {
+        while (count-- > 0)
+        {
+            sem_post(&m_sema);
+        }
+    }
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+    typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+    std::atomic<ssize_t> m_count;
+    Semaphore m_sema;
+
+    bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+    {
+        ssize_t oldCount;
+        // Is there a better way to set the initial spin count?
+        // If we lower it to 1000, testBenaphore becomes 15x slower on my Core
+        // i7-5930K Windows PC,
+        // as threads start hitting the kernel semaphore.
+        int spin = 10000;
+        while (--spin >= 0)
+        {
+            oldCount = m_count.load(std::memory_order_relaxed);
+            if ((oldCount > 0) &&
+                m_count.compare_exchange_strong(oldCount, oldCount - 1,
+                                                std::memory_order_acquire,
+                                                std::memory_order_relaxed))
+                return true;
+            std::atomic_signal_fence(std::memory_order_acquire); // Prevent the
+                                                                 // compiler
+                                                                 // from
+                                                                 // collapsing
+                                                                 // the loop.
+        }
+        oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+        if (oldCount > 0)
+            return true;
+        if (timeout_usecs < 0)
+        {
+            m_sema.wait();
+            return true;
+        }
+        if (m_sema.timed_wait((std::uint64_t)timeout_usecs))
+            return true;
+        // At this point, we've timed out waiting for the semaphore, but the
+        // count is still decremented indicating we may still be waiting on
+        // it. So we have to re-adjust the count, but only if the semaphore
+        // wasn't signaled enough times for us too since then. If it was, we
+        // need to release the semaphore too.
+        while (true)
+        {
+            oldCount = m_count.load(std::memory_order_acquire);
+            if (oldCount >= 0 && m_sema.try_wait())
+                return true;
+            if (oldCount < 0 &&
+                m_count.compare_exchange_strong(oldCount, oldCount + 1,
+                                                std::memory_order_relaxed,
+                                                std::memory_order_relaxed))
+                return false;
+        }
+    }
+
+    ssize_t waitManyWithPartialSpinning(ssize_t max,
+                                        std::int64_t timeout_usecs = -1)
+    {
+        assert(max > 0);
+        ssize_t oldCount;
+        int spin = 10000;
+        while (--spin >= 0)
+        {
+            oldCount = m_count.load(std::memory_order_relaxed);
+            if (oldCount > 0)
+            {
+                ssize_t newCount = oldCount > max ? oldCount - max : 0;
+                if (m_count.compare_exchange_strong(oldCount, newCount,
+                                                    std::memory_order_acquire,
+                                                    std::memory_order_relaxed))
+                    return oldCount - newCount;
+            }
+            std::atomic_signal_fence(std::memory_order_acquire);
+        }
+        oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+        if (oldCount <= 0)
+        {
+            if (timeout_usecs < 0)
+                m_sema.wait();
+            else if (!m_sema.timed_wait((std::uint64_t)timeout_usecs))
+            {
+                while (true)
+                {
+                    oldCount = m_count.load(std::memory_order_acquire);
+                    if (oldCount >= 0 && m_sema.try_wait())
+                        break;
+                    if (oldCount < 0 &&
+                        m_count.compare_exchange_strong(
+                            oldCount, oldCount + 1, std::memory_order_relaxed,
+                            std::memory_order_relaxed))
+                        return 0;
+                }
+            }
+        }
+        if (max > 1)
+            return 1 + tryWaitMany(max - 1);
+        return 1;
+    }
+
+public:
+    LightweightSemaphore(ssize_t initialCount = 0)
+        : m_count(initialCount)
+    {
+        assert(initialCount >= 0);
+    }
+
+    bool tryWait()
+    {
+        ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+        while (oldCount > 0)
+        {
+            if (m_count.compare_exchange_weak(oldCount, oldCount - 1,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed))
+                return true;
+        }
+        return false;
+    }
+
+    void wait()
+    {
+        if (!tryWait())
+            waitWithPartialSpinning();
+    }
+
+    bool wait(std::int64_t timeout_usecs)
+    {
+        return tryWait() || waitWithPartialSpinning(timeout_usecs);
+    }
+
+    // Acquires between 0 and (greedily) max, inclusive
+    ssize_t tryWaitMany(ssize_t max)
+    {
+        assert(max >= 0);
+        ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+        while (oldCount > 0)
+        {
+            ssize_t newCount = oldCount > max ? oldCount - max : 0;
+            if (m_count.compare_exchange_weak(oldCount, newCount,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed))
+                return oldCount - newCount;
+        }
+        return 0;
+    }
+
+    // Acquires at least one, and (greedily) at most max
+    ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+    {
+        assert(max >= 0);
+        ssize_t result = tryWaitMany(max);
+        if (result == 0 && max > 0)
+            result = waitManyWithPartialSpinning(max, timeout_usecs);
+        return result;
+    }
+
+    ssize_t waitMany(ssize_t max)
+    {
+        ssize_t result = waitMany(max, -1);
+        assert(result > 0);
+        return result;
+    }
+
+    void signal(ssize_t count = 1)
+    {
+        assert(count >= 0);
+        ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+        ssize_t toRelease = -oldCount < count ? -oldCount : count;
+        if (toRelease > 0)
+        {
+            m_sema.signal((int)toRelease);
+        }
+    }
+
+    ssize_t availableApprox() const
+    {
+        ssize_t count = m_count.load(std::memory_order_relaxed);
+        return count > 0 ? count : 0;
+    }
+};
+} // end namespace mpmc_sema
+} // end namespace details
+
+// This is a blocking version of the queue. It has an almost identical interface
+// to
+// the normal non-blocking version, with the addition of various wait_dequeue()
+// methods
+// and the removal of producer-specific dequeue methods.
+template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+    typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+    typedef details::mpmc_sema::LightweightSemaphore LightweightSemaphore;
+
+public:
+    typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+    typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+
+    typedef typename ConcurrentQueue::index_t index_t;
+    typedef typename ConcurrentQueue::size_t size_t;
+    typedef typename std::make_signed<size_t>::type ssize_t;
+
+    static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
+        ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
+        ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
+        ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
+        ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+    static const std::uint32_t
+        EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+            ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+    static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+
+public:
+    // Creates a queue with at least `capacity` element slots; note that the
+    // actual number of elements that can be inserted without additional memory
+    // allocation depends on the number of producers and the block size (e.g. if
+    // the block size is equal to `capacity`, only a single block will be
+    // allocated
+    // up-front, which means only a single producer will be able to enqueue
+    // elements
+    // without an extra allocation -- blocks aren't shared between producers).
+    // This method is not thread safe -- it is up to the user to ensure that the
+    // queue is fully constructed before it starts being used by other threads
+    // (this
+    // includes making the memory effects of construction visible, possibly with
+    // a
+    // memory barrier).
+    explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+        : inner(capacity)
+        , sema(create<LightweightSemaphore>(),
+               &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+    {
+        assert(
+            reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) ==
+                &((BlockingConcurrentQueue*)1)->inner &&
+            "BlockingConcurrentQueue must have ConcurrentQueue as its first "
+            "member");
+        if (!sema)
+        {
+            MOODYCAMEL_THROW(std::bad_alloc());
+        }
+    }
+
+    BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
+                            size_t maxImplicitProducers)
+        : inner(minCapacity, maxExplicitProducers, maxImplicitProducers)
+        , sema(create<LightweightSemaphore>(),
+               &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+    {
+        assert(
+            reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) ==
+                &((BlockingConcurrentQueue*)1)->inner &&
+            "BlockingConcurrentQueue must have ConcurrentQueue as its first "
+            "member");
+        if (!sema)
+        {
+            MOODYCAMEL_THROW(std::bad_alloc());
+        }
+    }
+
+    // Disable copying and copy assignment
+    BlockingConcurrentQueue(BlockingConcurrentQueue const&)
+        MOODYCAMEL_DELETE_FUNCTION;
+    BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&)
+        MOODYCAMEL_DELETE_FUNCTION;
+
+    // Moving is supported, but note that it is *not* a thread-safe operation.
+    // Nobody can use the queue while it's being moved, and the memory effects
+    // of that move must be propagated to other threads before they can use it.
+    // Note: When a queue is moved, its tokens are still valid but can only be
+    // used with the destination queue (i.e. semantically they are moved along
+    // with the queue itself).
+    BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+        : inner(std::move(other.inner)),
+          sema(std::move(other.sema))
+    {
+    }
+
+    inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other)
+        MOODYCAMEL_NOEXCEPT
+    {
+        return swap_internal(other);
+    }
+
+    // Swaps this queue's state with the other's. Not thread-safe.
+    // Swapping two queues does not invalidate their tokens, however
+    // the tokens that were created for one queue must be used with
+    // only the swapped queue (i.e. the tokens are tied to the
+    // queue's movable state, not the object itself).
+    inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap_internal(other);
+    }
+
+private:
+    BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+    {
+        if (this == &other)
+        {
+            return *this;
+        }
+
+        inner.swap(other.inner);
+        sema.swap(other.sema);
+        return *this;
+    }
+
+public:
+    // Enqueues a single item (by copying it).
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit
+    // production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T const& item)
+    {
+        if (details::likely(inner.enqueue(item)))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit
+    // production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T&& item)
+    {
+        if (details::likely(inner.enqueue(std::move(item))))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T const& item)
+    {
+        if (details::likely(inner.enqueue(token, item)))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit
+    // producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T&& item)
+    {
+        if (details::likely(inner.enqueue(token, std::move(item))))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be
+    // surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved instead
+    // of copied.
+    // Thread-safe.
+    template <typename It>
+    inline bool enqueue_bulk(It itemFirst, size_t count)
+    {
+        if (details::likely(
+                inner.enqueue_bulk(std::forward<It>(itemFirst), count)))
+        {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails
+    // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    inline bool enqueue_bulk(producer_token_t const& token, It itemFirst,
+                             size_t count)
+    {
+        if (details::likely(
+                inner.enqueue_bulk(token, std::forward<It>(itemFirst), count)))
+        {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by copying it).
+    // Does not allocate memory. Fails if not enough room to enqueue (or
+    // implicit
+    // production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T const& item)
+    {
+        if (inner.try_enqueue(item))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T&& item)
+    {
+        if (inner.try_enqueue(std::move(item)))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T const& item)
+    {
+        if (inner.try_enqueue(token, item))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit
+    // producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T&& item)
+    {
+        if (inner.try_enqueue(token, std::move(item)))
+        {
+            sema->signal();
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items.
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    inline bool try_enqueue_bulk(It itemFirst, size_t count)
+    {
+        if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count))
+        {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst,
+                                 size_t count)
+    {
+        if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count))
+        {
+            sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+            return true;
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline bool try_dequeue(U& item)
+    {
+        if (sema->tryWait())
+        {
+            while (!inner.try_dequeue(item))
+            {
+                continue;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue using an explicit consumer token.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline bool try_dequeue(consumer_token_t& token, U& item)
+    {
+        if (sema->tryWait())
+        {
+            while (!inner.try_dequeue(token, item))
+            {
+                continue;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        max = (size_t)sema->tryWaitMany(
+            (LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max)
+        {
+            count +=
+                inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit
+    // consumer token.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst,
+                                   size_t max)
+    {
+        size_t count = 0;
+        max = (size_t)sema->tryWaitMany(
+            (LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max)
+        {
+            count += inner.template try_dequeue_bulk<It&>(token, itemFirst,
+                                                          max - count);
+        }
+        return count;
+    }
+
+    // Blocks the current thread until there's something to dequeue, then
+    // dequeues it.
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline void wait_dequeue(U& item)
+    {
+        sema->wait();
+        while (!inner.try_dequeue(item))
+        {
+            continue;
+        }
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout (specified in microseconds) expires. Returns false
+    // without setting `item` if the timeout expires, otherwise assigns
+    // to `item` and returns true.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue.
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+    {
+        if (!sema->wait(timeout_usecs))
+        {
+            return false;
+        }
+        while (!inner.try_dequeue(item))
+        {
+            continue;
+        }
+        return true;
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+    // Never allocates. Thread-safe.
+    template <typename U, typename Rep, typename Period>
+    inline bool wait_dequeue_timed(
+        U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(
+            item, std::chrono::duration_cast<std::chrono::microseconds>(timeout)
+                      .count());
+    }
+
+    // Blocks the current thread until there's something to dequeue, then
+    // dequeues it using an explicit consumer token.
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline void wait_dequeue(consumer_token_t& token, U& item)
+    {
+        sema->wait();
+        while (!inner.try_dequeue(token, item))
+        {
+            continue;
+        }
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout (specified in microseconds) expires. Returns false
+    // without setting `item` if the timeout expires, otherwise assigns
+    // to `item` and returns true.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue.
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline bool wait_dequeue_timed(consumer_token_t& token, U& item,
+                                   std::int64_t timeout_usecs)
+    {
+        if (!sema->wait(timeout_usecs))
+        {
+            return false;
+        }
+        while (!inner.try_dequeue(token, item))
+        {
+            continue;
+        }
+        return true;
+    }
+
+    // Blocks the current thread until either there's something to dequeue
+    // or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+    // Never allocates. Thread-safe.
+    template <typename U, typename Rep, typename Period>
+    inline bool wait_dequeue_timed(
+        consumer_token_t& token, U& item,
+        std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(
+            token, item,
+            std::chrono::duration_cast<std::chrono::microseconds>(timeout)
+                .count());
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued, which will
+    // always be at least one (this method blocks until the queue
+    // is non-empty) and at most max.
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        max =
+            (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max)
+        {
+            count +=
+                inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue_bulk.
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max,
+                                          std::int64_t timeout_usecs)
+    {
+        size_t count = 0;
+        max =
+            (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max,
+                                   timeout_usecs);
+        while (count != max)
+        {
+            count +=
+                inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Never allocates. Thread-safe.
+    template <typename It, typename Rep, typename Period>
+    inline size_t wait_dequeue_bulk_timed(
+        It itemFirst, size_t max,
+        std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(
+            itemFirst, max,
+            std::chrono::duration_cast<std::chrono::microseconds>(timeout)
+                .count());
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit
+    // consumer token.
+    // Returns the number of items actually dequeued, which will
+    // always be at least one (this method blocks until the queue
+    // is non-empty) and at most max.
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst,
+                                    size_t max)
+    {
+        size_t count = 0;
+        max =
+            (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+        while (count != max)
+        {
+            count += inner.template try_dequeue_bulk<It&>(token, itemFirst,
+                                                          max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit
+    // consumer token.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Using a negative timeout indicates an indefinite timeout,
+    // and is thus functionally equivalent to calling wait_dequeue_bulk.
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst,
+                                          size_t max,
+                                          std::int64_t timeout_usecs)
+    {
+        size_t count = 0;
+        max =
+            (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max,
+                                   timeout_usecs);
+        while (count != max)
+        {
+            count += inner.template try_dequeue_bulk<It&>(token, itemFirst,
+                                                          max - count);
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit
+    // consumer token.
+    // Returns the number of items actually dequeued, which can
+    // be 0 if the timeout expires while waiting for elements,
+    // and at most max.
+    // Never allocates. Thread-safe.
+    template <typename It, typename Rep, typename Period>
+    inline size_t wait_dequeue_bulk_timed(
+        consumer_token_t& token, It itemFirst, size_t max,
+        std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(
+            token, itemFirst, max,
+            std::chrono::duration_cast<std::chrono::microseconds>(timeout)
+                .count());
+    }
+
+    // Returns an estimate of the total number of elements currently in the
+    // queue. This
+    // estimate is only accurate if the queue has completely stabilized before
+    // it is called
+    // (i.e. all enqueue and dequeue operations have completed and their memory
+    // effects are
+    // visible on the calling thread, and no further operations start while this
+    // method is
+    // being called).
+    // Thread-safe.
+    inline size_t size_approx() const
+    {
+        return (size_t)sema->availableApprox();
+    }
+
+    // Returns true if the underlying atomic variables used by
+    // the queue are lock-free (they should be on most platforms).
+    // Thread-safe.
+    static bool is_lock_free() { return ConcurrentQueue::is_lock_free(); }
+private:
+    template <typename U>
+    static inline U* create()
+    {
+        auto p = (Traits::malloc)(sizeof(U));
+        return p != nullptr ? new (p) U : nullptr;
+    }
+
+    template <typename U, typename A1>
+    static inline U* create(A1&& a1)
+    {
+        auto p = (Traits::malloc)(sizeof(U));
+        return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+    }
+
+    template <typename U>
+    static inline void destroy(U* p)
+    {
+        if (p != nullptr)
+        {
+            p->~U();
+        }
+        (Traits::free)(p);
+    }
+
+private:
+    ConcurrentQueue inner;
+    std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+template <typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a,
+                 BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+
+} // end namespace moodycamel
diff --git a/deflect/moodycamel/concurrentqueue.h b/deflect/moodycamel/concurrentqueue.h
new file mode 100644
index 0000000..3dabb2a
--- /dev/null
+++ b/deflect/moodycamel/concurrentqueue.h
@@ -0,0 +1,5089 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free
+// queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2016, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this
+// list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of
+// conditions and the following disclaimer in the documentation and/or other
+// materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing
+// warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete`
+// method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic> // Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <algorithm>
+#include <array>
+#include <climits> // for CHAR_BIT
+#include <cstddef> // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <thread> // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <type_traits>
+#include <utility>
+
+// Platform-specific definitions of a numeric thread ID type and an invalid
+// value
+namespace moodycamel
+{
+namespace details
+{
+template <typename thread_id_t>
+struct thread_id_converter
+{
+    typedef thread_id_t thread_id_numeric_size_t;
+    typedef thread_id_t thread_id_hash_t;
+    static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+};
+}
+}
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel
+{
+namespace details
+{
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0xFFFFFFFFU;
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+static inline thread_id_t thread_id()
+{
+    return rl::thread_index();
+}
+}
+}
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the
+// function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(
+    void);
+namespace moodycamel
+{
+namespace details
+{
+static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
+              "Expected size of unsigned long to be 32 bits on Windows");
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id =
+    0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+static const thread_id_t invalid_thread_id2 =
+    0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used
+                 // in practice. Note that all Win32 thread IDs are presently
+                 // multiples of 4.
+static inline thread_id_t thread_id()
+{
+    return static_cast<thread_id_t>(::GetCurrentThreadId());
+}
+}
+}
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+    (defined(__APPLE__) && TARGET_OS_IPHONE)
+namespace moodycamel
+{
+namespace details
+{
+static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
+              "std::thread::id is expected to be either 4 or 8 bytes");
+
+typedef std::thread::id thread_id_t;
+static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID
+
+// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have
+// one; it's
+// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which
+// it won't
+// be.
+static inline thread_id_t thread_id()
+{
+    return std::this_thread::get_id();
+}
+
+template <std::size_t>
+struct thread_id_size
+{
+};
+template <>
+struct thread_id_size<4>
+{
+    typedef std::uint32_t numeric_t;
+};
+template <>
+struct thread_id_size<8>
+{
+    typedef std::uint64_t numeric_t;
+};
+
+template <>
+struct thread_id_converter<thread_id_t>
+{
+    typedef thread_id_size<sizeof(thread_id_t)>::numeric_t
+        thread_id_numeric_size_t;
+#ifndef __APPLE__
+    typedef std::size_t thread_id_hash_t;
+#else
+    typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+    static thread_id_hash_t prehash(thread_id_t const& x)
+    {
+#ifndef __APPLE__
+        return std::hash<std::thread::id>()(x);
+#else
+        return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+    }
+};
+}
+}
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a
+// thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel
+{
+namespace details
+{
+typedef std::uintptr_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr
+static const thread_id_t invalid_thread_id2 =
+    1; // Member accesses off a null pointer are also generally invalid. Plus
+       // it's not aligned.
+static inline thread_id_t thread_id()
+{
+    static MOODYCAMEL_THREADLOCAL int x;
+    return reinterpret_cast<thread_id_t>(&x);
+}
+}
+}
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) ||   \
+    (defined(__GNUC__) && defined(__EXCEPTIONS)) || \
+    (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw(expr)
+#else
+#define MOODYCAMEL_TRY if (true)
+#define MOODYCAMEL_CATCH(...) else if (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when
+// it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)      \
+    (std::is_rvalue_reference<valueType>::value &&           \
+             std::is_move_constructible<type>::value         \
+         ? std::is_trivially_move_constructible<type>::value \
+         : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)        \
+    ((std::is_rvalue_reference<valueType>::value &&              \
+              std::is_move_assignable<type>::value               \
+          ? std::is_trivially_move_assignable<type>::value ||    \
+                std::is_nothrow_move_assignable<type>::value     \
+          : std::is_trivially_copy_assignable<type>::value ||    \
+                std::is_nothrow_copy_assignable<type>::value) && \
+     MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)         \
+    (std::is_rvalue_reference<valueType>::value &&              \
+             std::is_move_constructible<type>::value            \
+         ? std::is_trivially_move_constructible<type>::value || \
+               std::is_nothrow_move_constructible<type>::value  \
+         : std::is_trivially_copy_constructible<type>::value || \
+               std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)        \
+    ((std::is_rvalue_reference<valueType>::value &&              \
+              std::is_move_assignable<type>::value               \
+          ? std::is_trivially_move_assignable<type>::value ||    \
+                std::is_nothrow_move_assignable<type>::value     \
+          : std::is_trivially_copy_assignable<type>::value ||    \
+                std::is_nothrow_copy_assignable<type>::value) && \
+     MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a
+// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to
+// compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                        \
+    (!defined(__MINGW32__) && !defined(__MINGW64__) ||                 \
+     !defined(__WINPTHREADS_VERSION)) &&                               \
+    (!defined(__GNUC__) || __GNUC__ > 4 ||                             \
+     (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                        \
+    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \
+    !defined(_M_ARM) && !defined(__aarch64__)
+// Assume `thread_local` is fully supported in all other C++11
+// compilers/platforms
+//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // always disabled for now
+// since several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions.
+// In this case, we declare the function normally but don't define it. A link
+// error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel
+{
+namespace details
+{
+#if defined(__GNUC__)
+inline bool likely(bool x)
+{
+    return __builtin_expect((x), true);
+}
+inline bool unlikely(bool x)
+{
+    return __builtin_expect((x), false);
+}
+#else
+inline bool likely(bool x)
+{
+    return x;
+}
+inline bool unlikely(bool x)
+{
+    return x;
+}
+#endif
+}
+}
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+template <typename T>
+struct const_numeric_max
+{
+    static_assert(std::is_integral<T>::value,
+                  "const_numeric_max can only be used with integers");
+    static const T value =
+        std::numeric_limits<T>::is_signed
+            ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) -
+                  static_cast<T>(1)
+            : static_cast<T>(-1);
+};
+
+#if defined(__GNUC__) && !defined(__clang__)
+typedef ::max_align_t max_align_t; // GCC forgot to add it to std:: for a while
+#else
+typedef std::max_align_t max_align_t; // Others (e.g. MSVC) insist it can *only*
+                                      // be accessed via std::
+#endif
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+    // General-purpose size type. std::size_t is strongly recommended.
+    typedef std::size_t size_t;
+
+    // The type used for the enqueue and dequeue indices. Must be at least as
+    // large as size_t. Should be significantly larger than the number of
+    // elements
+    // you expect to hold at once, especially if you have a high turnover rate;
+    // for example, on 32-bit x86, if you expect to have over a hundred million
+    // elements or pump several million elements through your queue in a very
+    // short space of time, using a 32-bit type *may* trigger a race condition.
+    // A 64-bit int type is recommended in that case, and in practice will
+    // prevent a race condition no matter the usage of the queue. Note that
+    // whether the queue is lock-free with a 64-int type depends on the whether
+    // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+    typedef std::size_t index_t;
+
+    // Internally, all elements are enqueued and dequeued from multi-element
+    // blocks; this is the smallest controllable unit. If you expect few
+    // elements
+    // but many producers, a smaller block size should be favoured. For few
+    // producers
+    // and/or many elements, a larger block size is preferred. A sane default
+    // is provided. Must be a power of 2.
+    static const size_t BLOCK_SIZE = 32;
+
+    // For explicit producers (i.e. when using a producer token), the block is
+    // checked for being empty by iterating through a list of flags, one per
+    // element.
+    // For large block sizes, this is too inefficient, and switching to an
+    // atomic
+    // counter-based approach is faster. The switch is made for block sizes
+    // strictly
+    // larger than this threshold.
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+    // How many full blocks can be expected for a single explicit producer? This
+    // should
+    // reflect that number's maximum for optimal performance. Must be a power of
+    // 2.
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+    // How many full blocks can be expected for a single implicit producer? This
+    // should
+    // reflect that number's maximum for optimal performance. Must be a power of
+    // 2.
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+    // The initial size of the hash table mapping thread IDs to implicit
+    // producers.
+    // Note that the hash is resized every time it becomes half full.
+    // Must be a power of two, and either 0 or at least 1. If 0, implicit
+    // production
+    // (using the enqueue methods without an explicit producer token) is
+    // disabled.
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+    // Controls the number of items that an explicit consumer (i.e. one with a
+    // token)
+    // must consume before it causes all consumers to rotate and move on to the
+    // next
+    // internal queue.
+    static const std::uint32_t
+        EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+    // The maximum number of elements (inclusive) that can be enqueued to a
+    // sub-queue.
+    // Enqueue operations that would cause this limit to be surpassed will fail.
+    // Note
+    // that this limit is enforced at the block level (for performance reasons),
+    // i.e.
+    // it's rounded up to the nearest block size.
+    static const size_t MAX_SUBQUEUE_SIZE =
+        details::const_numeric_max<size_t>::value;
+
+#ifndef MCDBGQ_USE_RELACY
+// Memory allocation can be customized if needed.
+// malloc should return nullptr on failure, and handle alignment like
+// std::malloc.
+#if defined(malloc) || defined(free)
+    // Gah, this is 2015, stop defining macros that break standard code already!
+    // Work around malloc/free being special macros:
+    static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+    static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+    static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); }
+    static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+    static inline void* malloc(size_t size) { return std::malloc(size); }
+    static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+    // Debug versions when running under the Relacy race detector (ignore
+    // these in user code)
+    static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+    static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template <typename T, typename Traits>
+class ConcurrentQueue;
+template <typename T, typename Traits>
+class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+namespace details
+{
+struct ConcurrentQueueProducerTypelessBase
+{
+    ConcurrentQueueProducerTypelessBase* next;
+    std::atomic<bool> inactive;
+    ProducerToken* token;
+
+    ConcurrentQueueProducerTypelessBase()
+        : next(nullptr)
+        , inactive(false)
+        , token(nullptr)
+    {
+    }
+};
+
+template <bool use32>
+struct _hash_32_or_64
+{
+    static inline std::uint32_t hash(std::uint32_t h)
+    {
+        // MurmurHash3 finalizer -- see
+        // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+        // Since the thread ID is already unique, all we really want to do is
+        // propagate that
+        // uniqueness evenly across all the bits, so that we can use a subset of
+        // the bits while
+        // reducing collisions significantly
+        h ^= h >> 16;
+        h *= 0x85ebca6b;
+        h ^= h >> 13;
+        h *= 0xc2b2ae35;
+        return h ^ (h >> 16);
+    }
+};
+template <>
+struct _hash_32_or_64<1>
+{
+    static inline std::uint64_t hash(std::uint64_t h)
+    {
+        h ^= h >> 33;
+        h *= 0xff51afd7ed558ccd;
+        h ^= h >> 33;
+        h *= 0xc4ceb9fe1a85ec53;
+        return h ^ (h >> 33);
+    }
+};
+template <std::size_t size>
+struct hash_32_or_64 : public _hash_32_or_64<(size > 4)>
+{
+};
+
+static inline size_t hash_thread_id(thread_id_t id)
+{
+    static_assert(
+        sizeof(thread_id_t) <= 8,
+        "Expected a platform where thread IDs are at most 64-bit values");
+    return static_cast<size_t>(
+        hash_32_or_64<sizeof(
+            thread_id_converter<thread_id_t>::thread_id_hash_t)>::
+            hash(thread_id_converter<thread_id_t>::prehash(id)));
+}
+
+template <typename T>
+static inline bool circular_less_than(T a, T b)
+{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4554)
+#endif
+    static_assert(std::is_integral<T>::value &&
+                      !std::numeric_limits<T>::is_signed,
+                  "circular_less_than is intended to be used only with "
+                  "unsigned integer types");
+    return static_cast<T>(a - b) >
+           static_cast<T>(static_cast<T>(1)
+                          << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+}
+
+template <typename U>
+static inline char* align_for(char* ptr)
+{
+    const std::size_t alignment = std::alignment_of<U>::value;
+    return ptr +
+           (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) %
+               alignment;
+}
+
+template <typename T>
+static inline T ceil_to_pow_2(T x)
+{
+    static_assert(std::is_integral<T>::value &&
+                      !std::numeric_limits<T>::is_signed,
+                  "ceil_to_pow_2 is intended to be used only with unsigned "
+                  "integer types");
+
+    // Adapted from
+    // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    for (std::size_t i = 1; i < sizeof(T); i <<= 1)
+    {
+        x |= x >> (i << 3);
+    }
+    ++x;
+    return x;
+}
+
+template <typename T>
+static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+{
+    T temp = std::move(left.load(std::memory_order_relaxed));
+    left.store(std::move(right.load(std::memory_order_relaxed)),
+               std::memory_order_relaxed);
+    right.store(std::move(temp), std::memory_order_relaxed);
+}
+
+template <typename T>
+static inline T const& nomove(T const& x)
+{
+    return x;
+}
+
+template <bool Enable>
+struct nomove_if
+{
+    template <typename T>
+    static inline T const& eval(T const& x)
+    {
+        return x;
+    }
+};
+
+template <>
+struct nomove_if<false>
+{
+    template <typename U>
+    static inline auto eval(U&& x) -> decltype(std::forward<U>(x))
+    {
+        return std::forward<U>(x);
+    }
+};
+
+template <typename It>
+static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+{
+    return *it;
+}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \
+    (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+template <typename T>
+struct is_trivially_destructible : std::is_trivially_destructible<T>
+{
+};
+#else
+template <typename T>
+struct is_trivially_destructible : std::has_trivial_destructor<T>
+{
+};
+#endif
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+typedef RelacyThreadExitListener ThreadExitListener;
+typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+struct ThreadExitListener
+{
+    typedef void (*callback_t)(void*);
+    callback_t callback;
+    void* userData;
+
+    ThreadExitListener* next; // reserved for use by the ThreadExitNotifier
+};
+
+class ThreadExitNotifier
+{
+public:
+    static void subscribe(ThreadExitListener* listener)
+    {
+        auto& tlsInst = instance();
+        listener->next = tlsInst.tail;
+        tlsInst.tail = listener;
+    }
+
+    static void unsubscribe(ThreadExitListener* listener)
+    {
+        auto& tlsInst = instance();
+        ThreadExitListener** prev = &tlsInst.tail;
+        for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next)
+        {
+            if (ptr == listener)
+            {
+                *prev = ptr->next;
+                break;
+            }
+            prev = &ptr->next;
+        }
+    }
+
+private:
+    ThreadExitNotifier()
+        : tail(nullptr)
+    {
+    }
+    ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+    ThreadExitNotifier& operator=(ThreadExitNotifier const&)
+        MOODYCAMEL_DELETE_FUNCTION;
+
+    ~ThreadExitNotifier()
+    {
+        // This thread is about to exit, let everyone know!
+        assert(this == &instance() &&
+               "If this assert fails, you likely have a buggy compiler! Change "
+               "the preprocessor conditions such that "
+               "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+        for (auto ptr = tail; ptr != nullptr; ptr = ptr->next)
+        {
+            ptr->callback(ptr->userData);
+        }
+    }
+
+    // Thread-local
+    static inline ThreadExitNotifier& instance()
+    {
+        static thread_local ThreadExitNotifier notifier;
+        return notifier;
+    }
+
+private:
+    ThreadExitListener* tail;
+};
+#endif
+#endif
+
+template <typename T>
+struct static_is_lock_free_num
+{
+    enum
+    {
+        value = 0
+    };
+};
+template <>
+struct static_is_lock_free_num<signed char>
+{
+    enum
+    {
+        value = ATOMIC_CHAR_LOCK_FREE
+    };
+};
+template <>
+struct static_is_lock_free_num<short>
+{
+    enum
+    {
+        value = ATOMIC_SHORT_LOCK_FREE
+    };
+};
+template <>
+struct static_is_lock_free_num<int>
+{
+    enum
+    {
+        value = ATOMIC_INT_LOCK_FREE
+    };
+};
+template <>
+struct static_is_lock_free_num<long>
+{
+    enum
+    {
+        value = ATOMIC_LONG_LOCK_FREE
+    };
+};
+template <>
+struct static_is_lock_free_num<long long>
+{
+    enum
+    {
+        value = ATOMIC_LLONG_LOCK_FREE
+    };
+};
+template <typename T>
+struct static_is_lock_free
+    : static_is_lock_free_num<typename std::make_signed<T>::type>
+{
+};
+template <>
+struct static_is_lock_free<bool>
+{
+    enum
+    {
+        value = ATOMIC_BOOL_LOCK_FREE
+    };
+};
+template <typename U>
+struct static_is_lock_free<U*>
+{
+    enum
+    {
+        value = ATOMIC_POINTER_LOCK_FREE
+    };
+};
+}
+
+struct ProducerToken
+{
+    template <typename T, typename Traits>
+    explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+    template <typename T, typename Traits>
+    explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+
+    ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+        : producer(other.producer)
+    {
+        other.producer = nullptr;
+        if (producer != nullptr)
+        {
+            producer->token = this;
+        }
+    }
+
+    inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap(other);
+        return *this;
+    }
+
+    void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+    {
+        std::swap(producer, other.producer);
+        if (producer != nullptr)
+        {
+            producer->token = this;
+        }
+        if (other.producer != nullptr)
+        {
+            other.producer->token = &other;
+        }
+    }
+
+    // A token is always valid unless:
+    //     1) Memory allocation failed during construction
+    //     2) It was moved via the move constructor
+    //        (Note: assignment does a swap, leaving both potentially valid)
+    //     3) The associated queue was destroyed
+    // Note that if valid() returns true, that only indicates
+    // that the token is valid for use with a specific queue,
+    // but not which one; that's up to the user to track.
+    inline bool valid() const { return producer != nullptr; }
+    ~ProducerToken()
+    {
+        if (producer != nullptr)
+        {
+            producer->token = nullptr;
+            producer->inactive.store(true, std::memory_order_release);
+        }
+    }
+
+    // Disable copying and assignment
+    ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+    ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+    template <typename T, typename Traits>
+    friend class ConcurrentQueue;
+    friend class ConcurrentQueueTests;
+
+protected:
+    details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+struct ConsumerToken
+{
+    template <typename T, typename Traits>
+    explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+    template <typename T, typename Traits>
+    explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+
+    ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+        : initialOffset(other.initialOffset),
+          lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+          itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+          currentProducer(other.currentProducer),
+          desiredProducer(other.desiredProducer)
+    {
+    }
+
+    inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap(other);
+        return *this;
+    }
+
+    void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+    {
+        std::swap(initialOffset, other.initialOffset);
+        std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+        std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+        std::swap(currentProducer, other.currentProducer);
+        std::swap(desiredProducer, other.desiredProducer);
+    }
+
+    // Disable copying and assignment
+    ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+    template <typename T, typename Traits>
+    friend class ConcurrentQueue;
+    friend class ConcurrentQueueTests;
+
+private: // but shared with ConcurrentQueue
+    std::uint32_t initialOffset;
+    std::uint32_t lastKnownGlobalOffset;
+    std::uint32_t itemsConsumedFromCurrent;
+    details::ConcurrentQueueProducerTypelessBase* currentProducer;
+    details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See
+// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b)
+    MOODYCAMEL_NOEXCEPT;
+
+template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+    typedef ::moodycamel::ProducerToken producer_token_t;
+    typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+    typedef typename Traits::index_t index_t;
+    typedef typename Traits::size_t size_t;
+
+    static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
+        static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
+        static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
+        static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
+        static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+    static const std::uint32_t
+        EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+            static_cast<std::uint32_t>(
+                Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4307) // + integral constant overflow (that's what the
+                                // ternary expression is for!)
+#pragma warning(disable : 4309) // static_cast: Truncation of constant value
+#endif
+    static const size_t MAX_SUBQUEUE_SIZE =
+        (details::const_numeric_max<size_t>::value -
+             static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) <
+         BLOCK_SIZE)
+            ? details::const_numeric_max<size_t>::value
+            : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) +
+                (BLOCK_SIZE - 1)) /
+               BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    static_assert(!std::numeric_limits<size_t>::is_signed &&
+                      std::is_integral<size_t>::value,
+                  "Traits::size_t must be an unsigned integral type");
+    static_assert(!std::numeric_limits<index_t>::is_signed &&
+                      std::is_integral<index_t>::value,
+                  "Traits::index_t must be an unsigned integral type");
+    static_assert(sizeof(index_t) >= sizeof(size_t),
+                  "Traits::index_t must be at least as wide as Traits::size_t");
+    static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)),
+                  "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+    static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
+                      !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD &
+                        (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
+                  "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a "
+                  "power of 2 (and greater than 1)");
+    static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                      !(EXPLICIT_INITIAL_INDEX_SIZE &
+                        (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
+                  "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 "
+                  "(and greater than 1)");
+    static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                      !(IMPLICIT_INITIAL_INDEX_SIZE &
+                        (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
+                  "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 "
+                  "(and greater than 1)");
+    static_assert(
+        (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
+            !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE &
+              (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
+        "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+    static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 ||
+                      INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
+                  "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at "
+                  "least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+    // Creates a queue with at least `capacity` element slots; note that the
+    // actual number of elements that can be inserted without additional memory
+    // allocation depends on the number of producers and the block size (e.g. if
+    // the block size is equal to `capacity`, only a single block will be
+    // allocated
+    // up-front, which means only a single producer will be able to enqueue
+    // elements
+    // without an extra allocation -- blocks aren't shared between producers).
+    // This method is not thread safe -- it is up to the user to ensure that the
+    // queue is fully constructed before it starts being used by other threads
+    // (this
+    // includes making the memory effects of construction visible, possibly with
+    // a
+    // memory barrier).
+    explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+        : producerListTail(nullptr)
+        , producerCount(0)
+        , initialBlockPoolIndex(0)
+        , nextExplicitConsumerId(0)
+        , globalExplicitConsumerOffset(0)
+    {
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        populate_initial_block_list(
+            capacity / BLOCK_SIZE +
+            ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        // Track all the producers using a fully-resolved typed list for
+        // each kind; this makes it possible to debug them starting from
+        // the root queue object (otherwise wacky casts are needed that
+        // don't compile in the debugger's expression evaluator).
+        explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+    }
+
+    // Computes the correct amount of pre-allocated blocks for you based
+    // on the minimum number of elements you want available at any given
+    // time, and the maximum concurrent number of each type of producer.
+    ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
+                    size_t maxImplicitProducers)
+        : producerListTail(nullptr)
+        , producerCount(0)
+        , initialBlockPoolIndex(0)
+        , nextExplicitConsumerId(0)
+        , globalExplicitConsumerOffset(0)
+    {
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) *
+                            (maxExplicitProducers + 1) +
+                        2 * (maxExplicitProducers + maxImplicitProducers);
+        populate_initial_block_list(blocks);
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+    }
+
+    // Note: The queue should not be accessed concurrently while it's
+    // being deleted. It's up to the user to synchronize this.
+    // This method is not thread safe.
+    ~ConcurrentQueue()
+    {
+        // Destroy producers
+        auto ptr = producerListTail.load(std::memory_order_relaxed);
+        while (ptr != nullptr)
+        {
+            auto next = ptr->next_prod();
+            if (ptr->token != nullptr)
+            {
+                ptr->token->producer = nullptr;
+            }
+            destroy(ptr);
+            ptr = next;
+        }
+
+        // Destroy implicit producer hash tables
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0)
+        {
+            auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+            while (hash != nullptr)
+            {
+                auto prev = hash->prev;
+                if (prev != nullptr)
+                { // The last hash is part of this object and was not allocated
+                    // dynamically
+                    for (size_t i = 0; i != hash->capacity; ++i)
+                    {
+                        hash->entries[i].~ImplicitProducerKVP();
+                    }
+                    hash->~ImplicitProducerHash();
+                    (Traits::free)(hash);
+                }
+                hash = prev;
+            }
+        }
+
+        // Destroy global free list
+        auto block = freeList.head_unsafe();
+        while (block != nullptr)
+        {
+            auto next = block->freeListNext.load(std::memory_order_relaxed);
+            if (block->dynamicallyAllocated)
+            {
+                destroy(block);
+            }
+            block = next;
+        }
+
+        // Destroy initial free list
+        destroy_array(initialBlockPool, initialBlockPoolSize);
+    }
+
+    // Disable copying and copy assignment
+    ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConcurrentQueue& operator=(ConcurrentQueue const&)
+        MOODYCAMEL_DELETE_FUNCTION;
+
+    // Moving is supported, but note that it is *not* a thread-safe operation.
+    // Nobody can use the queue while it's being moved, and the memory effects
+    // of that move must be propagated to other threads before they can use it.
+    // Note: When a queue is moved, its tokens are still valid but can only be
+    // used with the destination queue (i.e. semantically they are moved along
+    // with the queue itself).
+    ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+        : producerListTail(
+              other.producerListTail.load(std::memory_order_relaxed)),
+          producerCount(other.producerCount.load(std::memory_order_relaxed)),
+          initialBlockPoolIndex(
+              other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+          initialBlockPool(other.initialBlockPool),
+          initialBlockPoolSize(other.initialBlockPoolSize),
+          freeList(std::move(other.freeList)),
+          nextExplicitConsumerId(
+              other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+          globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(
+              std::memory_order_relaxed))
+    {
+        // Move the other one into this, and leave the other one as an empty
+        // queue
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        swap_implicit_producer_hashes(other);
+
+        other.producerListTail.store(nullptr, std::memory_order_relaxed);
+        other.producerCount.store(0, std::memory_order_relaxed);
+        other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+        other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        explicitProducers.store(other.explicitProducers.load(
+                                    std::memory_order_relaxed),
+                                std::memory_order_relaxed);
+        other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(other.implicitProducers.load(
+                                    std::memory_order_relaxed),
+                                std::memory_order_relaxed);
+        other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+
+        other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+        other.initialBlockPoolSize = 0;
+        other.initialBlockPool = nullptr;
+
+        reown_producers();
+    }
+
+    inline ConcurrentQueue& operator=(ConcurrentQueue&& other)
+        MOODYCAMEL_NOEXCEPT
+    {
+        return swap_internal(other);
+    }
+
+    // Swaps this queue's state with the other's. Not thread-safe.
+    // Swapping two queues does not invalidate their tokens, however
+    // the tokens that were created for one queue must be used with
+    // only the swapped queue (i.e. the tokens are tied to the
+    // queue's movable state, not the object itself).
+    inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap_internal(other);
+    }
+
+private:
+    ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+    {
+        if (this == &other)
+        {
+            return *this;
+        }
+
+        details::swap_relaxed(producerListTail, other.producerListTail);
+        details::swap_relaxed(producerCount, other.producerCount);
+        details::swap_relaxed(initialBlockPoolIndex,
+                              other.initialBlockPoolIndex);
+        std::swap(initialBlockPool, other.initialBlockPool);
+        std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+        freeList.swap(other.freeList);
+        details::swap_relaxed(nextExplicitConsumerId,
+                              other.nextExplicitConsumerId);
+        details::swap_relaxed(globalExplicitConsumerOffset,
+                              other.globalExplicitConsumerOffset);
+
+        swap_implicit_producer_hashes(other);
+
+        reown_producers();
+        other.reown_producers();
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        details::swap_relaxed(explicitProducers, other.explicitProducers);
+        details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+
+        return *this;
+    }
+
+public:
+    // Enqueues a single item (by copying it).
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit
+    // production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T const& item)
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+        return inner_enqueue<CanAlloc>(item);
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit
+    // production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T&& item)
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+        return inner_enqueue<CanAlloc>(std::move(item));
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T const& item)
+    {
+        return inner_enqueue<CanAlloc>(token, item);
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit
+    // producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T&& item)
+    {
+        return inner_enqueue<CanAlloc>(token, std::move(item));
+    }
+
+    // Enqueues several items.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be
+    // surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved instead
+    // of copied.
+    // Thread-safe.
+    template <typename It>
+    bool enqueue_bulk(It itemFirst, size_t count)
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+        return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails
+    // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+    }
+
+    // Enqueues a single item (by copying it).
+    // Does not allocate memory. Fails if not enough room to enqueue (or
+    // implicit
+    // production is disabled because
+    // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T const& item)
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+        return inner_enqueue<CannotAlloc>(item);
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T&& item)
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+        return inner_enqueue<CannotAlloc>(std::move(item));
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T const& item)
+    {
+        return inner_enqueue<CannotAlloc>(token, item);
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit
+    // producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T&& item)
+    {
+        return inner_enqueue<CannotAlloc>(token, std::move(item));
+    }
+
+    // Enqueues several items.
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    bool try_enqueue_bulk(It itemFirst, size_t count)
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+        return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    bool try_enqueue_bulk(producer_token_t const& token, It itemFirst,
+                          size_t count)
+    {
+        return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    bool try_dequeue(U& item)
+    {
+        // Instead of simply trying each producer in turn (which could cause
+        // needless contention on the first
+        // producer), we score them heuristically.
+        size_t nonEmptyCount = 0;
+        ProducerBase* best = nullptr;
+        size_t bestSize = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire);
+             nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod())
+        {
+            auto size = ptr->size_approx();
+            if (size > 0)
+            {
+                if (size > bestSize)
+                {
+                    bestSize = size;
+                    best = ptr;
+                }
+                ++nonEmptyCount;
+            }
+        }
+
+        // If there was at least one non-empty queue but it appears empty at the
+        // time
+        // we try to dequeue from it, we need to make sure every queue's been
+        // tried
+        if (nonEmptyCount > 0)
+        {
+            if (details::likely(best->dequeue(item)))
+            {
+                return true;
+            }
+            for (auto ptr = producerListTail.load(std::memory_order_acquire);
+                 ptr != nullptr; ptr = ptr->next_prod())
+            {
+                if (ptr != best && ptr->dequeue(item))
+                {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // This differs from the try_dequeue(item) method in that this one does
+    // not attempt to reduce contention by interleaving the order that producer
+    // streams are dequeued from. So, using this method can reduce overall
+    // throughput
+    // under contention, but will give more predictable results in
+    // single-threaded
+    // consumer scenarios. This is mostly only useful for internal unit tests.
+    // Never allocates. Thread-safe.
+    template <typename U>
+    bool try_dequeue_non_interleaved(U& item)
+    {
+        for (auto ptr = producerListTail.load(std::memory_order_acquire);
+             ptr != nullptr; ptr = ptr->next_prod())
+        {
+            if (ptr->dequeue(item))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue using an explicit consumer token.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    bool try_dequeue(consumer_token_t& token, U& item)
+    {
+        // The idea is roughly as follows:
+        // Every 256 items from one producer, make everyone rotate (increase the
+        // global offset) -> this means the highest efficiency consumer dictates
+        // the rotation speed of everyone else, more or less
+        // If you see that the global offset has changed, you must reset your
+        // consumption counter and move to your designated place
+        // If there's no items where you're supposed to be, keep moving until
+        // you find a producer with some items
+        // If the global offset has not changed but you've run out of items to
+        // consume, move over from your current position until you find an
+        // producer with something in it
+
+        if (token.desiredProducer == nullptr ||
+            token.lastKnownGlobalOffset !=
+                globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+        {
+            if (!update_current_producer_after_rotation(token))
+            {
+                return false;
+            }
+        }
+
+        // If there was at least one non-empty queue but it appears empty at the
+        // time
+        // we try to dequeue from it, we need to make sure every queue's been
+        // tried
+        if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item))
+        {
+            if (++token.itemsConsumedFromCurrent ==
+                EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE)
+            {
+                globalExplicitConsumerOffset.fetch_add(
+                    1, std::memory_order_relaxed);
+            }
+            return true;
+        }
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr =
+            static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr)
+        {
+            ptr = tail;
+        }
+        while (ptr != static_cast<ProducerBase*>(token.currentProducer))
+        {
+            if (ptr->dequeue(item))
+            {
+                token.currentProducer = ptr;
+                token.itemsConsumedFromCurrent = 1;
+                return true;
+            }
+            ptr = ptr->next_prod();
+            if (ptr == nullptr)
+            {
+                ptr = tail;
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    size_t try_dequeue_bulk(It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire);
+             ptr != nullptr; ptr = ptr->next_prod())
+        {
+            count += ptr->dequeue_bulk(itemFirst, max - count);
+            if (count == max)
+            {
+                break;
+            }
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit
+    // consumer token.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+    {
+        if (token.desiredProducer == nullptr ||
+            token.lastKnownGlobalOffset !=
+                globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+        {
+            if (!update_current_producer_after_rotation(token))
+            {
+                return 0;
+            }
+        }
+
+        size_t count = static_cast<ProducerBase*>(token.currentProducer)
+                           ->dequeue_bulk(itemFirst, max);
+        if (count == max)
+        {
+            if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(
+                     max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE)
+            {
+                globalExplicitConsumerOffset.fetch_add(
+                    1, std::memory_order_relaxed);
+            }
+            return max;
+        }
+        token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+        max -= count;
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr =
+            static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr)
+        {
+            ptr = tail;
+        }
+        while (ptr != static_cast<ProducerBase*>(token.currentProducer))
+        {
+            auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+            count += dequeued;
+            if (dequeued != 0)
+            {
+                token.currentProducer = ptr;
+                token.itemsConsumedFromCurrent =
+                    static_cast<std::uint32_t>(dequeued);
+            }
+            if (dequeued == max)
+            {
+                break;
+            }
+            max -= dequeued;
+            ptr = ptr->next_prod();
+            if (ptr == nullptr)
+            {
+                ptr = tail;
+            }
+        }
+        return count;
+    }
+
+    // Attempts to dequeue from a specific producer's inner queue.
+    // If you happen to know which producer you want to dequeue from, this
+    // is significantly faster than using the general-case try_dequeue methods.
+    // Returns false if the producer's queue appeared empty at the time it
+    // was checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline bool try_dequeue_from_producer(producer_token_t const& producer,
+                                          U& item)
+    {
+        return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+    }
+
+    // Attempts to dequeue several elements from a specific producer's inner
+    // queue.
+    // Returns the number of items actually dequeued.
+    // If you happen to know which producer you want to dequeue from, this
+    // is significantly faster than using the general-case try_dequeue methods.
+    // Returns 0 if the producer's queue appeared empty at the time it
+    // was checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t try_dequeue_bulk_from_producer(
+        producer_token_t const& producer, It itemFirst, size_t max)
+    {
+        return static_cast<ExplicitProducer*>(producer.producer)
+            ->dequeue_bulk(itemFirst, max);
+    }
+
+    // Returns an estimate of the total number of elements currently in the
+    // queue. This
+    // estimate is only accurate if the queue has completely stabilized before
+    // it is called
+    // (i.e. all enqueue and dequeue operations have completed and their memory
+    // effects are
+    // visible on the calling thread, and no further operations start while this
+    // method is
+    // being called).
+    // Thread-safe.
+    size_t size_approx() const
+    {
+        size_t size = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire);
+             ptr != nullptr; ptr = ptr->next_prod())
+        {
+            size += ptr->size_approx();
+        }
+        return size;
+    }
+
+    // Returns true if the underlying atomic variables used by
+    // the queue are lock-free (they should be on most platforms).
+    // Thread-safe.
+    static bool is_lock_free()
+    {
+        return details::static_is_lock_free<bool>::value == 2 &&
+               details::static_is_lock_free<size_t>::value == 2 &&
+               details::static_is_lock_free<std::uint32_t>::value == 2 &&
+               details::static_is_lock_free<index_t>::value == 2 &&
+               details::static_is_lock_free<void*>::value == 2 &&
+               details::static_is_lock_free<
+                   typename details::thread_id_converter<details::thread_id_t>::
+                       thread_id_numeric_size_t>::value == 2;
+    }
+
+private:
+    friend struct ProducerToken;
+    friend struct ConsumerToken;
+    friend struct ExplicitProducer;
+    friend class ConcurrentQueueTests;
+
+    enum AllocationMode
+    {
+        CanAlloc,
+        CannotAlloc
+    };
+
+    ///////////////////////////////
+    // Queue methods
+    ///////////////////////////////
+
+    template <AllocationMode canAlloc, typename U>
+    inline bool inner_enqueue(producer_token_t const& token, U&& element)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)
+            ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
+                std::forward<U>(element));
+    }
+
+    template <AllocationMode canAlloc, typename U>
+    inline bool inner_enqueue(U&& element)
+    {
+        auto producer = get_or_add_implicit_producer();
+        return producer == nullptr
+                   ? false
+                   : producer
+                         ->ConcurrentQueue::ImplicitProducer::template enqueue<
+                             canAlloc>(std::forward<U>(element));
+    }
+
+    template <AllocationMode canAlloc, typename It>
+    inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst,
+                                   size_t count)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)
+            ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<
+                canAlloc>(itemFirst, count);
+    }
+
+    template <AllocationMode canAlloc, typename It>
+    inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+    {
+        auto producer = get_or_add_implicit_producer();
+        return producer == nullptr
+                   ? false
+                   : producer->ConcurrentQueue::ImplicitProducer::
+                         template enqueue_bulk<canAlloc>(itemFirst, count);
+    }
+
+    inline bool update_current_producer_after_rotation(consumer_token_t& token)
+    {
+        // Ah, there's been a rotation, figure out where we should be!
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        if (token.desiredProducer == nullptr && tail == nullptr)
+        {
+            return false;
+        }
+        auto prodCount = producerCount.load(std::memory_order_relaxed);
+        auto globalOffset =
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+        if (details::unlikely(token.desiredProducer == nullptr))
+        {
+            // Aha, first time we're dequeueing anything.
+            // Figure out our local position
+            // Note: offset is from start, not end, but we're traversing from
+            // end -- subtract from count first
+            std::uint32_t offset =
+                prodCount - 1 - (token.initialOffset % prodCount);
+            token.desiredProducer = tail;
+            for (std::uint32_t i = 0; i != offset; ++i)
+            {
+                token.desiredProducer =
+                    static_cast<ProducerBase*>(token.desiredProducer)
+                        ->next_prod();
+                if (token.desiredProducer == nullptr)
+                {
+                    token.desiredProducer = tail;
+                }
+            }
+        }
+
+        std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+        if (delta >= prodCount)
+        {
+            delta = delta % prodCount;
+        }
+        for (std::uint32_t i = 0; i != delta; ++i)
+        {
+            token.desiredProducer =
+                static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+            if (token.desiredProducer == nullptr)
+            {
+                token.desiredProducer = tail;
+            }
+        }
+
+        token.lastKnownGlobalOffset = globalOffset;
+        token.currentProducer = token.desiredProducer;
+        token.itemsConsumedFromCurrent = 0;
+        return true;
+    }
+
+    ///////////////////////////
+    // Free list
+    ///////////////////////////
+
+    template <typename N>
+    struct FreeListNode
+    {
+        FreeListNode()
+            : freeListRefs(0)
+            , freeListNext(nullptr)
+        {
+        }
+
+        std::atomic<std::uint32_t> freeListRefs;
+        std::atomic<N*> freeListNext;
+    };
+
+    // A simple CAS-based lock-free free list. Not the fastest thing in the
+    // world under heavy contention, but
+    // simple and correct (assuming nodes are never freed until after the free
+    // list is destroyed), and fairly
+    // speedy under low contention.
+    template <typename N> // N must inherit FreeListNode or have the same fields
+    // (and initialization of them)
+    struct FreeList
+    {
+        FreeList()
+            : freeListHead(nullptr)
+        {
+        }
+        FreeList(FreeList&& other)
+            : freeListHead(other.freeListHead.load(std::memory_order_relaxed))
+        {
+            other.freeListHead.store(nullptr, std::memory_order_relaxed);
+        }
+        void swap(FreeList& other)
+        {
+            details::swap_relaxed(freeListHead, other.freeListHead);
+        }
+
+        FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+        FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+
+        inline void add(N* node)
+        {
+#if MCDBGQ_NOLOCKFREE_FREELIST
+            debug::DebugLock lock(mutex);
+#endif
+            // We know that the should-be-on-freelist bit is 0 at this point, so
+            // it's safe to
+            // set it using a fetch_add
+            if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST,
+                                             std::memory_order_acq_rel) == 0)
+            {
+                // Oh look! We were the last ones referencing this node, and we
+                // know
+                // we want to add it to the free list, so let's do it!
+                add_knowing_refcount_is_zero(node);
+            }
+        }
+
+        inline N* try_get()
+        {
+#if MCDBGQ_NOLOCKFREE_FREELIST
+            debug::DebugLock lock(mutex);
+#endif
+            auto head = freeListHead.load(std::memory_order_acquire);
+            while (head != nullptr)
+            {
+                auto prevHead = head;
+                auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+                if ((refs & REFS_MASK) == 0 ||
+                    !head->freeListRefs.compare_exchange_strong(
+                        refs, refs + 1, std::memory_order_acquire,
+                        std::memory_order_relaxed))
+                {
+                    head = freeListHead.load(std::memory_order_acquire);
+                    continue;
+                }
+
+                // Good, reference count has been incremented (it wasn't at
+                // zero), which means we can read the
+                // next and not worry about it changing between now and the time
+                // we do the CAS
+                auto next = head->freeListNext.load(std::memory_order_relaxed);
+                if (freeListHead.compare_exchange_strong(
+                        head, next, std::memory_order_acquire,
+                        std::memory_order_relaxed))
+                {
+                    // Yay, got the node. This means it was on the list, which
+                    // means shouldBeOnFreeList must be false no
+                    // matter the refcount (because nobody else knows it's been
+                    // taken off yet, it can't have been put back on).
+                    assert((head->freeListRefs.load(std::memory_order_relaxed) &
+                            SHOULD_BE_ON_FREELIST) == 0);
+
+                    // Decrease refcount twice, once for our ref, and once for
+                    // the list's ref
+                    head->freeListRefs.fetch_add(-2, std::memory_order_release);
+                    return head;
+                }
+
+                // OK, the head must have changed on us, but we still need to
+                // decrease the refcount we increased.
+                // Note that we don't need to release any memory effects, but we
+                // do need to ensure that the reference
+                // count decrement happens-after the CAS on the head.
+                refs =
+                    prevHead->freeListRefs.fetch_add(-1,
+                                                     std::memory_order_acq_rel);
+                if (refs == SHOULD_BE_ON_FREELIST + 1)
+                {
+                    add_knowing_refcount_is_zero(prevHead);
+                }
+            }
+
+            return nullptr;
+        }
+
+        // Useful for traversing the list when there's no contention (e.g. to
+        // destroy remaining nodes)
+        N* head_unsafe() const
+        {
+            return freeListHead.load(std::memory_order_relaxed);
+        }
+
+    private:
+        inline void add_knowing_refcount_is_zero(N* node)
+        {
+            // Since the refcount is zero, and nobody can increase it once it's
+            // zero (except us, and we run
+            // only one copy of this method per node at a time, i.e. the single
+            // thread case), then we know
+            // we can safely change the next pointer of the node; however, once
+            // the refcount is back above
+            // zero, then other threads could increase it (happens under heavy
+            // contention, when the refcount
+            // goes to zero in between a load and a refcount increment of a node
+            // in try_get, then back up to
+            // something non-zero, then the refcount increment is done by the
+            // other thread) -- so, if the CAS
+            // to add the node to the actual list fails, decrease the refcount
+            // and leave the add operation to
+            // the next thread who puts the refcount back at zero (which could
+            // be us, hence the loop).
+            auto head = freeListHead.load(std::memory_order_relaxed);
+            while (true)
+            {
+                node->freeListNext.store(head, std::memory_order_relaxed);
+                node->freeListRefs.store(1, std::memory_order_release);
+                if (!freeListHead.compare_exchange_strong(
+                        head, node, std::memory_order_release,
+                        std::memory_order_relaxed))
+                {
+                    // Hmm, the add failed, but we can only try again when the
+                    // refcount goes back to zero
+                    if (node->freeListRefs.fetch_add(
+                            SHOULD_BE_ON_FREELIST - 1,
+                            std::memory_order_release) == 1)
+                    {
+                        continue;
+                    }
+                }
+                return;
+            }
+        }
+
+    private:
+        // Implemented like a stack, but where node order doesn't matter (nodes
+        // are inserted out of order under contention)
+        std::atomic<N*> freeListHead;
+
+        static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+        static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
+#if MCDBGQ_NOLOCKFREE_FREELIST
+        debug::DebugMutex mutex;
+#endif
+    };
+
+    ///////////////////////////
+    // Block
+    ///////////////////////////
+
+    enum InnerQueueContext
+    {
+        implicit_context = 0,
+        explicit_context = 1
+    };
+
+    struct Block
+    {
+        Block()
+            : next(nullptr)
+            , elementsCompletelyDequeued(0)
+            , freeListRefs(0)
+            , freeListNext(nullptr)
+            , shouldBeOnFreeList(false)
+            , dynamicallyAllocated(true)
+        {
+#if MCDBGQ_TRACKMEM
+            owner = nullptr;
+#endif
+        }
+
+        template <InnerQueueContext context>
+        inline bool is_empty() const
+        {
+            if (context == explicit_context &&
+                BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Check flags
+                for (size_t i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    if (!emptyFlags[i].load(std::memory_order_relaxed))
+                    {
+                        return false;
+                    }
+                }
+
+                // Aha, empty; make sure we have all other memory effects that
+                // happened before the empty flags were set
+                std::atomic_thread_fence(std::memory_order_acquire);
+                return true;
+            }
+            else
+            {
+                // Check counter
+                if (elementsCompletelyDequeued.load(
+                        std::memory_order_relaxed) == BLOCK_SIZE)
+                {
+                    std::atomic_thread_fence(std::memory_order_acquire);
+                    return true;
+                }
+                assert(elementsCompletelyDequeued.load(
+                           std::memory_order_relaxed) <= BLOCK_SIZE);
+                return false;
+            }
+        }
+
+        // Returns true if the block is now empty (does not apply in explicit
+        // context)
+        template <InnerQueueContext context>
+        inline bool set_empty(index_t i)
+        {
+            if (context == explicit_context &&
+                BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Set flag
+                assert(!emptyFlags[BLOCK_SIZE - 1 -
+                                   static_cast<size_t>(i & static_cast<index_t>(
+                                                               BLOCK_SIZE - 1))]
+                            .load(std::memory_order_relaxed));
+                emptyFlags[BLOCK_SIZE - 1 -
+                           static_cast<size_t>(
+                               i & static_cast<index_t>(BLOCK_SIZE - 1))]
+                    .store(true, std::memory_order_release);
+                return false;
+            }
+            else
+            {
+                // Increment counter
+                auto prevVal = elementsCompletelyDequeued.fetch_add(
+                    1, std::memory_order_release);
+                assert(prevVal < BLOCK_SIZE);
+                return prevVal == BLOCK_SIZE - 1;
+            }
+        }
+
+        // Sets multiple contiguous item statuses to 'empty' (assumes no
+        // wrapping and count > 0).
+        // Returns true if the block is now empty (does not apply in explicit
+        // context).
+        template <InnerQueueContext context>
+        inline bool set_many_empty(index_t i, size_t count)
+        {
+            if (context == explicit_context &&
+                BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Set flags
+                std::atomic_thread_fence(std::memory_order_release);
+                i = BLOCK_SIZE - 1 -
+                    static_cast<size_t>(i &
+                                        static_cast<index_t>(BLOCK_SIZE - 1)) -
+                    count + 1;
+                for (size_t j = 0; j != count; ++j)
+                {
+                    assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+                    emptyFlags[i + j].store(true, std::memory_order_relaxed);
+                }
+                return false;
+            }
+            else
+            {
+                // Increment counter
+                auto prevVal = elementsCompletelyDequeued.fetch_add(
+                    count, std::memory_order_release);
+                assert(prevVal + count <= BLOCK_SIZE);
+                return prevVal + count == BLOCK_SIZE;
+            }
+        }
+
+        template <InnerQueueContext context>
+        inline void set_all_empty()
+        {
+            if (context == explicit_context &&
+                BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Set all flags
+                for (size_t i = 0; i != BLOCK_SIZE; ++i)
+                {
+                    emptyFlags[i].store(true, std::memory_order_relaxed);
+                }
+            }
+            else
+            {
+                // Reset counter
+                elementsCompletelyDequeued.store(BLOCK_SIZE,
+                                                 std::memory_order_relaxed);
+            }
+        }
+
+        template <InnerQueueContext context>
+        inline void reset_empty()
+        {
+            if (context == explicit_context &&
+                BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Reset flags
+                for (size_t i = 0; i != BLOCK_SIZE; ++i)
+                {
+                    emptyFlags[i].store(false, std::memory_order_relaxed);
+                }
+            }
+            else
+            {
+                // Reset counter
+                elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+            }
+        }
+
+        inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT
+        {
+            return static_cast<T*>(static_cast<void*>(elements)) +
+                   static_cast<size_t>(idx &
+                                       static_cast<index_t>(BLOCK_SIZE - 1));
+        }
+        inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT
+        {
+            return static_cast<T const*>(static_cast<void const*>(elements)) +
+                   static_cast<size_t>(idx &
+                                       static_cast<index_t>(BLOCK_SIZE - 1));
+        }
+
+    private:
+        // IMPORTANT: This must be the first member in Block, so that if T
+        // depends on the alignment of
+        // addresses returned by malloc, that alignment will be preserved.
+        // Apparently clang actually
+        // generates code that uses this assumption for AVX instructions in some
+        // cases. Ideally, we
+        // should also align Block to the alignment of T in case it's higher
+        // than malloc's 16-byte
+        // alignment, but this is hard to do in a cross-platform way. Assert for
+        // this case:
+        static_assert(
+            std::alignment_of<T>::value <=
+                std::alignment_of<details::max_align_t>::value,
+            "The queue does not support super-aligned types at this time");
+        // Additionally, we need the alignment of Block itself to be a multiple
+        // of max_align_t since
+        // otherwise the appropriate padding will not be added at the end of
+        // Block in order to make
+        // arrays of Blocks all be properly aligned (not just the first one). We
+        // use a union to force
+        // this.
+        union {
+            char elements[sizeof(T) * BLOCK_SIZE];
+            details::max_align_t dummy;
+        };
+
+    public:
+        Block* next;
+        std::atomic<size_t> elementsCompletelyDequeued;
+        std::atomic<bool>
+            emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD
+                           ? BLOCK_SIZE
+                           : 1];
+
+    public:
+        std::atomic<std::uint32_t> freeListRefs;
+        std::atomic<Block*> freeListNext;
+        std::atomic<bool> shouldBeOnFreeList;
+        bool dynamicallyAllocated; // Perhaps a better name for this would be
+                                   // 'isNotPartOfInitialBlockPool'
+
+#if MCDBGQ_TRACKMEM
+        void* owner;
+#endif
+    };
+    static_assert(std::alignment_of<Block>::value >=
+                      std::alignment_of<details::max_align_t>::value,
+                  "Internal error: Blocks must be at least as aligned as the "
+                  "type they are wrapping");
+
+#if MCDBGQ_TRACKMEM
+public:
+    struct MemStats;
+
+private:
+#endif
+
+    ///////////////////////////
+    // Producer base
+    ///////////////////////////
+
+    struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+    {
+        ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
+            : tailIndex(0)
+            , headIndex(0)
+            , dequeueOptimisticCount(0)
+            , dequeueOvercommit(0)
+            , tailBlock(nullptr)
+            , isExplicit(isExplicit_)
+            , parent(parent_)
+        {
+        }
+
+        virtual ~ProducerBase(){};
+
+        template <typename U>
+        inline bool dequeue(U& element)
+        {
+            if (isExplicit)
+            {
+                return static_cast<ExplicitProducer*>(this)->dequeue(element);
+            }
+            else
+            {
+                return static_cast<ImplicitProducer*>(this)->dequeue(element);
+            }
+        }
+
+        template <typename It>
+        inline size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            if (isExplicit)
+            {
+                return static_cast<ExplicitProducer*>(this)->dequeue_bulk(
+                    itemFirst, max);
+            }
+            else
+            {
+                return static_cast<ImplicitProducer*>(this)->dequeue_bulk(
+                    itemFirst, max);
+            }
+        }
+
+        inline ProducerBase* next_prod() const
+        {
+            return static_cast<ProducerBase*>(next);
+        }
+
+        inline size_t size_approx() const
+        {
+            auto tail = tailIndex.load(std::memory_order_relaxed);
+            auto head = headIndex.load(std::memory_order_relaxed);
+            return details::circular_less_than(head, tail)
+                       ? static_cast<size_t>(tail - head)
+                       : 0;
+        }
+
+        inline index_t getTail() const
+        {
+            return tailIndex.load(std::memory_order_relaxed);
+        }
+
+    protected:
+        std::atomic<index_t> tailIndex; // Where to enqueue to next
+        std::atomic<index_t> headIndex; // Where to dequeue from next
+
+        std::atomic<index_t> dequeueOptimisticCount;
+        std::atomic<index_t> dequeueOvercommit;
+
+        Block* tailBlock;
+
+    public:
+        bool isExplicit;
+        ConcurrentQueue* parent;
+
+    protected:
+#if MCDBGQ_TRACKMEM
+        friend struct MemStats;
+#endif
+    };
+
+    ///////////////////////////
+    // Explicit queue
+    ///////////////////////////
+
+    struct ExplicitProducer : public ProducerBase
+    {
+        explicit ExplicitProducer(ConcurrentQueue* parent)
+            : ProducerBase(parent, true)
+            , blockIndex(nullptr)
+            , pr_blockIndexSlotsUsed(0)
+            , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1)
+            , pr_blockIndexFront(0)
+            , pr_blockIndexEntries(nullptr)
+            , pr_blockIndexRaw(nullptr)
+        {
+            size_t poolBasedIndexSize =
+                details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1;
+            if (poolBasedIndexSize > pr_blockIndexSize)
+            {
+                pr_blockIndexSize = poolBasedIndexSize;
+            }
+
+            new_block_index(0); // This creates an index with double the number
+                                // of current entries, i.e.
+                                // EXPLICIT_INITIAL_INDEX_SIZE
+        }
+
+        ~ExplicitProducer()
+        {
+            // Destruct any elements not yet dequeued.
+            // Since we're in the destructor, we can assume all elements
+            // are either completely dequeued or completely not (no halfways).
+            if (this->tailBlock != nullptr)
+            { // Note this means there must be a block index too
+                // First find the block that's partially dequeued, if any
+                Block* halfDequeuedBlock = nullptr;
+                if ((this->headIndex.load(std::memory_order_relaxed) &
+                     static_cast<index_t>(BLOCK_SIZE - 1)) != 0)
+                {
+                    // The head's not on a block boundary, meaning a block
+                    // somewhere is partially dequeued
+                    // (or the head block is the tail block and was fully
+                    // dequeued, but the head/tail are still not on a boundary)
+                    size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
+                               (pr_blockIndexSize - 1);
+                    while (details::circular_less_than<index_t>(
+                        pr_blockIndexEntries[i].base + BLOCK_SIZE,
+                        this->headIndex.load(std::memory_order_relaxed)))
+                    {
+                        i = (i + 1) & (pr_blockIndexSize - 1);
+                    }
+                    assert(details::circular_less_than<index_t>(
+                        pr_blockIndexEntries[i].base,
+                        this->headIndex.load(std::memory_order_relaxed)));
+                    halfDequeuedBlock = pr_blockIndexEntries[i].block;
+                }
+
+                // Start at the head block (note the first line in the loop
+                // gives us the head from the tail on the first iteration)
+                auto block = this->tailBlock;
+                do
+                {
+                    block = block->next;
+                    if (block->ConcurrentQueue::Block::template is_empty<
+                            explicit_context>())
+                    {
+                        continue;
+                    }
+
+                    size_t i = 0; // Offset into block
+                    if (block == halfDequeuedBlock)
+                    {
+                        i = static_cast<size_t>(
+                            this->headIndex.load(std::memory_order_relaxed) &
+                            static_cast<index_t>(BLOCK_SIZE - 1));
+                    }
+
+                    // Walk through all the items in the block; if this is the
+                    // tail block, we need to stop when we reach the tail index
+                    auto lastValidIndex =
+                        (this->tailIndex.load(std::memory_order_relaxed) &
+                         static_cast<index_t>(BLOCK_SIZE - 1)) == 0
+                            ? BLOCK_SIZE
+                            : static_cast<size_t>(
+                                  this->tailIndex.load(
+                                      std::memory_order_relaxed) &
+                                  static_cast<index_t>(BLOCK_SIZE - 1));
+                    while (i != BLOCK_SIZE &&
+                           (block != this->tailBlock || i != lastValidIndex))
+                    {
+                        (*block)[i++]->~T();
+                    }
+                } while (block != this->tailBlock);
+            }
+
+            // Destroy all blocks that we own
+            if (this->tailBlock != nullptr)
+            {
+                auto block = this->tailBlock;
+                do
+                {
+                    auto nextBlock = block->next;
+                    if (block->dynamicallyAllocated)
+                    {
+                        destroy(block);
+                    }
+                    else
+                    {
+                        this->parent->add_block_to_free_list(block);
+                    }
+                    block = nextBlock;
+                } while (block != this->tailBlock);
+            }
+
+            // Destroy the block indices
+            auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+            while (header != nullptr)
+            {
+                auto prev = static_cast<BlockIndexHeader*>(header->prev);
+                header->~BlockIndexHeader();
+                (Traits::free)(header);
+                header = prev;
+            }
+        }
+
+        template <AllocationMode allocMode, typename U>
+        inline bool enqueue(U&& element)
+        {
+            index_t currentTailIndex =
+                this->tailIndex.load(std::memory_order_relaxed);
+            index_t newTailIndex = 1 + currentTailIndex;
+            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+            {
+                // We reached the end of a block, start a new one
+                auto startBlock = this->tailBlock;
+                auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+                if (this->tailBlock != nullptr &&
+                    this->tailBlock->next->ConcurrentQueue::Block::
+                        template is_empty<explicit_context>())
+                {
+                    // We can re-use the block ahead of us, it's empty!
+                    this->tailBlock = this->tailBlock->next;
+                    this->tailBlock
+                        ->ConcurrentQueue::Block::template reset_empty<
+                            explicit_context>();
+
+                    // We'll put the block on the block index (guaranteed to be
+                    // room since we're conceptually removing the
+                    // last block from it first -- except instead of removing
+                    // then adding, we can just overwrite).
+                    // Note that there must be a valid block index here, since
+                    // even if allocation failed in the ctor,
+                    // it would have been re-attempted when adding the first
+                    // block to the queue; since there is such
+                    // a block, a block index must have been successfully
+                    // allocated.
+                }
+                else
+                {
+                    // Whatever head value we see here is >= the last value we
+                    // saw here (relatively),
+                    // and <= its current value. Since we have the most recent
+                    // tail, the head must be
+                    // <= to it.
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(
+                        !details::circular_less_than<index_t>(currentTailIndex,
+                                                              head));
+                    if (!details::circular_less_than<index_t>(head,
+                                                              currentTailIndex +
+                                                                  BLOCK_SIZE) ||
+                        (MAX_SUBQUEUE_SIZE !=
+                             details::const_numeric_max<size_t>::value &&
+                         (MAX_SUBQUEUE_SIZE == 0 ||
+                          MAX_SUBQUEUE_SIZE - BLOCK_SIZE <
+                              currentTailIndex - head)))
+                    {
+                        // We can't enqueue in another block because there's not
+                        // enough leeway -- the
+                        // tail could surpass the head by the time the block
+                        // fills up! (Or we'll exceed
+                        // the size limit, if the second part of the condition
+                        // was true.)
+                        return false;
+                    }
+                    // We're going to need a new block; check that the block
+                    // index has room
+                    if (pr_blockIndexRaw == nullptr ||
+                        pr_blockIndexSlotsUsed == pr_blockIndexSize)
+                    {
+                        // Hmm, the circular block index is already full --
+                        // we'll need
+                        // to allocate a new index. Note pr_blockIndexRaw can
+                        // only be nullptr if
+                        // the initial allocation failed in the constructor.
+
+                        if (allocMode == CannotAlloc ||
+                            !new_block_index(pr_blockIndexSlotsUsed))
+                        {
+                            return false;
+                        }
+                    }
+
+                    // Insert a new block in the circular linked list
+                    auto newBlock = this->parent->ConcurrentQueue::
+                                        template requisition_block<allocMode>();
+                    if (newBlock == nullptr)
+                    {
+                        return false;
+                    }
+#if MCDBGQ_TRACKMEM
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<
+                        explicit_context>();
+                    if (this->tailBlock == nullptr)
+                    {
+                        newBlock->next = newBlock;
+                    }
+                    else
+                    {
+                        newBlock->next = this->tailBlock->next;
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock = newBlock;
+                    ++pr_blockIndexSlotsUsed;
+                }
+
+                if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U,
+                                              new (nullptr)
+                                                  T(std::forward<U>(element))))
+                {
+                    // The constructor may throw. We want the element not to
+                    // appear in the queue in
+                    // that case (without corrupting the queue):
+                    MOODYCAMEL_TRY
+                    {
+                        new ((*this->tailBlock)[currentTailIndex])
+                            T(std::forward<U>(element));
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        // Revert change to the current block, but leave the new
+                        // block available
+                        // for next time
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock = startBlock == nullptr
+                                              ? this->tailBlock
+                                              : startBlock;
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+                else
+                {
+                    (void)startBlock;
+                    (void)originalBlockIndexSlotsUsed;
+                }
+
+                // Add block to block index
+                auto& entry = blockIndex.load(std::memory_order_relaxed)
+                                  ->entries[pr_blockIndexFront];
+                entry.base = currentTailIndex;
+                entry.block = this->tailBlock;
+                blockIndex.load(std::memory_order_relaxed)
+                    ->front.store(pr_blockIndexFront,
+                                  std::memory_order_release);
+                pr_blockIndexFront =
+                    (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+                if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U,
+                                              new (nullptr)
+                                                  T(std::forward<U>(element))))
+                {
+                    this->tailIndex.store(newTailIndex,
+                                          std::memory_order_release);
+                    return true;
+                }
+            }
+
+            // Enqueue
+            new ((*this->tailBlock)[currentTailIndex])
+                T(std::forward<U>(element));
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template <typename U>
+        bool dequeue(U& element)
+        {
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit =
+                this->dequeueOvercommit.load(std::memory_order_relaxed);
+            if (details::circular_less_than<index_t>(
+                    this->dequeueOptimisticCount.load(
+                        std::memory_order_relaxed) -
+                        overcommit,
+                    tail))
+            {
+                // Might be something to dequeue, let's give it a try
+
+                // Note that this if is purely for performance purposes in the
+                // common case when the queue is
+                // empty and the values are eventually consistent -- we may
+                // enter here spuriously.
+
+                // Note that whatever the values of overcommit and tail are,
+                // they are not going to change (unless we
+                // change them) and must be the same value at this point (inside
+                // the if) as when the if condition was
+                // evaluated.
+
+                // We insert an acquire fence here to synchronize-with the
+                // release upon incrementing dequeueOvercommit below.
+                // This ensures that whatever the value we got loaded into
+                // overcommit, the load of dequeueOptisticCount in
+                // the fetch_add below will result in a value at least as recent
+                // as that (and therefore at least as large).
+                // Note that I believe a compiler (signal) fence here would be
+                // sufficient due to the nature of fetch_add (all
+                // read-modify-write operations are guaranteed to work on the
+                // latest value in the modification order), but
+                // unfortunately that can't be shown to be correct using only
+                // the C++11 standard.
+                // See
+                // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                // Increment optimistic counter, then check if it went over the
+                // boundary
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+                    1, std::memory_order_relaxed);
+
+                // Note that since dequeueOvercommit must be <=
+                // dequeueOptimisticCount (because dequeueOvercommit is only
+                // ever
+                // incremented after dequeueOptimisticCount -- this is enforced
+                // in the `else` block below), and since we now
+                // have a version of dequeueOptimisticCount that is at least as
+                // recent as overcommit (due to the release upon
+                // incrementing dequeueOvercommit and the acquire above that
+                // synchronizes with it), overcommit <= myDequeueCount.
+                assert(overcommit <= myDequeueCount);
+
+                // Note that we reload tail here in case it changed; it will be
+                // the same value as before or greater, since
+                // this load is sequenced after (happens after) the earlier load
+                // above. This is supported by read-read
+                // coherency (as defined in the standard), explained here:
+                // http://en.cppreference.com/w/cpp/atomic/memory_order
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                if (details::likely(details::circular_less_than<index_t>(
+                        myDequeueCount - overcommit, tail)))
+                {
+                    // Guaranteed to be at least one element to dequeue!
+
+                    // Get the index. Note that since there's guaranteed to be
+                    // at least one element, this
+                    // will never exceed tail. We need to do an acquire-release
+                    // fence here since it's possible
+                    // that whatever condition got us to this point was for an
+                    // earlier enqueued element (that
+                    // we already see the memory effects for), but that by the
+                    // time we increment somebody else
+                    // has incremented it, and we need to see the memory effects
+                    // for *that* element, which is
+                    // in such a case is necessarily visible on the thread that
+                    // incremented it in the first
+                    // place with the more current condition (they must have
+                    // acquired a tail that is at least
+                    // as recent).
+                    auto index =
+                        this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                    // Determine which block the element is in
+
+                    auto localBlockIndex =
+                        blockIndex.load(std::memory_order_acquire);
+                    auto localBlockIndexHead =
+                        localBlockIndex->front.load(std::memory_order_acquire);
+
+                    // We need to be careful here about subtracting and dividing
+                    // because of index wrap-around.
+                    // When an index wraps, we need to preserve the sign of the
+                    // offset when dividing it by the
+                    // block size (in order to get a correct signed block count
+                    // offset in all cases):
+                    auto headBase =
+                        localBlockIndex->entries[localBlockIndexHead].base;
+                    auto blockBaseIndex =
+                        index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                    auto offset = static_cast<size_t>(
+                        static_cast<typename std::make_signed<index_t>::type>(
+                            blockBaseIndex - headBase) /
+                        BLOCK_SIZE);
+                    auto block = localBlockIndex
+                                     ->entries[(localBlockIndexHead + offset) &
+                                               (localBlockIndex->size - 1)]
+                                     .block;
+
+                    // Dequeue
+                    auto& el = *((*block)[index]);
+                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&,
+                                                    element = std::move(el)))
+                    {
+                        // Make sure the element is still fully dequeued and
+                        // destroyed even if the assignment
+                        // throws
+                        struct Guard
+                        {
+                            Block* block;
+                            index_t index;
+
+                            ~Guard()
+                            {
+                                (*block)[index]->~T();
+                                block->ConcurrentQueue::Block::
+                                    template set_empty<explicit_context>(index);
+                            }
+                        } guard = {block, index};
+
+                        element = std::move(el);
+                    }
+                    else
+                    {
+                        element = std::move(el);
+                        el.~T();
+                        block->ConcurrentQueue::Block::template set_empty<
+                            explicit_context>(index);
+                    }
+
+                    return true;
+                }
+                else
+                {
+                    // Wasn't anything to dequeue after all; make the effective
+                    // dequeue count eventually consistent
+                    this->dequeueOvercommit.fetch_add(
+                        1, std::memory_order_release); // Release so that the
+                                                       // fetch_add on
+                                                       // dequeueOptimisticCount
+                                                       // is guaranteed to
+                                                       // happen before this
+                                                       // write
+                }
+            }
+
+            return false;
+        }
+
+        template <AllocationMode allocMode, typename It>
+        bool enqueue_bulk(It itemFirst, size_t count)
+        {
+            // First, we need to make sure we have enough room to enqueue all of
+            // the elements;
+            // this means pre-allocating blocks and putting them in the block
+            // index (but only if
+            // all the allocations succeeded).
+            index_t startTailIndex =
+                this->tailIndex.load(std::memory_order_relaxed);
+            auto startBlock = this->tailBlock;
+            auto originalBlockIndexFront = pr_blockIndexFront;
+            auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+            Block* firstAllocatedBlock = nullptr;
+
+            // Figure out how many blocks we'll need to allocate, and do so
+            size_t blockBaseDiff =
+                ((startTailIndex + count - 1) &
+                 ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+                ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+            index_t currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            if (blockBaseDiff > 0)
+            {
+                // Allocate as many blocks as possible from ahead
+                while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
+                       this->tailBlock->next != firstAllocatedBlock &&
+                       this->tailBlock->next->ConcurrentQueue::Block::
+                           template is_empty<explicit_context>())
+                {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    this->tailBlock = this->tailBlock->next;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                              ? this->tailBlock
+                                              : firstAllocatedBlock;
+
+                    auto& entry = blockIndex.load(std::memory_order_relaxed)
+                                      ->entries[pr_blockIndexFront];
+                    entry.base = currentTailIndex;
+                    entry.block = this->tailBlock;
+                    pr_blockIndexFront =
+                        (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                }
+
+                // Now allocate as many blocks as necessary from the block pool
+                while (blockBaseDiff > 0)
+                {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(
+                        !details::circular_less_than<index_t>(currentTailIndex,
+                                                              head));
+                    bool full =
+                        !details::circular_less_than<index_t>(head,
+                                                              currentTailIndex +
+                                                                  BLOCK_SIZE) ||
+                        (MAX_SUBQUEUE_SIZE !=
+                             details::const_numeric_max<size_t>::value &&
+                         (MAX_SUBQUEUE_SIZE == 0 ||
+                          MAX_SUBQUEUE_SIZE - BLOCK_SIZE <
+                              currentTailIndex - head));
+                    if (pr_blockIndexRaw == nullptr ||
+                        pr_blockIndexSlotsUsed == pr_blockIndexSize || full)
+                    {
+                        if (allocMode == CannotAlloc || full ||
+                            !new_block_index(originalBlockIndexSlotsUsed))
+                        {
+                            // Failed to allocate, undo changes (but keep
+                            // injected blocks)
+                            pr_blockIndexFront = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed =
+                                originalBlockIndexSlotsUsed;
+                            this->tailBlock = startBlock == nullptr
+                                                  ? firstAllocatedBlock
+                                                  : startBlock;
+                            return false;
+                        }
+
+                        // pr_blockIndexFront is updated inside new_block_index,
+                        // so we need to
+                        // update our fallback value too (since we keep the new
+                        // index even if we
+                        // later fail)
+                        originalBlockIndexFront = originalBlockIndexSlotsUsed;
+                    }
+
+                    // Insert a new block in the circular linked list
+                    auto newBlock = this->parent->ConcurrentQueue::
+                                        template requisition_block<allocMode>();
+                    if (newBlock == nullptr)
+                    {
+                        pr_blockIndexFront = originalBlockIndexFront;
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock = startBlock == nullptr
+                                              ? firstAllocatedBlock
+                                              : startBlock;
+                        return false;
+                    }
+
+#if MCDBGQ_TRACKMEM
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template set_all_empty<
+                        explicit_context>();
+                    if (this->tailBlock == nullptr)
+                    {
+                        newBlock->next = newBlock;
+                    }
+                    else
+                    {
+                        newBlock->next = this->tailBlock->next;
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock = newBlock;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                              ? this->tailBlock
+                                              : firstAllocatedBlock;
+
+                    ++pr_blockIndexSlotsUsed;
+
+                    auto& entry = blockIndex.load(std::memory_order_relaxed)
+                                      ->entries[pr_blockIndexFront];
+                    entry.base = currentTailIndex;
+                    entry.block = this->tailBlock;
+                    pr_blockIndexFront =
+                        (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                }
+
+                // Excellent, all allocations succeeded. Reset each block's
+                // emptiness before we fill them up, and
+                // publish the new block index front
+                auto block = firstAllocatedBlock;
+                while (true)
+                {
+                    block->ConcurrentQueue::Block::template reset_empty<
+                        explicit_context>();
+                    if (block == this->tailBlock)
+                    {
+                        break;
+                    }
+                    block = block->next;
+                }
+
+                if (MOODYCAMEL_NOEXCEPT_CTOR(
+                        T, decltype(*itemFirst),
+                        new (nullptr) T(details::deref_noexcept(itemFirst))))
+                {
+                    blockIndex.load(std::memory_order_relaxed)
+                        ->front.store((pr_blockIndexFront - 1) &
+                                          (pr_blockIndexSize - 1),
+                                      std::memory_order_release);
+                }
+            }
+
+            // Enqueue, one block at a time
+            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+            currentTailIndex = startTailIndex;
+            auto endBlock = this->tailBlock;
+            this->tailBlock = startBlock;
+            assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) !=
+                       0 ||
+                   firstAllocatedBlock != nullptr || count == 0);
+            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+                firstAllocatedBlock != nullptr)
+            {
+                this->tailBlock = firstAllocatedBlock;
+            }
+            while (true)
+            {
+                auto stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(newTailIndex,
+                                                         stopIndex))
+                {
+                    stopIndex = newTailIndex;
+                }
+                if (MOODYCAMEL_NOEXCEPT_CTOR(
+                        T, decltype(*itemFirst),
+                        new (nullptr) T(details::deref_noexcept(itemFirst))))
+                {
+                    while (currentTailIndex != stopIndex)
+                    {
+                        new ((*this->tailBlock)[currentTailIndex++])
+                            T(*itemFirst++);
+                    }
+                }
+                else
+                {
+                    MOODYCAMEL_TRY
+                    {
+                        while (currentTailIndex != stopIndex)
+                        {
+                            // Must use copy constructor even if move
+                            // constructor is available
+                            // because we may have to revert if there's an
+                            // exception.
+                            // Sorry about the horrible templated next line, but
+                            // it was the only way
+                            // to disable moving *at compile time*, which is
+                            // important because a type
+                            // may only define a (noexcept) move constructor,
+                            // and so calls to the
+                            // cctor will not compile, even if they are in an if
+                            // branch that will never
+                            // be executed
+                            new ((*this->tailBlock)[currentTailIndex])
+                                T(details::nomove_if<
+                                    (bool)!MOODYCAMEL_NOEXCEPT_CTOR(
+                                        T, decltype(*itemFirst),
+                                        new (nullptr) T(details::deref_noexcept(
+                                            itemFirst)))>::eval(*itemFirst));
+                            ++currentTailIndex;
+                            ++itemFirst;
+                        }
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        // Oh dear, an exception's been thrown -- destroy the
+                        // elements that
+                        // were enqueued so far and revert the entire bulk
+                        // operation (we'll keep
+                        // any allocated blocks in our linked list for later,
+                        // though).
+                        auto constructedStopIndex = currentTailIndex;
+                        auto lastBlockEnqueued = this->tailBlock;
+
+                        pr_blockIndexFront = originalBlockIndexFront;
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock = startBlock == nullptr
+                                              ? firstAllocatedBlock
+                                              : startBlock;
+
+                        if (!details::is_trivially_destructible<T>::value)
+                        {
+                            auto block = startBlock;
+                            if ((startTailIndex &
+                                 static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                            {
+                                block = firstAllocatedBlock;
+                            }
+                            currentTailIndex = startTailIndex;
+                            while (true)
+                            {
+                                stopIndex =
+                                    (currentTailIndex &
+                                     ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                                    static_cast<index_t>(BLOCK_SIZE);
+                                if (details::circular_less_than<index_t>(
+                                        constructedStopIndex, stopIndex))
+                                {
+                                    stopIndex = constructedStopIndex;
+                                }
+                                while (currentTailIndex != stopIndex)
+                                {
+                                    (*block)[currentTailIndex++]->~T();
+                                }
+                                if (block == lastBlockEnqueued)
+                                {
+                                    break;
+                                }
+                                block = block->next;
+                            }
+                        }
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                if (this->tailBlock == endBlock)
+                {
+                    assert(currentTailIndex == newTailIndex);
+                    break;
+                }
+                this->tailBlock = this->tailBlock->next;
+            }
+
+            if (!MOODYCAMEL_NOEXCEPT_CTOR(
+                    T, decltype(*itemFirst),
+                    new (nullptr) T(details::deref_noexcept(itemFirst))) &&
+                firstAllocatedBlock != nullptr)
+            {
+                blockIndex.load(std::memory_order_relaxed)
+                    ->front.store((pr_blockIndexFront - 1) &
+                                      (pr_blockIndexSize - 1),
+                                  std::memory_order_release);
+            }
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template <typename It>
+        size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit =
+                this->dequeueOvercommit.load(std::memory_order_relaxed);
+            auto desiredCount = static_cast<size_t>(
+                tail -
+                (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                 overcommit));
+            if (details::circular_less_than<size_t>(0, desiredCount))
+            {
+                desiredCount = desiredCount < max ? desiredCount : max;
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+                    desiredCount, std::memory_order_relaxed);
+                assert(overcommit <= myDequeueCount);
+
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                auto actualCount =
+                    static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                if (details::circular_less_than<size_t>(0, actualCount))
+                {
+                    actualCount =
+                        desiredCount < actualCount ? desiredCount : actualCount;
+                    if (actualCount < desiredCount)
+                    {
+                        this->dequeueOvercommit.fetch_add(
+                            desiredCount - actualCount,
+                            std::memory_order_release);
+                    }
+
+                    // Get the first index. Note that since there's guaranteed
+                    // to be at least actualCount elements, this
+                    // will never exceed tail.
+                    auto firstIndex =
+                        this->headIndex.fetch_add(actualCount,
+                                                  std::memory_order_acq_rel);
+
+                    // Determine which block the first element is in
+                    auto localBlockIndex =
+                        blockIndex.load(std::memory_order_acquire);
+                    auto localBlockIndexHead =
+                        localBlockIndex->front.load(std::memory_order_acquire);
+
+                    auto headBase =
+                        localBlockIndex->entries[localBlockIndexHead].base;
+                    auto firstBlockBaseIndex =
+                        firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                    auto offset = static_cast<size_t>(
+                        static_cast<typename std::make_signed<index_t>::type>(
+                            firstBlockBaseIndex - headBase) /
+                        BLOCK_SIZE);
+                    auto indexIndex = (localBlockIndexHead + offset) &
+                                      (localBlockIndex->size - 1);
+
+                    // Iterate the blocks and dequeue
+                    auto index = firstIndex;
+                    do
+                    {
+                        auto firstIndexInBlock = index;
+                        auto endIndex =
+                            (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                            static_cast<index_t>(BLOCK_SIZE);
+                        endIndex =
+                            details::circular_less_than<index_t>(
+                                firstIndex + static_cast<index_t>(actualCount),
+                                endIndex)
+                                ? firstIndex + static_cast<index_t>(actualCount)
+                                : endIndex;
+                        auto block = localBlockIndex->entries[indexIndex].block;
+                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(
+                                T, T&&, details::deref_noexcept(itemFirst) =
+                                            std::move((*(*block)[index]))))
+                        {
+                            while (index != endIndex)
+                            {
+                                auto& el = *((*block)[index]);
+                                *itemFirst++ = std::move(el);
+                                el.~T();
+                                ++index;
+                            }
+                        }
+                        else
+                        {
+                            MOODYCAMEL_TRY
+                            {
+                                while (index != endIndex)
+                                {
+                                    auto& el = *((*block)[index]);
+                                    *itemFirst = std::move(el);
+                                    ++itemFirst;
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            MOODYCAMEL_CATCH(...)
+                            {
+                                // It's too late to revert the dequeue, but we
+                                // can make sure that all
+                                // the dequeued objects are properly destroyed
+                                // and the block index
+                                // (and empty count) are properly updated before
+                                // we propagate the exception
+                                do
+                                {
+                                    block = localBlockIndex->entries[indexIndex]
+                                                .block;
+                                    while (index != endIndex)
+                                    {
+                                        (*block)[index++]->~T();
+                                    }
+                                    block->ConcurrentQueue::Block::
+                                        template set_many_empty<
+                                            explicit_context>(
+                                            firstIndexInBlock,
+                                            static_cast<size_t>(
+                                                endIndex - firstIndexInBlock));
+                                    indexIndex = (indexIndex + 1) &
+                                                 (localBlockIndex->size - 1);
+
+                                    firstIndexInBlock = index;
+                                    endIndex = (index &
+                                                ~static_cast<index_t>(
+                                                    BLOCK_SIZE - 1)) +
+                                               static_cast<index_t>(BLOCK_SIZE);
+                                    endIndex =
+                                        details::circular_less_than<index_t>(
+                                            firstIndex + static_cast<index_t>(
+                                                             actualCount),
+                                            endIndex)
+                                            ? firstIndex + static_cast<index_t>(
+                                                               actualCount)
+                                            : endIndex;
+                                } while (index != firstIndex + actualCount);
+
+                                MOODYCAMEL_RETHROW;
+                            }
+                        }
+                        block->ConcurrentQueue::Block::template set_many_empty<
+                            explicit_context>(
+                            firstIndexInBlock,
+                            static_cast<size_t>(endIndex - firstIndexInBlock));
+                        indexIndex =
+                            (indexIndex + 1) & (localBlockIndex->size - 1);
+                    } while (index != firstIndex + actualCount);
+
+                    return actualCount;
+                }
+                else
+                {
+                    // Wasn't anything to dequeue after all; make the effective
+                    // dequeue count eventually consistent
+                    this->dequeueOvercommit.fetch_add(
+                        desiredCount, std::memory_order_release);
+                }
+            }
+
+            return 0;
+        }
+
+    private:
+        struct BlockIndexEntry
+        {
+            index_t base;
+            Block* block;
+        };
+
+        struct BlockIndexHeader
+        {
+            size_t size;
+            std::atomic<size_t> front; // Current slot (not next, like
+                                       // pr_blockIndexFront)
+            BlockIndexEntry* entries;
+            void* prev;
+        };
+
+        bool new_block_index(size_t numberOfFilledSlotsToExpose)
+        {
+            auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+            // Create the new block
+            pr_blockIndexSize <<= 1;
+            auto newRawPtr = static_cast<char*>(
+                (Traits::malloc)(sizeof(BlockIndexHeader) +
+                                 std::alignment_of<BlockIndexEntry>::value - 1 +
+                                 sizeof(BlockIndexEntry) * pr_blockIndexSize));
+            if (newRawPtr == nullptr)
+            {
+                pr_blockIndexSize >>= 1; // Reset to allow graceful retry
+                return false;
+            }
+
+            auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(
+                details::align_for<BlockIndexEntry>(newRawPtr +
+                                                    sizeof(BlockIndexHeader)));
+
+            // Copy in all the old indices, if any
+            size_t j = 0;
+            if (pr_blockIndexSlotsUsed != 0)
+            {
+                auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
+                         prevBlockSizeMask;
+                do
+                {
+                    newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+                    i = (i + 1) & prevBlockSizeMask;
+                } while (i != pr_blockIndexFront);
+            }
+
+            // Update everything
+            auto header = new (newRawPtr) BlockIndexHeader;
+            header->size = pr_blockIndexSize;
+            header->front.store(numberOfFilledSlotsToExpose - 1,
+                                std::memory_order_relaxed);
+            header->entries = newBlockIndexEntries;
+            header->prev = pr_blockIndexRaw; // we link the new block to the old
+                                             // one so we can free it later
+
+            pr_blockIndexFront = j;
+            pr_blockIndexEntries = newBlockIndexEntries;
+            pr_blockIndexRaw = newRawPtr;
+            blockIndex.store(header, std::memory_order_release);
+
+            return true;
+        }
+
+    private:
+        std::atomic<BlockIndexHeader*> blockIndex;
+
+        // To be used by producer only -- consumer must use the ones in
+        // referenced by blockIndex
+        size_t pr_blockIndexSlotsUsed;
+        size_t pr_blockIndexSize;
+        size_t pr_blockIndexFront; // Next slot (not current)
+        BlockIndexEntry* pr_blockIndexEntries;
+        void* pr_blockIndexRaw;
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    public:
+        ExplicitProducer* nextExplicitProducer;
+
+    private:
+#endif
+
+#if MCDBGQ_TRACKMEM
+        friend struct MemStats;
+#endif
+    };
+
+    //////////////////////////////////
+    // Implicit queue
+    //////////////////////////////////
+
+    struct ImplicitProducer : public ProducerBase
+    {
+        ImplicitProducer(ConcurrentQueue* parent)
+            : ProducerBase(parent, false)
+            , nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE)
+            , blockIndex(nullptr)
+        {
+            new_block_index();
+        }
+
+        ~ImplicitProducer()
+        {
+// Note that since we're in the destructor we can assume that all
+// enqueue/dequeue operations
+// completed already; this means that all undequeued elements are placed
+// contiguously across
+// contiguous blocks, and that only the first and last remaining blocks can be
+// only partially
+// empty (all other remaining blocks must be completely full).
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+            // Unregister ourselves for thread termination notification
+            if (!this->inactive.load(std::memory_order_relaxed))
+            {
+                details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+            }
+#endif
+
+            // Destroy all remaining elements!
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto index = this->headIndex.load(std::memory_order_relaxed);
+            Block* block = nullptr;
+            assert(index == tail || details::circular_less_than(index, tail));
+            bool forceFreeLastBlock = index != tail; // If we enter the loop,
+                                                     // then the last (tail)
+                                                     // block will not be freed
+            while (index != tail)
+            {
+                if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ||
+                    block == nullptr)
+                {
+                    if (block != nullptr)
+                    {
+                        // Free the old block
+                        this->parent->add_block_to_free_list(block);
+                    }
+
+                    block = get_block_index_entry_for_index(index)->value.load(
+                        std::memory_order_relaxed);
+                }
+
+                ((*block)[index])->~T();
+                ++index;
+            }
+            // Even if the queue is empty, there's still one block that's not on
+            // the free list
+            // (unless the head index reached the end of it, in which case the
+            // tail will be poised
+            // to create a new block).
+            if (this->tailBlock != nullptr &&
+                (forceFreeLastBlock ||
+                 (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0))
+            {
+                this->parent->add_block_to_free_list(this->tailBlock);
+            }
+
+            // Destroy block index
+            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+            if (localBlockIndex != nullptr)
+            {
+                for (size_t i = 0; i != localBlockIndex->capacity; ++i)
+                {
+                    localBlockIndex->index[i]->~BlockIndexEntry();
+                }
+                do
+                {
+                    auto prev = localBlockIndex->prev;
+                    localBlockIndex->~BlockIndexHeader();
+                    (Traits::free)(localBlockIndex);
+                    localBlockIndex = prev;
+                } while (localBlockIndex != nullptr);
+            }
+        }
+
+        template <AllocationMode allocMode, typename U>
+        inline bool enqueue(U&& element)
+        {
+            index_t currentTailIndex =
+                this->tailIndex.load(std::memory_order_relaxed);
+            index_t newTailIndex = 1 + currentTailIndex;
+            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+            {
+                // We reached the end of a block, start a new one
+                auto head = this->headIndex.load(std::memory_order_relaxed);
+                assert(!details::circular_less_than<index_t>(currentTailIndex,
+                                                             head));
+                if (!details::circular_less_than<index_t>(head,
+                                                          currentTailIndex +
+                                                              BLOCK_SIZE) ||
+                    (MAX_SUBQUEUE_SIZE !=
+                         details::const_numeric_max<size_t>::value &&
+                     (MAX_SUBQUEUE_SIZE == 0 ||
+                      MAX_SUBQUEUE_SIZE - BLOCK_SIZE <
+                          currentTailIndex - head)))
+                {
+                    return false;
+                }
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Find out where we'll be inserting this block in the block
+                // index
+                BlockIndexEntry* idxEntry;
+                if (!insert_block_index_entry<allocMode>(idxEntry,
+                                                         currentTailIndex))
+                {
+                    return false;
+                }
+
+                // Get ahold of a new block
+                auto newBlock =
+                    this->parent->ConcurrentQueue::template requisition_block<
+                        allocMode>();
+                if (newBlock == nullptr)
+                {
+                    rewind_block_index_tail();
+                    idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                    return false;
+                }
+#if MCDBGQ_TRACKMEM
+                newBlock->owner = this;
+#endif
+                newBlock->ConcurrentQueue::Block::template reset_empty<
+                    implicit_context>();
+
+                if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U,
+                                              new (nullptr)
+                                                  T(std::forward<U>(element))))
+                {
+                    // May throw, try to insert now before we publish the fact
+                    // that we have this new block
+                    MOODYCAMEL_TRY
+                    {
+                        new ((*newBlock)[currentTailIndex])
+                            T(std::forward<U>(element));
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        rewind_block_index_tail();
+                        idxEntry->value.store(nullptr,
+                                              std::memory_order_relaxed);
+                        this->parent->add_block_to_free_list(newBlock);
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                // Insert the new block into the index
+                idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                this->tailBlock = newBlock;
+
+                if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U,
+                                              new (nullptr)
+                                                  T(std::forward<U>(element))))
+                {
+                    this->tailIndex.store(newTailIndex,
+                                          std::memory_order_release);
+                    return true;
+                }
+            }
+
+            // Enqueue
+            new ((*this->tailBlock)[currentTailIndex])
+                T(std::forward<U>(element));
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template <typename U>
+        bool dequeue(U& element)
+        {
+            // See ExplicitProducer::dequeue for rationale and explanation
+            index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+            index_t overcommit =
+                this->dequeueOvercommit.load(std::memory_order_relaxed);
+            if (details::circular_less_than<index_t>(
+                    this->dequeueOptimisticCount.load(
+                        std::memory_order_relaxed) -
+                        overcommit,
+                    tail))
+            {
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+                    1, std::memory_order_relaxed);
+                assert(overcommit <= myDequeueCount);
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                if (details::likely(details::circular_less_than<index_t>(
+                        myDequeueCount - overcommit, tail)))
+                {
+                    index_t index =
+                        this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                    // Determine which block the element is in
+                    auto entry = get_block_index_entry_for_index(index);
+
+                    // Dequeue
+                    auto block = entry->value.load(std::memory_order_relaxed);
+                    auto& el = *((*block)[index]);
+
+                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&,
+                                                    element = std::move(el)))
+                    {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                        // Note: Acquiring the mutex with every dequeue instead
+                        // of only when a block
+                        // is released is very sub-optimal, but it is, after
+                        // all, purely debug code.
+                        debug::DebugLock lock(producer->mutex);
+#endif
+                        struct Guard
+                        {
+                            Block* block;
+                            index_t index;
+                            BlockIndexEntry* entry;
+                            ConcurrentQueue* parent;
+
+                            ~Guard()
+                            {
+                                (*block)[index]->~T();
+                                if (block->ConcurrentQueue::Block::
+                                        template set_empty<implicit_context>(
+                                            index))
+                                {
+                                    entry->value.store(
+                                        nullptr, std::memory_order_relaxed);
+                                    parent->add_block_to_free_list(block);
+                                }
+                            }
+                        } guard = {block, index, entry, this->parent};
+
+                        element = std::move(el);
+                    }
+                    else
+                    {
+                        element = std::move(el);
+                        el.~T();
+
+                        if (block->ConcurrentQueue::Block::template set_empty<
+                                implicit_context>(index))
+                        {
+                            {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                                debug::DebugLock lock(mutex);
+#endif
+                                // Add the block back into the global free pool
+                                // (and remove from block index)
+                                entry->value.store(nullptr,
+                                                   std::memory_order_relaxed);
+                            }
+                            this->parent->add_block_to_free_list(
+                                block); // releases the above store
+                        }
+                    }
+
+                    return true;
+                }
+                else
+                {
+                    this->dequeueOvercommit.fetch_add(
+                        1, std::memory_order_release);
+                }
+            }
+
+            return false;
+        }
+
+        template <AllocationMode allocMode, typename It>
+        bool enqueue_bulk(It itemFirst, size_t count)
+        {
+            // First, we need to make sure we have enough room to enqueue all of
+            // the elements;
+            // this means pre-allocating blocks and putting them in the block
+            // index (but only if
+            // all the allocations succeeded).
+
+            // Note that the tailBlock we start off with may not be owned by us
+            // any more;
+            // this happens if it was filled up exactly to the top (setting
+            // tailIndex to
+            // the first index of the next block which is not yet allocated),
+            // then dequeued
+            // completely (putting it on the free list) before we enqueue again.
+
+            index_t startTailIndex =
+                this->tailIndex.load(std::memory_order_relaxed);
+            auto startBlock = this->tailBlock;
+            Block* firstAllocatedBlock = nullptr;
+            auto endBlock = this->tailBlock;
+
+            // Figure out how many blocks we'll need to allocate, and do so
+            size_t blockBaseDiff =
+                ((startTailIndex + count - 1) &
+                 ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+                ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+            index_t currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            if (blockBaseDiff > 0)
+            {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                do
+                {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    // Find out where we'll be inserting this block in the block
+                    // index
+                    BlockIndexEntry* idxEntry =
+                        nullptr; // initialization here unnecessary but compiler
+                                 // can't always tell
+                    Block* newBlock;
+                    bool indexInserted = false;
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(
+                        !details::circular_less_than<index_t>(currentTailIndex,
+                                                              head));
+                    bool full =
+                        !details::circular_less_than<index_t>(head,
+                                                              currentTailIndex +
+                                                                  BLOCK_SIZE) ||
+                        (MAX_SUBQUEUE_SIZE !=
+                             details::const_numeric_max<size_t>::value &&
+                         (MAX_SUBQUEUE_SIZE == 0 ||
+                          MAX_SUBQUEUE_SIZE - BLOCK_SIZE <
+                              currentTailIndex - head));
+                    if (full ||
+                        !(indexInserted = insert_block_index_entry<allocMode>(
+                              idxEntry, currentTailIndex)) ||
+                        (newBlock =
+                             this->parent
+                                 ->ConcurrentQueue::template requisition_block<
+                                     allocMode>()) == nullptr)
+                    {
+                        // Index allocation or block allocation failed; revert
+                        // any other allocations
+                        // and index insertions done so far for this operation
+                        if (indexInserted)
+                        {
+                            rewind_block_index_tail();
+                            idxEntry->value.store(nullptr,
+                                                  std::memory_order_relaxed);
+                        }
+                        currentTailIndex =
+                            (startTailIndex - 1) &
+                            ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        for (auto block = firstAllocatedBlock; block != nullptr;
+                             block = block->next)
+                        {
+                            currentTailIndex +=
+                                static_cast<index_t>(BLOCK_SIZE);
+                            idxEntry = get_block_index_entry_for_index(
+                                currentTailIndex);
+                            idxEntry->value.store(nullptr,
+                                                  std::memory_order_relaxed);
+                            rewind_block_index_tail();
+                        }
+                        this->parent->add_blocks_to_free_list(
+                            firstAllocatedBlock);
+                        this->tailBlock = startBlock;
+
+                        return false;
+                    }
+
+#if MCDBGQ_TRACKMEM
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<
+                        implicit_context>();
+                    newBlock->next = nullptr;
+
+                    // Insert the new block into the index
+                    idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                    // Store the chain of blocks so that we can undo if later
+                    // allocations fail,
+                    // and so that we can find the blocks when we do the actual
+                    // enqueueing
+                    if ((startTailIndex &
+                         static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+                        firstAllocatedBlock != nullptr)
+                    {
+                        assert(this->tailBlock != nullptr);
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock = newBlock;
+                    endBlock = newBlock;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                              ? newBlock
+                                              : firstAllocatedBlock;
+                } while (blockBaseDiff > 0);
+            }
+
+            // Enqueue, one block at a time
+            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+            currentTailIndex = startTailIndex;
+            this->tailBlock = startBlock;
+            assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) !=
+                       0 ||
+                   firstAllocatedBlock != nullptr || count == 0);
+            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+                firstAllocatedBlock != nullptr)
+            {
+                this->tailBlock = firstAllocatedBlock;
+            }
+            while (true)
+            {
+                auto stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(newTailIndex,
+                                                         stopIndex))
+                {
+                    stopIndex = newTailIndex;
+                }
+                if (MOODYCAMEL_NOEXCEPT_CTOR(
+                        T, decltype(*itemFirst),
+                        new (nullptr) T(details::deref_noexcept(itemFirst))))
+                {
+                    while (currentTailIndex != stopIndex)
+                    {
+                        new ((*this->tailBlock)[currentTailIndex++])
+                            T(*itemFirst++);
+                    }
+                }
+                else
+                {
+                    MOODYCAMEL_TRY
+                    {
+                        while (currentTailIndex != stopIndex)
+                        {
+                            new ((*this->tailBlock)[currentTailIndex])
+                                T(details::nomove_if<
+                                    (bool)!MOODYCAMEL_NOEXCEPT_CTOR(
+                                        T, decltype(*itemFirst),
+                                        new (nullptr) T(details::deref_noexcept(
+                                            itemFirst)))>::eval(*itemFirst));
+                            ++currentTailIndex;
+                            ++itemFirst;
+                        }
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        auto constructedStopIndex = currentTailIndex;
+                        auto lastBlockEnqueued = this->tailBlock;
+
+                        if (!details::is_trivially_destructible<T>::value)
+                        {
+                            auto block = startBlock;
+                            if ((startTailIndex &
+                                 static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                            {
+                                block = firstAllocatedBlock;
+                            }
+                            currentTailIndex = startTailIndex;
+                            while (true)
+                            {
+                                stopIndex =
+                                    (currentTailIndex &
+                                     ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                                    static_cast<index_t>(BLOCK_SIZE);
+                                if (details::circular_less_than<index_t>(
+                                        constructedStopIndex, stopIndex))
+                                {
+                                    stopIndex = constructedStopIndex;
+                                }
+                                while (currentTailIndex != stopIndex)
+                                {
+                                    (*block)[currentTailIndex++]->~T();
+                                }
+                                if (block == lastBlockEnqueued)
+                                {
+                                    break;
+                                }
+                                block = block->next;
+                            }
+                        }
+
+                        currentTailIndex =
+                            (startTailIndex - 1) &
+                            ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        for (auto block = firstAllocatedBlock; block != nullptr;
+                             block = block->next)
+                        {
+                            currentTailIndex +=
+                                static_cast<index_t>(BLOCK_SIZE);
+                            auto idxEntry = get_block_index_entry_for_index(
+                                currentTailIndex);
+                            idxEntry->value.store(nullptr,
+                                                  std::memory_order_relaxed);
+                            rewind_block_index_tail();
+                        }
+                        this->parent->add_blocks_to_free_list(
+                            firstAllocatedBlock);
+                        this->tailBlock = startBlock;
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                if (this->tailBlock == endBlock)
+                {
+                    assert(currentTailIndex == newTailIndex);
+                    break;
+                }
+                this->tailBlock = this->tailBlock->next;
+            }
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template <typename It>
+        size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            auto tail = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit =
+                this->dequeueOvercommit.load(std::memory_order_relaxed);
+            auto desiredCount = static_cast<size_t>(
+                tail -
+                (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                 overcommit));
+            if (details::circular_less_than<size_t>(0, desiredCount))
+            {
+                desiredCount = desiredCount < max ? desiredCount : max;
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+                    desiredCount, std::memory_order_relaxed);
+                assert(overcommit <= myDequeueCount);
+
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                auto actualCount =
+                    static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                if (details::circular_less_than<size_t>(0, actualCount))
+                {
+                    actualCount =
+                        desiredCount < actualCount ? desiredCount : actualCount;
+                    if (actualCount < desiredCount)
+                    {
+                        this->dequeueOvercommit.fetch_add(
+                            desiredCount - actualCount,
+                            std::memory_order_release);
+                    }
+
+                    // Get the first index. Note that since there's guaranteed
+                    // to be at least actualCount elements, this
+                    // will never exceed tail.
+                    auto firstIndex =
+                        this->headIndex.fetch_add(actualCount,
+                                                  std::memory_order_acq_rel);
+
+                    // Iterate the blocks and dequeue
+                    auto index = firstIndex;
+                    BlockIndexHeader* localBlockIndex;
+                    auto indexIndex =
+                        get_block_index_index_for_index(index, localBlockIndex);
+                    do
+                    {
+                        auto blockStartIndex = index;
+                        auto endIndex =
+                            (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                            static_cast<index_t>(BLOCK_SIZE);
+                        endIndex =
+                            details::circular_less_than<index_t>(
+                                firstIndex + static_cast<index_t>(actualCount),
+                                endIndex)
+                                ? firstIndex + static_cast<index_t>(actualCount)
+                                : endIndex;
+
+                        auto entry = localBlockIndex->index[indexIndex];
+                        auto block =
+                            entry->value.load(std::memory_order_relaxed);
+                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(
+                                T, T&&, details::deref_noexcept(itemFirst) =
+                                            std::move((*(*block)[index]))))
+                        {
+                            while (index != endIndex)
+                            {
+                                auto& el = *((*block)[index]);
+                                *itemFirst++ = std::move(el);
+                                el.~T();
+                                ++index;
+                            }
+                        }
+                        else
+                        {
+                            MOODYCAMEL_TRY
+                            {
+                                while (index != endIndex)
+                                {
+                                    auto& el = *((*block)[index]);
+                                    *itemFirst = std::move(el);
+                                    ++itemFirst;
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            MOODYCAMEL_CATCH(...)
+                            {
+                                do
+                                {
+                                    entry = localBlockIndex->index[indexIndex];
+                                    block = entry->value.load(
+                                        std::memory_order_relaxed);
+                                    while (index != endIndex)
+                                    {
+                                        (*block)[index++]->~T();
+                                    }
+
+                                    if (block->ConcurrentQueue::Block::
+                                            template set_many_empty<
+                                                implicit_context>(
+                                                blockStartIndex,
+                                                static_cast<size_t>(
+                                                    endIndex -
+                                                    blockStartIndex)))
+                                    {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                                        debug::DebugLock lock(mutex);
+#endif
+                                        entry->value.store(
+                                            nullptr, std::memory_order_relaxed);
+                                        this->parent->add_block_to_free_list(
+                                            block);
+                                    }
+                                    indexIndex =
+                                        (indexIndex + 1) &
+                                        (localBlockIndex->capacity - 1);
+
+                                    blockStartIndex = index;
+                                    endIndex = (index &
+                                                ~static_cast<index_t>(
+                                                    BLOCK_SIZE - 1)) +
+                                               static_cast<index_t>(BLOCK_SIZE);
+                                    endIndex =
+                                        details::circular_less_than<index_t>(
+                                            firstIndex + static_cast<index_t>(
+                                                             actualCount),
+                                            endIndex)
+                                            ? firstIndex + static_cast<index_t>(
+                                                               actualCount)
+                                            : endIndex;
+                                } while (index != firstIndex + actualCount);
+
+                                MOODYCAMEL_RETHROW;
+                            }
+                        }
+                        if (block->ConcurrentQueue::Block::
+                                template set_many_empty<implicit_context>(
+                                    blockStartIndex,
+                                    static_cast<size_t>(endIndex -
+                                                        blockStartIndex)))
+                        {
+                            {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                                debug::DebugLock lock(mutex);
+#endif
+                                // Note that the set_many_empty above did a
+                                // release, meaning that anybody who acquires
+                                // the block
+                                // we're about to free can use it safely since
+                                // our writes (and reads!) will have
+                                // happened-before then.
+                                entry->value.store(nullptr,
+                                                   std::memory_order_relaxed);
+                            }
+                            this->parent->add_block_to_free_list(
+                                block); // releases the above store
+                        }
+                        indexIndex =
+                            (indexIndex + 1) & (localBlockIndex->capacity - 1);
+                    } while (index != firstIndex + actualCount);
+
+                    return actualCount;
+                }
+                else
+                {
+                    this->dequeueOvercommit.fetch_add(
+                        desiredCount, std::memory_order_release);
+                }
+            }
+
+            return 0;
+        }
+
+    private:
+        // The block size must be > 1, so any number with the low bit set is an
+        // invalid block base index
+        static const index_t INVALID_BLOCK_BASE = 1;
+
+        struct BlockIndexEntry
+        {
+            std::atomic<index_t> key;
+            std::atomic<Block*> value;
+        };
+
+        struct BlockIndexHeader
+        {
+            size_t capacity;
+            std::atomic<size_t> tail;
+            BlockIndexEntry* entries;
+            BlockIndexEntry** index;
+            BlockIndexHeader* prev;
+        };
+
+        template <AllocationMode allocMode>
+        inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry,
+                                             index_t blockStartIndex)
+        {
+            auto localBlockIndex =
+                blockIndex.load(std::memory_order_relaxed); // We're the only
+                                                            // writer thread,
+                                                            // relaxed is OK
+            if (localBlockIndex == nullptr)
+            {
+                return false; // this can happen if new_block_index failed in
+                              // the constructor
+            }
+            auto newTail =
+                (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+                (localBlockIndex->capacity - 1);
+            idxEntry = localBlockIndex->index[newTail];
+            if (idxEntry->key.load(std::memory_order_relaxed) ==
+                    INVALID_BLOCK_BASE ||
+                idxEntry->value.load(std::memory_order_relaxed) == nullptr)
+            {
+                idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                localBlockIndex->tail.store(newTail, std::memory_order_release);
+                return true;
+            }
+
+            // No room in the old block index, try to allocate another one!
+            if (allocMode == CannotAlloc || !new_block_index())
+            {
+                return false;
+            }
+            localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+            newTail =
+                (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+                (localBlockIndex->capacity - 1);
+            idxEntry = localBlockIndex->index[newTail];
+            assert(idxEntry->key.load(std::memory_order_relaxed) ==
+                   INVALID_BLOCK_BASE);
+            idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+            localBlockIndex->tail.store(newTail, std::memory_order_release);
+            return true;
+        }
+
+        inline void rewind_block_index_tail()
+        {
+            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+            localBlockIndex->tail.store(
+                (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) &
+                    (localBlockIndex->capacity - 1),
+                std::memory_order_relaxed);
+        }
+
+        inline BlockIndexEntry* get_block_index_entry_for_index(
+            index_t index) const
+        {
+            BlockIndexHeader* localBlockIndex;
+            auto idx = get_block_index_index_for_index(index, localBlockIndex);
+            return localBlockIndex->index[idx];
+        }
+
+        inline size_t get_block_index_index_for_index(
+            index_t index, BlockIndexHeader*& localBlockIndex) const
+        {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+            debug::DebugLock lock(mutex);
+#endif
+            index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+            localBlockIndex = blockIndex.load(std::memory_order_acquire);
+            auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+            auto tailBase = localBlockIndex->index[tail]->key.load(
+                std::memory_order_relaxed);
+            assert(tailBase != INVALID_BLOCK_BASE);
+            // Note: Must use division instead of shift because the index may
+            // wrap around, causing a negative
+            // offset, whose negativity we want to preserve
+            auto offset = static_cast<size_t>(
+                static_cast<typename std::make_signed<index_t>::type>(
+                    index - tailBase) /
+                BLOCK_SIZE);
+            size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+            assert(localBlockIndex->index[idx]->key.load(
+                       std::memory_order_relaxed) == index &&
+                   localBlockIndex->index[idx]->value.load(
+                       std::memory_order_relaxed) != nullptr);
+            return idx;
+        }
+
+        bool new_block_index()
+        {
+            auto prev = blockIndex.load(std::memory_order_relaxed);
+            size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+            auto entryCount =
+                prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+            auto raw = static_cast<char*>((Traits::malloc)(
+                sizeof(BlockIndexHeader) +
+                std::alignment_of<BlockIndexEntry>::value - 1 +
+                sizeof(BlockIndexEntry) * entryCount +
+                std::alignment_of<BlockIndexEntry*>::value - 1 +
+                sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+            if (raw == nullptr)
+            {
+                return false;
+            }
+
+            auto header = new (raw) BlockIndexHeader;
+            auto entries = reinterpret_cast<BlockIndexEntry*>(
+                details::align_for<BlockIndexEntry>(raw +
+                                                    sizeof(BlockIndexHeader)));
+            auto index = reinterpret_cast<BlockIndexEntry**>(
+                details::align_for<BlockIndexEntry*>(
+                    reinterpret_cast<char*>(entries) +
+                    sizeof(BlockIndexEntry) * entryCount));
+            if (prev != nullptr)
+            {
+                auto prevTail = prev->tail.load(std::memory_order_relaxed);
+                auto prevPos = prevTail;
+                size_t i = 0;
+                do
+                {
+                    prevPos = (prevPos + 1) & (prev->capacity - 1);
+                    index[i++] = prev->index[prevPos];
+                } while (prevPos != prevTail);
+                assert(i == prevCapacity);
+            }
+            for (size_t i = 0; i != entryCount; ++i)
+            {
+                new (entries + i) BlockIndexEntry;
+                entries[i].key.store(INVALID_BLOCK_BASE,
+                                     std::memory_order_relaxed);
+                index[prevCapacity + i] = entries + i;
+            }
+            header->prev = prev;
+            header->entries = entries;
+            header->index = index;
+            header->capacity = nextBlockIndexCapacity;
+            header->tail.store((prevCapacity - 1) &
+                                   (nextBlockIndexCapacity - 1),
+                               std::memory_order_relaxed);
+
+            blockIndex.store(header, std::memory_order_release);
+
+            nextBlockIndexCapacity <<= 1;
+
+            return true;
+        }
+
+    private:
+        size_t nextBlockIndexCapacity;
+        std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+    public:
+        details::ThreadExitListener threadExitListener;
+
+    private:
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    public:
+        ImplicitProducer* nextImplicitProducer;
+
+    private:
+#endif
+
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        mutable debug::DebugMutex mutex;
+#endif
+#if MCDBGQ_TRACKMEM
+        friend struct MemStats;
+#endif
+    };
+
+    //////////////////////////////////
+    // Block pool manipulation
+    //////////////////////////////////
+
+    void populate_initial_block_list(size_t blockCount)
+    {
+        initialBlockPoolSize = blockCount;
+        if (initialBlockPoolSize == 0)
+        {
+            initialBlockPool = nullptr;
+            return;
+        }
+
+        initialBlockPool = create_array<Block>(blockCount);
+        if (initialBlockPool == nullptr)
+        {
+            initialBlockPoolSize = 0;
+        }
+        for (size_t i = 0; i < initialBlockPoolSize; ++i)
+        {
+            initialBlockPool[i].dynamicallyAllocated = false;
+        }
+    }
+
+    inline Block* try_get_block_from_initial_pool()
+    {
+        if (initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+            initialBlockPoolSize)
+        {
+            return nullptr;
+        }
+
+        auto index =
+            initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+        return index < initialBlockPoolSize ? (initialBlockPool + index)
+                                            : nullptr;
+    }
+
+    inline void add_block_to_free_list(Block* block)
+    {
+#if MCDBGQ_TRACKMEM
+        block->owner = nullptr;
+#endif
+        freeList.add(block);
+    }
+
+    inline void add_blocks_to_free_list(Block* block)
+    {
+        while (block != nullptr)
+        {
+            auto next = block->next;
+            add_block_to_free_list(block);
+            block = next;
+        }
+    }
+
+    inline Block* try_get_block_from_free_list() { return freeList.try_get(); }
+    // Gets a free block from one of the memory pools, or allocates a new one
+    // (if applicable)
+    template <AllocationMode canAlloc>
+    Block* requisition_block()
+    {
+        auto block = try_get_block_from_initial_pool();
+        if (block != nullptr)
+        {
+            return block;
+        }
+
+        block = try_get_block_from_free_list();
+        if (block != nullptr)
+        {
+            return block;
+        }
+
+        if (canAlloc == CanAlloc)
+        {
+            return create<Block>();
+        }
+
+        return nullptr;
+    }
+
+#if MCDBGQ_TRACKMEM
+public:
+    struct MemStats
+    {
+        size_t allocatedBlocks;
+        size_t usedBlocks;
+        size_t freeBlocks;
+        size_t ownedBlocksExplicit;
+        size_t ownedBlocksImplicit;
+        size_t implicitProducers;
+        size_t explicitProducers;
+        size_t elementsEnqueued;
+        size_t blockClassBytes;
+        size_t queueClassBytes;
+        size_t implicitBlockIndexBytes;
+        size_t explicitBlockIndexBytes;
+
+        friend class ConcurrentQueue;
+
+    private:
+        static MemStats getFor(ConcurrentQueue* q)
+        {
+            MemStats stats = {0};
+
+            stats.elementsEnqueued = q->size_approx();
+
+            auto block = q->freeList.head_unsafe();
+            while (block != nullptr)
+            {
+                ++stats.allocatedBlocks;
+                ++stats.freeBlocks;
+                block = block->freeListNext.load(std::memory_order_relaxed);
+            }
+
+            for (auto ptr = q->producerListTail.load(std::memory_order_acquire);
+                 ptr != nullptr; ptr = ptr->next_prod())
+            {
+                bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+                stats.implicitProducers += implicit ? 1 : 0;
+                stats.explicitProducers += implicit ? 0 : 1;
+
+                if (implicit)
+                {
+                    auto prod = static_cast<ImplicitProducer*>(ptr);
+                    stats.queueClassBytes += sizeof(ImplicitProducer);
+                    auto head = prod->headIndex.load(std::memory_order_relaxed);
+                    auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+                    auto hash =
+                        prod->blockIndex.load(std::memory_order_relaxed);
+                    if (hash != nullptr)
+                    {
+                        for (size_t i = 0; i != hash->capacity; ++i)
+                        {
+                            if (hash->index[i]->key.load(
+                                    std::memory_order_relaxed) !=
+                                    ImplicitProducer::INVALID_BLOCK_BASE &&
+                                hash->index[i]->value.load(
+                                    std::memory_order_relaxed) != nullptr)
+                            {
+                                ++stats.allocatedBlocks;
+                                ++stats.ownedBlocksImplicit;
+                            }
+                        }
+                        stats.implicitBlockIndexBytes +=
+                            hash->capacity *
+                            sizeof(typename ImplicitProducer::BlockIndexEntry);
+                        for (; hash != nullptr; hash = hash->prev)
+                        {
+                            stats.implicitBlockIndexBytes +=
+                                sizeof(typename ImplicitProducer::
+                                           BlockIndexHeader) +
+                                hash->capacity *
+                                    sizeof(typename ImplicitProducer::
+                                               BlockIndexEntry*);
+                        }
+                    }
+                    for (; details::circular_less_than<index_t>(head, tail);
+                         head += BLOCK_SIZE)
+                    {
+                        // auto block =
+                        // prod->get_block_index_entry_for_index(head);
+                        ++stats.usedBlocks;
+                    }
+                }
+                else
+                {
+                    auto prod = static_cast<ExplicitProducer*>(ptr);
+                    stats.queueClassBytes += sizeof(ExplicitProducer);
+                    auto tailBlock = prod->tailBlock;
+                    bool wasNonEmpty = false;
+                    if (tailBlock != nullptr)
+                    {
+                        auto block = tailBlock;
+                        do
+                        {
+                            ++stats.allocatedBlocks;
+                            if (!block->ConcurrentQueue::Block::
+                                     template is_empty<explicit_context>() ||
+                                wasNonEmpty)
+                            {
+                                ++stats.usedBlocks;
+                                wasNonEmpty = wasNonEmpty || block != tailBlock;
+                            }
+                            ++stats.ownedBlocksExplicit;
+                            block = block->next;
+                        } while (block != tailBlock);
+                    }
+                    auto index =
+                        prod->blockIndex.load(std::memory_order_relaxed);
+                    while (index != nullptr)
+                    {
+                        stats.explicitBlockIndexBytes +=
+                            sizeof(
+                                typename ExplicitProducer::BlockIndexHeader) +
+                            index->size *
+                                sizeof(
+                                    typename ExplicitProducer::BlockIndexEntry);
+                        index = static_cast<
+                            typename ExplicitProducer::BlockIndexHeader*>(
+                            index->prev);
+                    }
+                }
+            }
+
+            auto freeOnInitialPool =
+                q->initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+                        q->initialBlockPoolSize
+                    ? 0
+                    : q->initialBlockPoolSize -
+                          q->initialBlockPoolIndex.load(
+                              std::memory_order_relaxed);
+            stats.allocatedBlocks += freeOnInitialPool;
+            stats.freeBlocks += freeOnInitialPool;
+
+            stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+            stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+            return stats;
+        }
+    };
+
+    // For debugging only. Not thread-safe.
+    MemStats getMemStats() { return MemStats::getFor(this); }
+private:
+    friend struct MemStats;
+#endif
+
+    //////////////////////////////////
+    // Producer list manipulation
+    //////////////////////////////////
+
+    ProducerBase* recycle_or_create_producer(bool isExplicit)
+    {
+        bool recycled;
+        return recycle_or_create_producer(isExplicit, recycled);
+    }
+
+    ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled)
+    {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+        // Try to re-use one first
+        for (auto ptr = producerListTail.load(std::memory_order_acquire);
+             ptr != nullptr; ptr = ptr->next_prod())
+        {
+            if (ptr->inactive.load(std::memory_order_relaxed) &&
+                ptr->isExplicit == isExplicit)
+            {
+                bool expected = true;
+                if (ptr->inactive.compare_exchange_strong(
+                        expected, /* desired */ false,
+                        std::memory_order_acquire, std::memory_order_relaxed))
+                {
+                    // We caught one! It's been marked as activated, the caller
+                    // can have it
+                    recycled = true;
+                    return ptr;
+                }
+            }
+        }
+
+        recycled = false;
+        return add_producer(isExplicit ? static_cast<ProducerBase*>(
+                                             create<ExplicitProducer>(this))
+                                       : create<ImplicitProducer>(this));
+    }
+
+    ProducerBase* add_producer(ProducerBase* producer)
+    {
+        // Handle failed memory allocation
+        if (producer == nullptr)
+        {
+            return nullptr;
+        }
+
+        producerCount.fetch_add(1, std::memory_order_relaxed);
+
+        // Add it to the lock-free list
+        auto prevTail = producerListTail.load(std::memory_order_relaxed);
+        do
+        {
+            producer->next = prevTail;
+        } while (
+            !producerListTail.compare_exchange_weak(prevTail, producer,
+                                                    std::memory_order_release,
+                                                    std::memory_order_relaxed));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+        if (producer->isExplicit)
+        {
+            auto prevTailExplicit =
+                explicitProducers.load(std::memory_order_relaxed);
+            do
+            {
+                static_cast<ExplicitProducer*>(producer)->nextExplicitProducer =
+                    prevTailExplicit;
+            } while (!explicitProducers.compare_exchange_weak(
+                prevTailExplicit, static_cast<ExplicitProducer*>(producer),
+                std::memory_order_release, std::memory_order_relaxed));
+        }
+        else
+        {
+            auto prevTailImplicit =
+                implicitProducers.load(std::memory_order_relaxed);
+            do
+            {
+                static_cast<ImplicitProducer*>(producer)->nextImplicitProducer =
+                    prevTailImplicit;
+            } while (!implicitProducers.compare_exchange_weak(
+                prevTailImplicit, static_cast<ImplicitProducer*>(producer),
+                std::memory_order_release, std::memory_order_relaxed));
+        }
+#endif
+
+        return producer;
+    }
+
+    void reown_producers()
+    {
+        // After another instance is moved-into/swapped-with this one, all the
+        // producers we stole still think their parents are the other queue.
+        // So fix them up!
+        for (auto ptr = producerListTail.load(std::memory_order_relaxed);
+             ptr != nullptr; ptr = ptr->next_prod())
+        {
+            ptr->parent = this;
+        }
+    }
+
+    //////////////////////////////////
+    // Implicit producer hash
+    //////////////////////////////////
+
+    struct ImplicitProducerKVP
+    {
+        std::atomic<details::thread_id_t> key;
+        ImplicitProducer* value; // No need for atomicity since it's only read
+                                 // by the thread that sets it in the first
+                                 // place
+
+        ImplicitProducerKVP()
+            : value(nullptr)
+        {
+        }
+
+        ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+        {
+            key.store(other.key.load(std::memory_order_relaxed),
+                      std::memory_order_relaxed);
+            value = other.value;
+        }
+
+        inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other)
+            MOODYCAMEL_NOEXCEPT
+        {
+            swap(other);
+            return *this;
+        }
+
+        inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+        {
+            if (this != &other)
+            {
+                details::swap_relaxed(key, other.key);
+                std::swap(value, other.value);
+            }
+        }
+    };
+
+    template <typename XT, typename XTraits>
+    friend void moodycamel::swap(
+        typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&,
+        typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&)
+        MOODYCAMEL_NOEXCEPT;
+
+    struct ImplicitProducerHash
+    {
+        size_t capacity;
+        ImplicitProducerKVP* entries;
+        ImplicitProducerHash* prev;
+    };
+
+    inline void populate_initial_implicit_producer_hash()
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return;
+
+        implicitProducerHashCount.store(0, std::memory_order_relaxed);
+        auto hash = &initialImplicitProducerHash;
+        hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+        hash->entries = &initialImplicitProducerHashEntries[0];
+        for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i)
+        {
+            initialImplicitProducerHashEntries[i].key.store(
+                details::invalid_thread_id, std::memory_order_relaxed);
+        }
+        hash->prev = nullptr;
+        implicitProducerHash.store(hash, std::memory_order_relaxed);
+    }
+
+    void swap_implicit_producer_hashes(ConcurrentQueue& other)
+    {
+        if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return;
+
+        // Swap (assumes our implicit producer hash is initialized)
+        initialImplicitProducerHashEntries.swap(
+            other.initialImplicitProducerHashEntries);
+        initialImplicitProducerHash.entries =
+            &initialImplicitProducerHashEntries[0];
+        other.initialImplicitProducerHash.entries =
+            &other.initialImplicitProducerHashEntries[0];
+
+        details::swap_relaxed(implicitProducerHashCount,
+                              other.implicitProducerHashCount);
+
+        details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+        if (implicitProducerHash.load(std::memory_order_relaxed) ==
+            &other.initialImplicitProducerHash)
+        {
+            implicitProducerHash.store(&initialImplicitProducerHash,
+                                       std::memory_order_relaxed);
+        }
+        else
+        {
+            ImplicitProducerHash* hash;
+            for (hash = implicitProducerHash.load(std::memory_order_relaxed);
+                 hash->prev != &other.initialImplicitProducerHash;
+                 hash = hash->prev)
+            {
+                continue;
+            }
+            hash->prev = &initialImplicitProducerHash;
+        }
+        if (other.implicitProducerHash.load(std::memory_order_relaxed) ==
+            &initialImplicitProducerHash)
+        {
+            other.implicitProducerHash.store(&other.initialImplicitProducerHash,
+                                             std::memory_order_relaxed);
+        }
+        else
+        {
+            ImplicitProducerHash* hash;
+            for (hash =
+                     other.implicitProducerHash.load(std::memory_order_relaxed);
+                 hash->prev != &initialImplicitProducerHash; hash = hash->prev)
+            {
+                continue;
+            }
+            hash->prev = &other.initialImplicitProducerHash;
+        }
+    }
+
+    // Only fails (returns nullptr) if memory allocation fails
+    ImplicitProducer* get_or_add_implicit_producer()
+    {
+// Note that since the data is essentially thread-local (key is thread ID),
+// there's a reduced need for fences (memory ordering is already consistent
+// for any individual thread), except for the current table itself.
+
+// Start by looking for the thread ID in the current and all previous hash
+// tables.
+// If it's not found, it must not be in there yet, since this same thread would
+// have added it previously to one of the tables that we traversed.
+
+// Code and algorithm adapted from
+// http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+
+        auto id = details::thread_id();
+        auto hashedId = details::hash_thread_id(id);
+
+        auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        for (auto hash = mainHash; hash != nullptr; hash = hash->prev)
+        {
+            // Look for the id in this hash
+            auto index = hashedId;
+            while (true)
+            {   // Not an infinite loop because at least one slot is free in the
+                // hash table
+                index &= hash->capacity - 1;
+
+                auto probedKey =
+                    hash->entries[index].key.load(std::memory_order_relaxed);
+                if (probedKey == id)
+                {
+                    // Found it! If we had to search several hashes deep,
+                    // though, we should lazily add it
+                    // to the current main hash table to avoid the extended
+                    // search next time.
+                    // Note there's guaranteed to be room in the current hash
+                    // table since every subsequent
+                    // table implicitly reserves space for all previous tables
+                    // (there's only one
+                    // implicitProducerHashCount).
+                    auto value = hash->entries[index].value;
+                    if (hash != mainHash)
+                    {
+                        index = hashedId;
+                        while (true)
+                        {
+                            index &= mainHash->capacity - 1;
+                            probedKey = mainHash->entries[index].key.load(
+                                std::memory_order_relaxed);
+                            auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+                            auto reusable = details::invalid_thread_id2;
+                            if ((probedKey == empty &&
+                                 mainHash->entries[index]
+                                     .key.compare_exchange_strong(
+                                         empty, id, std::memory_order_relaxed,
+                                         std::memory_order_relaxed)) ||
+                                (probedKey == reusable &&
+                                 mainHash->entries[index]
+                                     .key.compare_exchange_strong(
+                                         reusable, id,
+                                         std::memory_order_acquire,
+                                         std::memory_order_acquire)))
+                            {
+#else
+                            if ((probedKey == empty &&
+                                 mainHash->entries[index]
+                                     .key.compare_exchange_strong(
+                                         empty, id, std::memory_order_relaxed,
+                                         std::memory_order_relaxed)))
+                            {
+#endif
+                                mainHash->entries[index].value = value;
+                                break;
+                            }
+                            ++index;
+                        }
+                    }
+
+                    return value;
+                }
+                if (probedKey == details::invalid_thread_id)
+                {
+                    break; // Not in this hash table
+                }
+                ++index;
+            }
+        }
+
+        // Insert!
+        auto newCount =
+            1 +
+            implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+        while (true)
+        {
+            if (newCount >= (mainHash->capacity >> 1) &&
+                !implicitProducerHashResizeInProgress.test_and_set(
+                    std::memory_order_acquire))
+            {
+                // We've acquired the resize lock, try to allocate a bigger hash
+                // table.
+                // Note the acquire fence synchronizes with the release fence at
+                // the end of this block, and hence when
+                // we reload implicitProducerHash it must be the most recent
+                // version (it only gets changed within this
+                // locked block).
+                mainHash = implicitProducerHash.load(std::memory_order_acquire);
+                if (newCount >= (mainHash->capacity >> 1))
+                {
+                    auto newCapacity = mainHash->capacity << 1;
+                    while (newCount >= (newCapacity >> 1))
+                    {
+                        newCapacity <<= 1;
+                    }
+                    auto raw = static_cast<char*>((Traits::malloc)(
+                        sizeof(ImplicitProducerHash) +
+                        std::alignment_of<ImplicitProducerKVP>::value - 1 +
+                        sizeof(ImplicitProducerKVP) * newCapacity));
+                    if (raw == nullptr)
+                    {
+                        // Allocation failed
+                        implicitProducerHashCount.fetch_add(
+                            -1, std::memory_order_relaxed);
+                        implicitProducerHashResizeInProgress.clear(
+                            std::memory_order_relaxed);
+                        return nullptr;
+                    }
+
+                    auto newHash = new (raw) ImplicitProducerHash;
+                    newHash->capacity = newCapacity;
+                    newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(
+                        details::align_for<ImplicitProducerKVP>(
+                            raw + sizeof(ImplicitProducerHash)));
+                    for (size_t i = 0; i != newCapacity; ++i)
+                    {
+                        new (newHash->entries + i) ImplicitProducerKVP;
+                        newHash->entries[i].key.store(
+                            details::invalid_thread_id,
+                            std::memory_order_relaxed);
+                    }
+                    newHash->prev = mainHash;
+                    implicitProducerHash.store(newHash,
+                                               std::memory_order_release);
+                    implicitProducerHashResizeInProgress.clear(
+                        std::memory_order_release);
+                    mainHash = newHash;
+                }
+                else
+                {
+                    implicitProducerHashResizeInProgress.clear(
+                        std::memory_order_release);
+                }
+            }
+
+            // If it's < three-quarters full, add to the old one anyway so that
+            // we don't have to wait for the next table
+            // to finish being allocated by another thread (and if we just
+            // finished allocating above, the condition will
+            // always be true)
+            if (newCount <
+                (mainHash->capacity >> 1) + (mainHash->capacity >> 2))
+            {
+                bool recycled;
+                auto producer = static_cast<ImplicitProducer*>(
+                    recycle_or_create_producer(false, recycled));
+                if (producer == nullptr)
+                {
+                    implicitProducerHashCount.fetch_add(
+                        -1, std::memory_order_relaxed);
+                    return nullptr;
+                }
+                if (recycled)
+                {
+                    implicitProducerHashCount.fetch_add(
+                        -1, std::memory_order_relaxed);
+                }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+                producer->threadExitListener.callback =
+                    &ConcurrentQueue::implicit_producer_thread_exited_callback;
+                producer->threadExitListener.userData = producer;
+                details::ThreadExitNotifier::subscribe(
+                    &producer->threadExitListener);
+#endif
+
+                auto index = hashedId;
+                while (true)
+                {
+                    index &= mainHash->capacity - 1;
+                    auto probedKey = mainHash->entries[index].key.load(
+                        std::memory_order_relaxed);
+
+                    auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+                    auto reusable = details::invalid_thread_id2;
+                    if ((probedKey == empty &&
+                         mainHash->entries[index].key.compare_exchange_strong(
+                             empty, id, std::memory_order_relaxed,
+                             std::memory_order_relaxed)) ||
+                        (probedKey == reusable &&
+                         mainHash->entries[index].key.compare_exchange_strong(
+                             reusable, id, std::memory_order_acquire,
+                             std::memory_order_acquire)))
+                    {
+#else
+                    if ((probedKey == empty &&
+                         mainHash->entries[index].key.compare_exchange_strong(
+                             empty, id, std::memory_order_relaxed,
+                             std::memory_order_relaxed)))
+                    {
+#endif
+                        mainHash->entries[index].value = producer;
+                        break;
+                    }
+                    ++index;
+                }
+                return producer;
+            }
+
+            // Hmm, the old hash is quite full and somebody else is busy
+            // allocating a new one.
+            // We need to wait for the allocating thread to finish (if it
+            // succeeds, we add, if not,
+            // we try to allocate ourselves).
+            mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        }
+    }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+    void implicit_producer_thread_exited(ImplicitProducer* producer)
+    {
+        // Remove from thread exit listeners
+        details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener);
+
+// Remove from hash
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+        auto hash = implicitProducerHash.load(std::memory_order_acquire);
+        assert(hash != nullptr); // The thread exit listener is only registered
+                                 // if we were added to a hash in the first
+                                 // place
+        auto id = details::thread_id();
+        auto hashedId = details::hash_thread_id(id);
+        details::thread_id_t probedKey;
+
+        // We need to traverse all the hashes just in case other threads aren't
+        // on the current one yet and are
+        // trying to add an entry thinking there's a free slot (because they
+        // reused a producer)
+        for (; hash != nullptr; hash = hash->prev)
+        {
+            auto index = hashedId;
+            do
+            {
+                index &= hash->capacity - 1;
+                probedKey =
+                    hash->entries[index].key.load(std::memory_order_relaxed);
+                if (probedKey == id)
+                {
+                    hash->entries[index].key.store(details::invalid_thread_id2,
+                                                   std::memory_order_release);
+                    break;
+                }
+                ++index;
+            } while (probedKey !=
+                     details::invalid_thread_id); // Can happen if the hash has
+                                                  // changed but we weren't put
+                                                  // back in it yet, or if we
+                                                  // weren't added to this hash
+                                                  // in the first place
+        }
+
+        // Mark the queue as being recyclable
+        producer->inactive.store(true, std::memory_order_release);
+    }
+
+    static void implicit_producer_thread_exited_callback(void* userData)
+    {
+        auto producer = static_cast<ImplicitProducer*>(userData);
+        auto queue = producer->parent;
+        queue->implicit_producer_thread_exited(producer);
+    }
+#endif
+
+    //////////////////////////////////
+    // Utility functions
+    //////////////////////////////////
+
+    template <typename U>
+    static inline U* create_array(size_t count)
+    {
+        assert(count > 0);
+        auto p = static_cast<U*>((Traits::malloc)(sizeof(U) * count));
+        if (p == nullptr)
+        {
+            return nullptr;
+        }
+
+        for (size_t i = 0; i != count; ++i)
+        {
+            new (p + i) U();
+        }
+        return p;
+    }
+
+    template <typename U>
+    static inline void destroy_array(U* p, size_t count)
+    {
+        if (p != nullptr)
+        {
+            assert(count > 0);
+            for (size_t i = count; i != 0;)
+            {
+                (p + --i)->~U();
+            }
+            (Traits::free)(p);
+        }
+    }
+
+    template <typename U>
+    static inline U* create()
+    {
+        auto p = (Traits::malloc)(sizeof(U));
+        return p != nullptr ? new (p) U : nullptr;
+    }
+
+    template <typename U, typename A1>
+    static inline U* create(A1&& a1)
+    {
+        auto p = (Traits::malloc)(sizeof(U));
+        return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+    }
+
+    template <typename U>
+    static inline void destroy(U* p)
+    {
+        if (p != nullptr)
+        {
+            p->~U();
+        }
+        (Traits::free)(p);
+    }
+
+private:
+    std::atomic<ProducerBase*> producerListTail;
+    std::atomic<std::uint32_t> producerCount;
+
+    std::atomic<size_t> initialBlockPoolIndex;
+    Block* initialBlockPool;
+    size_t initialBlockPoolSize;
+
+#if !MCDBGQ_USEDEBUGFREELIST
+    FreeList<Block> freeList;
+#else
+    debug::DebugFreeList<Block> freeList;
+#endif
+
+    std::atomic<ImplicitProducerHash*> implicitProducerHash;
+    std::atomic<size_t> implicitProducerHashCount; // Number of slots logically
+                                                   // used
+    ImplicitProducerHash initialImplicitProducerHash;
+    std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE>
+        initialImplicitProducerHashEntries;
+    std::atomic_flag implicitProducerHashResizeInProgress;
+
+    std::atomic<std::uint32_t> nextExplicitConsumerId;
+    std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugMutex implicitProdMutex;
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    std::atomic<ExplicitProducer*> explicitProducers;
+    std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+    : producer(queue.recycle_or_create_producer(true))
+{
+    if (producer != nullptr)
+    {
+        producer->token = this;
+    }
+}
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+    : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)
+                   ->recycle_or_create_producer(true))
+{
+    if (producer != nullptr)
+    {
+        producer->token = this;
+    }
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+    : itemsConsumedFromCurrent(0)
+    , currentProducer(nullptr)
+    , desiredProducer(nullptr)
+{
+    initialOffset =
+        queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+    lastKnownGlobalOffset = -1;
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+    : itemsConsumedFromCurrent(0)
+    , currentProducer(nullptr)
+    , desiredProducer(nullptr)
+{
+    initialOffset =
+        reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)
+            ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+    lastKnownGlobalOffset = -1;
+}
+
+template <typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a,
+                 ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b)
+    MOODYCAMEL_NOEXCEPT
+{
+    a.swap(b);
+}
+}
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/deflect/qt/helpers.h b/deflect/qt/helpers.h
index d3fe7e9..1991242 100644
--- a/deflect/qt/helpers.h
+++ b/deflect/qt/helpers.h
@@ -1,6 +1,6 @@
 /*********************************************************************/
-/* Copyright (c) 2016, EPFL/Blue Brain Project                       */
-/*                     Raphael Dumusc <raphael.dumusc@epfl.ch>       */
+/* Copyright (c) 2016-2017, EPFL/Blue Brain Project                  */
+/*                          Raphael Dumusc <raphael.dumusc@epfl.ch>  */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -47,14 +47,6 @@ namespace deflect
 {
 namespace qt
 {
-template <typename T>
-std::future<T> make_ready_future(T&& value)
-{
-    std::promise<T> promise;
-    promise.set_value(std::forward<T>(value));
-    return promise.get_future();
-}
-
 // missing make_unique() implementation in C++11 standard
 // source: http://herbsutter.com/gotw/_102/
 template <typename T, typename... Args>
diff --git a/deflect/types.h b/deflect/types.h
index 0b30208..dcb8cbc 100644
--- a/deflect/types.h
+++ b/deflect/types.h
@@ -73,6 +73,14 @@ constexpr typename std::underlying_type<E>::type as_underlying_type(E e)
     return static_cast<typename std::underlying_type<E>::type>(e);
 }
 
+template <typename T>
+std::future<T> make_ready_future(T&& value)
+{
+    std::promise<T> promise;
+    promise.set_value(std::forward<T>(value));
+    return promise.get_future();
+}
+
 class EventReceiver;
 class Frame;
 class FrameDispatcher;
diff --git a/doc/Changelog.md b/doc/Changelog.md
index 8f15cbc..4677130 100644
--- a/doc/Changelog.md
+++ b/doc/Changelog.md
@@ -5,6 +5,8 @@ Changelog {#Changelog}
 
 ### 0.14.0 (git master)
 
+* [176](https://github.com/BlueBrain/Deflect/pull/176):
+  OPT: Lock-free request queueing for multi-threaded stream clients (e.g. KNL)
 * [175](https://github.com/BlueBrain/Deflect/pull/175):
   Add deflect::Observer which can be used to only receive events w/o the need
   to send images
diff --git a/tests/cpp/ServerTests.cpp b/tests/cpp/ServerTests.cpp
index 456cfcd..e0e2066 100644
--- a/tests/cpp/ServerTests.cpp
+++ b/tests/cpp/ServerTests.cpp
@@ -48,6 +48,7 @@ namespace ut = boost::unit_test;
 #include <deflect/Server.h>
 #include <deflect/Stream.h>
 
+#include <cmath>
 #include <iostream>
 
 #include <QMutex>
@@ -376,3 +377,152 @@ BOOST_AUTO_TEST_CASE(testOneObserverAndOneStream)
     BOOST_CHECK_EQUAL(openedStreams, 0);
     BOOST_CHECK_EQUAL(receivedFrames, expectedFrames);
 }
+
+BOOST_AUTO_TEST_CASE(testThreadedSmallSegmentStream)
+{
+    QThread serverThread;
+    deflect::Server* server = new deflect::Server(0 /* OS-chosen port */);
+    server->moveToThread(&serverThread);
+    serverThread.connect(&serverThread, &QThread::finished, server,
+                         &deflect::Server::deleteLater);
+    serverThread.start();
+
+    // to wait in this thread until server thread is done with certain
+    // operations
+    QWaitCondition received;
+    QMutex mutex;
+    bool receivedState = false;
+
+    auto processServerMessages = [&] {
+        for (size_t j = 0; j < 20; ++j)
+        {
+            mutex.lock();
+            received.wait(&mutex, 100 /*ms*/);
+            if (receivedState)
+            {
+                mutex.unlock();
+                break;
+            }
+            mutex.unlock();
+        }
+        BOOST_REQUIRE(receivedState);
+        receivedState = false;
+    };
+
+    const unsigned int segmentSize = 64;
+    const unsigned int width = 1920;
+    const unsigned int height = 1088;
+
+    struct Segment
+    {
+        unsigned int x;
+        unsigned int y;
+    };
+    std::vector<Segment> segments;
+    for (unsigned int i = 0; i < width; i += segmentSize)
+    {
+        for (unsigned int j = 0; j < height; j += segmentSize)
+            segments.emplace_back(Segment{i, j});
+    }
+
+    const unsigned int numSegments = segments.size();
+    BOOST_REQUIRE_EQUAL(numSegments,
+                        std::ceil((float)width / segmentSize) *
+                            std::ceil((float)height / segmentSize));
+
+    const std::vector<uint8_t> pixels(segmentSize * segmentSize * 4, 42);
+
+    size_t openedStreams = 0;
+    // only continue once we have the stream
+    server->connect(server, &deflect::Server::pixelStreamOpened,
+                    [&](const QString) {
+                        ++openedStreams;
+                        if (openedStreams == 1)
+                        {
+                            mutex.lock();
+                            receivedState = true;
+                            received.wakeAll();
+                            mutex.unlock();
+                        }
+                    });
+
+    // make sure that we get the close of the stream
+    server->connect(server, &deflect::Server::pixelStreamClosed,
+                    [&](const QString) {
+                        --openedStreams;
+                        if (openedStreams == 0)
+                        {
+                            mutex.lock();
+                            receivedState = true;
+                            received.wakeAll();
+                            mutex.unlock();
+                        }
+                    });
+
+    // handle received frames to test the stream's purpose
+    const size_t expectedFrames = 10;
+    size_t receivedFrames = 0;
+    server->connect(server, &deflect::Server::receivedFrame,
+                    [&](deflect::FramePtr frame) {
+                        BOOST_CHECK_EQUAL(frame->segments.size(), numSegments);
+                        BOOST_CHECK_EQUAL(frame->uri.toStdString(),
+                                          testStreamId.toStdString());
+                        const auto dim = frame->computeDimensions();
+                        BOOST_CHECK_EQUAL(dim.width(), width);
+                        BOOST_CHECK_EQUAL(dim.height(), height);
+                        ++receivedFrames;
+                        mutex.lock();
+                        receivedState = true;
+                        received.wakeAll();
+                        mutex.unlock();
+                    });
+
+    {
+        deflect::Stream stream(testStreamId.toStdString(), "localhost",
+                               server->serverPort());
+        BOOST_REQUIRE(stream.isConnected());
+
+        // handle connects first before sending and receiving frames
+        processServerMessages();
+
+        std::mutex testMutex; // BOOST_CHECK is not thread-safe
+        for (size_t i = 0; i < expectedFrames; ++i)
+        {
+            receivedState = false;
+
+// to make coverage report work; otherwise fails for unknown reasons
+#ifdef NDEBUG
+#pragma omp parallel for
+#endif
+            for (int j = 0; j < int(segments.size()); ++j)
+            {
+                deflect::ImageWrapper deflectImage((const void*)pixels.data(),
+                                                   segmentSize, segmentSize,
+                                                   deflect::RGBA, segments[j].x,
+                                                   segments[j].y);
+                deflectImage.compressionPolicy = deflect::COMPRESSION_ON;
+
+                const bool success = stream.send(deflectImage).get();
+
+                std::lock_guard<std::mutex> lock(testMutex);
+                BOOST_CHECK(success);
+            }
+
+            BOOST_CHECK(stream.finishFrame().get());
+
+            server->requestFrame(testStreamId);
+
+            // process frame receive
+            processServerMessages();
+        }
+    }
+
+    // handle close of streamer
+    processServerMessages();
+
+    serverThread.quit();
+    serverThread.wait();
+
+    BOOST_CHECK_EQUAL(openedStreams, 0);
+    BOOST_CHECK_EQUAL(receivedFrames, expectedFrames);
+}