From d388a08e8fec0a9e569e8f6e75dd25ac95e4a4ac Mon Sep 17 00:00:00 2001
From: Manu Evans <turkeyman@gmail.com>
Date: Sun, 18 Aug 2019 16:53:14 -0700
Subject: [PATCH 1/5] Move some traits to druntime.

---
 src/core/internal/traits.d | 158 +++++++++++++++++++++++++++++++------
 1 file changed, 132 insertions(+), 26 deletions(-)

diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d
index bccf1ad356..30519dc276 100644
--- a/src/core/internal/traits.d
+++ b/src/core/internal/traits.d
@@ -8,23 +8,22 @@
  */
 module core.internal.traits;
 
-/// taken from std.typetuple.TypeTuple
-template TypeTuple(TList...)
-{
-    alias TypeTuple = TList;
-}
-alias AliasSeq = TypeTuple;
 
-template FieldTypeTuple(T)
+// TODO: deprecate these old names...?
+alias TypeTuple = AliasSeq;
+alias FieldTypeTuple = Fields;
+
+
+alias AliasSeq(TList...) = TList;
+
+template Fields(T)
 {
     static if (is(T == struct) || is(T == union))
-        alias FieldTypeTuple = typeof(T.tupleof[0 .. $ - __traits(isNested, T)]);
+        alias Fields = typeof(T.tupleof[0 .. $ - __traits(isNested, T)]);
     else static if (is(T == class))
-        alias FieldTypeTuple = typeof(T.tupleof);
+        alias Fields = typeof(T.tupleof);
     else
-    {
-        alias FieldTypeTuple = TypeTuple!T;
-    }
+        alias Fields = AliasSeq!T;
 }
 
 T trustedCast(T, U)(auto ref U u) @trusted pure nothrow
@@ -66,6 +65,20 @@ template Unqual(T)
     }
 }
 
+// [For internal use]
+package template ModifyTypePreservingTQ(alias Modifier, T)
+{
+         static if (is(T U ==          immutable U)) alias ModifyTypePreservingTQ =          immutable Modifier!U;
+    else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U;
+    else static if (is(T U == shared inout       U)) alias ModifyTypePreservingTQ = shared inout       Modifier!U;
+    else static if (is(T U == shared       const U)) alias ModifyTypePreservingTQ = shared       const Modifier!U;
+    else static if (is(T U == shared             U)) alias ModifyTypePreservingTQ = shared             Modifier!U;
+    else static if (is(T U ==        inout const U)) alias ModifyTypePreservingTQ =        inout const Modifier!U;
+    else static if (is(T U ==        inout       U)) alias ModifyTypePreservingTQ =              inout Modifier!U;
+    else static if (is(T U ==              const U)) alias ModifyTypePreservingTQ =              const Modifier!U;
+    else                                             alias ModifyTypePreservingTQ =                    Modifier!T;
+}
+
 // Substitute all `inout` qualifiers that appears in T to `const`
 template substInout(T)
 {
@@ -187,12 +200,12 @@ template allSatisfy(alias F, T...)
 }
 
 // taken from std.meta.anySatisfy
-template anySatisfy(alias F, T...)
+template anySatisfy(alias F, Ts...)
 {
-    static foreach (Ti; T)
+    static foreach (T; Ts)
     {
         static if (!is(typeof(anySatisfy) == bool) && // not yet defined
-                   F!(Ti))
+                   F!T)
         {
             enum anySatisfy = true;
         }
@@ -220,17 +233,6 @@ template maxAlignment(U...)
     }
 }
 
-// std.traits.Fields
-template Fields(T)
-{
-    static if (is(T == struct) || is(T == union))
-        alias Fields = typeof(T.tupleof[0 .. $ - __traits(isNested, T)]);
-    else static if (is(T == class))
-        alias Fields = typeof(T.tupleof);
-    else
-        alias Fields = TypeTuple!T;
-}
-
 /// See $(REF hasElaborateMove, std,traits)
 template hasElaborateMove(S)
 {
@@ -303,6 +305,110 @@ template hasElaborateAssign(S)
     }
 }
 
+template hasIndirections(T)
+{
+    static if (is(T == struct) || is(T == union))
+        enum hasIndirections = anySatisfy!(.hasIndirections, Fields!T);
+    else static if (__traits(isStaticArray, T) && is(T : E[N], E, size_t N))
+        enum hasIndirections = is(E == void) ? true : hasIndirections!E;
+    else static if (isFunctionPointer!T)
+        enum hasIndirections = false;
+    else
+        enum hasIndirections = isPointer!T || isDelegate!T || isDynamicArray!T ||
+            __traits(isAssociativeArray, T) || is (T == class) || is(T == interface);
+}
+
+template hasUnsharedIndirections(T)
+{
+    static if (is(T == struct) || is(T == union))
+        enum hasUnsharedIndirections = anySatisfy!(.hasUnsharedIndirections, Fields!T);
+    else static if (is(T : E[N], E, size_t N))
+        enum hasUnsharedIndirections = is(E == void) ? false : hasUnsharedIndirections!E;
+    else static if (isFunctionPointer!T)
+        enum hasUnsharedIndirections = false;
+    else static if (isPointer!T)
+        enum hasUnsharedIndirections = !is(T : shared(U)*, U);
+    else static if (isDynamicArray!T)
+        enum hasUnsharedIndirections = !is(T : shared(V)[], V);
+    else static if (is(T == class) || is(T == interface))
+        enum hasUnsharedIndirections = !is(T : shared(W), W);
+    else
+        enum hasUnsharedIndirections = isDelegate!T || __traits(isAssociativeArray, T); // TODO: how to handle these?
+}
+
+enum bool isAggregateType(T) = is(T == struct) || is(T == union) ||
+                               is(T == class) || is(T == interface);
+
+enum bool isPointer(T) = is(T == U*, U) && !isAggregateType!T;
+
+enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
+
+template OriginalType(T)
+{
+    template Impl(T)
+    {
+        static if (is(T U == enum)) alias Impl = OriginalType!U;
+        else                        alias Impl =              T;
+    }
+
+    alias OriginalType = ModifyTypePreservingTQ!(Impl, T);
+}
+
+template DynamicArrayTypeOf(T)
+{
+    static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT))
+        alias X = DynamicArrayTypeOf!AT;
+    else
+        alias X = OriginalType!T;
+
+    static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; })))
+        alias DynamicArrayTypeOf = X;
+    else
+        static assert(0, T.stringof ~ " is not a dynamic array");
+}
+
+private template AliasThisTypeOf(T)
+    if (isAggregateType!T)
+{
+    alias members = __traits(getAliasThis, T);
+
+    static if (members.length == 1)
+        alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0]));
+    else
+        static assert(0, T.stringof~" does not have alias this type");
+}
+
+template isFunctionPointer(T...)
+    if (T.length == 1)
+{
+    static if (is(T[0] U) || is(typeof(T[0]) U))
+    {
+        static if (is(U F : F*) && is(F == function))
+            enum bool isFunctionPointer = true;
+        else
+            enum bool isFunctionPointer = false;
+    }
+    else
+        enum bool isFunctionPointer = false;
+}
+
+template isDelegate(T...)
+    if (T.length == 1)
+{
+    static if (is(typeof(& T[0]) U : U*) && is(typeof(& T[0]) U == delegate))
+    {
+        // T is a (nested) function symbol.
+        enum bool isDelegate = true;
+    }
+    else static if (is(T[0] W) || is(typeof(T[0]) W))
+    {
+        // T is an expression or a type.  Take the type of it and examine.
+        enum bool isDelegate = is(W == delegate);
+    }
+    else
+        enum bool isDelegate = false;
+}
+
 // std.meta.Filter
 template Filter(alias pred, TList...)
 {

From f8ec8e3526557c719c60d178318483a653434ca9 Mon Sep 17 00:00:00 2001
From: Manu Evans <turkeyman@gmail.com>
Date: Sat, 17 Aug 2019 19:32:51 -0700
Subject: [PATCH 2/5] Extract atomic platform specific implementation into an
 implementation file.

---
 CODEOWNERS                 |    4 +-
 mak/COPY                   |    1 +
 mak/SRCS                   |    1 +
 mak/WINDOWS                |    3 +
 src/core/atomic.d          | 2034 +++++++++---------------------------
 src/core/internal/atomic.d |  533 ++++++++++
 6 files changed, 1060 insertions(+), 1516 deletions(-)
 create mode 100644 src/core/internal/atomic.d

diff --git a/CODEOWNERS b/CODEOWNERS
index 50ee7e8455..bcdb157e21 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -15,7 +15,7 @@
 
 src/checkedint.d @redstar @andralex @JackStouffer
 
-src/core/atomic.d @WalterBright @ibuclaw
+src/core/atomic.d @WalterBright @ibuclaw @TurkeyMan
 src/core/attribute.d @jacob-carlborg
 src/core/bitop.d @schveiguy @tsbockman @Geod24
 src/core/cpuid.d @WalterBright @ibuclaw @JackStouffer
@@ -26,7 +26,7 @@ src/core/math.d @ibuclaw @redstar
 src/core/runtime.d @MartinNowak @Abscissa
 src/core/simd.d @WalterBright @MartinNowak
 src/core/stdc/* @schveiguy @ibuclaw
-src/core/stdcpp/* @WalterBright @Darredevil
+src/core/stdcpp/* @WalterBright @Darredevil @TurkeyMan
 src/core/sync/* @MartinNowak @Geod24 @WalterBright @ZombineDev
 src/core/sys/bionic/* @joakim-noah
 src/core/sys/darwin/* @jacob-carlborg @klickverbot @etcimon @MartinNowak
diff --git a/mak/COPY b/mak/COPY
index 1a8342116b..2962e567e4 100644
--- a/mak/COPY
+++ b/mak/COPY
@@ -23,6 +23,7 @@ COPY=\
 	\
 	$(IMPDIR)\core\internal\abort.d \
 	$(IMPDIR)\core\internal\arrayop.d \
+	$(IMPDIR)\core\internal\atomic.d \
 	$(IMPDIR)\core\internal\attributes.d \
 	$(IMPDIR)\core\internal\convert.d \
 	$(IMPDIR)\core\internal\dassert.d \
diff --git a/mak/SRCS b/mak/SRCS
index cc0e925154..e744d0c992 100644
--- a/mak/SRCS
+++ b/mak/SRCS
@@ -23,6 +23,7 @@ SRCS=\
 	\
 	src\core\internal\abort.d \
 	src\core\internal\arrayop.d \
+	src\core\internal\atomic.d \
 	src\core\internal\convert.d \
 	src\core\internal\dassert.d \
 	src\core\internal\hash.d \
diff --git a/mak/WINDOWS b/mak/WINDOWS
index 4cd551976a..cccb5cc480 100644
--- a/mak/WINDOWS
+++ b/mak/WINDOWS
@@ -123,6 +123,9 @@ $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
 $(IMPDIR)\core\internal\arrayop.d : src\core\internal\arrayop.d
 	copy $** $@
 
+$(IMPDIR)\core\internal\atomic.d : src\core\internal\atomic.d
+	copy $** $@
+
 $(IMPDIR)\core\internal\attributes.d : src\core\internal\attributes.d
 	copy $** $@
 
diff --git a/src/core/atomic.d b/src/core/atomic.d
index 6b54587199..04b045d2ab 100644
--- a/src/core/atomic.d
+++ b/src/core/atomic.d
@@ -10,6 +10,7 @@
 
 module core.atomic;
 
+import core.internal.atomic;
 import core.internal.attributes : betterC;
 
 version (D_InlineAsm_X86)
@@ -35,123 +36,6 @@ else
     enum has128BitCAS = false;
 }
 
-private
-{
-    /* Construct a type with a shared tail, and if possible with an unshared
-    head. */
-    template TailShared(U) if (!is(U == shared))
-    {
-        alias TailShared = .TailShared!(shared U);
-    }
-    template TailShared(S) if (is(S == shared))
-    {
-        // Get the unshared variant of S.
-        static if (is(S U == shared U)) {}
-        else static assert(false, "Should never be triggered. The `static " ~
-            "if` declares `U` as the unshared version of the shared type " ~
-            "`S`. `S` is explicitly declared as shared, so getting `U` " ~
-            "should always work.");
-
-        static if (is(S : U))
-            alias TailShared = U;
-        else static if (is(S == struct))
-        {
-            enum implName = () {
-                /* Start with "_impl". If S has a field with that name, append
-                underscores until the clash is resolved. */
-                string name = "_impl";
-                string[] fieldNames;
-                static foreach (alias field; S.tupleof)
-                {
-                    fieldNames ~= __traits(identifier, field);
-                }
-                static bool canFind(string[] haystack, string needle)
-                {
-                    foreach (candidate; haystack)
-                    {
-                        if (candidate == needle) return true;
-                    }
-                    return false;
-                }
-                while (canFind(fieldNames, name)) name ~= "_";
-                return name;
-            } ();
-            struct TailShared
-            {
-                static foreach (i, alias field; S.tupleof)
-                {
-                    /* On @trusted: This is casting the field from shared(Foo)
-                    to TailShared!Foo. The cast is safe because the field has
-                    been loaded and is not shared anymore. */
-                    mixin("
-                        @trusted @property
-                        ref " ~ __traits(identifier, field) ~ "()
-                        {
-                            alias R = TailShared!(typeof(field));
-                            return * cast(R*) &" ~ implName ~ ".tupleof[i];
-                        }
-                    ");
-                }
-                mixin("
-                    S " ~ implName ~ ";
-                    alias " ~ implName ~ " this;
-                ");
-            }
-        }
-        else
-            alias TailShared = S;
-    }
-    @safe unittest
-    {
-        // No tail (no indirections) -> fully unshared.
-
-        static assert(is(TailShared!int == int));
-        static assert(is(TailShared!(shared int) == int));
-
-        static struct NoIndir { int i; }
-        static assert(is(TailShared!NoIndir == NoIndir));
-        static assert(is(TailShared!(shared NoIndir) == NoIndir));
-
-        // Tail can be independently shared or is already -> tail-shared.
-
-        static assert(is(TailShared!(int*) == shared(int)*));
-        static assert(is(TailShared!(shared int*) == shared(int)*));
-        static assert(is(TailShared!(shared(int)*) == shared(int)*));
-
-        static assert(is(TailShared!(int[]) == shared(int)[]));
-        static assert(is(TailShared!(shared int[]) == shared(int)[]));
-        static assert(is(TailShared!(shared(int)[]) == shared(int)[]));
-
-        static struct S1 { shared int* p; }
-        static assert(is(TailShared!S1 == S1));
-        static assert(is(TailShared!(shared S1) == S1));
-
-        static struct S2 { shared(int)* p; }
-        static assert(is(TailShared!S2 == S2));
-        static assert(is(TailShared!(shared S2) == S2));
-
-        // Tail follows shared-ness of head -> fully shared.
-
-        static class C { int i; }
-        static assert(is(TailShared!C == shared C));
-        static assert(is(TailShared!(shared C) == shared C));
-
-        /* However, structs get a wrapper that has getters which cast to
-        TailShared. */
-
-        static struct S3 { int* p; int _impl; int _impl_; int _impl__; }
-        static assert(!is(TailShared!S3 : S3));
-        static assert(is(TailShared!S3 : shared S3));
-        static assert(is(TailShared!(shared S3) == TailShared!S3));
-
-        static struct S4 { shared(int)** p; }
-        static assert(!is(TailShared!S4 : S4));
-        static assert(is(TailShared!S4 : shared S4));
-        static assert(is(TailShared!(shared S4) == TailShared!S4));
-    }
-}
-
-
 version (AsmX86)
 {
     // NOTE: Strictly speaking, the x86 supports atomic operations on
@@ -173,1279 +57,435 @@ version (AsmX86)
     }
 }
 
-
-version (CoreDdoc)
+/**
+ * Specifies the memory ordering semantics of an atomic operation.
+ *
+ * See_Also:
+ *     $(HTTP en.cppreference.com/w/cpp/atomic/memory_order)
+ */
+enum MemoryOrder
 {
     /**
-     * Performs the binary operation 'op' on val using 'mod' as the modifier.
-     *
-     * Params:
-     *  val = The target variable.
-     *  mod = The modifier to apply.
-     *
-     * Returns:
-     *  The result of the operation.
-     */
-    TailShared!T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) pure nothrow @nogc @safe
-        if ( __traits( compiles, mixin( "*cast(T*)&val" ~ op ~ "mod" ) ) )
-    {
-        return TailShared!T.init;
-    }
-
-    /**
-     * Atomically adds `mod` to the value referenced by `val` and returns the value `val` held previously.
-     * This operation is both lock-free and atomic.
-     *
-     * Params:
-     *  val = Reference to the value to modify.
-     *  mod = The value to add.
-     *
-     * Returns:
-     *  The value held previously by `val`.
-     */
-    TailShared!(T) atomicFetchAdd(T)( ref shared T val, size_t mod ) pure nothrow @nogc @safe;
-
-    /**
-     * Atomically subtracts `mod` from the value referenced by `val` and returns the value `val` held previously.
-     * This operation is both lock-free and atomic.
-     *
-     * Params:
-     *  val = Reference to the value to modify.
-     *  mod = The value to subtract.
-     *
-     * Returns:
-     *  The value held previously by `val`.
-     */
-    TailShared!(T) atomicFetchSub(T)( ref shared T val, size_t mod ) pure nothrow @nogc @safe;
-
-    /**
-     * Exchange `exchangeWith` with the memory referenced by `here`.
-     * This operation is both lock-free and atomic.
-     *
-     * Params:
-     *  here         = The address of the destination variable.
-     *  exchangeWith = The value to exchange.
-     *
-     * Returns:
-     *  The value held previously by `here`.
-     */
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, V exchangeWith ) pure nothrow @nogc @safe
-        if ( !is(T == class) && !is(T U : U*) && __traits( compiles, { *here = exchangeWith; } ) );
-
-    /// Ditto
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V) exchangeWith ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = exchangeWith; } ) );
-
-    /// Ditto
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V)* exchangeWith ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = exchangeWith; } ) );
-
-    /**
-     * Stores 'writeThis' to the memory referenced by 'here' if the value
-     * referenced by 'here' is equal to 'ifThis'.  This operation is both
-     * lock-free and atomic.
-     *
-     * Params:
-     *  here      = The address of the destination variable.
-     *  writeThis = The value to store.
-     *  ifThis    = The comparison value.
-     *
-     * Returns:
-     *  true if the store occurred, false if not.
+     * Not sequenced.
+     * Corresponds to $(LINK2 https://llvm.org/docs/Atomics.html#monotonic, LLVM AtomicOrdering.Monotonic)
+     * and C++11/C11 `memory_order_relaxed`.
      */
-    bool cas(T,V1,V2)( shared(T)* here, const V1 ifThis, V2 writeThis ) pure nothrow @nogc @safe
-        if ( !is(T == class) && !is(T U : U*) && __traits( compiles, { *here = writeThis; } ) );
-
-    /// Ditto
-    bool cas(T,V1,V2)( shared(T)* here, const shared(V1) ifThis, shared(V2) writeThis ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = writeThis; } ) );
-
-    /// Ditto
-    bool cas(T,V1,V2)( shared(T)* here, const shared(V1)* ifThis, shared(V2)* writeThis ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = writeThis; } ) );
-
-    /**
-    * Stores 'writeThis' to the memory referenced by 'here' if the value
-    * referenced by 'here' is equal to the value referenced by 'ifThis'.
-    * The prior value referenced by 'here' is written to `ifThis` and
-    * returned to the user.  This operation is both lock-free and atomic.
-    *
-    * Params:
-    *  here      = The address of the destination variable.
-    *  writeThis = The value to store.
-    *  ifThis    = The address of the value to compare, and receives the prior value of `here` as output.
-    *
-    * Returns:
-    *  true if the store occurred, false if not.
-    */
-    bool cas(T,V1,V2)( shared(T)* here, V1* ifThis, V2 writeThis ) pure nothrow @nogc @safe
-        if ( !is(T == class) && !is(T U : U*) && __traits( compiles, { *here = writeThis; } ) );
-
-    /// Ditto
-    bool cas(T,V1,V2)( shared(T)* here, shared(V1)* ifThis, shared(V2) writeThis ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = writeThis; } ) );
-
-    /// Ditto
-    bool cas(T,V1,V2)( shared(T)* here, shared(V1)** ifThis, shared(V2)* writeThis ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = writeThis; } ) );
-
+    raw,
     /**
-     * Loads 'val' from memory and returns it.  The memory barrier specified
-     * by 'ms' is applied to the operation, which is fully sequenced by
-     * default.  Valid memory orders are MemoryOrder.raw, MemoryOrder.acq,
-     * and MemoryOrder.seq.
-     *
-     * Params:
-     *  val = The target variable.
-     *
-     * Returns:
-     *  The value of 'val'.
+     * Hoist-load + hoist-store barrier.
+     * Corresponds to $(LINK2 https://llvm.org/docs/Atomics.html#acquire, LLVM AtomicOrdering.Acquire)
+     * and C++11/C11 `memory_order_acquire`.
      */
-    TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq,T)( ref const shared T val ) pure nothrow @nogc @safe
-    {
-        return TailShared!T.init;
-    }
-
-
+    acq,
     /**
-     * Writes 'newval' into 'val'.  The memory barrier specified by 'ms' is
-     * applied to the operation, which is fully sequenced by default.
-     * Valid memory orders are MemoryOrder.raw, MemoryOrder.rel, and
-     * MemoryOrder.seq.
-     *
-     * Params:
-     *  val    = The target variable.
-     *  newval = The value to store.
+     * Sink-load + sink-store barrier.
+     * Corresponds to $(LINK2 https://llvm.org/docs/Atomics.html#release, LLVM AtomicOrdering.Release)
+     * and C++11/C11 `memory_order_release`.
      */
-    void atomicStore(MemoryOrder ms = MemoryOrder.seq,T,V1)( ref shared T val, V1 newval ) pure nothrow @nogc @safe
-        if ( __traits( compiles, { val = newval; } ) )
-    {
-
-    }
-
-
+    rel,
     /**
-     * Specifies the memory ordering semantics of an atomic operation.
-     *
-     * See_Also:
-     *     $(HTTP en.cppreference.com/w/cpp/atomic/memory_order)
+     * Acquire + release barrier.
+     * Corresponds to $(LINK2 https://llvm.org/docs/Atomics.html#acquirerelease, LLVM AtomicOrdering.AcquireRelease)
+     * and C++11/C11 `memory_order_acq_rel`.
      */
-    enum MemoryOrder
-    {
-        /++
-        Not sequenced.
-        Corresponds to $(LINK2 https://llvm.org/docs/Atomics.html#monotonic, LLVM AtomicOrdering.Monotonic)
-        and C++11/C11 `memory_order_relaxed`.
-        +/
-        raw,
-        /++
-        Hoist-load + hoist-store barrier.
-        Corresponds to $(LINK2 https://llvm.org/docs/Atomics.html#acquire, LLVM AtomicOrdering.Acquire)
-        and C++11/C11 `memory_order_acquire`.
-        +/
-        acq,
-        /++
-        Sink-load + sink-store barrier.
-        Corresponds to $(LINK2 https://llvm.org/docs/Atomics.html#release, LLVM AtomicOrdering.Release)
-        and C++11/C11 `memory_order_release`.
-        +/
-        rel,
-        /++
-        Fully sequenced (acquire + release). Corresponds to
-        $(LINK2 https://llvm.org/docs/Atomics.html#sequentiallyconsistent, LLVM AtomicOrdering.SequentiallyConsistent)
-        and C++11/C11 `memory_order_seq_cst`.
-        +/
-        seq,
-    }
-
+    acq_rel,
     /**
-     * Inserts a full load/store memory fence (on platforms that need it). This ensures
-     * that all loads and stores before a call to this function are executed before any
-     * loads and stores after the call.
+     * Fully sequenced (acquire + release). Corresponds to
+     * $(LINK2 https://llvm.org/docs/Atomics.html#sequentiallyconsistent, LLVM AtomicOrdering.SequentiallyConsistent)
+     * and C++11/C11 `memory_order_seq_cst`.
      */
-    void atomicFence() nothrow @nogc;
+    seq,
 }
-else version (AsmX86_32)
+
+/**
+ * Atomically adds `mod` to the value referenced by `val` and returns the value `val` held previously.
+ * This operation is both lock-free and atomic.
+ *
+ * Params:
+ *  val = Reference to the value to modify.
+ *  mod = The value to add.
+ *
+ * Returns:
+ *  The value held previously by `val`.
+ */
+TailShared!(T) atomicFetchAdd(T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
+    if ( __traits(isIntegral, T) )
+in ( atomicValueIsProperlyAligned(val) )
 {
-    // Uses specialized asm for fast fetch and add operations
-    TailShared!(T) atomicFetchAdd(T)( ref shared T val, size_t mod ) pure nothrow @nogc @safe
-        if ( T.sizeof <= 4 )
-    {
-        asm pure nothrow @nogc @trusted
-        {
-            mov EAX, mod;
-            mov EDX, val;
-        }
-        static if (T.sizeof == 1) asm pure nothrow @nogc @trusted { lock; xadd[EDX], AL; }
-        else static if (T.sizeof == 2) asm pure nothrow @nogc @trusted { lock; xadd[EDX], AX; }
-        else static if (T.sizeof == 4) asm pure nothrow @nogc @trusted { lock; xadd[EDX], EAX; }
-    }
+    return core.internal.atomic.atomicFetchAdd( &val, cast(T)mod );
+}
 
-    TailShared!(T) atomicFetchSub(T)( ref shared T val, size_t mod ) pure nothrow @nogc @safe
-        if ( T.sizeof <= 4)
-    {
-        return atomicFetchAdd(val, -mod);
-    }
+/**
+ * Atomically subtracts `mod` from the value referenced by `val` and returns the value `val` held previously.
+ * This operation is both lock-free and atomic.
+ *
+ * Params:
+ *  val = Reference to the value to modify.
+ *  mod = The value to subtract.
+ *
+ * Returns:
+ *  The value held previously by `val`.
+ */
+TailShared!(T) atomicFetchSub(T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
+    if ( __traits(isIntegral, T) )
+in ( atomicValueIsProperlyAligned(val) )
+{
+    return core.internal.atomic.atomicFetchSub( &val, cast(T)mod );
+}
 
-    TailShared!T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) pure nothrow @nogc
-        if ( __traits( compiles, mixin( "*cast(T*)&val" ~ op ~ "mod" ) ) )
-    in
-    {
-        assert(atomicValueIsProperlyAligned(val));
-    }
-    do
+/**
+ * Exchange `exchangeWith` with the memory referenced by `here`.
+ * This operation is both lock-free and atomic.
+ *
+ * Params:
+ *  here         = The address of the destination variable.
+ *  exchangeWith = The value to exchange.
+ *
+ * Returns:
+ *  The value held previously by `here`.
+ */
+shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, V exchangeWith ) pure nothrow @nogc @trusted
+    if ( !is(T == class) && !is(T U : U*) &&  __traits( compiles, { *here = exchangeWith; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    static if ( __traits(isFloating, V) )
     {
-        // binary operators
-        //
-        // +    -   *   /   %   ^^  &
-        // |    ^   <<  >>  >>> ~   in
-        // ==   !=  <   <=  >   >=
-        static if (op == "+"  || op == "-"  || op == "*"  || op == "/"   ||
-                   op == "%"  || op == "^^" || op == "&"  || op == "|"   ||
-                   op == "^"  || op == "<<" || op == ">>" || op == ">>>" ||
-                   op == "~"  || // skip "in"
-                   op == "==" || op == "!=" || op == "<"  || op == "<="  ||
-                   op == ">"  || op == ">=")
-        {
-            TailShared!T get = atomicLoad!(MemoryOrder.raw)( val );
-            mixin( "return get " ~ op ~ " mod;" );
-        }
-        else
-        // assignment operators
-        //
-        // +=   -=  *=  /=  %=  ^^= &=
-        // |=   ^=  <<= >>= >>>=    ~=
-        static if ( op == "+=" && __traits(isIntegral, T) && T.sizeof <= 4 && V1.sizeof <= 4)
-        {
-            return cast(T)(atomicFetchAdd!(T)(val, mod) + mod);
-        }
-        else static if ( op == "-=" && __traits(isIntegral, T) && T.sizeof <= 4 && V1.sizeof <= 4)
-        {
-            return cast(T)(atomicFetchSub!(T)(val, mod) - mod);
-        }
-        else static if ( op == "+=" || op == "-="  || op == "*="  || op == "/=" ||
-                   op == "%=" || op == "^^=" || op == "&="  || op == "|=" ||
-                   op == "^=" || op == "<<=" || op == ">>=" || op == ">>>=" ) // skip "~="
-        {
-            TailShared!T get, set;
-
-            do
-            {
-                get = set = atomicLoad!(MemoryOrder.raw)( val );
-                mixin( "set " ~ op ~ " mod;" );
-            } while ( !casByRef( val, get, set ) );
-            return set;
-        }
+        static if ( V.sizeof == 4 )
+            alias I = uint;
+        else static if ( V.sizeof == 8 )
+            alias I = ulong;
         else
-        {
-            static assert( false, "Operation not supported." );
-        }
-    }
-
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, V exchangeWith ) pure nothrow @nogc @safe
-        if ( !is(T == class) && !is(T U : U*) && __traits( compiles, { *here = exchangeWith; } ) )
-    {
-        return atomicExchangeImpl(here, exchangeWith);
-    }
-
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V) exchangeWith ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = exchangeWith; } ) )
-    {
-        return atomicExchangeImpl(here, exchangeWith);
-    }
-
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V)* exchangeWith ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = exchangeWith; } ) )
-    {
-        return atomicExchangeImpl(here, exchangeWith);
-    }
-
-    private shared(T) atomicExchangeImpl(T,V)( shared(T)* here, V exchangeWith ) pure nothrow @nogc @safe
-        in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
-    {
-        static if ( T.sizeof == byte.sizeof )
-        {
-            asm pure nothrow @nogc @trusted
-            {
-                mov AL, exchangeWith;
-                mov ECX, here;
-                xchg [ECX], AL;
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            asm pure nothrow @nogc @trusted
-            {
-                mov AX, exchangeWith;
-                mov ECX, here;
-                xchg [ECX], AX;
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            asm pure nothrow @nogc @trusted
-            {
-                mov EAX, exchangeWith;
-                mov ECX, here;
-                xchg [ECX], EAX;
-            }
-            static if ( __traits(isFloating, T) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov exchangeWith, EAX;
-                }
-                return exchangeWith;
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
-
-    bool casByRef(T,V1,V2)( ref T value, V1 ifThis, V2 writeThis ) pure nothrow @nogc @trusted
-    {
-        return cas(&value, ifThis, writeThis);
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, const V1 ifThis, V2 writeThis ) pure nothrow @nogc @safe
-        if ( !is(T == class) && !is(T U : U*) && __traits( compiles, { *here = writeThis; } ) )
-    {
-        return casImplNoResult(here, ifThis, writeThis);
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, const shared(V1) ifThis, shared(V2) writeThis ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = writeThis; } ) )
-    {
-        return casImplNoResult(here, ifThis, writeThis);
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, const shared(V1)* ifThis, shared(V2)* writeThis ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = writeThis; } ) )
-    {
-        return casImplNoResult(here, ifThis, writeThis);
-    }
-
-    private bool casImplNoResult(T,V1,V2)( shared(T)* here, V1 ifThis, V2 writeThis ) pure nothrow @nogc @safe
-    in
-    {
-        assert( atomicPtrIsProperlyAligned( here ) );
-    }
-    do
-    {
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte CAS
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                mov DL, writeThis;
-                mov AL, ifThis;
-                mov ECX, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg [ECX], DL;
-                setz AL;
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte CAS
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                mov DX, writeThis;
-                mov AX, ifThis;
-                mov ECX, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg [ECX], DX;
-                setz AL;
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte CAS
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                mov EDX, writeThis;
-                mov EAX, ifThis;
-                mov ECX, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg [ECX], EDX;
-                setz AL;
-            }
-        }
-        else static if ( T.sizeof == long.sizeof && has64BitCAS )
-        {
-
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte CAS on a 32-Bit Processor
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                push EBX;
-                lea EDI, writeThis;
-                mov EBX, [EDI];
-                mov ECX, 4[EDI];
-                lea EDI, ifThis;
-                mov EAX, [EDI];
-                mov EDX, 4[EDI];
-                mov EDI, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg8b [EDI];
-                setz AL;
-                pop EBX;
-                pop EDI;
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, V1* ifThis, V2 writeThis ) pure nothrow @nogc @safe
-        if ( !is(T == class) && !is(T U : U*) && __traits( compiles, { *here = writeThis; } ) )
-    {
-        return casImplWithResult(here, *ifThis, writeThis);
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, shared(V1)* ifThis, shared(V2) writeThis ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = writeThis; } ) )
-    {
-        return casImplWithResult(here, *ifThis, writeThis);
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, shared(V1*)* ifThis, shared(V2)* writeThis ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = writeThis; } ) )
-    {
-        return casImplWithResult(here, *ifThis, writeThis);
-    }
-
-    private bool casImplWithResult(T,V1,V2)( shared(T)* here, ref V1 ifThis, V2 writeThis ) pure nothrow @nogc @safe
-    in
-    {
-        assert( atomicPtrIsProperlyAligned( here ) );
-    }
-    do
-    {
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte CAS
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                mov DL, writeThis;
-                mov EDI, ifThis;
-                mov AL, [EDI];
-                mov ECX, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg [ECX], DL;
-                mov [EDI], AL;
-                setz AL;
-                pop EDI;
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte CAS
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                mov DX, writeThis;
-                mov EDI, ifThis;
-                mov AX, [EDI];
-                mov ECX, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg [ECX], DX;
-                mov [EDI], AX;
-                setz AL;
-                pop EDI;
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte CAS
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                mov EDX, writeThis;
-                mov EDI, ifThis;
-                mov EAX, [EDI];
-                mov ECX, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg [ECX], EDX;
-                mov [EDI], EAX;
-                setz AL;
-                pop EDI;
-            }
-        }
-        else static if ( T.sizeof == long.sizeof && has64BitCAS )
-        {
-
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte CAS on a 32-Bit Processor
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                push EBX;
-                lea EDI, writeThis;
-                mov EBX, [EDI];
-                mov ECX, 4[EDI];
-                mov EDI, ifThis;
-                mov EAX, [EDI];
-                mov EDX, 4[EDI];
-                mov EDI, here;
-                lock; // lock always needed to make this op atomic
-                cmpxchg8b [EDI];
-                mov EDI, ifThis;
-                mov [EDI], EAX;
-                mov 4[EDI], EDX;
-                setz AL;
-                pop EBX;
-                pop EDI;
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
-
-
-    enum MemoryOrder
-    {
-        raw,
-        acq,
-        rel,
-        seq,
-    }
-
-
-    private
-    {
-        // NOTE: x86 loads implicitly have acquire semantics so a memory
-        //       barrier is only necessary on releases.
-        template needsLoadBarrier( MemoryOrder ms )
-        {
-            enum bool needsLoadBarrier = ms == MemoryOrder.seq;
-        }
-
-
-        // NOTE: x86 stores implicitly have release semantics so a memory
-        //       barrier is only necessary on acquires.
-        template needsStoreBarrier( MemoryOrder ms )
-        {
-            enum bool needsStoreBarrier = ms == MemoryOrder.seq;
-        }
-    }
-
-
-    TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @safe
-    if (!__traits(isFloating, T))
-    {
-        static assert( ms != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()" );
-        static assert( __traits(isPOD, T), "argument to atomicLoad() must be POD" );
-
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov DL, 0;
-                    mov AL, 0;
-                    mov ECX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [ECX], DL;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov AL, [EAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov DX, 0;
-                    mov AX, 0;
-                    mov ECX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [ECX], DX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov AX, [EAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EDX, 0;
-                    mov EAX, 0;
-                    mov ECX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [ECX], EDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov EAX, [EAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof && has64BitCAS )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte Load on a 32-Bit Processor
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                push EBX;
-                mov EBX, 0;
-                mov ECX, 0;
-                mov EAX, 0;
-                mov EDX, 0;
-                mov EDI, val;
-                lock; // lock always needed to make this op atomic
-                cmpxchg8b [EDI];
-                pop EBX;
-                pop EDI;
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
-
-    void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V1)( ref shared T val, V1 newval ) pure nothrow @nogc @safe
-        if ( __traits( compiles, { val = newval; } ) )
-    {
-        static assert( ms != MemoryOrder.acq, "invalid MemoryOrder for atomicStore()" );
-        static assert( __traits(isPOD, T), "argument to atomicStore() must be POD" );
-
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DL, newval;
-                    lock;
-                    xchg [EAX], DL;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DL, newval;
-                    mov [EAX], DL;
-                }
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DX, newval;
-                    lock;
-                    xchg [EAX], DX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DX, newval;
-                    mov [EAX], DX;
-                }
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov EDX, newval;
-                    lock;
-                    xchg [EAX], EDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov EDX, newval;
-                    mov [EAX], EDX;
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof && has64BitCAS )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte Store on a 32-Bit Processor
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                push EBX;
-                lea EDI, newval;
-                mov EBX, [EDI];
-                mov ECX, 4[EDI];
-                mov EDI, val;
-                mov EAX, [EDI];
-                mov EDX, 4[EDI];
-            L1: lock; // lock always needed to make this op atomic
-                cmpxchg8b [EDI];
-                jne L1;
-                pop EBX;
-                pop EDI;
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
+            static assert( false, "Float type " ~ V.stringof ~ " not supported.");
+        I r = core.internal.atomic.atomicExchange(cast(shared(I)*)here, *cast(I*)&exchangeWith);
+        return *cast(shared(T)*)&r;
     }
+    else
+        return core.internal.atomic.atomicExchange(here, exchangeWith);
+}
 
+/// Ditto
+shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V) exchangeWith ) pure nothrow @nogc @safe
+    if ( is(T == class) && __traits( compiles, { *here = exchangeWith; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    return core.internal.atomic.atomicExchange(here, exchangeWith);
+}
 
-    void atomicFence() nothrow @nogc @safe
-    {
-        import core.cpuid;
+/// Ditto
+shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V)* exchangeWith ) pure nothrow @nogc @safe
+    if ( is(T U : U*) && __traits( compiles, { *here = exchangeWith; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    return core.internal.atomic.atomicExchange(here, exchangeWith);
+}
 
-        asm pure nothrow @nogc @trusted
-        {
-            naked;
+/**
+ * Stores 'writeThis' to the memory referenced by 'here' if the value
+ * referenced by 'here' is equal to 'ifThis'.  This operation is both
+ * lock-free and atomic.
+ *
+ * Params:
+ *  here      = The address of the destination variable.
+ *  writeThis = The value to store.
+ *  ifThis    = The comparison value.
+ *
+ * Returns:
+ *  true if the store occurred, false if not.
+ */
+bool cas(T,V1,V2)( shared(T)* here, const V1 ifThis, V2 writeThis ) pure nothrow @nogc @trusted
+    if ( !is(T == class) && !is(T U : U*) &&  __traits( compiles, { *here = writeThis; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    static if ( __traits(isFloating, T) )
+    {
+        static assert ( __traits(isFloating, V1) && __traits(isFloating, V2), "Mismatching argument types." );
+        static if ( T.sizeof == 4 )
+            alias IntTy = uint;
+        else static if ( T.sizeof == 8 )
+            alias IntTy = ulong;
+        return atomicCompareExchangeStrongNoResult( cast(IntTy*)here, *cast(IntTy*)&ifThis, *cast(IntTy*)&writeThis );
+    }
+    else
+        return atomicCompareExchangeStrongNoResult!( MemoryOrder.seq, MemoryOrder.seq, T )( cast(T*)here, cast()ifThis, cast()writeThis );
+}
 
-            call sse2;
-            test AL, AL;
-            jne Lcpuid;
+/// Ditto
+bool cas(T,V1,V2)( shared(T)* here, const shared(V1) ifThis, shared(V2) writeThis ) pure nothrow @nogc @safe
+    if ( is(T == class) && __traits( compiles, { *here = writeThis; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    return atomicCompareExchangeStrongNoResult( here, ifThis, writeThis );
+}
 
-            // Fast path: We have SSE2, so just use mfence.
-            mfence;
-            jmp Lend;
+/// Ditto
+bool cas(T,V1,V2)( shared(T)* here, const shared(V1)* ifThis, shared(V2)* writeThis ) pure nothrow @nogc @safe
+    if ( is(T U : U*) && __traits( compiles, { *here = writeThis; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    return atomicCompareExchangeStrongNoResult( here, ifThis, writeThis );
+}
 
-        Lcpuid:
+/**
+ * Stores 'writeThis' to the memory referenced by 'here' if the value
+ * referenced by 'here' is equal to the value referenced by 'ifThis'.
+ * The prior value referenced by 'here' is written to `ifThis` and
+ * returned to the user.  This operation is both lock-free and atomic.
+ *
+ * Params:
+ *  here      = The address of the destination variable.
+ *  writeThis = The value to store.
+ *  ifThis    = The address of the value to compare, and receives the prior value of `here` as output.
+ *
+ * Returns:
+ *  true if the store occurred, false if not.
+ */
+bool cas(T,V)( shared(T)* here, shared(T)* ifThis, V writeThis ) pure nothrow @nogc @trusted
+    if ( !is(T == class) && !is(T U : U*) &&  __traits( compiles, { *here = writeThis; *ifThis = *here; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    static if ( __traits(isFloating, T) )
+    {
+        static assert ( __traits(isFloating, V), "Mismatching argument types." );
+        static if ( T.sizeof == 4 )
+            alias IntTy = uint;
+        else static if ( T.sizeof == 8 )
+            alias IntTy = ulong;
+        return atomicCompareExchangeStrong( cast(IntTy*)here, cast(IntTy*)ifThis, *cast(IntTy*)&writeThis );
+    }
+    else
+        return atomicCompareExchangeStrong!( MemoryOrder.seq, MemoryOrder.seq, T )( cast(T*)here, cast(T*)ifThis, cast()writeThis );
+}
 
-            // Slow path: We use cpuid to serialize. This is
-            // significantly slower than mfence, but is the
-            // only serialization facility we have available
-            // on older non-SSE2 chips.
-            push EBX;
+/// Ditto
+bool cas(T,V)( shared(T)* here, shared(T)* ifThis, shared(V) writeThis ) pure nothrow @nogc @trusted
+    if ( is(T == class) && __traits( compiles, { *here = writeThis; *ifThis = *here; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    return atomicCompareExchangeStrong( cast(T*)here, cast(T*)ifThis, cast()writeThis );
+}
 
-            mov EAX, 0;
-            cpuid;
+/// Ditto
+bool cas(T,V)( shared(T)* here, shared(T)* ifThis, shared(V)* writeThis ) pure nothrow @nogc @trusted
+    if ( is(T U : U*) && __traits( compiles, { *here = writeThis; *ifThis = *here; } ) )
+in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+{
+    return atomicCompareExchangeStrong!( MemoryOrder.seq, MemoryOrder.seq, T )( cast(T*)here, cast(T*)ifThis, writeThis );
+}
 
-            pop EBX;
+/**
+ * Inserts a full load/store memory fence (on platforms that need it). This ensures
+ * that all loads and stores before a call to this function are executed before any
+ * loads and stores after the call.
+ */
+void atomicFence() nothrow @nogc @safe
+{
+    core.internal.atomic.atomicFence();
+}
 
-        Lend:
 
-            ret;
-        }
-    }
-}
-else version (AsmX86_64)
+/**
+ * Performs the binary operation 'op' on val using 'mod' as the modifier.
+ *
+ * Params:
+ *  val = The target variable.
+ *  mod = The modifier to apply.
+ *
+ * Returns:
+ *  The result of the operation.
+ */
+TailShared!T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) pure nothrow @nogc @safe
+    if ( __traits( compiles, mixin( "*cast(T*)&val" ~ op ~ "mod" ) ) )
+in ( atomicValueIsProperlyAligned( val ) )
 {
-    // Uses specialized asm for fast fetch and add operations
-    TailShared!(T) atomicFetchAdd(T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
-        if ( __traits(isIntegral, T) )
-    in ( atomicValueIsProperlyAligned(val) )
-    {
-        return atomicFetchAddImpl( val, mod );
-    }
-    TailShared!(T) atomicFetchAddImpl(T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
-    {
-        asm pure nothrow @nogc @trusted { naked; }
-        version (Windows)
-        {
-            asm pure nothrow @nogc @trusted { mov RAX, RCX; }
-            static if (T.sizeof == 1) asm pure nothrow @nogc @trusted { lock; xadd[RDX], AL; }
-            else static if (T.sizeof == 2) asm pure nothrow @nogc @trusted { lock; xadd[RDX], AX; }
-            else static if (T.sizeof == 4) asm pure nothrow @nogc @trusted { lock; xadd[RDX], EAX; }
-            else static if (T.sizeof == 8) asm pure nothrow @nogc @trusted { lock; xadd[RDX], RAX; }
-        }
-        else
-        {
-            asm pure nothrow @nogc @trusted { mov RAX, RDI; }
-            static if (T.sizeof == 1) asm pure nothrow @nogc @trusted { lock; xadd[RSI], AL; }
-            else static if (T.sizeof == 2) asm pure nothrow @nogc @trusted { lock; xadd[RSI], AX; }
-            else static if (T.sizeof == 4) asm pure nothrow @nogc @trusted { lock; xadd[RSI], EAX; }
-            else static if (T.sizeof == 8) asm pure nothrow @nogc @trusted { lock; xadd[RSI], RAX; }
-        }
-        asm pure nothrow @nogc @trusted { ret; }
+    // binary operators
+    //
+    // +    -   *   /   %   ^^  &
+    // |    ^   <<  >>  >>> ~   in
+    // ==   !=  <   <=  >   >=
+    static if ( op == "+"  || op == "-"  || op == "*"  || op == "/"   ||
+                op == "%"  || op == "^^" || op == "&"  || op == "|"   ||
+                op == "^"  || op == "<<" || op == ">>" || op == ">>>" ||
+                op == "~"  || // skip "in"
+                op == "==" || op == "!=" || op == "<"  || op == "<="  ||
+                op == ">"  || op == ">=" )
+    {
+        TailShared!T get = atomicLoad!(MemoryOrder.raw)( val );
+        mixin( "return get " ~ op ~ " mod;" );
     }
-
-    TailShared!(T) atomicFetchSub(T)( ref shared T val, size_t mod ) pure nothrow @nogc @safe
-        if ( __traits(isIntegral, T) )
-    in ( atomicValueIsProperlyAligned(val) )
+    else
+    // assignment operators
+    //
+    // +=   -=  *=  /=  %=  ^^= &=
+    // |=   ^=  <<= >>= >>>=    ~=
+    static if ( op == "+=" && __traits(isIntegral, T) && __traits(isIntegral, V1) && T.sizeof <= size_t.sizeof && V1.sizeof <= size_t.sizeof)
     {
-        return atomicFetchAddImpl(val, -mod);
+        return cast(T)( atomicFetchAdd!(T)( val, mod ) + mod );
     }
-
-    TailShared!T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) pure nothrow @nogc
-        if ( __traits( compiles, mixin( "*cast(T*)&val" ~ op ~ "mod" ) ) )
-    in
+    else static if ( op == "-=" && __traits(isIntegral, T) && __traits(isIntegral, V1) && T.sizeof <= size_t.sizeof && V1.sizeof <= size_t.sizeof)
     {
-        assert( atomicValueIsProperlyAligned(val));
+        return cast(T)( atomicFetchSub!(T)( val, mod ) - mod );
     }
-    do
+    else static if ( op == "+=" || op == "-="  || op == "*="  || op == "/=" ||
+                op == "%=" || op == "^^=" || op == "&="  || op == "|=" ||
+                op == "^=" || op == "<<=" || op == ">>=" || op == ">>>=" ) // skip "~="
     {
-        // binary operators
-        //
-        // +    -   *   /   %   ^^  &
-        // |    ^   <<  >>  >>> ~   in
-        // ==   !=  <   <=  >   >=
-        static if ( op == "+"  || op == "-"  || op == "*"  || op == "/"   ||
-                   op == "%"  || op == "^^" || op == "&"  || op == "|"   ||
-                   op == "^"  || op == "<<" || op == ">>" || op == ">>>" ||
-                   op == "~"  || // skip "in"
-                   op == "==" || op == "!=" || op == "<"  || op == "<="  ||
-                   op == ">"  || op == ">=" )
-        {
-            TailShared!T get = atomicLoad!(MemoryOrder.raw)( val );
-            mixin( "return get " ~ op ~ " mod;" );
-        }
-        else
-        // assignment operators
-        //
-        // +=   -=  *=  /=  %=  ^^= &=
-        // |=   ^=  <<= >>= >>>=    ~=
-        static if ( op == "+=" && __traits(isIntegral, T) && __traits(isIntegral, V1))
-        {
-            return cast(T)(atomicFetchAdd!(T)(val, mod) + mod);
-        }
-        else static if ( op == "-=" && __traits(isIntegral, T) && __traits(isIntegral, V1))
-        {
-            return cast(T)(atomicFetchSub!(T)(val, mod) - mod);
-        }
-        else static if ( op == "+=" || op == "-="  || op == "*="  || op == "/=" ||
-                   op == "%=" || op == "^^=" || op == "&="  || op == "|=" ||
-                   op == "^=" || op == "<<=" || op == ">>=" || op == ">>>=" ) // skip "~="
-        {
-            TailShared!T get, set;
-
-            do
-            {
-                get = set = atomicLoad!(MemoryOrder.raw)( val );
-                mixin( "set " ~ op ~ " mod;" );
-            } while ( !casByRef( val, get, set ) );
-            return set;
-        }
-        else
-        {
-            static assert( false, "Operation not supported." );
-        }
-    }
+        TailShared!T get, set;
 
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, V exchangeWith ) pure nothrow @nogc @trusted
-        if ( !is(T == class) && !is(T U : U*) &&  __traits( compiles, { *here = exchangeWith; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
-    {
-        static if ( __traits(isFloating, V) )
+        do
         {
-            static if ( V.sizeof == 4 )
-                alias I = uint;
-            else static if ( V.sizeof == 8 )
-                alias I = ulong;
-            else
-                static assert( false, "Float type " ~ V.stringof ~ " not supported.");
-            I r = atomicExchangeImpl(cast(shared(I)*)here, *cast(I*)&exchangeWith);
-            return *cast(shared(T)*)&r;
-        }
-        else
-            return atomicExchangeImpl(here, exchangeWith);
+            get = set = atomicLoad!(MemoryOrder.raw)( val );
+            mixin( "set " ~ op ~ " mod;" );
+        } while ( !casByRef( val, get, set ) );
+        return set;
     }
-
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V) exchangeWith ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = exchangeWith; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+    else
     {
-        return atomicExchangeImpl(here, exchangeWith);
+        static assert( false, "Operation not supported." );
     }
+}
 
-    shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here, shared(V)* exchangeWith ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = exchangeWith; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
-    {
-        return atomicExchangeImpl(here, exchangeWith);
-    }
 
-    private shared(T) atomicExchangeImpl(T,V)( shared(T)* here, V exchangeWith ) pure nothrow @nogc @safe
+version (CoreDdoc)
+{
+    /**
+     * Loads 'val' from memory and returns it.  The memory barrier specified
+     * by 'ms' is applied to the operation, which is fully sequenced by
+     * default.  Valid memory orders are MemoryOrder.raw, MemoryOrder.acq,
+     * and MemoryOrder.seq.
+     *
+     * Params:
+     *  val = The target variable.
+     *
+     * Returns:
+     *  The value of 'val'.
+     */
+    TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq,T)( ref const shared T val ) pure nothrow @nogc @safe
     {
-        // Windows: here = RDX, exchangeWith = RCX
-        // Posix:   here = RSI, exchangeWith = RDI
-        static if ( T.sizeof == byte.sizeof )
-        {
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RDX], CL;
-                    mov AL, CL;
-                    ret;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RSI], DIL;
-                    mov AL, DIL;
-                    ret;
-                }
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RDX], CX;
-                    mov AX, CX;
-                    ret;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RSI], DI;
-                    mov AX, DI;
-                    ret;
-                }
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RDX], ECX;
-                    mov EAX, ECX;
-                    ret;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RSI], EDI;
-                    mov EAX, EDI;
-                    ret;
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof )
-        {
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RDX], RCX;
-                    mov RAX, RCX;
-                    ret;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    xchg [RSI], RDI;
-                    mov RAX, RDI;
-                    ret;
-                }
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
+        return TailShared!T.init;
     }
 
-    bool casByRef(T,V1,V2)( ref T value, V1 ifThis, V2 writeThis ) pure nothrow @nogc @trusted
-    {
-        return cas(&value, ifThis, writeThis);
-    }
 
-    bool cas(T,V1,V2)( shared(T)* here, const V1 ifThis, V2 writeThis ) pure nothrow @nogc @trusted
-        if ( !is(T == class) && !is(T U : U*) &&  __traits( compiles, { *here = writeThis; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+    /**
+     * Writes 'newval' into 'val'.  The memory barrier specified by 'ms' is
+     * applied to the operation, which is fully sequenced by default.
+     * Valid memory orders are MemoryOrder.raw, MemoryOrder.rel, and
+     * MemoryOrder.seq.
+     *
+     * Params:
+     *  val    = The target variable.
+     *  newval = The value to store.
+     */
+    void atomicStore(MemoryOrder ms = MemoryOrder.seq,T,V1)( ref shared T val, V1 newval ) pure nothrow @nogc @safe
+        if ( __traits( compiles, { val = newval; } ) )
     {
-        static assert (V1.sizeof == V2.sizeof, "Mismatching argument sizes");
-        static if ( V2.sizeof == 4 && __traits(isFloating, V2) )
-        {
-            uint cmp = *cast(uint*)&ifThis;
-            uint arg = *cast(uint*)&writeThis;
-        }
-        else static if ( V2.sizeof == 8 && __traits(isFloating, V2) )
-        {
-            ulong cmp = *cast(ulong*)&ifThis;
-            ulong arg = *cast(ulong*)&writeThis;
-        }
-        else
-        {
-            alias cmp = ifThis;
-            alias arg = writeThis;
-        }
-        return casImplNoResult(here, cmp, arg);
-    }
 
-    bool cas(T,V1,V2)( shared(T)* here, const shared(V1) ifThis, shared(V2) writeThis ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = writeThis; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
-    {
-        return casImplNoResult(here, ifThis, writeThis);
     }
-
-    bool cas(T,V1,V2)( shared(T)* here, const shared(V1)* ifThis, shared(V2)* writeThis ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = writeThis; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+}
+else version (AsmX86_32)
+{
+    TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @safe
+    if (!__traits(isFloating, T))
     {
-        return casImplNoResult(here, ifThis, writeThis);
-    }
+        static assert( ms != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()" );
+        static assert( __traits(isPOD, T), "argument to atomicLoad() must be POD" );
 
-    private bool casImplNoResult(T,V1,V2)( shared(T)* here, V1 ifThis, V2 writeThis ) pure nothrow @nogc @safe
-    {
-        // Windows: here = *R8, ifThis = RDX, writeThis = RCX
-        // Posix:   here = *RDX, ifThis = RSI, writeThis = RDI
         static if ( T.sizeof == byte.sizeof )
         {
             //////////////////////////////////////////////////////////////////
-            // 1 Byte CAS
+            // 1 Byte Load
             //////////////////////////////////////////////////////////////////
-            version (Windows)
+
+            static if ( needsLoadBarrier!(ms) )
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AL, DL;
-                    lock; cmpxchg [R8], CL;
-                    setz AL;
-                    ret;
+                    mov DL, 0;
+                    mov AL, 0;
+                    mov ECX, val;
+                    lock; // lock always needed to make this op atomic
+                    cmpxchg [ECX], DL;
                 }
             }
             else
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AL, SIL;
-                    lock; cmpxchg [RDX], DIL;
-                    setz AL;
-                    ret;
+                    mov EAX, val;
+                    mov AL, [EAX];
                 }
             }
         }
         else static if ( T.sizeof == short.sizeof )
         {
             //////////////////////////////////////////////////////////////////
-            // 2 Byte CAS
+            // 2 Byte Load
             //////////////////////////////////////////////////////////////////
-            version (Windows)
+
+            static if ( needsLoadBarrier!(ms) )
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AX, DX;
-                    lock; cmpxchg [R8], CX;
-                    setz AL;
-                    ret;
+                    mov DX, 0;
+                    mov AX, 0;
+                    mov ECX, val;
+                    lock; // lock always needed to make this op atomic
+                    cmpxchg [ECX], DX;
                 }
             }
             else
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AX, SI;
-                    lock; cmpxchg [RDX], DI;
-                    setz AL;
-                    ret;
+                    mov EAX, val;
+                    mov AX, [EAX];
                 }
             }
         }
         else static if ( T.sizeof == int.sizeof )
         {
             //////////////////////////////////////////////////////////////////
-            // 4 Byte CAS
-            //////////////////////////////////////////////////////////////////
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    mov EAX, EDX;
-                    lock; cmpxchg [R8], ECX;
-                    setz AL;
-                    ret;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    mov EAX, ESI;
-                    lock; cmpxchg [RDX], EDI;
-                    setz AL;
-                    ret;
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte CAS on a 64-Bit Processor
+            // 4 Byte Load
             //////////////////////////////////////////////////////////////////
-            version (Windows)
+
+            static if ( needsLoadBarrier!(ms) )
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov RAX, RDX;
-                    lock; cmpxchg [R8], RCX;
-                    setz AL;
-                    ret;
+                    mov EDX, 0;
+                    mov EAX, 0;
+                    mov ECX, val;
+                    lock; // lock always needed to make this op atomic
+                    cmpxchg [ECX], EDX;
                 }
             }
             else
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov RAX, RSI;
-                    lock; cmpxchg [RDX], RDI;
-                    setz AL;
-                    ret;
+                    mov EAX, val;
+                    mov EAX, [EAX];
                 }
             }
         }
-        else static if ( T.sizeof == long.sizeof*2 && has128BitCAS)
+        else static if ( T.sizeof == long.sizeof && has64BitCAS )
         {
             //////////////////////////////////////////////////////////////////
-            // 16 Byte CAS on a 64-Bit Processor
+            // 8 Byte Load on a 32-Bit Processor
             //////////////////////////////////////////////////////////////////
-
-            // Windows: here = *R8, ifThis = *RDX, writeThis = *RCX
-            // Posix:   here = *R8, ifThis = RCX:RDX, writeThis = RSI:RDI
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    push RBX;
-                    mov RAX, [RDX];
-                    mov RDX, 8[RDX];
-                    mov RBX, [RCX];
-                    mov RCX, 8[RCX];
-                    lock; cmpxchg16b [R8];
-                    setz AL;
-                    pop RBX;
-                    ret;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    push RBX;
-                    mov RAX, RDX;
-                    mov RDX, RCX;
-                    mov RBX, RDI;
-                    mov RCX, RSI;
-                    lock; cmpxchg16b [R8];
-                    setz AL;
-                    pop RBX;
-                    ret;
-                }
+
+            asm pure nothrow @nogc @trusted
+            {
+                push EDI;
+                push EBX;
+                mov EBX, 0;
+                mov ECX, 0;
+                mov EAX, 0;
+                mov EDX, 0;
+                mov EDI, val;
+                lock; // lock always needed to make this op atomic
+                cmpxchg8b [EDI];
+                pop EBX;
+                pop EDI;
             }
         }
         else
@@ -1454,243 +494,111 @@ else version (AsmX86_64)
         }
     }
 
-    bool cas(T,V1,V2)( shared(T)* here, V1* ifThis, V2 writeThis ) pure nothrow @nogc @trusted
-        if ( !is(T == class) && !is(T U : U*) &&  __traits( compiles, { *here = writeThis; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
-    {
-        static if ( V2.sizeof == 4 && __traits(isFloating, V2) )
-            uint arg = *cast(uint*)&writeThis;
-        else static if ( V2.sizeof == 8 && __traits(isFloating, V2) )
-            ulong arg = *cast(ulong*)&writeThis;
-        else
-            alias arg = writeThis;
-        return casImplWithResult(here, *ifThis, arg);
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, shared(V1)* ifThis, shared(V2) writeThis ) pure nothrow @nogc @safe
-        if ( is(T == class) && __traits( compiles, { *here = writeThis; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
-    {
-        return casImplWithResult(here, *ifThis, writeThis);
-    }
-
-    bool cas(T,V1,V2)( shared(T)* here, shared(V1*)* ifThis, shared(V2)* writeThis ) pure nothrow @nogc @safe
-        if ( is(T U : U*) && __traits( compiles, { *here = writeThis; } ) )
-    in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
+    void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V1)( ref shared T val, V1 newval ) pure nothrow @nogc @safe
+        if ( __traits( compiles, { val = newval; } ) )
     {
-        return casImplWithResult(here, *ifThis, writeThis);
-    }
+        static assert( ms != MemoryOrder.acq, "invalid MemoryOrder for atomicStore()" );
+        static assert( __traits(isPOD, T), "argument to atomicStore() must be POD" );
 
-    private bool casImplWithResult(T,V1,V2)( shared(T)* here, ref V1 ifThis, V2 writeThis ) pure nothrow @nogc @safe
-    {
-        // Windows: here = *R8, ifThis = *RDX, writeThis = RCX
-        // Posix:   here = *RDX, ifThis = *RSI, writeThis = RDI
         static if ( T.sizeof == byte.sizeof )
         {
             //////////////////////////////////////////////////////////////////
-            // 1 Byte CAS
+            // 1 Byte Store
             //////////////////////////////////////////////////////////////////
-            version (Windows)
+
+            static if ( needsStoreBarrier!(ms) )
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AL, [RDX];
-                    lock; cmpxchg [R8], CL;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RDX], AL;
-                    xor AL, AL;
-                    ret;
+                    mov EAX, val;
+                    mov DL, newval;
+                    lock;
+                    xchg [EAX], DL;
                 }
             }
             else
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AL, [RSI];
-                    lock; cmpxchg [RDX], DIL;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RSI], AL;
-                    xor AL, AL;
-                    ret;
+                    mov EAX, val;
+                    mov DL, newval;
+                    mov [EAX], DL;
                 }
             }
         }
         else static if ( T.sizeof == short.sizeof )
         {
             //////////////////////////////////////////////////////////////////
-            // 2 Byte CAS
+            // 2 Byte Store
             //////////////////////////////////////////////////////////////////
-            version (Windows)
+
+            static if ( needsStoreBarrier!(ms) )
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AX, [RDX];
-                    lock; cmpxchg [R8], CX;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RDX], AX;
-                    xor AL, AL;
-                    ret;
+                    mov EAX, val;
+                    mov DX, newval;
+                    lock;
+                    xchg [EAX], DX;
                 }
             }
             else
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov AX, [RSI];
-                    lock; cmpxchg [RDX], DI;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RSI], AX;
-                    xor AL, AL;
-                    ret;
+                    mov EAX, val;
+                    mov DX, newval;
+                    mov [EAX], DX;
                 }
             }
         }
         else static if ( T.sizeof == int.sizeof )
         {
             //////////////////////////////////////////////////////////////////
-            // 4 Byte CAS
-            //////////////////////////////////////////////////////////////////
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    mov EAX, [RDX];
-                    lock; cmpxchg [R8], ECX;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RDX], EAX;
-                    xor AL, AL;
-                    ret;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    mov EAX, [RSI];
-                    lock; cmpxchg [RDX], EDI;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RSI], EAX;
-                    xor AL, AL;
-                    ret;
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte CAS on a 64-Bit Processor
+            // 4 Byte Store
             //////////////////////////////////////////////////////////////////
-            version (Windows)
+
+            static if ( needsStoreBarrier!(ms) )
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov RAX, [RDX];
-                    lock; cmpxchg [R8], RCX;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RDX], RAX;
-                    xor AL, AL;
-                    ret;
+                    mov EAX, val;
+                    mov EDX, newval;
+                    lock;
+                    xchg [EAX], EDX;
                 }
             }
             else
             {
                 asm pure nothrow @nogc @trusted
                 {
-                    naked;
-                    mov RAX, [RSI];
-                    lock; cmpxchg [RDX], RDI;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [RSI], RAX;
-                    xor AL, AL;
-                    ret;
+                    mov EAX, val;
+                    mov EDX, newval;
+                    mov [EAX], EDX;
                 }
             }
         }
-        else static if ( T.sizeof == long.sizeof*2 && has128BitCAS)
+        else static if ( T.sizeof == long.sizeof && has64BitCAS )
         {
             //////////////////////////////////////////////////////////////////
-            // 16 Byte CAS on a 64-Bit Processor
+            // 8 Byte Store on a 32-Bit Processor
             //////////////////////////////////////////////////////////////////
 
-            // Windows: here = *R8, ifThis = *RDX, writeThis = *RCX
-            // Posix:   here = *RCX, ifThis = *RDX, writeThis = RSI:RDI
-            version (Windows)
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    push RBX;
-                    mov R9, RDX;
-                    mov RAX, [RDX];
-                    mov RDX, 8[RDX];
-                    mov RBX, [RCX];
-                    mov RCX, 8[RCX];
-                    lock; cmpxchg16b [R8];
-                    pop RBX;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [R9], RAX;
-                    mov 8[R9], RDX;
-                    xor AL, AL;
-                    ret;
-                }
-            }
-            else
+            asm pure nothrow @nogc @trusted
             {
-                asm pure nothrow @nogc @trusted
-                {
-                    naked;
-                    push RBX;
-                    mov R8, RCX;
-                    mov R9, RDX;
-                    mov RAX, [RDX];
-                    mov RDX, 8[RDX];
-                    mov RBX, RDI;
-                    mov RCX, RSI;
-                    lock; cmpxchg16b [R8];
-                    pop RBX;
-                    jne compare_fail;
-                    mov AL, 1;
-                    ret;
-                compare_fail:
-                    mov [R9], RAX;
-                    mov 8[R9], RDX;
-                    xor AL, AL;
-                    ret;
-                }
+                push EDI;
+                push EBX;
+                lea EDI, newval;
+                mov EBX, [EDI];
+                mov ECX, 4[EDI];
+                mov EDI, val;
+                mov EAX, [EDI];
+                mov EDX, 4[EDI];
+            L1: lock; // lock always needed to make this op atomic
+                cmpxchg8b [EDI];
+                jne L1;
+                pop EBX;
+                pop EDI;
             }
         }
         else
@@ -1698,36 +606,9 @@ else version (AsmX86_64)
             static assert( false, "Invalid template type specified." );
         }
     }
-
-
-    enum MemoryOrder
-    {
-        raw,
-        acq,
-        rel,
-        seq,
-    }
-
-
-    private
-    {
-        // NOTE: x86 loads implicitly have acquire semantics so a memory
-        //       barrier is only necessary on releases.
-        template needsLoadBarrier( MemoryOrder ms )
-        {
-            enum bool needsLoadBarrier = ms == MemoryOrder.seq;
-        }
-
-
-        // NOTE: x86 stores implicitly have release semantics so a memory
-        //       barrier is only necessary on acquires.
-        template needsStoreBarrier( MemoryOrder ms )
-        {
-            enum bool needsStoreBarrier = ms == MemoryOrder.seq;
-        }
-    }
-
-
+}
+else version (AsmX86_64)
+{
     TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @safe
     if (!__traits(isFloating, T))
     {
@@ -2062,19 +943,6 @@ else version (AsmX86_64)
             static assert( false, "Invalid template type specified." );
         }
     }
-
-
-    void atomicFence() nothrow @nogc @safe
-    {
-        // SSE2 is always present in 64-bit x86 chips.
-        asm nothrow @nogc @trusted
-        {
-            naked;
-
-            mfence;
-            ret;
-        }
-    }
 }
 
 // This is an ABI adapter that works on all architectures.  It type puns
@@ -2082,7 +950,7 @@ else version (AsmX86_64)
 // them back.  This is necessary so that they get returned in floating
 // point instead of integer registers.
 TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @trusted
-if (__traits(isFloating, T))
+    if (__traits(isFloating, T))
 {
     static if (T.sizeof == int.sizeof)
     {
@@ -2104,6 +972,144 @@ if (__traits(isFloating, T))
     }
 }
 
+private
+{
+    // NOTE: x86 loads implicitly have acquire semantics so a memory
+    //       barrier is only necessary on releases.
+    template needsLoadBarrier( MemoryOrder ms )
+    {
+        enum bool needsLoadBarrier = ms == MemoryOrder.seq;
+    }
+
+
+    // NOTE: x86 stores implicitly have release semantics so a memory
+    //       barrier is only necessary on acquires.
+    template needsStoreBarrier( MemoryOrder ms )
+    {
+        enum bool needsStoreBarrier = ms == MemoryOrder.seq;
+    }
+
+    // TODO: it'd be nice if we had @trusted scopes; we could remove this...
+    bool casByRef(T,V1,V2)( ref T value, V1 ifThis, V2 writeThis ) pure nothrow @nogc @trusted
+    {
+        return cas( &value, ifThis, writeThis );
+    }
+
+    /* Construct a type with a shared tail, and if possible with an unshared
+    head. */
+    template TailShared(U) if (!is(U == shared))
+    {
+        alias TailShared = .TailShared!(shared U);
+    }
+    template TailShared(S) if (is(S == shared))
+    {
+        // Get the unshared variant of S.
+        static if (is(S U == shared U)) {}
+        else static assert(false, "Should never be triggered. The `static " ~
+            "if` declares `U` as the unshared version of the shared type " ~
+            "`S`. `S` is explicitly declared as shared, so getting `U` " ~
+            "should always work.");
+
+        static if (is(S : U))
+            alias TailShared = U;
+        else static if (is(S == struct))
+        {
+            enum implName = () {
+                /* Start with "_impl". If S has a field with that name, append
+                underscores until the clash is resolved. */
+                string name = "_impl";
+                string[] fieldNames;
+                static foreach (alias field; S.tupleof)
+                {
+                    fieldNames ~= __traits(identifier, field);
+                }
+                static bool canFind(string[] haystack, string needle)
+                {
+                    foreach (candidate; haystack)
+                    {
+                        if (candidate == needle) return true;
+                    }
+                    return false;
+                }
+                while (canFind(fieldNames, name)) name ~= "_";
+                return name;
+            } ();
+            struct TailShared
+            {
+                static foreach (i, alias field; S.tupleof)
+                {
+                    /* On @trusted: This is casting the field from shared(Foo)
+                    to TailShared!Foo. The cast is safe because the field has
+                    been loaded and is not shared anymore. */
+                    mixin("
+                        @trusted @property
+                        ref " ~ __traits(identifier, field) ~ "()
+                        {
+                            alias R = TailShared!(typeof(field));
+                            return * cast(R*) &" ~ implName ~ ".tupleof[i];
+                        }
+                    ");
+                }
+                mixin("
+                    S " ~ implName ~ ";
+                    alias " ~ implName ~ " this;
+                ");
+            }
+        }
+        else
+            alias TailShared = S;
+    }
+    @safe unittest
+    {
+        // No tail (no indirections) -> fully unshared.
+
+        static assert(is(TailShared!int == int));
+        static assert(is(TailShared!(shared int) == int));
+
+        static struct NoIndir { int i; }
+        static assert(is(TailShared!NoIndir == NoIndir));
+        static assert(is(TailShared!(shared NoIndir) == NoIndir));
+
+        // Tail can be independently shared or is already -> tail-shared.
+
+        static assert(is(TailShared!(int*) == shared(int)*));
+        static assert(is(TailShared!(shared int*) == shared(int)*));
+        static assert(is(TailShared!(shared(int)*) == shared(int)*));
+
+        static assert(is(TailShared!(int[]) == shared(int)[]));
+        static assert(is(TailShared!(shared int[]) == shared(int)[]));
+        static assert(is(TailShared!(shared(int)[]) == shared(int)[]));
+
+        static struct S1 { shared int* p; }
+        static assert(is(TailShared!S1 == S1));
+        static assert(is(TailShared!(shared S1) == S1));
+
+        static struct S2 { shared(int)* p; }
+        static assert(is(TailShared!S2 == S2));
+        static assert(is(TailShared!(shared S2) == S2));
+
+        // Tail follows shared-ness of head -> fully shared.
+
+        static class C { int i; }
+        static assert(is(TailShared!C == shared C));
+        static assert(is(TailShared!(shared C) == shared C));
+
+        /* However, structs get a wrapper that has getters which cast to
+        TailShared. */
+
+        static struct S3 { int* p; int _impl; int _impl_; int _impl__; }
+        static assert(!is(TailShared!S3 : S3));
+        static assert(is(TailShared!S3 : shared S3));
+        static assert(is(TailShared!(shared S3) == TailShared!S3));
+
+        static struct S4 { shared(int)** p; }
+        static assert(!is(TailShared!S4 : S4));
+        static assert(is(TailShared!S4 : shared S4));
+        static assert(is(TailShared!(shared S4) == TailShared!S4));
+    }
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Unit Tests
 ////////////////////////////////////////////////////////////////////////////////
@@ -2148,7 +1154,7 @@ version (unittest)
 
         atom = cast(shared(T))null;
 
-        T arg = base;
+        shared(T) arg = base;
         assert( cas( &atom, &arg, val ), T.stringof );
         assert( arg is base, T.stringof );
         assert( atom is val, T.stringof );
diff --git a/src/core/internal/atomic.d b/src/core/internal/atomic.d
new file mode 100644
index 0000000000..86ab4bcd50
--- /dev/null
+++ b/src/core/internal/atomic.d
@@ -0,0 +1,533 @@
+/**
+* The core.internal.atomic module comtains the low-level atomic features available in hardware.
+* This module may be a routing layer for compiler intrinsics.
+*
+* Copyright: Copyright Manu Evans 2019.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+* Authors:   Sean Kelly, Alex Rønne Petersen, Manu Evans
+* Source:    $(DRUNTIMESRC core/internal/_atomic.d)
+*/
+
+module core.internal.atomic;
+
+import core.atomic : MemoryOrder;
+
+private
+{
+    enum : int
+    {
+        AX, BX, CX, DX, DI, SI, R8, R9
+    }
+
+    immutable string[4][8] registerNames = [
+        [ "AL", "AX", "EAX", "RAX" ],
+        [ "BL", "BX", "EBX", "RBX" ],
+        [ "CL", "CX", "ECX", "RCX" ],
+        [ "DL", "DX", "EDX", "RDX" ],
+        [ "DIL", "DI", "EDI", "RDI" ],
+        [ "SIL", "SI", "ESI", "RSI" ],
+        [ "R8B", "R8W", "R8D", "R8" ],
+        [ "R9B", "R9W", "R9D", "R9" ],
+    ];
+
+    template RegIndex(T)
+    {
+        static if (T.sizeof == 1)
+            enum RegIndex = 0;
+        else static if (T.sizeof == 2)
+            enum RegIndex = 1;
+        else static if (T.sizeof == 4)
+            enum RegIndex = 2;
+        else static if (T.sizeof == 8)
+            enum RegIndex = 3;
+        else
+            static assert(false, "Invalid type");
+    }
+
+    enum SizedReg(int reg, T = size_t) = registerNames[reg][RegIndex!T];
+}
+
+T atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(T* src) pure nothrow @nogc @safe
+{
+
+}
+
+void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* src, T value) pure nothrow @nogc @safe
+{
+
+}
+
+T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @safe
+    if (is(T : ulong))
+{
+    version (D_InlineAsm_X86)
+    {
+        static assert(T.sizeof <= 4, "64bit atomicFetchAdd not supported on 32bit target." );
+
+        enum DestReg = SizedReg!DX;
+        enum ValReg = SizedReg!(AX, T);
+
+        mixin (simpleFormat(q{
+            asm pure nothrow @nogc @trusted
+            {
+                mov %1, value;
+                mov %0, dest;
+                lock; xadd[%0], %1;
+            }
+        }, DestReg, ValReg));
+    }
+    else version (D_InlineAsm_X86_64)
+    {
+        version (Windows)
+        {
+            enum DestReg = SizedReg!DX;
+            enum ValReg = SizedReg!(CX, T);
+        }
+        else
+        {
+            enum DestReg = SizedReg!SI;
+            enum ValReg = SizedReg!(DI, T);
+        }
+        enum ResReg = result ? SizedReg!(AX, T) : null;
+
+        mixin (simpleFormat(q{
+            asm pure nothrow @nogc @trusted
+            {
+                naked;
+                lock; xadd[%0], %1;
+?2                mov %2, %1;
+                ret;
+            }
+        }, DestReg, ValReg, ResReg));
+    }
+    else
+        static assert (false, "Unsupported architecture.");
+}
+
+T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @safe
+    if (is(T : ulong))
+{
+    return atomicFetchAdd(dest, cast(T)-cast(IntOrLong!T)value);
+}
+
+T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @safe
+    if (is(T : ulong) || is(T == class) || is(T U : U*))
+{
+    version (D_InlineAsm_X86)
+    {
+        static assert(T.sizeof <= 4, "64bit atomicExchange not supported on 32bit target." );
+
+        enum DestReg = SizedReg!CX;
+        enum ValReg = SizedReg!(AX, T);
+
+        mixin (simpleFormat(q{
+            asm pure nothrow @nogc @trusted
+            {
+                mov %1, value;
+                mov %0, dest;
+                xchg [%0], %1;
+            }
+        }, DestReg, ValReg));
+    }
+    else version (D_InlineAsm_X86_64)
+    {
+        version (Windows)
+        {
+            enum DestReg = SizedReg!DX;
+            enum ValReg = SizedReg!(CX, T);
+        }
+        else
+        {
+            enum DestReg = SizedReg!SI;
+            enum ValReg = SizedReg!(DI, T);
+        }
+        enum ResReg = result ? SizedReg!(AX, T) : null;
+
+        mixin (simpleFormat(q{
+            asm pure nothrow @nogc @trusted
+            {
+                naked;
+                xchg [%0], %1;
+?2                mov %2, %1;
+                ret;
+            }
+        }, DestReg, ValReg, ResReg));
+    }
+    else
+        static assert (false, "Unsupported architecture.");
+}
+
+alias atomicCompareExchangeWeak = atomicCompareExchangeStrong;
+
+bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @safe
+    if (CanCAS!T)
+{
+    version (D_InlineAsm_X86)
+    {
+        static if (T.sizeof <= 4)
+        {
+            enum DestAddr = SizedReg!CX;
+            enum CmpAddr = SizedReg!DI;
+            enum Val = SizedReg!(DX, T);
+            enum Cmp = SizedReg!(AX, T);
+
+            mixin (simpleFormat(q{
+                asm pure nothrow @nogc @trusted
+                {
+                    push %1;
+                    mov %2, value;
+                    mov %1, compare;
+                    mov %3, [%1];
+                    mov %0, dest;
+                    lock; cmpxchg [%0], %2;
+                    mov [%1], %3;
+                    setz AL;
+                    pop %1;
+                }
+            }, DestAddr, CmpAddr, Val, Cmp));
+        }
+        else static if (T.sizeof == 8)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                push EDI;
+                push EBX;
+                lea EDI, value;
+                mov EBX, [EDI];
+                mov ECX, 4[EDI];
+                mov EDI, compare;
+                mov EAX, [EDI];
+                mov EDX, 4[EDI];
+                mov EDI, dest;
+                lock; cmpxchg8b [EDI];
+                mov EDI, compare;
+                mov [EDI], EAX;
+                mov 4[EDI], EDX;
+                setz AL;
+                pop EBX;
+                pop EDI;
+            }
+        }
+        else
+            static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
+    }
+    else version (D_InlineAsm_X86_64)
+    {
+        static if (T.sizeof <= 8)
+        {
+            version (Windows)
+            {
+                enum DestAddr = SizedReg!R8;
+                enum CmpAddr = SizedReg!DX;
+                enum Val = SizedReg!(CX, T);
+            }
+            else
+            {
+                enum DestAddr = SizedReg!DX;
+                enum CmpAddr = SizedReg!SI;
+                enum Val = SizedReg!(DI, T);
+            }
+            enum Res = SizedReg!(AX, T);
+
+            mixin (simpleFormat(q{
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    mov %3, [%1];
+                    lock; cmpxchg [%0], %2;
+                    jne compare_fail;
+                    mov AL, 1;
+                    ret;
+                compare_fail:
+                    mov [%1], %3;
+                    xor AL, AL;
+                    ret;
+                }
+            }, DestAddr, CmpAddr, Val, Res));
+        }
+        else
+        {
+            version (Windows)
+            {
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    push RBX;
+                    mov R9, RDX;
+                    mov RAX, [RDX];
+                    mov RDX, 8[RDX];
+                    mov RBX, [RCX];
+                    mov RCX, 8[RCX];
+                    lock; cmpxchg16b [R8];
+                    pop RBX;
+                    jne compare_fail;
+                    mov AL, 1;
+                    ret;
+                compare_fail:
+                    mov [R9], RAX;
+                    mov 8[R9], RDX;
+                    xor AL, AL;
+                    ret;
+                }
+            }
+            else
+            {
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    push RBX;
+                    mov R8, RCX;
+                    mov R9, RDX;
+                    mov RAX, [RDX];
+                    mov RDX, 8[RDX];
+                    mov RBX, RDI;
+                    mov RCX, RSI;
+                    lock; cmpxchg16b [R8];
+                    pop RBX;
+                    jne compare_fail;
+                    mov AL, 1;
+                    ret;
+                compare_fail:
+                    mov [R9], RAX;
+                    mov 8[R9], RDX;
+                    xor AL, AL;
+                    ret;
+                }
+            }
+        }
+    }
+    else
+        static assert (false, "Unsupported architecture.");
+}
+
+bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T compare, T value) pure nothrow @nogc @safe
+    if (CanCAS!T)
+{
+    version (D_InlineAsm_X86)
+    {
+        static if (T.sizeof <= 4)
+        {
+            enum DestAddr = SizedReg!CX;
+            enum Cmp = SizedReg!(AX, T);
+            enum Val = SizedReg!(DX, T);
+
+            mixin (simpleFormat(q{
+                asm pure nothrow @nogc @trusted
+                {
+                    mov %2, value;
+                    mov %1, compare;
+                    mov %0, dest;
+                    lock; cmpxchg [%0], %2;
+                    setz AL;
+                }
+            }, DestAddr, Cmp, Val));
+        }
+        else static if (T.sizeof == 8)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                push EDI;
+                push EBX;
+                lea EDI, value;
+                mov EBX, [EDI];
+                mov ECX, 4[EDI];
+                lea EDI, compare;
+                mov EAX, [EDI];
+                mov EDX, 4[EDI];
+                mov EDI, dest;
+                lock; cmpxchg8b [EDI];
+                setz AL;
+                pop EBX;
+                pop EDI;
+            }
+        }
+        else
+            static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
+    }
+    else version (D_InlineAsm_X86_64)
+    {
+        static if (T.sizeof <= 8)
+        {
+            version (Windows)
+            {
+                enum DestAddr = SizedReg!R8;
+                enum Cmp = SizedReg!(DX, T);
+                enum Val = SizedReg!(CX, T);
+            }
+            else
+            {
+                enum DestAddr = SizedReg!DX;
+                enum Cmp = SizedReg!(SI, T);
+                enum Val = SizedReg!(DI, T);
+            }
+            enum AXReg = SizedReg!(AX, T);
+
+            mixin (simpleFormat(q{
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    mov %3, %1;
+                    lock; cmpxchg [%0], %2;
+                    setz AL;
+                    ret;
+                }
+            }, DestAddr, Cmp, Val, AXReg));
+        }
+        else
+        {
+            version (Windows)
+            {
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    push RBX;
+                    mov RAX, [RDX];
+                    mov RDX, 8[RDX];
+                    mov RBX, [RCX];
+                    mov RCX, 8[RCX];
+                    lock; cmpxchg16b [R8];
+                    setz AL;
+                    pop RBX;
+                    ret;
+                }
+            }
+            else
+            {
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    push RBX;
+                    mov RAX, RDX;
+                    mov RDX, RCX;
+                    mov RBX, RDI;
+                    mov RCX, RSI;
+                    lock; cmpxchg16b [R8];
+                    setz AL;
+                    pop RBX;
+                    ret;
+                }
+            }
+        }
+    }
+    else
+        static assert (false, "Unsupported architecture.");
+}
+
+void atomicFence(MemoryOrder order = MemoryOrder.seq)() nothrow @nogc @safe
+{
+    // TODO: `mfence` should only be required for seq_cst operations, but this depends on
+    //       the compiler's backend knowledge to not reorder code inappropriately,
+    //       so we'll apply it conservatively.
+    static if (order != MemoryOrder.raw)
+    {
+        version (D_InlineAsm_X86)
+        {
+            import core.cpuid;
+
+            // TODO: review this implementation; it seems way overly complicated
+            asm pure nothrow @nogc @trusted
+            {
+                naked;
+
+                call sse2;
+                test AL, AL;
+                jne Lcpuid;
+
+                // Fast path: We have SSE2, so just use mfence.
+                mfence;
+                jmp Lend;
+
+            Lcpuid:
+
+                // Slow path: We use cpuid to serialize. This is
+                // significantly slower than mfence, but is the
+                // only serialization facility we have available
+                // on older non-SSE2 chips.
+                push EBX;
+
+                mov EAX, 0;
+                cpuid;
+
+                pop EBX;
+
+            Lend:
+
+                ret;
+            }
+        }
+        else version (D_InlineAsm_X86_64)
+        {
+            asm nothrow @nogc @trusted
+            {
+                naked;
+                mfence;
+                ret;
+            }
+        }
+    }
+    else
+        static assert (false, "Unsupported architecture.");
+}
+
+
+private:
+
+enum CanCAS(T) = is(T : ulong) ||
+                 is(T == class) ||
+                 is(T : U*, U) ||
+                 (is(T == struct) && T.sizeof <= 16 && (T.sizeof & (T.sizeof - 1)) == 0);
+
+template IntOrLong(T)
+{
+    static if (T.sizeof > 4)
+        alias IntOrLong = long;
+    else
+        alias IntOrLong = int;
+}
+
+// this is a helper to build asm blocks
+string simpleFormat(string format, string[] args...)
+{
+    string result;
+    outer: while (format.length)
+    {
+        foreach (i; 0 .. format.length)
+        {
+            if (format[i] == '%' || format[i] == '?')
+            {
+                bool isQ = format[i] == '?';
+                result ~= format[0 .. i++];
+                assert (i < format.length, "Invalid format string");
+                if (format[i] == '%' || format[i] == '?')
+                {
+                    assert(!isQ, "Invalid format string");
+                    result ~= format[i++];
+                }
+                else
+                {
+                    int index = 0;
+                    assert (format[i] >= '0' && format[i] <= '9', "Invalid format string");
+                    while (i < format.length && format[i] >= '0' && format[i] <= '9')
+                        index = index * 10 + (ubyte(format[i++]) - ubyte('0'));
+                    if (!isQ)
+                        result ~= args[index];
+                    else if (!args[index])
+                    {
+                        size_t j = i;
+                        for (; j < format.length;)
+                        {
+                            if (format[j++] == '\n')
+                                break;
+                        }
+                        i = j;
+                    }
+                }
+                format = format[i .. $];
+                continue outer;
+            }
+        }
+        result ~= format;
+        break;
+    }
+    return result;
+}

From 2763339c1256954d78f52c6e142ae4f708718a99 Mon Sep 17 00:00:00 2001
From: Manu Evans <turkeyman@gmail.com>
Date: Sat, 17 Aug 2019 22:26:24 -0700
Subject: [PATCH 3/5] Move atomicStore

---
 src/core/atomic.d          | 356 +++++--------------------------------
 src/core/internal/atomic.d |  89 +++++++++-
 2 files changed, 130 insertions(+), 315 deletions(-)

diff --git a/src/core/atomic.d b/src/core/atomic.d
index 04b045d2ab..8c608520e8 100644
--- a/src/core/atomic.d
+++ b/src/core/atomic.d
@@ -97,6 +97,29 @@ enum MemoryOrder
     seq,
 }
 
+/**
+ * Writes 'newval' into 'val'.  The memory barrier specified by 'ms' is
+ * applied to the operation, which is fully sequenced by default.
+ * Valid memory orders are MemoryOrder.raw, MemoryOrder.rel, and
+ * MemoryOrder.seq.
+ *
+ * Params:
+ *  val    = The target variable.
+ *  newval = The value to store.
+ */
+void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, V newval ) pure nothrow @nogc @trusted
+    if ( __traits( compiles, { val = newval; } ) )
+{
+    static if ( __traits(isFloating, T) )
+    {
+        static assert ( __traits(isFloating, V) && V.sizeof == T.sizeof, "Mismatching argument types." );
+        alias IntTy = IntForFloat!T;
+        core.internal.atomic.atomicStore(cast(IntTy*)&val, *cast(IntTy*)&newval);
+    }
+    else
+        core.internal.atomic.atomicStore(cast(T*)&val, newval);
+}
+
 /**
  * Atomically adds `mod` to the value referenced by `val` and returns the value `val` held previously.
  * This operation is both lock-free and atomic.
@@ -148,15 +171,11 @@ shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here,
     if ( !is(T == class) && !is(T U : U*) &&  __traits( compiles, { *here = exchangeWith; } ) )
 in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
 {
-    static if ( __traits(isFloating, V) )
+    static if ( __traits(isFloating, T) )
     {
-        static if ( V.sizeof == 4 )
-            alias I = uint;
-        else static if ( V.sizeof == 8 )
-            alias I = ulong;
-        else
-            static assert( false, "Float type " ~ V.stringof ~ " not supported.");
-        I r = core.internal.atomic.atomicExchange(cast(shared(I)*)here, *cast(I*)&exchangeWith);
+        static assert ( __traits(isFloating, V) && V.sizeof == T.sizeof, "Mismatching argument types." );
+        alias IntTy = IntForFloat!T;
+        IntTy r = core.internal.atomic.atomicExchange(cast(IntTy*)here, *cast(IntTy*)&exchangeWith);
         return *cast(shared(T)*)&r;
     }
     else
@@ -198,11 +217,9 @@ in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligne
 {
     static if ( __traits(isFloating, T) )
     {
-        static assert ( __traits(isFloating, V1) && __traits(isFloating, V2), "Mismatching argument types." );
-        static if ( T.sizeof == 4 )
-            alias IntTy = uint;
-        else static if ( T.sizeof == 8 )
-            alias IntTy = ulong;
+        static assert ( __traits(isFloating, V1) && V1.sizeof == T.sizeof, "Mismatching argument types." );
+        static assert ( __traits(isFloating, V2) && V2.sizeof == T.sizeof, "Mismatching argument types." );
+        alias IntTy = IntForFloat!T;
         return atomicCompareExchangeStrongNoResult( cast(IntTy*)here, *cast(IntTy*)&ifThis, *cast(IntTy*)&writeThis );
     }
     else
@@ -245,11 +262,8 @@ in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligne
 {
     static if ( __traits(isFloating, T) )
     {
-        static assert ( __traits(isFloating, V), "Mismatching argument types." );
-        static if ( T.sizeof == 4 )
-            alias IntTy = uint;
-        else static if ( T.sizeof == 8 )
-            alias IntTy = ulong;
+        static assert ( __traits(isFloating, V) && V.sizeof == T.sizeof, "Mismatching argument types." );
+        alias IntTy = IntForFloat!T;
         return atomicCompareExchangeStrong( cast(IntTy*)here, cast(IntTy*)ifThis, *cast(IntTy*)&writeThis );
     }
     else
@@ -363,23 +377,6 @@ version (CoreDdoc)
     {
         return TailShared!T.init;
     }
-
-
-    /**
-     * Writes 'newval' into 'val'.  The memory barrier specified by 'ms' is
-     * applied to the operation, which is fully sequenced by default.
-     * Valid memory orders are MemoryOrder.raw, MemoryOrder.rel, and
-     * MemoryOrder.seq.
-     *
-     * Params:
-     *  val    = The target variable.
-     *  newval = The value to store.
-     */
-    void atomicStore(MemoryOrder ms = MemoryOrder.seq,T,V1)( ref shared T val, V1 newval ) pure nothrow @nogc @safe
-        if ( __traits( compiles, { val = newval; } ) )
-    {
-
-    }
 }
 else version (AsmX86_32)
 {
@@ -493,119 +490,6 @@ else version (AsmX86_32)
             static assert( false, "Invalid template type specified." );
         }
     }
-
-    void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V1)( ref shared T val, V1 newval ) pure nothrow @nogc @safe
-        if ( __traits( compiles, { val = newval; } ) )
-    {
-        static assert( ms != MemoryOrder.acq, "invalid MemoryOrder for atomicStore()" );
-        static assert( __traits(isPOD, T), "argument to atomicStore() must be POD" );
-
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DL, newval;
-                    lock;
-                    xchg [EAX], DL;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DL, newval;
-                    mov [EAX], DL;
-                }
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DX, newval;
-                    lock;
-                    xchg [EAX], DX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov DX, newval;
-                    mov [EAX], DX;
-                }
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov EDX, newval;
-                    lock;
-                    xchg [EAX], EDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov EDX, newval;
-                    mov [EAX], EDX;
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof && has64BitCAS )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte Store on a 32-Bit Processor
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                push EBX;
-                lea EDI, newval;
-                mov EBX, [EDI];
-                mov ECX, 4[EDI];
-                mov EDI, val;
-                mov EAX, [EDI];
-                mov EDX, 4[EDI];
-            L1: lock; // lock always needed to make this op atomic
-                cmpxchg8b [EDI];
-                jne L1;
-                pop EBX;
-                pop EDI;
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
 }
 else version (AsmX86_64)
 {
@@ -780,169 +664,6 @@ else version (AsmX86_64)
             static assert( false, "Invalid template type specified." );
         }
     }
-
-
-    void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V1)( ref shared T val, V1 newval ) pure nothrow @nogc @safe
-        if ( __traits( compiles, { val = newval; } ) )
-    {
-        static assert( ms != MemoryOrder.acq, "invalid MemoryOrder for atomicStore()" );
-        static assert( __traits(isPOD, T), "argument to atomicStore() must be POD" );
-
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov DL, newval;
-                    lock;
-                    xchg [RAX], DL;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov DL, newval;
-                    mov [RAX], DL;
-                }
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov DX, newval;
-                    lock;
-                    xchg [RAX], DX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov DX, newval;
-                    mov [RAX], DX;
-                }
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte Store
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov EDX, newval;
-                    lock;
-                    xchg [RAX], EDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov EDX, newval;
-                    mov [RAX], EDX;
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof && has64BitCAS )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte Store on a 64-Bit Processor
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsStoreBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov RDX, newval;
-                    lock;
-                    xchg [RAX], RDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov RDX, newval;
-                    mov [RAX], RDX;
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof*2 && has128BitCAS )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 16 Byte Store on a 64-Bit Processor
-            //////////////////////////////////////////////////////////////////
-            version (Win64){
-                asm pure nothrow @nogc @trusted
-                {
-                    push RDI;
-                    push RBX;
-                    mov R9, val;
-                    mov R10, newval;
-
-                    mov RDI, R10;
-                    mov RBX, [RDI];
-                    mov RCX, 8[RDI];
-
-                    mov RDI, R9;
-                    mov RAX, [RDI];
-                    mov RDX, 8[RDI];
-
-                    L1: lock; // lock always needed to make this op atomic
-                    cmpxchg16b [RDI];
-                    jne L1;
-                    pop RBX;
-                    pop RDI;
-                }
-            }else{
-                asm pure nothrow @nogc @trusted
-                {
-                    push RDI;
-                    push RBX;
-                    lea RDI, newval;
-                    mov RBX, [RDI];
-                    mov RCX, 8[RDI];
-                    mov RDI, val;
-                    mov RAX, [RDI];
-                    mov RDX, 8[RDI];
-                    L1: lock; // lock always needed to make this op atomic
-                    cmpxchg16b [RDI];
-                    jne L1;
-                    pop RBX;
-                    pop RDI;
-                }
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
 }
 
 // This is an ABI adapter that works on all architectures.  It type puns
@@ -974,6 +695,17 @@ TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T
 
 private
 {
+    template IntForFloat(F)
+    {
+        static assert ( __traits(isFloating, F), "Not a floating point type: " ~ F.stringof );
+        static if ( F.sizeof == 4 )
+            alias IntForFloat = uint;
+        else static if ( F.sizeof == 8 )
+            alias IntForFloat = ulong;
+        else
+            static assert ( false, "Invalid floating point type: " ~ F.stringof ~ ", only support `float` and `double`." );
+    }
+
     // NOTE: x86 loads implicitly have acquire semantics so a memory
     //       barrier is only necessary on releases.
     template needsLoadBarrier( MemoryOrder ms )
@@ -1224,7 +956,7 @@ version (unittest)
         {
             () @trusted
             {
-                struct Big { long a, b; }
+                align(16) struct Big { long a, b; }
 
                 shared(Big) atom;
                 shared(Big) base;
diff --git a/src/core/internal/atomic.d b/src/core/internal/atomic.d
index 86ab4bcd50..caba54168f 100644
--- a/src/core/internal/atomic.d
+++ b/src/core/internal/atomic.d
@@ -52,9 +52,74 @@ T atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(T* src) pure nothrow @nogc
 
 }
 
-void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* src, T value) pure nothrow @nogc @safe
+void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @safe
+    if (CanCAS!T)
 {
+    static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
+    static assert(__traits(isPOD, T), "Argument to atomicStore() must be POD");
 
+    static if (T.sizeof == size_t.sizeof * 2)
+    {
+        version (D_InlineAsm_X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                push EDI;
+                push EBX;
+                lea EDI, value;
+                mov EBX, [EDI];
+                mov ECX, 4[EDI];
+                mov EDI, dest;
+                mov EAX, [EDI];
+                mov EDX, 4[EDI];
+            L1: lock; cmpxchg8b [EDI];
+                jne L1;
+                pop EBX;
+                pop EDI;
+            }
+        }
+        else version(D_InlineAsm_X86_64)
+        {
+            version (Windows)
+            {
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    push RBX;
+                    mov R8, RDX;
+                    mov RAX, [RDX];
+                    mov RDX, 8[RDX];
+                    mov RBX, [RCX];
+                    mov RCX, 8[RCX];
+                L1: lock; cmpxchg16b [R8];
+                    jne L1;
+                    pop RBX;
+                    ret;
+                }
+            }
+            else
+            {
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    push RBX;
+                    mov RBX, RDI;
+                    mov RCX, RSI;
+                    mov RDI, RDX;
+                    mov RAX, [RDX];
+                    mov RDX, 8[RDX];
+                L1: lock; cmpxchg16b [RDI];
+                    jne L1;
+                    pop RBX;
+                    ret;
+                }
+            }
+        }
+    }
+    else static if (needsStoreBarrier!order)
+        atomicExchange!(order, false)(dest, value);
+    else
+        *dest = value;
 }
 
 T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @safe
@@ -300,7 +365,7 @@ bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder
         static assert (false, "Unsupported architecture.");
 }
 
-bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T compare, T value) pure nothrow @nogc @safe
+bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @safe
     if (CanCAS!T)
 {
     version (D_InlineAsm_X86)
@@ -475,7 +540,10 @@ private:
 enum CanCAS(T) = is(T : ulong) ||
                  is(T == class) ||
                  is(T : U*, U) ||
-                 (is(T == struct) && T.sizeof <= 16 && (T.sizeof & (T.sizeof - 1)) == 0);
+                 (is(T == struct) && __traits(isPOD, T) &&
+                  T.sizeof <= size_t.sizeof*2 && // no more than 2 words
+                  (T.sizeof & (T.sizeof - 1)) == 0 // is power of 2
+                 );
 
 template IntOrLong(T)
 {
@@ -485,6 +553,21 @@ template IntOrLong(T)
         alias IntOrLong = int;
 }
 
+// NOTE: x86 loads implicitly have acquire semantics so a memory
+//       barrier is only necessary on releases.
+template needsLoadBarrier( MemoryOrder ms )
+{
+    enum bool needsLoadBarrier = ms == MemoryOrder.seq;
+}
+
+
+// NOTE: x86 stores implicitly have release semantics so a memory
+//       barrier is only necessary on acquires.
+template needsStoreBarrier( MemoryOrder ms )
+{
+    enum bool needsStoreBarrier = ms == MemoryOrder.seq;
+}
+
 // this is a helper to build asm blocks
 string simpleFormat(string format, string[] args...)
 {

From 4120ca76b364c0c238af039f2dab2b0af67b58fb Mon Sep 17 00:00:00 2001
From: Manu Evans <turkeyman@gmail.com>
Date: Sat, 17 Aug 2019 23:08:12 -0700
Subject: [PATCH 4/5] Move atomicLoad

---
 src/core/atomic.d          | 401 ++++---------------------------------
 src/core/internal/atomic.d | 124 +++++++++++-
 2 files changed, 160 insertions(+), 365 deletions(-)

diff --git a/src/core/atomic.d b/src/core/atomic.d
index 8c608520e8..a68f677ba7 100644
--- a/src/core/atomic.d
+++ b/src/core/atomic.d
@@ -97,6 +97,33 @@ enum MemoryOrder
     seq,
 }
 
+/**
+ * Loads 'val' from memory and returns it.  The memory barrier specified
+ * by 'ms' is applied to the operation, which is fully sequenced by
+ * default.  Valid memory orders are MemoryOrder.raw, MemoryOrder.acq,
+ * and MemoryOrder.seq.
+ *
+ * Params:
+ *  val = The target variable.
+ *
+ * Returns:
+ *  The value of 'val'.
+ */
+TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @trusted
+{
+    static if ( __traits(isFloating, T) )
+    {
+        alias IntTy = IntForFloat!T;
+        IntTy r = core.internal.atomic.atomicLoad!ms(cast(IntTy*)&val);
+        return *cast(T*)&r;
+    }
+    else
+    {
+        T r = core.internal.atomic.atomicLoad!ms(cast(T*)&val);
+        return *cast(TailShared!T*)&r;
+    }
+}
+
 /**
  * Writes 'newval' into 'val'.  The memory barrier specified by 'ms' is
  * applied to the operation, which is fully sequenced by default.
@@ -114,10 +141,10 @@ void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, V ne
     {
         static assert ( __traits(isFloating, V) && V.sizeof == T.sizeof, "Mismatching argument types." );
         alias IntTy = IntForFloat!T;
-        core.internal.atomic.atomicStore(cast(IntTy*)&val, *cast(IntTy*)&newval);
+        core.internal.atomic.atomicStore!ms(cast(IntTy*)&val, *cast(IntTy*)&newval);
     }
     else
-        core.internal.atomic.atomicStore(cast(T*)&val, newval);
+        core.internal.atomic.atomicStore!ms(cast(T*)&val, newval);
 }
 
 /**
@@ -131,11 +158,11 @@ void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, V ne
  * Returns:
  *  The value held previously by `val`.
  */
-TailShared!(T) atomicFetchAdd(T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
+TailShared!(T) atomicFetchAdd(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
     if ( __traits(isIntegral, T) )
 in ( atomicValueIsProperlyAligned(val) )
 {
-    return core.internal.atomic.atomicFetchAdd( &val, cast(T)mod );
+    return core.internal.atomic.atomicFetchAdd!ms( &val, cast(T)mod );
 }
 
 /**
@@ -149,11 +176,11 @@ in ( atomicValueIsProperlyAligned(val) )
  * Returns:
  *  The value held previously by `val`.
  */
-TailShared!(T) atomicFetchSub(T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
+TailShared!(T) atomicFetchSub(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
     if ( __traits(isIntegral, T) )
 in ( atomicValueIsProperlyAligned(val) )
 {
-    return core.internal.atomic.atomicFetchSub( &val, cast(T)mod );
+    return core.internal.atomic.atomicFetchSub!ms( &val, cast(T)mod );
 }
 
 /**
@@ -175,11 +202,11 @@ in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligne
     {
         static assert ( __traits(isFloating, V) && V.sizeof == T.sizeof, "Mismatching argument types." );
         alias IntTy = IntForFloat!T;
-        IntTy r = core.internal.atomic.atomicExchange(cast(IntTy*)here, *cast(IntTy*)&exchangeWith);
+        IntTy r = core.internal.atomic.atomicExchange!ms(cast(IntTy*)here, *cast(IntTy*)&exchangeWith);
         return *cast(shared(T)*)&r;
     }
     else
-        return core.internal.atomic.atomicExchange(here, exchangeWith);
+        return core.internal.atomic.atomicExchange!ms(here, exchangeWith);
 }
 
 /// Ditto
@@ -187,7 +214,7 @@ shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here,
     if ( is(T == class) && __traits( compiles, { *here = exchangeWith; } ) )
 in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
 {
-    return core.internal.atomic.atomicExchange(here, exchangeWith);
+    return core.internal.atomic.atomicExchange!ms(here, exchangeWith);
 }
 
 /// Ditto
@@ -195,7 +222,7 @@ shared(T) atomicExchange(MemoryOrder ms = MemoryOrder.seq,T,V)( shared(T)* here,
     if ( is(T U : U*) && __traits( compiles, { *here = exchangeWith; } ) )
 in ( atomicPtrIsProperlyAligned( here ), "Argument `here` is not properly aligned" )
 {
-    return core.internal.atomic.atomicExchange(here, exchangeWith);
+    return core.internal.atomic.atomicExchange!ms(here, exchangeWith);
 }
 
 /**
@@ -333,11 +360,11 @@ in ( atomicValueIsProperlyAligned( val ) )
     // |=   ^=  <<= >>= >>>=    ~=
     static if ( op == "+=" && __traits(isIntegral, T) && __traits(isIntegral, V1) && T.sizeof <= size_t.sizeof && V1.sizeof <= size_t.sizeof)
     {
-        return cast(T)( atomicFetchAdd!(T)( val, mod ) + mod );
+        return cast(T)( atomicFetchAdd!(MemoryOrder.seq, T)( val, mod ) + mod );
     }
     else static if ( op == "-=" && __traits(isIntegral, T) && __traits(isIntegral, V1) && T.sizeof <= size_t.sizeof && V1.sizeof <= size_t.sizeof)
     {
-        return cast(T)( atomicFetchSub!(T)( val, mod ) - mod );
+        return cast(T)( atomicFetchSub!(MemoryOrder.seq, T)( val, mod ) - mod );
     }
     else static if ( op == "+=" || op == "-="  || op == "*="  || op == "/=" ||
                 op == "%=" || op == "^^=" || op == "&="  || op == "|=" ||
@@ -358,341 +385,6 @@ in ( atomicValueIsProperlyAligned( val ) )
     }
 }
 
-
-version (CoreDdoc)
-{
-    /**
-     * Loads 'val' from memory and returns it.  The memory barrier specified
-     * by 'ms' is applied to the operation, which is fully sequenced by
-     * default.  Valid memory orders are MemoryOrder.raw, MemoryOrder.acq,
-     * and MemoryOrder.seq.
-     *
-     * Params:
-     *  val = The target variable.
-     *
-     * Returns:
-     *  The value of 'val'.
-     */
-    TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq,T)( ref const shared T val ) pure nothrow @nogc @safe
-    {
-        return TailShared!T.init;
-    }
-}
-else version (AsmX86_32)
-{
-    TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @safe
-    if (!__traits(isFloating, T))
-    {
-        static assert( ms != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()" );
-        static assert( __traits(isPOD, T), "argument to atomicLoad() must be POD" );
-
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov DL, 0;
-                    mov AL, 0;
-                    mov ECX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [ECX], DL;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov AL, [EAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov DX, 0;
-                    mov AX, 0;
-                    mov ECX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [ECX], DX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov AX, [EAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EDX, 0;
-                    mov EAX, 0;
-                    mov ECX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [ECX], EDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EAX, val;
-                    mov EAX, [EAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof && has64BitCAS )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte Load on a 32-Bit Processor
-            //////////////////////////////////////////////////////////////////
-
-            asm pure nothrow @nogc @trusted
-            {
-                push EDI;
-                push EBX;
-                mov EBX, 0;
-                mov ECX, 0;
-                mov EAX, 0;
-                mov EDX, 0;
-                mov EDI, val;
-                lock; // lock always needed to make this op atomic
-                cmpxchg8b [EDI];
-                pop EBX;
-                pop EDI;
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
-}
-else version (AsmX86_64)
-{
-    TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @safe
-    if (!__traits(isFloating, T))
-    {
-        static assert( ms != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()" );
-        static assert( __traits(isPOD, T), "argument to atomicLoad() must be POD" );
-
-        static if ( T.sizeof == byte.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 1 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov DL, 0;
-                    mov AL, 0;
-                    mov RCX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [RCX], DL;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov AL, [RAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == short.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 2 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov DX, 0;
-                    mov AX, 0;
-                    mov RCX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [RCX], DX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov AX, [RAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == int.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 4 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov EDX, 0;
-                    mov EAX, 0;
-                    mov RCX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [RCX], EDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov EAX, [RAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 8 Byte Load
-            //////////////////////////////////////////////////////////////////
-
-            static if ( needsLoadBarrier!(ms) )
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RDX, 0;
-                    mov RAX, 0;
-                    mov RCX, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg [RCX], RDX;
-                }
-            }
-            else
-            {
-                asm pure nothrow @nogc @trusted
-                {
-                    mov RAX, val;
-                    mov RAX, [RAX];
-                }
-            }
-        }
-        else static if ( T.sizeof == long.sizeof*2 && has128BitCAS )
-        {
-            //////////////////////////////////////////////////////////////////
-            // 16 Byte Load on a 64-Bit Processor
-            //////////////////////////////////////////////////////////////////
-            version (Win64){
-                size_t[2] retVal;
-                asm pure nothrow @nogc @trusted
-                {
-                    push RDI;
-                    push RBX;
-                    mov RDI, val;
-                    mov RBX, 0;
-                    mov RCX, 0;
-                    mov RAX, 0;
-                    mov RDX, 0;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg16b [RDI];
-                    lea RDI, retVal;
-                    mov [RDI], RAX;
-                    mov 8[RDI], RDX;
-                    pop RBX;
-                    pop RDI;
-                }
-
-                static if (is(T:U[], U))
-                {
-                    pragma(inline, true)
-                    static typeof(return) toTrusted(size_t[2] retVal) @trusted
-                    {
-                        return *(cast(typeof(return)*) retVal.ptr);
-                    }
-
-                    return toTrusted(retVal);
-                }
-                else
-                {
-                    return cast(typeof(return)) retVal;
-                }
-            }else{
-                asm pure nothrow @nogc @trusted
-                {
-                    push RDI;
-                    push RBX;
-                    mov RBX, 0;
-                    mov RCX, 0;
-                    mov RAX, 0;
-                    mov RDX, 0;
-                    mov RDI, val;
-                    lock; // lock always needed to make this op atomic
-                    cmpxchg16b [RDI];
-                    pop RBX;
-                    pop RDI;
-                }
-            }
-        }
-        else
-        {
-            static assert( false, "Invalid template type specified." );
-        }
-    }
-}
-
-// This is an ABI adapter that works on all architectures.  It type puns
-// floats and doubles to ints and longs, atomically loads them, then puns
-// them back.  This is necessary so that they get returned in floating
-// point instead of integer registers.
-TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @trusted
-    if (__traits(isFloating, T))
-{
-    static if (T.sizeof == int.sizeof)
-    {
-        static assert(is(T : float));
-        auto ptr = cast(const shared int*) &val;
-        auto asInt = atomicLoad!(ms)(*ptr);
-        return *(cast(typeof(return)*) &asInt);
-    }
-    else static if (T.sizeof == long.sizeof)
-    {
-        static assert(is(T : double));
-        auto ptr = cast(const shared long*) &val;
-        auto asLong = atomicLoad!(ms)(*ptr);
-        return *(cast(typeof(return)*) &asLong);
-    }
-    else
-    {
-        static assert(0, "Cannot atomically load 80-bit reals.");
-    }
-}
-
 private
 {
     template IntForFloat(F)
@@ -706,21 +398,6 @@ private
             static assert ( false, "Invalid floating point type: " ~ F.stringof ~ ", only support `float` and `double`." );
     }
 
-    // NOTE: x86 loads implicitly have acquire semantics so a memory
-    //       barrier is only necessary on releases.
-    template needsLoadBarrier( MemoryOrder ms )
-    {
-        enum bool needsLoadBarrier = ms == MemoryOrder.seq;
-    }
-
-
-    // NOTE: x86 stores implicitly have release semantics so a memory
-    //       barrier is only necessary on acquires.
-    template needsStoreBarrier( MemoryOrder ms )
-    {
-        enum bool needsStoreBarrier = ms == MemoryOrder.seq;
-    }
-
     // TODO: it'd be nice if we had @trusted scopes; we could remove this...
     bool casByRef(T,V1,V2)( ref T value, V1 ifThis, V2 writeThis ) pure nothrow @nogc @trusted
     {
diff --git a/src/core/internal/atomic.d b/src/core/internal/atomic.d
index caba54168f..3f47a20c3c 100644
--- a/src/core/internal/atomic.d
+++ b/src/core/internal/atomic.d
@@ -47,16 +47,127 @@ private
     enum SizedReg(int reg, T = size_t) = registerNames[reg][RegIndex!T];
 }
 
-T atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(T* src) pure nothrow @nogc @safe
+T atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(T* src) pure nothrow @nogc @trusted
+    if (CanCAS!T)
 {
+    static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");
+
+    static if (T.sizeof == size_t.sizeof * 2)
+    {
+        version (D_InlineAsm_X86)
+        {
+            asm pure nothrow @nogc @trusted
+            {
+                push EDI;
+                push EBX;
+                mov EBX, 0;
+                mov ECX, 0;
+                mov EAX, 0;
+                mov EDX, 0;
+                mov EDI, src;
+                lock; cmpxchg8b [EDI];
+                pop EBX;
+                pop EDI;
+            }
+        }
+        else version (D_InlineAsm_X86_64)
+        {
+            version (Windows)
+            {
+                static if (RegisterReturn!T)
+                {
+                    enum SrcPtr = SizedReg!CX;
+                    enum RetPtr = null;
+                }
+                else
+                {
+                    enum SrcPtr = SizedReg!DX;
+                    enum RetPtr = SizedReg!CX;
+                }
 
+                mixin (simpleFormat(q{
+                    asm pure nothrow @nogc @trusted
+                    {
+                        naked;
+                        push RBX;
+                        mov R8, %0;
+?1                        mov R9, %1;
+                        mov RBX, 0;
+                        mov RCX, 0;
+                        mov RAX, 0;
+                        mov RDX, 0;
+                        lock; cmpxchg16b [R8];
+?1                        mov [R9], RAX;
+?1                        mov 8[R9], RDX;
+                        pop RBX;
+                        ret;
+                    }
+                }, SrcPtr, RetPtr));
+            }
+            else
+            {
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    push RBX;
+                    mov RBX, 0;
+                    mov RCX, 0;
+                    mov RAX, 0;
+                    mov RDX, 0;
+                    lock; cmpxchg16b [RDI];
+                    pop RBX;
+                    ret;
+                }
+            }
+        }
+    }
+    else static if (needsLoadBarrier!order)
+    {
+        version (D_InlineAsm_X86)
+        {
+            enum SrcReg = SizedReg!CX;
+            enum ZeroReg = SizedReg!(DX, T);
+            enum ResReg = SizedReg!(AX, T);
+
+            mixin (simpleFormat(q{
+                asm pure nothrow @nogc @trusted
+                {
+                    mov %1, 0;
+                    mov %2, 0;
+                    mov %0, src;
+                    lock; cmpxchg [%0], %1;
+                }
+            }, SrcReg, ZeroReg, ResReg));
+        }
+        else version (D_InlineAsm_X86_64)
+        {
+            version (Windows)
+                enum SrcReg = SizedReg!CX;
+            else
+                enum SrcReg = SizedReg!DI;
+            enum ZeroReg = SizedReg!(DX, T);
+            enum ResReg = SizedReg!(AX, T);
+
+            mixin (simpleFormat(q{
+                asm pure nothrow @nogc @trusted
+                {
+                    naked;
+                    mov %1, 0;
+                    mov %2, 0;
+                    lock; cmpxchg [%0], %1;
+                    ret;
+                }
+            }, SrcReg, ZeroReg, ResReg));
+        }
+    }
+    else
+        return *src;
 }
 
 void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @safe
     if (CanCAS!T)
 {
     static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
-    static assert(__traits(isPOD, T), "Argument to atomicStore() must be POD");
 
     static if (T.sizeof == size_t.sizeof * 2)
     {
@@ -78,7 +189,7 @@ void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure
                 pop EDI;
             }
         }
-        else version(D_InlineAsm_X86_64)
+        else version (D_InlineAsm_X86_64)
         {
             version (Windows)
             {
@@ -537,9 +648,16 @@ void atomicFence(MemoryOrder order = MemoryOrder.seq)() nothrow @nogc @safe
 
 private:
 
+version (Windows)
+{
+    enum RegisterReturn(T) = is(T : U[], U) || is(T : R delegate(A), R, A...);
+}
+
 enum CanCAS(T) = is(T : ulong) ||
                  is(T == class) ||
                  is(T : U*, U) ||
+                 is(T : U[], U) ||
+                 is(T : R delegate(A), R, A...) ||
                  (is(T == struct) && __traits(isPOD, T) &&
                   T.sizeof <= size_t.sizeof*2 && // no more than 2 words
                   (T.sizeof & (T.sizeof - 1)) == 0 // is power of 2

From 9183a7dbd487c92296d51fad376888ef40c3dde8 Mon Sep 17 00:00:00 2001
From: Manu Evans <turkeyman@gmail.com>
Date: Sun, 18 Aug 2019 00:56:19 -0700
Subject: [PATCH 5/5] Better handling of various permutations of shared-ness.

---
 src/core/atomic.d          | 188 +++++++++++++++++++++++++------------
 src/core/internal/atomic.d |   2 +-
 2 files changed, 131 insertions(+), 59 deletions(-)

diff --git a/src/core/atomic.d b/src/core/atomic.d
index a68f677ba7..fcbf899868 100644
--- a/src/core/atomic.d
+++ b/src/core/atomic.d
@@ -12,50 +12,7 @@ module core.atomic;
 
 import core.internal.atomic;
 import core.internal.attributes : betterC;
-
-version (D_InlineAsm_X86)
-{
-    version = AsmX86;
-    version = AsmX86_32;
-    enum has64BitXCHG = false;
-    enum has64BitCAS = true;
-    enum has128BitCAS = false;
-}
-else version (D_InlineAsm_X86_64)
-{
-    version = AsmX86;
-    version = AsmX86_64;
-    enum has64BitXCHG = true;
-    enum has64BitCAS = true;
-    enum has128BitCAS = true;
-}
-else
-{
-    enum has64BitXCHG = false;
-    enum has64BitCAS = false;
-    enum has128BitCAS = false;
-}
-
-version (AsmX86)
-{
-    // NOTE: Strictly speaking, the x86 supports atomic operations on
-    //       unaligned values.  However, this is far slower than the
-    //       common case, so such behavior should be prohibited.
-    private bool atomicValueIsProperlyAligned(T)( ref T val ) pure nothrow @nogc @trusted
-    {
-        return atomicPtrIsProperlyAligned(&val);
-    }
-
-    private bool atomicPtrIsProperlyAligned(T)( T* ptr ) pure nothrow @nogc @safe
-    {
-        // NOTE: 32 bit x86 systems support 8 byte CAS, which only requires
-        //       4 byte alignment, so use size_t as the align type here.
-        static if ( T.sizeof > size_t.sizeof )
-            return cast(size_t)ptr % size_t.sizeof == 0;
-        else
-            return cast(size_t)ptr % T.sizeof == 0;
-    }
-}
+import core.internal.traits : hasUnsharedIndirections;
 
 /**
  * Specifies the memory ordering semantics of an atomic operation.
@@ -109,7 +66,8 @@ enum MemoryOrder
  * Returns:
  *  The value of 'val'.
  */
-TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T val ) pure nothrow @nogc @trusted
+T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref T val ) pure nothrow @nogc @trusted
+    if ( !is( T == shared U, U ) && !is( T == shared inout U, U ) && !is( T == shared const U, U ) )
 {
     static if ( __traits(isFloating, T) )
     {
@@ -118,10 +76,30 @@ TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T
         return *cast(T*)&r;
     }
     else
-    {
-        T r = core.internal.atomic.atomicLoad!ms(cast(T*)&val);
-        return *cast(TailShared!T*)&r;
-    }
+        return core.internal.atomic.atomicLoad!ms(&val);
+}
+
+/// Ditto
+T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val ) pure nothrow @nogc @trusted
+    if ( !hasUnsharedIndirections!T )
+{
+    import core.internal.traits : hasUnsharedIndirections;
+    static assert(!hasUnsharedIndirections!T, "Copying `shared " ~ T.stringof ~ "` would violate shared.");
+
+    return atomicLoad!ms(*cast(T*)&val);
+}
+
+/// Ditto
+TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val ) pure nothrow @nogc @trusted
+    if ( hasUnsharedIndirections!T )
+{
+    // HACK: DEPRECATE THIS FUNCTION, IT IS INVALID TO DO ATOMIC LOAD OF SHARED CLASS
+    // this is here because code exists in the wild that does this...
+
+    import core.lifetime : move;
+
+    T r = core.internal.atomic.atomicLoad!ms(cast(T*)&val);
+    return move(*cast(TailShared!T*)&r);
 }
 
 /**
@@ -134,8 +112,8 @@ TailShared!T atomicLoad(MemoryOrder ms = MemoryOrder.seq, T)( ref const shared T
  *  val    = The target variable.
  *  newval = The value to store.
  */
-void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, V newval ) pure nothrow @nogc @trusted
-    if ( __traits( compiles, { val = newval; } ) )
+void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref T val, V newval ) pure nothrow @nogc @trusted
+    if ( __traits( compiles, { val = newval; } ) && !is(T == shared S, S) && !is(V == shared U, U) )
 {
     static if ( __traits(isFloating, T) )
     {
@@ -144,7 +122,31 @@ void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, V ne
         core.internal.atomic.atomicStore!ms(cast(IntTy*)&val, *cast(IntTy*)&newval);
     }
     else
-        core.internal.atomic.atomicStore!ms(cast(T*)&val, newval);
+        core.internal.atomic.atomicStore!ms(&val, newval);
+}
+
+/// Ditto
+void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, V newval ) pure nothrow @nogc @trusted
+    if ( __traits( compiles, { val = newval; } ) && !is( T == class ) )
+{
+    static if ( is ( V == shared U, U ) )
+        alias Thunk = U;
+    else
+    {
+        import core.internal.traits : hasUnsharedIndirections;
+        static assert(!hasUnsharedIndirections!V, "Copying unshared argument `newval` to shared `val` would violate shared.");
+        alias Thunk = V;
+    }
+    atomicStore!ms(*cast(T*)&val, *cast(Thunk*)&newval);
+}
+
+/// Ditto
+void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, shared V newval ) pure nothrow @nogc @trusted
+    if ( is( T == class ) )
+{
+    static assert ( is ( V : T ), "Can't assign `newval` of type `shared " ~ V.stringof ~ "` to `shared " ~ T.stringof ~ "`.");
+
+    core.internal.atomic.atomicStore!ms(cast(T*)&val, cast(V)newval);
 }
 
 /**
@@ -158,7 +160,7 @@ void atomicStore(MemoryOrder ms = MemoryOrder.seq, T, V)( ref shared T val, V ne
  * Returns:
  *  The value held previously by `val`.
  */
-TailShared!(T) atomicFetchAdd(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
+TailShared!T atomicFetchAdd(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
     if ( __traits(isIntegral, T) )
 in ( atomicValueIsProperlyAligned(val) )
 {
@@ -176,7 +178,7 @@ in ( atomicValueIsProperlyAligned(val) )
  * Returns:
  *  The value held previously by `val`.
  */
-TailShared!(T) atomicFetchSub(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
+TailShared!T atomicFetchSub(MemoryOrder ms = MemoryOrder.seq, T)( ref shared T val, size_t mod ) pure nothrow @nogc @trusted
     if ( __traits(isIntegral, T) )
 in ( atomicValueIsProperlyAligned(val) )
 {
@@ -323,7 +325,6 @@ void atomicFence() nothrow @nogc @safe
     core.internal.atomic.atomicFence();
 }
 
-
 /**
  * Performs the binary operation 'op' on val using 'mod' as the modifier.
  *
@@ -385,11 +386,54 @@ in ( atomicValueIsProperlyAligned( val ) )
     }
 }
 
+
+version (X86)
+{
+    version = IsX86;
+    enum has64BitXCHG = false;
+    enum has64BitCAS = true;
+    enum has128BitCAS = false;
+}
+else version (X86_64)
+{
+    version = IsX86;
+    enum has64BitXCHG = true;
+    enum has64BitCAS = true;
+    enum has128BitCAS = true;
+}
+else
+{
+    enum has64BitXCHG = false;
+    enum has64BitCAS = false;
+    enum has128BitCAS = false;
+}
+
 private
 {
+    version (IsX86)
+    {
+        // NOTE: Strictly speaking, the x86 supports atomic operations on
+        //       unaligned values.  However, this is far slower than the
+        //       common case, so such behavior should be prohibited.
+        bool atomicValueIsProperlyAligned(T)( ref T val ) pure nothrow @nogc @trusted
+        {
+            return atomicPtrIsProperlyAligned(&val);
+        }
+
+        bool atomicPtrIsProperlyAligned(T)( T* ptr ) pure nothrow @nogc @safe
+        {
+            // NOTE: 32 bit x86 systems support 8 byte CAS, which only requires
+            //       4 byte alignment, so use size_t as the align type here.
+            static if ( T.sizeof > size_t.sizeof )
+                return cast(size_t)ptr % size_t.sizeof == 0;
+            else
+                return cast(size_t)ptr % T.sizeof == 0;
+        }
+    }
+
     template IntForFloat(F)
+        if (__traits(isFloating, F))
     {
-        static assert ( __traits(isFloating, F), "Not a floating point type: " ~ F.stringof );
         static if ( F.sizeof == 4 )
             alias IntForFloat = uint;
         else static if ( F.sizeof == 8 )
@@ -398,6 +442,34 @@ private
             static assert ( false, "Invalid floating point type: " ~ F.stringof ~ ", only support `float` and `double`." );
     }
 
+    template IntForStruct(S)
+        if (is(S == struct))
+    {
+        static if ( S.sizeof == 1 )
+            alias IntForFloat = ubyte;
+        else static if ( F.sizeof == 2 )
+            alias IntForFloat = ushort;
+        else static if ( F.sizeof == 4 )
+            alias IntForFloat = uint;
+        else static if ( F.sizeof == 8 )
+            alias IntForFloat = ulong;
+        else static if ( F.sizeof == 16 )
+            alias IntForFloat = ulong[2]; // TODO: what's the best type here? slice/delegates pass in registers...
+        else
+            static assert (ValidateStruct!S);
+    }
+
+    template ValidateStruct(S)
+        if (is(S == struct))
+    {
+        import core.internal.traits : hasElaborateAssign;
+
+        static assert (S.sizeof <= size_t*2 && (S.sizeof & (S.sizeof - 1)) == 0, S.stringof ~ " has invalid size for atomic operations.");
+        static assert (!hasElaborateAssign!S, S.stringof ~ " may not have an elaborate assignment when used with atomic operations.");
+
+        enum ValidateStruct = true;
+    }
+
     // TODO: it'd be nice if we had @trusted scopes; we could remove this...
     bool casByRef(T,V1,V2)( ref T value, V1 ifThis, V2 writeThis ) pure nothrow @nogc @trusted
     {
@@ -795,7 +867,7 @@ version (unittest)
         assert(atomicOp!"+="(i8, 8) == 13);
         assert(atomicOp!"+="(i16, 8) == 14);
         assert(atomicOp!"+="(i32, 8) == 15);
-        version (AsmX86_64)
+        version (D_LP64)
         {
             shared ulong u64 = 4;
             shared long i64 = 8;
@@ -819,7 +891,7 @@ version (unittest)
         assert(atomicOp!"-="(i8, 1) == 4);
         assert(atomicOp!"-="(i16, 1) == 5);
         assert(atomicOp!"-="(i32, 1) == 6);
-        version (AsmX86_64)
+        version (D_LP64)
         {
             shared ulong u64 = 4;
             shared long i64 = 8;
diff --git a/src/core/internal/atomic.d b/src/core/internal/atomic.d
index 3f47a20c3c..b8e10c8271 100644
--- a/src/core/internal/atomic.d
+++ b/src/core/internal/atomic.d
@@ -47,7 +47,7 @@ private
     enum SizedReg(int reg, T = size_t) = registerNames[reg][RegIndex!T];
 }
 
-T atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(T* src) pure nothrow @nogc @trusted
+inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
     if (CanCAS!T)
 {
     static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");