From 184435f243b830cf464047deb1b636a8d5b4ed4a Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Fri, 27 May 2016 20:12:05 +0200
Subject: [PATCH 01/12] implement templated array ops

- use RPN to encode operand precedence
- fixes Issue 15619, and 16680
---
 mak/COPY                    |   1 +
 mak/SRCS                    |   1 +
 src/core/internal/arrayop.d | 428 ++++++++++++++++++++++++++++++++++++
 src/core/internal/traits.d  |  23 ++
 src/object.d                |   7 +
 win32.mak                   |   3 +
 win64.mak                   |   3 +
 7 files changed, 466 insertions(+)
 create mode 100644 src/core/internal/arrayop.d

diff --git a/mak/COPY b/mak/COPY
index 0a7f4317bd..d7b6a4c4e9 100644
--- a/mak/COPY
+++ b/mak/COPY
@@ -17,6 +17,7 @@ COPY=\
 	$(IMPDIR)\core\vararg.d \
 	\
 	$(IMPDIR)\core\internal\abort.d \
+	$(IMPDIR)\core\internal\arrayop.d \
 	$(IMPDIR)\core\internal\convert.d \
 	$(IMPDIR)\core\internal\hash.d \
 	$(IMPDIR)\core\internal\spinlock.d \
diff --git a/mak/SRCS b/mak/SRCS
index 2258203aa3..52330cf772 100644
--- a/mak/SRCS
+++ b/mak/SRCS
@@ -17,6 +17,7 @@ SRCS=\
 	src\core\vararg.d \
 	\
 	src\core\internal\abort.d \
+	src\core\internal\arrayop.d \
 	src\core\internal\convert.d \
 	src\core\internal\hash.d \
 	src\core\internal\spinlock.d \
diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d
new file mode 100644
index 0000000000..b11ffe6c4f
--- /dev/null
+++ b/src/core/internal/arrayop.d
@@ -0,0 +1,428 @@
+module core.internal.arrayop;
+import core.internal.traits : Filter, Unqual;
+
+version (GNU) version = GNU_OR_LDC;
+version (LDC) version = GNU_OR_LDC;
+
+/**
+ * Perform array (vector) operations and store the result in `res`.
+ * Operand types and operations are passed as template arguments in Reverse
+ * Polish Notation (RPN).
+ * All slice operands must have the same length as the result slice.
+ *
+ * Params: res = the slice in which to store the results
+ *        args = all other operands
+ *        Args = operand types and operations in RPN
+ *         T[] = type of result slice
+ * Returns: the slice containing the result
+ */
+T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nogc pure nothrow
+{
+    size_t pos;
+    static if (vectorizeable!(T[], Args))
+    {
+        alias vec = .vec!T;
+        alias load = .load!(T, vec.length);
+        alias store = .store!(T, vec.length);
+        alias scalarToVec = .scalarToVec!(T, vec.length);
+
+        auto n = res.length / vec.length;
+        enum nScalarInits = scalarIndices!Args.length;
+        if (n > 2 * (1 + nScalarInits)) // empirically found cost estimate
+        {
+            mixin(initScalarVecs!Args);
+
+            do
+            {
+                mixin(vectorExp!Args ~ ";");
+                pos += vec.length;
+            }
+            while (--n);
+        }
+    }
+    for (; pos < res.length; ++pos)
+        mixin(scalarExp!Args ~ ";");
+
+    return res;
+}
+
+private:
+
+// SIMD helpers
+
+version (GNU)
+    import gcc.builtins;
+else version (LDC)
+{
+    import ldc.simd;
+    import ldc.gccbuiltins_x86;
+}
+else version (DigitalMars)
+    import core.simd;
+else
+    static assert(0, "unimplemented");
+
+template vec(T)
+{
+    enum regsz = 16; // SSE2
+    enum N = regsz / T.sizeof;
+    alias vec = __vector(T[N]);
+}
+
+void store(T, size_t N)(T* p, in __vector(T[N]) val)
+{
+    pragma(inline, true);
+    alias vec = __vector(T[N]);
+
+    version (LDC)
+    {
+        storeUnaligned!vec(val, p);
+    }
+    else version (GNU)
+    {
+        static if (is(T == float))
+            __builtin_ia32_storeups(p, val);
+        else static if (is(T == double))
+            __builtin_ia32_storeupd(p, val);
+        else
+            __builtin_ia32_storedqu(cast(char*) p, val);
+    }
+    else version (DigitalMars)
+    {
+        static if (is(T == float))
+            cast(void) __simd_sto(XMM.STOUPS, *cast(vec*) p, val);
+        else static if (is(T == double))
+            cast(void) __simd_sto(XMM.STOUPD, *cast(vec*) p, val);
+        else
+            cast(void) __simd_sto(XMM.STODQU, *cast(vec*) p, val);
+    }
+}
+
+const(__vector(T[N])) load(T, size_t N)(in T* p)
+{
+    pragma(inline, true);
+    alias vec = __vector(T[N]);
+
+    version (LDC)
+    {
+        return loadUnaligned!vec(cast(T*) p);
+    }
+    else version (GNU)
+    {
+        static if (is(T == float))
+            return __builtin_ia32_loadups(p);
+        else static if (is(T == double))
+            return __builtin_ia32_loadupd(p);
+        else
+            return __builtin_ia32_loaddqu(cast(const char*) p);
+    }
+    else version (DigitalMars)
+    {
+        static if (is(T == float))
+            return __simd(XMM.LODUPS, *cast(const vec*) p);
+        else static if (is(T == double))
+            return __simd(XMM.LODUPD, *cast(const vec*) p);
+        else
+            return __simd(XMM.LODDQU, *cast(const vec*) p);
+    }
+}
+
+const(__vector(T[N])) scalarToVec(T, size_t N)(in T a)
+{
+    pragma(inline, true);
+    alias vec = __vector(T[N]);
+
+    vec res = void;
+    version (DigitalMars) // Bugzilla 7509
+        res.array = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a][0 .. N];
+    else
+        res = a;
+    return res;
+}
+
+__vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b)
+{
+    pragma(inline, true);
+    return mixin("a " ~ op ~ " b");
+}
+
+__vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == 'u')
+{
+    pragma(inline, true);
+    return mixin(op[1 .. $] ~ "a");
+}
+
+// mixin gen
+
+// filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though)
+bool vectorizeableOps(E)(string[] ops)
+{
+    // dfmt off
+    return !(
+        ops.contains("/", "/=") && __traits(isIntegral, E) ||
+        ops.contains("*", "*=") && __traits(isIntegral, E) && E.sizeof != 2 ||
+        ops.contains("%", "%=")
+    );
+    // dfmt on
+}
+
+// filter out things like float[] = float[] / size_t[]
+enum compatibleVecTypes(E, T : T[]) = is(Unqual!T == Unqual!E); // array elem types must be same (maybe add cvtpi2ps)
+enum compatibleVecTypes(E, T) = is(T : E); // scalar must be convertible to target elem type
+enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2])
+        && compatibleVecTypes!(E, Types[$ / 2 .. $]);
+
+template vectorizeable(E : E[], Args...)
+{
+    static if (is(vec!E))
+        enum vectorizeable = vectorizeableOps!E([Filter!(not!isType, Args)])
+                && compatibleVecTypes!(E, Filter!(isType, Args));
+    else
+        enum vectorizeable = false;
+}
+
+version (X86_64) unittest
+{
+    static assert(vectorizeable!(double[], const(double)[], double[], "+", "="));
+    static assert(!vectorizeable!(double[], const(ulong)[], double[], "+", "="));
+}
+
+bool isUnaryOp(string op)
+{
+    return op[0] == 'u';
+}
+
+bool isBinaryOp(string op)
+{
+    if (op.length != 1)
+        return false;
+    switch (op[0])
+    {
+    case '+', '-', '*', '/', '%', '|', '&', '^':
+        return true;
+    default:
+        return false;
+    }
+}
+
+bool isBinaryAssignOp(string op)
+{
+    return op.length == 2 && op[1] == '=' && isBinaryOp(op[0 .. 1]);
+}
+
+string scalarExp(Args...)()
+{
+    string[] stack;
+    size_t argsIdx;
+    foreach (i, arg; Args)
+    {
+        static if (is(arg == T[], T))
+            stack ~= "args[" ~ argsIdx++.toString ~ "][pos]";
+        else static if (is(arg))
+            stack ~= "args[" ~ argsIdx++.toString ~ "]";
+        else static if (isUnaryOp(arg))
+        {
+            auto op = arg[0] == 'u' ? arg[1 .. $] : arg;
+            stack[$ - 1] = op ~ stack[$ - 1];
+        }
+        else static if (arg == "=")
+        {
+            stack[$ - 1] = "res[pos] = cast(T)(" ~ stack[$ - 1] ~ ")";
+        }
+        else static if (isBinaryAssignOp(arg))
+        {
+            stack[$ - 1] = "res[pos] " ~ arg ~ " cast(T)(" ~ stack[$ - 1] ~ ")";
+        }
+        else static if (isBinaryOp(arg))
+        {
+            stack[$ - 2] = "(cast(T)(" ~ stack[$ - 2] ~ " " ~ arg ~ " " ~ stack[$ - 1] ~ "))";
+            stack.length -= 1;
+        }
+        else
+            assert(0, "Unexpected op " ~ arg);
+    }
+    assert(stack.length == 1);
+    return stack[0];
+}
+
+size_t[] scalarIndices(Args...)()
+{
+    size_t[] scalars;
+    foreach (i, arg; Args)
+    {
+        if (is(arg == T[], T))
+        {
+        }
+        else if (is(arg))
+            scalars ~= i;
+    }
+    return scalars;
+}
+
+string initScalarVecs(Args...)()
+{
+    auto scalars = scalarIndices!Args;
+    string res;
+    foreach (i, aidx; scalars)
+        res ~= "immutable vec scalar" ~ i.toString ~ " = scalarToVec(args[" ~ aidx
+            .toString ~ "]);\n";
+    return res;
+}
+
+string vectorExp(Args...)()
+{
+    size_t scalarsIdx, argsIdx;
+    string[] stack;
+    foreach (i, arg; Args)
+    {
+        static if (is(arg == T[], T))
+            stack ~= "load(&args[" ~ argsIdx++.toString ~ "][pos])";
+        else static if (is(arg))
+        {
+            ++argsIdx;
+            stack ~= "scalar" ~ scalarsIdx++.toString;
+        }
+        else static if (isUnaryOp(arg))
+        {
+            auto op = arg[0] == 'u' ? arg[1 .. $] : arg;
+            stack[$ - 1] = "unaop!\"" ~ arg ~ "\"(" ~ stack[$ - 1] ~ ")";
+        }
+        else static if (arg == "=")
+        {
+            stack[$ - 1] = "store(&res[pos], " ~ stack[$ - 1] ~ ")";
+        }
+        else static if (isBinaryAssignOp(arg))
+        {
+            stack[$ - 1] = "store(&res[pos], binop!\"" ~ arg[0 .. $ - 1]
+                ~ "\"(load(&res[pos]), " ~ stack[$ - 1] ~ "))";
+        }
+        else static if (isBinaryOp(arg))
+        {
+            stack[$ - 2] = "binop!\"" ~ arg ~ "\"(" ~ stack[$ - 2] ~ ", " ~ stack[$ - 1] ~ ")";
+            stack.length -= 1;
+        }
+        else
+            assert(0, "Unexpected op " ~ arg);
+    }
+    assert(stack.length == 1);
+    return stack[0];
+}
+
+// other helpers
+
+enum isType(T) = true;
+enum isType(alias a) = false;
+template not(alias tmlp)
+{
+    enum not(Args...) = !tmlp!Args;
+}
+
+string toString(size_t num)
+{
+    import core.internal.string : unsignedToTempString;
+
+    char[20] buf = void;
+    return unsignedToTempString(num, buf).idup;
+}
+
+bool contains(T)(in T[] ary, in T[] vals...)
+{
+    foreach (v1; ary)
+        foreach (v2; vals)
+            if (v1 == v2)
+                return true;
+    return false;
+}
+
+// tests
+
+version (unittest) template TT(T...)
+{
+    alias TT = T;
+}
+
+version (unittest) template _arrayOp(Args...)
+{
+    alias _arrayOp = arrayOp!Args;
+}
+
+unittest
+{
+    static void check(string op, TA, TB, T, size_t N)(TA a, TB b, in ref T[N] exp)
+    {
+        T[N] res;
+        _arrayOp!(T[], TA, TB, op, "=")(res[], a, b);
+        foreach (i; 0 .. N)
+            assert(res[i] == exp[i]);
+    }
+
+    static void check2(string unaOp, string binOp, TA, TB, T, size_t N)(TA a, TB b, in ref T[N] exp)
+    {
+        T[N] res;
+        _arrayOp!(T[], TA, TB, unaOp, binOp, "=")(res[], a, b);
+        foreach (i; 0 .. N)
+            assert(res[i] == exp[i]);
+    }
+
+    static void test(T, string op, size_t N = 16)(T a, T b, T exp)
+    {
+        T[N] va = a, vb = b, vexp = exp;
+
+        check!op(va[], vb[], vexp);
+        check!op(va[], b, vexp);
+        check!op(a, vb[], vexp);
+    }
+
+    static void test2(T, string unaOp, string binOp, size_t N = 16)(T a, T b, T exp)
+    {
+        T[N] va = a, vb = b, vexp = exp;
+
+        check2!(unaOp, binOp)(va[], vb[], vexp);
+        check2!(unaOp, binOp)(va[], b, vexp);
+        check2!(unaOp, binOp)(a, vb[], vexp);
+    }
+
+    alias UINTS = TT!(ubyte, ushort, uint, ulong);
+    alias INTS = TT!(byte, short, int, long);
+    alias FLOATS = TT!(float, double);
+
+    foreach (T; TT!(UINTS, INTS, FLOATS))
+    {
+        test!(T, "+")(1, 2, 3);
+        test!(T, "-")(3, 2, 1);
+
+        test2!(T, "u-", "+")(3, 2, 1);
+    }
+
+    foreach (T; TT!(UINTS, INTS))
+    {
+        test!(T, "|")(1, 2, 3);
+        test!(T, "&")(3, 1, 1);
+        test!(T, "^")(3, 1, 2);
+
+        test2!(T, "u~", "+")(3, cast(T)~2, 5);
+    }
+
+    foreach (T; TT!(INTS, FLOATS))
+    {
+        test!(T, "-")(1, 2, -1);
+        test2!(T, "u-", "+")(-3, -2, -1);
+        test2!(T, "u-", "*")(-3, -2, -6);
+    }
+
+    foreach (T; TT!(UINTS, INTS, FLOATS))
+    {
+        test!(T, "*")(2, 3, 6);
+        test!(T, "/")(8, 4, 2);
+        test!(T, "%")(8, 6, 2);
+    }
+}
+
+// test rewrite of v op= exp to v = v op exp
+unittest
+{
+    byte[32] c;
+    arrayOp!(byte[], byte, "+=")(c[], cast(byte) 6);
+    foreach (v; c)
+        assert(v == 6);
+}
diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d
index 8142f90c51..e15dff5204 100644
--- a/src/core/internal/traits.d
+++ b/src/core/internal/traits.d
@@ -187,3 +187,26 @@ template hasElaborateCopyConstructor(T...)
     else
         enum bool hasElaborateCopyConstructor = false;
 }
+
+// std.meta.Filter
+template Filter(alias pred, TList...)
+{
+    static if (TList.length == 0)
+    {
+        alias Filter = TypeTuple!();
+    }
+    else static if (TList.length == 1)
+    {
+        static if (pred!(TList[0]))
+            alias Filter = TypeTuple!(TList[0]);
+        else
+            alias Filter = TypeTuple!();
+    }
+    else
+    {
+        alias Filter =
+            TypeTuple!(
+                Filter!(pred, TList[ 0  .. $/2]),
+                Filter!(pred, TList[$/2 ..  $ ]));
+    }
+}
diff --git a/src/object.d b/src/object.d
index 5506ade80e..97dd6779b6 100644
--- a/src/object.d
+++ b/src/object.d
@@ -3634,6 +3634,13 @@ if (!__traits(isScalar, T1))
     assert(__cmp([c2, c2], [c1, c1]) > 0);
 }
 
+// Compiler hook into the runtime implementation of array (vector) operations.
+template _arrayOp(Args...)
+{
+    import core.internal.arrayop;
+    alias _arrayOp = arrayOp!Args;
+}
+
 // Helper functions
 
 private inout(TypeInfo) getElement(inout TypeInfo value) @trusted pure nothrow
diff --git a/win32.mak b/win32.mak
index 91262264c0..3494159935 100644
--- a/win32.mak
+++ b/win32.mak
@@ -266,6 +266,9 @@ $(IMPDIR)\core\vararg.d : src\core\vararg.d
 $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
 	copy $** $@
 
+$(IMPDIR)\core\internal\arrayop.d : src\core\internal\arrayop.d
+	copy $** $@
+
 $(IMPDIR)\core\internal\convert.d : src\core\internal\convert.d
 	copy $** $@
 
diff --git a/win64.mak b/win64.mak
index d9e10522b5..97b1aa7181 100644
--- a/win64.mak
+++ b/win64.mak
@@ -277,6 +277,9 @@ $(IMPDIR)\core\vararg.d : src\core\vararg.d
 $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
 	copy $** $@
 
+$(IMPDIR)\core\internal\arrayop.d : src\core\internal\arrayop.d
+	copy $** $@
+
 $(IMPDIR)\core\internal\convert.d : src\core\internal\convert.d
 	copy $** $@
 

From 84d49d18b25f0ec52d03a9d5d9ec4beaace3ab08 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Mon, 13 Mar 2017 16:01:29 +0100
Subject: [PATCH 02/12] fix plotting of arrayops benchmark

- properly sort/order values on abscissa
---
 benchmark/arrayops/arrayops.d |  2 +-
 benchmark/arrayops/plot.R     | 15 +++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/benchmark/arrayops/arrayops.d b/benchmark/arrayops/arrayops.d
index 60490b02d7..bce3ea14b1 100644
--- a/benchmark/arrayops/arrayops.d
+++ b/benchmark/arrayops/arrayops.d
@@ -180,7 +180,7 @@ void main()
     unmaskFPUExceptions;
 
     writefln("type, op, %(latency%s, %), %-(throughput%s, %)", iota(6)
-        .map!(i => 1 << i), ["8KB", "32KB", "512KB", "32MB"]);
+        .map!(i => 1 << i), ["8KB", "32KB", "512KB", "32768KB"]);
     foreach (op; mixin("AliasSeq!(%(%s, %))".format(genOps)))
         runOp!op;
     maskFPUExceptions;
diff --git a/benchmark/arrayops/plot.R b/benchmark/arrayops/plot.R
index 6574f03ea8..a1ba783810 100644
--- a/benchmark/arrayops/plot.R
+++ b/benchmark/arrayops/plot.R
@@ -16,18 +16,17 @@ for (file in files)
      dat = bind_rows(dat, datFile)
 }
 
-latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency'))
-throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput'))
-
-levels(latencies$num_elems) <- sub("latency(\\d+)", "\\1", levels(latencies$num_elems))
-levels(throughputs$array_size) <- sub("throughput(.+)", "\\1", levels(throughputs$array_size))
+latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>%
+    mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems))))
+throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>%
+    mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size))))
 
 img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) +
   facet_grid(op ~ file, scales="free_y") +
   labs(x="num elements", y="latency / ns")
-ggsave('array_ops_latency.svg', plot = img, width = 2 + 3 * length(files), height = 40)
+ggsave('array_ops_latency.png', plot = img, width = 2 + 3 * length(files), height = 40)
 
 img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) +
   facet_grid(op ~ file, scales="free_y") +
-  labs(x="array size", y="throughput / (ops / ns)")
-ggsave('array_ops_throughput.svg', plot = img, width = 2 + 3 * length(files), height = 40)
+  labs(x="array size / KB", y="throughput / (ops / ns)")
+ggsave('array_ops_throughput.png', plot = img, width = 2 + 3 * length(files), height = 40)

From f8dd223c8eafb8dc30015f3ab3c2e476694d830d Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Mon, 13 Mar 2017 14:47:41 +0100
Subject: [PATCH 03/12] change plot to relative numbers

---
 benchmark/arrayops/plot.R | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/benchmark/arrayops/plot.R b/benchmark/arrayops/plot.R
index a1ba783810..5b770dcb06 100644
--- a/benchmark/arrayops/plot.R
+++ b/benchmark/arrayops/plot.R
@@ -1,32 +1,30 @@
-# Use `R --vanilla < plot.R` to run this script.
-# It will read all *.csv files from the current folder and create a comparison plot for them.
+# Use `Rscript --vanilla plot.R old.csv new.csv` to run this script.
+# It will read old.csv and new.csv files and create a comparison plot for them.
 library(ggplot2)
 library(dplyr)
 library(tidyr)
 
 dat <- NULL
-files <- list.files(pattern='*.csv')
-for (file in files)
-{
-  datFile <- read.csv(file) %>% tbl_df() %>%
-    mutate(file=file)
-  if (is.null(dat))
-     dat = datFile
-  else
-     dat = bind_rows(dat, datFile)
-}
+args <- commandArgs(trailingOnly=T)
+old <- read.csv(args[1]) %>% tbl_df()
+new <- read.csv(args[2]) %>% tbl_df()
 
-latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>%
+col.indices <- which(!colnames(new) %in% c("type", "op"))
+
+# relative values
+new[,col.indices] <- 100 * new[,col.indices] / old[,col.indices]
+
+latencies <- gather(new %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>%
     mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems))))
-throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>%
+throughputs <- gather(new %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>%
     mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size))))
 
 img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) +
-  facet_grid(op ~ file, scales="free_y") +
-  labs(x="num elements", y="latency / ns")
-ggsave('array_ops_latency.png', plot = img, width = 2 + 3 * length(files), height = 40)
+  facet_grid(op ~ ., scales="free_y") +
+  labs(x="num elements", y="relative latency / %")
+ggsave('array_ops_latency.png', plot = img, width = 8, height = 40)
 
 img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) +
-  facet_grid(op ~ file, scales="free_y") +
-  labs(x="array size / KB", y="throughput / (ops / ns)")
-ggsave('array_ops_throughput.png', plot = img, width = 2 + 3 * length(files), height = 40)
+  facet_grid(op ~ ., scales="free_y") +
+  labs(x="array size / KB", y="relative throughput / %")
+ggsave('array_ops_throughput.png', plot = img, width = 8, height = 40)

From 973f3a29a66e15e27e196604b2594bf3c0d4bf44 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Tue, 25 Jul 2017 00:15:49 +0200
Subject: [PATCH 04/12] switch to easier to read bar plot

---
 benchmark/arrayops/plot.R | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/benchmark/arrayops/plot.R b/benchmark/arrayops/plot.R
index 5b770dcb06..5bb4a99e2f 100644
--- a/benchmark/arrayops/plot.R
+++ b/benchmark/arrayops/plot.R
@@ -14,17 +14,22 @@ col.indices <- which(!colnames(new) %in% c("type", "op"))
 # relative values
 new[,col.indices] <- 100 * new[,col.indices] / old[,col.indices]
 
+# arrange type factor levels
+new$type <- factor(new$type, levels = c('byte', 'ubyte', 'short', 'ushort', 'int', 'uint', 'long', 'ulong', 'float', 'double'))
+
 latencies <- gather(new %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>%
     mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems))))
 throughputs <- gather(new %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>%
     mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size))))
 
-img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) +
+img <- ggplot(latencies, aes(x=num_elems, y=latency, fill=type)) +
+  geom_bar(position="dodge", stat="identity") +
   facet_grid(op ~ ., scales="free_y") +
   labs(x="num elements", y="relative latency / %")
 ggsave('array_ops_latency.png', plot = img, width = 8, height = 40)
 
-img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) +
+img <- ggplot(throughputs, aes(x=array_size, y=throughput, fill=type)) +
+  geom_bar(position="dodge", stat="identity") +
   facet_grid(op ~ ., scales="free_y") +
   labs(x="array size / KB", y="relative throughput / %")
 ggsave('array_ops_throughput.png', plot = img, width = 8, height = 40)

From 549bc8be02343646260b761fa8e644ea9f1cc3ca Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Sun, 9 Apr 2017 10:40:15 +0200
Subject: [PATCH 05/12] vectorizable ops by introspection

- support for targets specific vector ops (e.g. AVX vs. SSE2)
---
 src/core/internal/arrayop.d | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d
index b11ffe6c4f..fe8a1d1a25 100644
--- a/src/core/internal/arrayop.d
+++ b/src/core/internal/arrayop.d
@@ -155,15 +155,18 @@ __vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == '
 // mixin gen
 
 // filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though)
-bool vectorizeableOps(E)(string[] ops)
+template vectorizeableOps(E, ops...) if (ops.length > 1)
 {
-    // dfmt off
-    return !(
-        ops.contains("/", "/=") && __traits(isIntegral, E) ||
-        ops.contains("*", "*=") && __traits(isIntegral, E) && E.sizeof != 2 ||
-        ops.contains("%", "%=")
-    );
-    // dfmt on
+    enum vectorizeableOps = vectorizeableOps!(E, ops[0 .. $ / 2])
+            && vectorizeableOps!(E, ops[$ / 2 .. $]);
+}
+
+template vectorizeableOps(E, string op)
+{
+    static if (isUnaryOp(op))
+        enum vectorizeableOps = is(typeof((vec!E a) => mixin(op[1 .. $] ~ " a")));
+    else
+        enum vectorizeableOps = is(typeof((vec!E a, vec!E b) => mixin("a " ~ op ~ " b")));
 }
 
 // filter out things like float[] = float[] / size_t[]

From 60d0eefbcddb497b9c5513f31062316da0c2dee2 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Mon, 10 Apr 2017 00:48:21 +0200
Subject: [PATCH 06/12] proper error message for unsupported scalar ops

- with UDTs
---
 src/core/internal/arrayop.d | 53 ++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d
index fe8a1d1a25..42cc212c81 100644
--- a/src/core/internal/arrayop.d
+++ b/src/core/internal/arrayop.d
@@ -18,6 +18,8 @@ version (LDC) version = GNU_OR_LDC;
  */
 T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nogc pure nothrow
 {
+    enum check = opsSupported!(true, T, Filter!(not!isType, Args)); // must support all scalar ops
+
     size_t pos;
     static if (vectorizeable!(T[], Args))
     {
@@ -155,18 +157,26 @@ __vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == '
 // mixin gen
 
 // filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though)
-template vectorizeableOps(E, ops...) if (ops.length > 1)
+template opsSupported(bool fail, T, ops...) if (ops.length > 1)
 {
-    enum vectorizeableOps = vectorizeableOps!(E, ops[0 .. $ / 2])
-            && vectorizeableOps!(E, ops[$ / 2 .. $]);
+    enum opsSupported = opsSupported!(fail, T, ops[0 .. $ / 2])
+            && opsSupported!(fail, T, ops[$ / 2 .. $]);
 }
 
-template vectorizeableOps(E, string op)
+template opsSupported(bool fail, T, string op)
 {
     static if (isUnaryOp(op))
-        enum vectorizeableOps = is(typeof((vec!E a) => mixin(op[1 .. $] ~ " a")));
+    {
+        enum opsSupported = is(typeof((T a) => mixin(op[1 .. $] ~ " a")));
+        static assert(!fail || opsSupported,
+                "Unary op `" ~ op[1 .. $] ~ "` not supported for element type " ~ T.stringof ~ ".");
+    }
     else
-        enum vectorizeableOps = is(typeof((vec!E a, vec!E b) => mixin("a " ~ op ~ " b")));
+    {
+        enum opsSupported = is(typeof((T a, T b) => mixin("a " ~ op ~ " b")));
+        static assert(!fail || opsSupported,
+                "Binary op `" ~ op ~ "` not supported for element type " ~ T.stringof ~ ".");
+    }
 }
 
 // filter out things like float[] = float[] / size_t[]
@@ -178,7 +188,7 @@ enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2])
 template vectorizeable(E : E[], Args...)
 {
     static if (is(vec!E))
-        enum vectorizeable = vectorizeableOps!E([Filter!(not!isType, Args)])
+        enum vectorizeable = opsSupported!(false, vec!E, Filter!(not!isType, Args))
                 && compatibleVecTypes!(E, Filter!(isType, Args));
     else
         enum vectorizeable = false;
@@ -429,3 +439,32 @@ unittest
     foreach (v; c)
         assert(v == 6);
 }
+
+// proper error message for UDT lacking certain ops
+unittest
+{
+    static assert(!is(typeof(&arrayOp!(int[4][], int[4], "+="))));
+    static assert(!is(typeof(&arrayOp!(int[4][], int[4], "u-", "="))));
+
+    static struct S
+    {
+    }
+
+    static assert(!is(typeof(&arrayOp!(S[], S, "+="))));
+    static assert(!is(typeof(&arrayOp!(S[], S[], "*", S, "+="))));
+    static struct S2
+    {
+        S2 opBinary(string op)(in S2) @nogc pure nothrow
+        {
+            return this;
+        }
+
+        ref S2 opOpAssign(string op)(in S2) @nogc pure nothrow
+        {
+            return this;
+        }
+    }
+
+    static assert(is(typeof(&arrayOp!(S2[], S2[], S2[], S2, "*", "+", "="))));
+    static assert(is(typeof(&arrayOp!(S2[], S2[], S2, "*", "+="))));
+}

From 4eaf5004d2eddbc8df860d43d521fed25c4c0ae0 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Tue, 11 Apr 2017 12:36:03 +0200
Subject: [PATCH 07/12] remove Issue 7509/16488 workaround

- dmd got broadcast init with #6248
---
 src/core/internal/arrayop.d | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d
index 42cc212c81..5eed3c61fa 100644
--- a/src/core/internal/arrayop.d
+++ b/src/core/internal/arrayop.d
@@ -26,7 +26,6 @@ T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nog
         alias vec = .vec!T;
         alias load = .load!(T, vec.length);
         alias store = .store!(T, vec.length);
-        alias scalarToVec = .scalarToVec!(T, vec.length);
 
         auto n = res.length / vec.length;
         enum nScalarInits = scalarIndices!Args.length;
@@ -129,19 +128,6 @@ const(__vector(T[N])) load(T, size_t N)(in T* p)
     }
 }
 
-const(__vector(T[N])) scalarToVec(T, size_t N)(in T a)
-{
-    pragma(inline, true);
-    alias vec = __vector(T[N]);
-
-    vec res = void;
-    version (DigitalMars) // Bugzilla 7509
-        res.array = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a][0 .. N];
-    else
-        res = a;
-    return res;
-}
-
 __vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b)
 {
     pragma(inline, true);
@@ -277,8 +263,7 @@ string initScalarVecs(Args...)()
     auto scalars = scalarIndices!Args;
     string res;
     foreach (i, aidx; scalars)
-        res ~= "immutable vec scalar" ~ i.toString ~ " = scalarToVec(args[" ~ aidx
-            .toString ~ "]);\n";
+        res ~= "immutable vec scalar" ~ i.toString ~ " = args[" ~ aidx.toString ~ "];\n";
     return res;
 }
 

From 9c5d83b3a7b5d2f2164d78715350dcb00d4a0b8e Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Fri, 9 Jun 2017 18:21:05 +0200
Subject: [PATCH 08/12] always use vec ops

---
 src/core/internal/arrayop.d | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d
index 5eed3c61fa..cd4795ac56 100644
--- a/src/core/internal/arrayop.d
+++ b/src/core/internal/arrayop.d
@@ -27,12 +27,14 @@ T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nog
         alias load = .load!(T, vec.length);
         alias store = .store!(T, vec.length);
 
-        auto n = res.length / vec.length;
-        enum nScalarInits = scalarIndices!Args.length;
-        if (n > 2 * (1 + nScalarInits)) // empirically found cost estimate
+        // Given that there are at most as many scalars broadcast as there are
+        // operations in any `ary[] = ary[] op const op const`, it should always be
+        // worthwhile to choose vector operations.
+        if (res.length >= vec.length)
         {
             mixin(initScalarVecs!Args);
 
+            auto n = res.length / vec.length;
             do
             {
                 mixin(vectorExp!Args ~ ";");
@@ -244,26 +246,19 @@ string scalarExp(Args...)()
     return stack[0];
 }
 
-size_t[] scalarIndices(Args...)()
+string initScalarVecs(Args...)()
 {
-    size_t[] scalars;
-    foreach (i, arg; Args)
+    size_t scalarsIdx;
+    string res;
+    foreach (aidx, arg; Args)
     {
-        if (is(arg == T[], T))
+        static if (is(arg == T[], T))
         {
         }
-        else if (is(arg))
-            scalars ~= i;
+        else static if (is(arg))
+            res ~= "immutable vec scalar" ~ scalarsIdx++.toString ~ " = args["
+                ~ aidx.toString ~ "];\n";
     }
-    return scalars;
-}
-
-string initScalarVecs(Args...)()
-{
-    auto scalars = scalarIndices!Args;
-    string res;
-    foreach (i, aidx; scalars)
-        res ~= "immutable vec scalar" ~ i.toString ~ " = args[" ~ aidx.toString ~ "];\n";
     return res;
 }
 

From 9d7faf97cf4830b2d111aaff493814e30b70019b Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Tue, 25 Jul 2017 00:50:26 +0200
Subject: [PATCH 09/12] rely on auto-vectorizer for gdc/ldc

- seems to have made quite some improvements while that module was written
- generated code for scalar loops and for vector loops ends up being almost identical,
  so it seems more reasonable to leave decisions completely to the auto-vectorizers.
---
 src/core/internal/arrayop.d | 118 ++++++++++++++----------------------
 1 file changed, 45 insertions(+), 73 deletions(-)

diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d
index cd4795ac56..724c5b8e11 100644
--- a/src/core/internal/arrayop.d
+++ b/src/core/internal/arrayop.d
@@ -53,45 +53,22 @@ private:
 
 // SIMD helpers
 
-version (GNU)
-    import gcc.builtins;
-else version (LDC)
+version (DigitalMars)
 {
-    import ldc.simd;
-    import ldc.gccbuiltins_x86;
-}
-else version (DigitalMars)
     import core.simd;
-else
-    static assert(0, "unimplemented");
-
-template vec(T)
-{
-    enum regsz = 16; // SSE2
-    enum N = regsz / T.sizeof;
-    alias vec = __vector(T[N]);
-}
 
-void store(T, size_t N)(T* p, in __vector(T[N]) val)
-{
-    pragma(inline, true);
-    alias vec = __vector(T[N]);
-
-    version (LDC)
-    {
-        storeUnaligned!vec(val, p);
-    }
-    else version (GNU)
+    template vec(T)
     {
-        static if (is(T == float))
-            __builtin_ia32_storeups(p, val);
-        else static if (is(T == double))
-            __builtin_ia32_storeupd(p, val);
-        else
-            __builtin_ia32_storedqu(cast(char*) p, val);
+        enum regsz = 16; // SSE2
+        enum N = regsz / T.sizeof;
+        alias vec = __vector(T[N]);
     }
-    else version (DigitalMars)
+
+    void store(T, size_t N)(T* p, in __vector(T[N]) val)
     {
+        pragma(inline, true);
+        alias vec = __vector(T[N]);
+
         static if (is(T == float))
             cast(void) __simd_sto(XMM.STOUPS, *cast(vec*) p, val);
         else static if (is(T == double))
@@ -99,28 +76,14 @@ void store(T, size_t N)(T* p, in __vector(T[N]) val)
         else
             cast(void) __simd_sto(XMM.STODQU, *cast(vec*) p, val);
     }
-}
-
-const(__vector(T[N])) load(T, size_t N)(in T* p)
-{
-    pragma(inline, true);
-    alias vec = __vector(T[N]);
 
-    version (LDC)
-    {
-        return loadUnaligned!vec(cast(T*) p);
-    }
-    else version (GNU)
-    {
-        static if (is(T == float))
-            return __builtin_ia32_loadups(p);
-        else static if (is(T == double))
-            return __builtin_ia32_loadupd(p);
-        else
-            return __builtin_ia32_loaddqu(cast(const char*) p);
-    }
-    else version (DigitalMars)
+    const(__vector(T[N])) load(T, size_t N)(in T* p)
     {
+        import core.simd;
+
+        pragma(inline, true);
+        alias vec = __vector(T[N]);
+
         static if (is(T == float))
             return __simd(XMM.LODUPS, *cast(const vec*) p);
         else static if (is(T == double))
@@ -128,18 +91,19 @@ const(__vector(T[N])) load(T, size_t N)(in T* p)
         else
             return __simd(XMM.LODDQU, *cast(const vec*) p);
     }
-}
 
-__vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b)
-{
-    pragma(inline, true);
-    return mixin("a " ~ op ~ " b");
-}
+    __vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b)
+    {
+        pragma(inline, true);
+        return mixin("a " ~ op ~ " b");
+    }
 
-__vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == 'u')
-{
-    pragma(inline, true);
-    return mixin(op[1 .. $] ~ "a");
+    __vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a)
+            if (op[0] == 'u')
+    {
+        pragma(inline, true);
+        return mixin(op[1 .. $] ~ "a");
+    }
 }
 
 // mixin gen
@@ -173,19 +137,27 @@ enum compatibleVecTypes(E, T) = is(T : E); // scalar must be convertible to targ
 enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2])
         && compatibleVecTypes!(E, Types[$ / 2 .. $]);
 
-template vectorizeable(E : E[], Args...)
+version (GNU_OR_LDC)
 {
-    static if (is(vec!E))
-        enum vectorizeable = opsSupported!(false, vec!E, Filter!(not!isType, Args))
-                && compatibleVecTypes!(E, Filter!(isType, Args));
-    else
-        enum vectorizeable = false;
+    // leave it to the auto-vectorizer
+    enum vectorizeable(E : E[], Args...) = false;
 }
-
-version (X86_64) unittest
+else
 {
-    static assert(vectorizeable!(double[], const(double)[], double[], "+", "="));
-    static assert(!vectorizeable!(double[], const(ulong)[], double[], "+", "="));
+    template vectorizeable(E : E[], Args...)
+    {
+        static if (is(vec!E))
+            enum vectorizeable = opsSupported!(false, vec!E, Filter!(not!isType, Args))
+                    && compatibleVecTypes!(E, Filter!(isType, Args));
+        else
+            enum vectorizeable = false;
+    }
+
+    version (X86_64) unittest
+    {
+        static assert(vectorizeable!(double[], const(double)[], double[], "+", "="));
+        static assert(!vectorizeable!(double[], const(ulong)[], double[], "+", "="));
+    }
 }
 
 bool isUnaryOp(string op)

From 69ff7244c8aa95ce6a9a0514c5438885ec7945fa Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Tue, 25 Jul 2017 00:14:51 +0200
Subject: [PATCH 10/12] use __gshared scalar to avoid const-folding

- e.g. replacement of ary[] / scalar with weaker ary[] >> 1
---
 benchmark/arrayops/arrayops.d | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/benchmark/arrayops/arrayops.d b/benchmark/arrayops/arrayops.d
index bce3ea14b1..dae0a1548a 100644
--- a/benchmark/arrayops/arrayops.d
+++ b/benchmark/arrayops/arrayops.d
@@ -21,13 +21,16 @@ float[6] getLatencies(T, string op)()
             a[] = 24;
             b[] = 4;
             c[] = 2;
+            __gshared T s = 2; // scalar, use __gshared to avoid const-folding
             auto sw = StopWatch(AutoStart.yes);
             foreach (off; size_t(0) .. size_t(64))
             {
                 off = off * len + off;
-                enum op = op.replace("const", "2").replace("a",
-                        "a[off .. off + len]").replace("b",
-                        "b[off .. off + len]").replace("c", "c[off .. off + len]");
+                enum op = op
+                    .replace("scalar", "s")
+                    .replace("a", "a[off .. off + len]")
+                    .replace("b", "b[off .. off + len]")
+                    .replace("c", "c[off .. off + len]");
                 mixin(op ~ ";");
             }
             latency = min(latency, sw.peek.nsecs);
@@ -54,13 +57,16 @@ float[4] getThroughput(T, string op)()
             a[] = 24;
             b[] = 4;
             c[] = 2;
+            __gshared T s = 2; // scalar, use __gshared to avoid const-folding
             auto sw = StopWatch(AutoStart.yes);
             foreach (off; size_t(0) .. size_t(64))
             {
                 off = off * len + off;
-                enum op = op.replace("const", "2").replace("a",
-                        "a[off .. off + len]").replace("b",
-                        "b[off .. off + len]").replace("c", "c[off .. off + len]");
+                enum op = op
+                    .replace("scalar", "s")
+                    .replace("a", "a[off .. off + len]")
+                    .replace("b", "b[off .. off + len]")
+                    .replace("c", "c[off .. off + len]");
                 mixin(op ~ ";");
             }
             immutable nsecs = sw.peek.nsecs;
@@ -78,11 +84,11 @@ string[] genOps()
     foreach (op1; ["+", "-", "*", "/"])
     {
         ops ~= "a " ~ op1 ~ "= b";
-        ops ~= "a " ~ op1 ~ "= const";
+        ops ~= "a " ~ op1 ~ "= scalar";
         foreach (op2; ["+", "-", "*", "/"])
         {
             ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " c";
-            ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " const";
+            ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " scalar";
         }
     }
     return ops;

From 6bdc5a4c358861408015443cf0e41dca68e340e5 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Wed, 26 Jul 2017 12:34:02 +0200
Subject: [PATCH 11/12] add changelog for templated array ops

---
 changelog/vectorized_array_ops.dd | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 changelog/vectorized_array_ops.dd

diff --git a/changelog/vectorized_array_ops.dd b/changelog/vectorized_array_ops.dd
new file mode 100644
index 0000000000..0b20c4e4f4
--- /dev/null
+++ b/changelog/vectorized_array_ops.dd
@@ -0,0 +1,10 @@
+Vectorized array operations are now templated
+
+Array operations have been converted from dedicated assembly routines for $(B some) array operations to a generic template implementation for $(B all) array operations. This provides huge performance increases (2-4x higher throughput) for array operations that were not previously vectorized.
+Furthermore the implementation makes better use of vectorization even for short arrays to heavily reduce latency for some operations (up to 4x).
+
+For GDC/LDC the implementation relies on auto-vectorization, for DMD the implementation performs the vectorization itself. Support for vector operations with DMD is determined statically (`-march=native`, `-march=avx2`) to avoid binary bloat and the small test overhead. DMD enables SSE2 for 64-bit targets by default.
+
+Also see $(DRUNTIMEPR 1891)
+
+$(RED Note:) The implementation no longer weakens floating point divisions (e.g. `ary[] / scalar`) to multiplication (`ary[] * (1.0 / scalar)`) as that may reduce precision. To preserve the higher performance of float multiplication when loss of precision is acceptable, use either `-ffast-math` with GDC/LDC or manually rewrite your code to multiply by `(1.0 / scalar)` for DMD.

From aee45fbb8043def66f81620c4752072ac22db3d1 Mon Sep 17 00:00:00 2001
From: Martin Nowak <code@dawg.eu>
Date: Wed, 9 Aug 2017 08:28:04 +0200
Subject: [PATCH 12/12] more docs

---
 src/core/internal/arrayop.d | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d
index 724c5b8e11..d24c849e76 100644
--- a/src/core/internal/arrayop.d
+++ b/src/core/internal/arrayop.d
@@ -5,15 +5,24 @@ version (GNU) version = GNU_OR_LDC;
 version (LDC) version = GNU_OR_LDC;
 
 /**
- * Perform array (vector) operations and store the result in `res`.
- * Operand types and operations are passed as template arguments in Reverse
- * Polish Notation (RPN).
+ * Perform array (vector) operations and store the result in `res`.  Operand
+ * types and operations are passed as template arguments in Reverse Polish
+ * Notation (RPN).
+
+ * Operands can be slices or scalar types. The unqualified element types of all
+ * slices must be `T`, scalar types must be implicitly convertible to `T`.
+ *
+ * Operations are encoded as strings, e.g. `"+"`, `"%"`, `"*="`. Unary
+ * operations are prefixed with "u", e.g. `"u-"`, `"u~"`. Only the last
+ * operation can and must be an assignment (`"="`) or op-assignment (`"op="`).
+ *
  * All slice operands must have the same length as the result slice.
  *
- * Params: res = the slice in which to store the results
- *        args = all other operands
+ * Params: T[] = type of result slice
  *        Args = operand types and operations in RPN
- *         T[] = type of result slice
+ *         res = the slice in which to store the results
+ *        args = operand values
+ *
  * Returns: the slice containing the result
  */
 T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nogc pure nothrow
@@ -108,7 +117,7 @@ version (DigitalMars)
 
 // mixin gen
 
-// filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though)
+// Check whether operations `ops` are supported for type `T`. Fails with a human-friendly static assert message, if `fail` is true.
 template opsSupported(bool fail, T, ops...) if (ops.length > 1)
 {
     enum opsSupported = opsSupported!(fail, T, ops[0 .. $ / 2])
@@ -131,7 +140,8 @@ template opsSupported(bool fail, T, string op)
     }
 }
 
-// filter out things like float[] = float[] / size_t[]
+// check whether slices have the unqualified element type `E` and scalars are implicitly convertible to `E`
+// i.e. filter out things like float[] = float[] / size_t[]
 enum compatibleVecTypes(E, T : T[]) = is(Unqual!T == Unqual!E); // array elem types must be same (maybe add cvtpi2ps)
 enum compatibleVecTypes(E, T) = is(T : E); // scalar must be convertible to target elem type
 enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2])
@@ -144,6 +154,7 @@ version (GNU_OR_LDC)
 }
 else
 {
+    // check whether arrayOp is vectorizable
     template vectorizeable(E : E[], Args...)
     {
         static if (is(vec!E))
@@ -183,6 +194,9 @@ bool isBinaryAssignOp(string op)
     return op.length == 2 && op[1] == '=' && isBinaryOp(op[0 .. 1]);
 }
 
+// Generate mixin expression to perform scalar arrayOp loop expression, assumes
+// `pos` to be the current slice index, `args` to contain operand values, and
+// `res` the target slice.
 string scalarExp(Args...)()
 {
     string[] stack;
@@ -218,6 +232,8 @@ string scalarExp(Args...)()
     return stack[0];
 }
 
+// Generate mixin statement to perform vector loop initialization, assumes
+// `args` to contain operand values.
 string initScalarVecs(Args...)()
 {
     size_t scalarsIdx;
@@ -234,6 +250,9 @@ string initScalarVecs(Args...)()
     return res;
 }
 
+// Generate mixin expression to perform vector arrayOp loop expression, assumes
+// `pos` to be the current slice index, `args` to contain operand values, and
+// `res` the target slice.
 string vectorExp(Args...)()
 {
     size_t scalarsIdx, argsIdx;