dlang · dlang-bot · Aug 9, 2017 · May 27, 2016 · Mar 13, 2017 · Mar 13, 2017
diff --git a/benchmark/arrayops/arrayops.d b/benchmark/arrayops/arrayops.d
@@ -21,13 +21,16 @@ float[6] getLatencies(T, string op)()
             a[] = 24;
             b[] = 4;
             c[] = 2;
+            __gshared T s = 2; // scalar, use __gshared to avoid const-folding
             auto sw = StopWatch(AutoStart.yes);
             foreach (off; size_t(0) .. size_t(64))
             {
                 off = off * len + off;
-                enum op = op.replace("const", "2").replace("a",
-                        "a[off .. off + len]").replace("b",
-                        "b[off .. off + len]").replace("c", "c[off .. off + len]");
+                enum op = op
+                    .replace("scalar", "s")
+                    .replace("a", "a[off .. off + len]")
+                    .replace("b", "b[off .. off + len]")
+                    .replace("c", "c[off .. off + len]");
                 mixin(op ~ ";");
             }
             latency = min(latency, sw.peek.nsecs);
@@ -54,13 +57,16 @@ float[4] getThroughput(T, string op)()
             a[] = 24;
             b[] = 4;
             c[] = 2;
+            __gshared T s = 2; // scalar, use __gshared to avoid const-folding
             auto sw = StopWatch(AutoStart.yes);
             foreach (off; size_t(0) .. size_t(64))
             {
                 off = off * len + off;
-                enum op = op.replace("const", "2").replace("a",
-                        "a[off .. off + len]").replace("b",
-                        "b[off .. off + len]").replace("c", "c[off .. off + len]");
+                enum op = op
+                    .replace("scalar", "s")
+                    .replace("a", "a[off .. off + len]")
+                    .replace("b", "b[off .. off + len]")
+                    .replace("c", "c[off .. off + len]");
                 mixin(op ~ ";");
             }
             immutable nsecs = sw.peek.nsecs;
@@ -78,11 +84,11 @@ string[] genOps()
     foreach (op1; ["+", "-", "*", "/"])
     {
         ops ~= "a " ~ op1 ~ "= b";
-        ops ~= "a " ~ op1 ~ "= const";
+        ops ~= "a " ~ op1 ~ "= scalar";
         foreach (op2; ["+", "-", "*", "/"])
         {
             ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " c";
-            ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " const";
+            ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " scalar";
         }
     }
     return ops;
@@ -180,7 +186,7 @@ void main()
     unmaskFPUExceptions;
 
     writefln("type, op, %(latency%s, %), %-(throughput%s, %)", iota(6)
-        .map!(i => 1 << i), ["8KB", "32KB", "512KB", "32MB"]);
+        .map!(i => 1 << i), ["8KB", "32KB", "512KB", "32768KB"]);
     foreach (op; mixin("AliasSeq!(%(%s, %))".format(genOps)))
         runOp!op;
     maskFPUExceptions;

diff --git a/benchmark/arrayops/plot.R b/benchmark/arrayops/plot.R
@@ -1,33 +1,35 @@
-# Use `R --vanilla < plot.R` to run this script.
-# It will read all *.csv files from the current folder and create a comparison plot for them.
+# Use `Rscript --vanilla plot.R old.csv new.csv` to run this script.
+# It will read old.csv and new.csv files and create a comparison plot for them.
 library(ggplot2)
 library(dplyr)
 library(tidyr)
 
 dat <- NULL
-files <- list.files(pattern='*.csv')
-for (file in files)
-{
-  datFile <- read.csv(file) %>% tbl_df() %>%
-    mutate(file=file)
-  if (is.null(dat))
-     dat = datFile
-  else
-     dat = bind_rows(dat, datFile)
-}
+args <- commandArgs(trailingOnly=T)
+old <- read.csv(args[1]) %>% tbl_df()
+new <- read.csv(args[2]) %>% tbl_df()
 
-latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency'))
-throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput'))
+col.indices <- which(!colnames(new) %in% c("type", "op"))
 
-levels(latencies$num_elems) <- sub("latency(\\d+)", "\\1", levels(latencies$num_elems))
-levels(throughputs$array_size) <- sub("throughput(.+)", "\\1", levels(throughputs$array_size))
+# relative values
+new[,col.indices] <- 100 * new[,col.indices] / old[,col.indices]
 
-img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) +
-  facet_grid(op ~ file, scales="free_y") +
-  labs(x="num elements", y="latency / ns")
-ggsave('array_ops_latency.svg', plot = img, width = 2 + 3 * length(files), height = 40)
+# arrange type factor levels
+new$type <- factor(new$type, levels = c('byte', 'ubyte', 'short', 'ushort', 'int', 'uint', 'long', 'ulong', 'float', 'double'))
 
-img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) +
-  facet_grid(op ~ file, scales="free_y") +
-  labs(x="array size", y="throughput / (ops / ns)")
-ggsave('array_ops_throughput.svg', plot = img, width = 2 + 3 * length(files), height = 40)
+latencies <- gather(new %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>%
+    mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems))))
+throughputs <- gather(new %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>%
+    mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size))))
+
+img <- ggplot(latencies, aes(x=num_elems, y=latency, fill=type)) +
+  geom_bar(position="dodge", stat="identity") +
+  facet_grid(op ~ ., scales="free_y") +
+  labs(x="num elements", y="relative latency / %")
+ggsave('array_ops_latency.png', plot = img, width = 8, height = 40)
+
+img <- ggplot(throughputs, aes(x=array_size, y=throughput, fill=type)) +
+  geom_bar(position="dodge", stat="identity") +
+  facet_grid(op ~ ., scales="free_y") +
+  labs(x="array size / KB", y="relative throughput / %")
+ggsave('array_ops_throughput.png', plot = img, width = 8, height = 40)
diff --git a/changelog/vectorized_array_ops.dd b/changelog/vectorized_array_ops.dd
@@ -0,0 +1,10 @@
+Vectorized array operations are now templated
+
+Array operations have been converted from dedicated assembly routines for $(B some) array operations to a generic template implementation for $(B all) array operations. This provides huge performance increases (2-4x higher throughput) for array operations that were not previously vectorized.
+Furthermore the implementation makes better use of vectorization even for short arrays to heavily reduce latency for some operations (up to 4x).
+
+For GDC/LDC the implementation relies on auto-vectorization, for DMD the implementation performs the vectorization itself. Support for vector operations with DMD is determined statically (`-march=native`, `-march=avx2`) to avoid binary bloat and the small test overhead. DMD enables SSE2 for 64-bit targets by default.
+
+Also see $(DRUNTIMEPR 1891)
+
+$(RED Note:) The implementation no longer weakens floating point divisions (e.g. `ary[] / scalar`) to multiplication (`ary[] * (1.0 / scalar)`) as that may reduce precision. To preserve the higher performance of float multiplication when loss of precision is acceptable, use either `-ffast-math` with GDC/LDC or manually rewrite your code to multiply by `(1.0 / scalar)` for DMD.
diff --git a/mak/COPY b/mak/COPY
@@ -17,6 +17,7 @@ COPY=\
 	$(IMPDIR)\core\vararg.d \
 	\
 	$(IMPDIR)\core\internal\abort.d \
+	$(IMPDIR)\core\internal\arrayop.d \
 	$(IMPDIR)\core\internal\convert.d \
 	$(IMPDIR)\core\internal\hash.d \
 	$(IMPDIR)\core\internal\spinlock.d \

diff --git a/mak/SRCS b/mak/SRCS
@@ -17,6 +17,7 @@ SRCS=\
 	src\core\vararg.d \
 	\
 	src\core\internal\abort.d \
+	src\core\internal\arrayop.d \
 	src\core\internal\convert.d \
 	src\core\internal\hash.d \
 	src\core\internal\spinlock.d \