From 184435f243b830cf464047deb1b636a8d5b4ed4a Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Fri, 27 May 2016 20:12:05 +0200 Subject: [PATCH 01/12] implement templated array ops - use RPN to encode operand precedence - fixes Issue 15619, and 16680 --- mak/COPY | 1 + mak/SRCS | 1 + src/core/internal/arrayop.d | 428 ++++++++++++++++++++++++++++++++++++ src/core/internal/traits.d | 23 ++ src/object.d | 7 + win32.mak | 3 + win64.mak | 3 + 7 files changed, 466 insertions(+) create mode 100644 src/core/internal/arrayop.d diff --git a/mak/COPY b/mak/COPY index 0a7f4317bd..d7b6a4c4e9 100644 --- a/mak/COPY +++ b/mak/COPY @@ -17,6 +17,7 @@ COPY=\ $(IMPDIR)\core\vararg.d \ \ $(IMPDIR)\core\internal\abort.d \ + $(IMPDIR)\core\internal\arrayop.d \ $(IMPDIR)\core\internal\convert.d \ $(IMPDIR)\core\internal\hash.d \ $(IMPDIR)\core\internal\spinlock.d \ diff --git a/mak/SRCS b/mak/SRCS index 2258203aa3..52330cf772 100644 --- a/mak/SRCS +++ b/mak/SRCS @@ -17,6 +17,7 @@ SRCS=\ src\core\vararg.d \ \ src\core\internal\abort.d \ + src\core\internal\arrayop.d \ src\core\internal\convert.d \ src\core\internal\hash.d \ src\core\internal\spinlock.d \ diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d new file mode 100644 index 0000000000..b11ffe6c4f --- /dev/null +++ b/src/core/internal/arrayop.d @@ -0,0 +1,428 @@ +module core.internal.arrayop; +import core.internal.traits : Filter, Unqual; + +version (GNU) version = GNU_OR_LDC; +version (LDC) version = GNU_OR_LDC; + +/** + * Perform array (vector) operations and store the result in `res`. + * Operand types and operations are passed as template arguments in Reverse + * Polish Notation (RPN). + * All slice operands must have the same length as the result slice. + * + * Params: res = the slice in which to store the results + * args = all other operands + * Args = operand types and operations in RPN + * T[] = type of result slice + * Returns: the slice containing the result + */ +T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nogc pure nothrow +{ + size_t pos; + static if (vectorizeable!(T[], Args)) + { + alias vec = .vec!T; + alias load = .load!(T, vec.length); + alias store = .store!(T, vec.length); + alias scalarToVec = .scalarToVec!(T, vec.length); + + auto n = res.length / vec.length; + enum nScalarInits = scalarIndices!Args.length; + if (n > 2 * (1 + nScalarInits)) // empirically found cost estimate + { + mixin(initScalarVecs!Args); + + do + { + mixin(vectorExp!Args ~ ";"); + pos += vec.length; + } + while (--n); + } + } + for (; pos < res.length; ++pos) + mixin(scalarExp!Args ~ ";"); + + return res; +} + +private: + +// SIMD helpers + +version (GNU) + import gcc.builtins; +else version (LDC) +{ + import ldc.simd; + import ldc.gccbuiltins_x86; +} +else version (DigitalMars) + import core.simd; +else + static assert(0, "unimplemented"); + +template vec(T) +{ + enum regsz = 16; // SSE2 + enum N = regsz / T.sizeof; + alias vec = __vector(T[N]); +} + +void store(T, size_t N)(T* p, in __vector(T[N]) val) +{ + pragma(inline, true); + alias vec = __vector(T[N]); + + version (LDC) + { + storeUnaligned!vec(val, p); + } + else version (GNU) + { + static if (is(T == float)) + __builtin_ia32_storeups(p, val); + else static if (is(T == double)) + __builtin_ia32_storeupd(p, val); + else + __builtin_ia32_storedqu(cast(char*) p, val); + } + else version (DigitalMars) + { + static if (is(T == float)) + cast(void) __simd_sto(XMM.STOUPS, *cast(vec*) p, val); + else static if (is(T == double)) + cast(void) __simd_sto(XMM.STOUPD, *cast(vec*) p, val); + else + cast(void) __simd_sto(XMM.STODQU, *cast(vec*) p, val); + } +} + +const(__vector(T[N])) load(T, size_t N)(in T* p) +{ + pragma(inline, true); + alias vec = __vector(T[N]); + + version (LDC) + { + return loadUnaligned!vec(cast(T*) p); + } + else version (GNU) + { + static if (is(T == float)) + return __builtin_ia32_loadups(p); + else static if (is(T == double)) + return __builtin_ia32_loadupd(p); + else + return __builtin_ia32_loaddqu(cast(const char*) p); + } + else version (DigitalMars) + { + static if (is(T == float)) + return __simd(XMM.LODUPS, *cast(const vec*) p); + else static if (is(T == double)) + return __simd(XMM.LODUPD, *cast(const vec*) p); + else + return __simd(XMM.LODDQU, *cast(const vec*) p); + } +} + +const(__vector(T[N])) scalarToVec(T, size_t N)(in T a) +{ + pragma(inline, true); + alias vec = __vector(T[N]); + + vec res = void; + version (DigitalMars) // Bugzilla 7509 + res.array = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a][0 .. N]; + else + res = a; + return res; +} + +__vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b) +{ + pragma(inline, true); + return mixin("a " ~ op ~ " b"); +} + +__vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == 'u') +{ + pragma(inline, true); + return mixin(op[1 .. $] ~ "a"); +} + +// mixin gen + +// filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though) +bool vectorizeableOps(E)(string[] ops) +{ + // dfmt off + return !( + ops.contains("/", "/=") && __traits(isIntegral, E) || + ops.contains("*", "*=") && __traits(isIntegral, E) && E.sizeof != 2 || + ops.contains("%", "%=") + ); + // dfmt on +} + +// filter out things like float[] = float[] / size_t[] +enum compatibleVecTypes(E, T : T[]) = is(Unqual!T == Unqual!E); // array elem types must be same (maybe add cvtpi2ps) +enum compatibleVecTypes(E, T) = is(T : E); // scalar must be convertible to target elem type +enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2]) + && compatibleVecTypes!(E, Types[$ / 2 .. $]); + +template vectorizeable(E : E[], Args...) +{ + static if (is(vec!E)) + enum vectorizeable = vectorizeableOps!E([Filter!(not!isType, Args)]) + && compatibleVecTypes!(E, Filter!(isType, Args)); + else + enum vectorizeable = false; +} + +version (X86_64) unittest +{ + static assert(vectorizeable!(double[], const(double)[], double[], "+", "=")); + static assert(!vectorizeable!(double[], const(ulong)[], double[], "+", "=")); +} + +bool isUnaryOp(string op) +{ + return op[0] == 'u'; +} + +bool isBinaryOp(string op) +{ + if (op.length != 1) + return false; + switch (op[0]) + { + case '+', '-', '*', '/', '%', '|', '&', '^': + return true; + default: + return false; + } +} + +bool isBinaryAssignOp(string op) +{ + return op.length == 2 && op[1] == '=' && isBinaryOp(op[0 .. 1]); +} + +string scalarExp(Args...)() +{ + string[] stack; + size_t argsIdx; + foreach (i, arg; Args) + { + static if (is(arg == T[], T)) + stack ~= "args[" ~ argsIdx++.toString ~ "][pos]"; + else static if (is(arg)) + stack ~= "args[" ~ argsIdx++.toString ~ "]"; + else static if (isUnaryOp(arg)) + { + auto op = arg[0] == 'u' ? arg[1 .. $] : arg; + stack[$ - 1] = op ~ stack[$ - 1]; + } + else static if (arg == "=") + { + stack[$ - 1] = "res[pos] = cast(T)(" ~ stack[$ - 1] ~ ")"; + } + else static if (isBinaryAssignOp(arg)) + { + stack[$ - 1] = "res[pos] " ~ arg ~ " cast(T)(" ~ stack[$ - 1] ~ ")"; + } + else static if (isBinaryOp(arg)) + { + stack[$ - 2] = "(cast(T)(" ~ stack[$ - 2] ~ " " ~ arg ~ " " ~ stack[$ - 1] ~ "))"; + stack.length -= 1; + } + else + assert(0, "Unexpected op " ~ arg); + } + assert(stack.length == 1); + return stack[0]; +} + +size_t[] scalarIndices(Args...)() +{ + size_t[] scalars; + foreach (i, arg; Args) + { + if (is(arg == T[], T)) + { + } + else if (is(arg)) + scalars ~= i; + } + return scalars; +} + +string initScalarVecs(Args...)() +{ + auto scalars = scalarIndices!Args; + string res; + foreach (i, aidx; scalars) + res ~= "immutable vec scalar" ~ i.toString ~ " = scalarToVec(args[" ~ aidx + .toString ~ "]);\n"; + return res; +} + +string vectorExp(Args...)() +{ + size_t scalarsIdx, argsIdx; + string[] stack; + foreach (i, arg; Args) + { + static if (is(arg == T[], T)) + stack ~= "load(&args[" ~ argsIdx++.toString ~ "][pos])"; + else static if (is(arg)) + { + ++argsIdx; + stack ~= "scalar" ~ scalarsIdx++.toString; + } + else static if (isUnaryOp(arg)) + { + auto op = arg[0] == 'u' ? arg[1 .. $] : arg; + stack[$ - 1] = "unaop!\"" ~ arg ~ "\"(" ~ stack[$ - 1] ~ ")"; + } + else static if (arg == "=") + { + stack[$ - 1] = "store(&res[pos], " ~ stack[$ - 1] ~ ")"; + } + else static if (isBinaryAssignOp(arg)) + { + stack[$ - 1] = "store(&res[pos], binop!\"" ~ arg[0 .. $ - 1] + ~ "\"(load(&res[pos]), " ~ stack[$ - 1] ~ "))"; + } + else static if (isBinaryOp(arg)) + { + stack[$ - 2] = "binop!\"" ~ arg ~ "\"(" ~ stack[$ - 2] ~ ", " ~ stack[$ - 1] ~ ")"; + stack.length -= 1; + } + else + assert(0, "Unexpected op " ~ arg); + } + assert(stack.length == 1); + return stack[0]; +} + +// other helpers + +enum isType(T) = true; +enum isType(alias a) = false; +template not(alias tmlp) +{ + enum not(Args...) = !tmlp!Args; +} + +string toString(size_t num) +{ + import core.internal.string : unsignedToTempString; + + char[20] buf = void; + return unsignedToTempString(num, buf).idup; +} + +bool contains(T)(in T[] ary, in T[] vals...) +{ + foreach (v1; ary) + foreach (v2; vals) + if (v1 == v2) + return true; + return false; +} + +// tests + +version (unittest) template TT(T...) +{ + alias TT = T; +} + +version (unittest) template _arrayOp(Args...) +{ + alias _arrayOp = arrayOp!Args; +} + +unittest +{ + static void check(string op, TA, TB, T, size_t N)(TA a, TB b, in ref T[N] exp) + { + T[N] res; + _arrayOp!(T[], TA, TB, op, "=")(res[], a, b); + foreach (i; 0 .. N) + assert(res[i] == exp[i]); + } + + static void check2(string unaOp, string binOp, TA, TB, T, size_t N)(TA a, TB b, in ref T[N] exp) + { + T[N] res; + _arrayOp!(T[], TA, TB, unaOp, binOp, "=")(res[], a, b); + foreach (i; 0 .. N) + assert(res[i] == exp[i]); + } + + static void test(T, string op, size_t N = 16)(T a, T b, T exp) + { + T[N] va = a, vb = b, vexp = exp; + + check!op(va[], vb[], vexp); + check!op(va[], b, vexp); + check!op(a, vb[], vexp); + } + + static void test2(T, string unaOp, string binOp, size_t N = 16)(T a, T b, T exp) + { + T[N] va = a, vb = b, vexp = exp; + + check2!(unaOp, binOp)(va[], vb[], vexp); + check2!(unaOp, binOp)(va[], b, vexp); + check2!(unaOp, binOp)(a, vb[], vexp); + } + + alias UINTS = TT!(ubyte, ushort, uint, ulong); + alias INTS = TT!(byte, short, int, long); + alias FLOATS = TT!(float, double); + + foreach (T; TT!(UINTS, INTS, FLOATS)) + { + test!(T, "+")(1, 2, 3); + test!(T, "-")(3, 2, 1); + + test2!(T, "u-", "+")(3, 2, 1); + } + + foreach (T; TT!(UINTS, INTS)) + { + test!(T, "|")(1, 2, 3); + test!(T, "&")(3, 1, 1); + test!(T, "^")(3, 1, 2); + + test2!(T, "u~", "+")(3, cast(T)~2, 5); + } + + foreach (T; TT!(INTS, FLOATS)) + { + test!(T, "-")(1, 2, -1); + test2!(T, "u-", "+")(-3, -2, -1); + test2!(T, "u-", "*")(-3, -2, -6); + } + + foreach (T; TT!(UINTS, INTS, FLOATS)) + { + test!(T, "*")(2, 3, 6); + test!(T, "/")(8, 4, 2); + test!(T, "%")(8, 6, 2); + } +} + +// test rewrite of v op= exp to v = v op exp +unittest +{ + byte[32] c; + arrayOp!(byte[], byte, "+=")(c[], cast(byte) 6); + foreach (v; c) + assert(v == 6); +} diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d index 8142f90c51..e15dff5204 100644 --- a/src/core/internal/traits.d +++ b/src/core/internal/traits.d @@ -187,3 +187,26 @@ template hasElaborateCopyConstructor(T...) else enum bool hasElaborateCopyConstructor = false; } + +// std.meta.Filter +template Filter(alias pred, TList...) +{ + static if (TList.length == 0) + { + alias Filter = TypeTuple!(); + } + else static if (TList.length == 1) + { + static if (pred!(TList[0])) + alias Filter = TypeTuple!(TList[0]); + else + alias Filter = TypeTuple!(); + } + else + { + alias Filter = + TypeTuple!( + Filter!(pred, TList[ 0 .. $/2]), + Filter!(pred, TList[$/2 .. $ ])); + } +} diff --git a/src/object.d b/src/object.d index 5506ade80e..97dd6779b6 100644 --- a/src/object.d +++ b/src/object.d @@ -3634,6 +3634,13 @@ if (!__traits(isScalar, T1)) assert(__cmp([c2, c2], [c1, c1]) > 0); } +// Compiler hook into the runtime implementation of array (vector) operations. +template _arrayOp(Args...) +{ + import core.internal.arrayop; + alias _arrayOp = arrayOp!Args; +} + // Helper functions private inout(TypeInfo) getElement(inout TypeInfo value) @trusted pure nothrow diff --git a/win32.mak b/win32.mak index 91262264c0..3494159935 100644 --- a/win32.mak +++ b/win32.mak @@ -266,6 +266,9 @@ $(IMPDIR)\core\vararg.d : src\core\vararg.d $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d copy $** $@ +$(IMPDIR)\core\internal\arrayop.d : src\core\internal\arrayop.d + copy $** $@ + $(IMPDIR)\core\internal\convert.d : src\core\internal\convert.d copy $** $@ diff --git a/win64.mak b/win64.mak index d9e10522b5..97b1aa7181 100644 --- a/win64.mak +++ b/win64.mak @@ -277,6 +277,9 @@ $(IMPDIR)\core\vararg.d : src\core\vararg.d $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d copy $** $@ +$(IMPDIR)\core\internal\arrayop.d : src\core\internal\arrayop.d + copy $** $@ + $(IMPDIR)\core\internal\convert.d : src\core\internal\convert.d copy $** $@ From 84d49d18b25f0ec52d03a9d5d9ec4beaace3ab08 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Mon, 13 Mar 2017 16:01:29 +0100 Subject: [PATCH 02/12] fix plotting of arrayops benchmark - properly sort/order values on abscissa --- benchmark/arrayops/arrayops.d | 2 +- benchmark/arrayops/plot.R | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/benchmark/arrayops/arrayops.d b/benchmark/arrayops/arrayops.d index 60490b02d7..bce3ea14b1 100644 --- a/benchmark/arrayops/arrayops.d +++ b/benchmark/arrayops/arrayops.d @@ -180,7 +180,7 @@ void main() unmaskFPUExceptions; writefln("type, op, %(latency%s, %), %-(throughput%s, %)", iota(6) - .map!(i => 1 << i), ["8KB", "32KB", "512KB", "32MB"]); + .map!(i => 1 << i), ["8KB", "32KB", "512KB", "32768KB"]); foreach (op; mixin("AliasSeq!(%(%s, %))".format(genOps))) runOp!op; maskFPUExceptions; diff --git a/benchmark/arrayops/plot.R b/benchmark/arrayops/plot.R index 6574f03ea8..a1ba783810 100644 --- a/benchmark/arrayops/plot.R +++ b/benchmark/arrayops/plot.R @@ -16,18 +16,17 @@ for (file in files) dat = bind_rows(dat, datFile) } -latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) -throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) - -levels(latencies$num_elems) <- sub("latency(\\d+)", "\\1", levels(latencies$num_elems)) -levels(throughputs$array_size) <- sub("throughput(.+)", "\\1", levels(throughputs$array_size)) +latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>% + mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems)))) +throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>% + mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size)))) img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) + facet_grid(op ~ file, scales="free_y") + labs(x="num elements", y="latency / ns") -ggsave('array_ops_latency.svg', plot = img, width = 2 + 3 * length(files), height = 40) +ggsave('array_ops_latency.png', plot = img, width = 2 + 3 * length(files), height = 40) img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) + facet_grid(op ~ file, scales="free_y") + - labs(x="array size", y="throughput / (ops / ns)") -ggsave('array_ops_throughput.svg', plot = img, width = 2 + 3 * length(files), height = 40) + labs(x="array size / KB", y="throughput / (ops / ns)") +ggsave('array_ops_throughput.png', plot = img, width = 2 + 3 * length(files), height = 40) From f8dd223c8eafb8dc30015f3ab3c2e476694d830d Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Mon, 13 Mar 2017 14:47:41 +0100 Subject: [PATCH 03/12] change plot to relative numbers --- benchmark/arrayops/plot.R | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/benchmark/arrayops/plot.R b/benchmark/arrayops/plot.R index a1ba783810..5b770dcb06 100644 --- a/benchmark/arrayops/plot.R +++ b/benchmark/arrayops/plot.R @@ -1,32 +1,30 @@ -# Use `R --vanilla < plot.R` to run this script. -# It will read all *.csv files from the current folder and create a comparison plot for them. +# Use `Rscript --vanilla plot.R old.csv new.csv` to run this script. +# It will read old.csv and new.csv files and create a comparison plot for them. library(ggplot2) library(dplyr) library(tidyr) dat <- NULL -files <- list.files(pattern='*.csv') -for (file in files) -{ - datFile <- read.csv(file) %>% tbl_df() %>% - mutate(file=file) - if (is.null(dat)) - dat = datFile - else - dat = bind_rows(dat, datFile) -} +args <- commandArgs(trailingOnly=T) +old <- read.csv(args[1]) %>% tbl_df() +new <- read.csv(args[2]) %>% tbl_df() -latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>% +col.indices <- which(!colnames(new) %in% c("type", "op")) + +# relative values +new[,col.indices] <- 100 * new[,col.indices] / old[,col.indices] + +latencies <- gather(new %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>% mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems)))) -throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>% +throughputs <- gather(new %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>% mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size)))) img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) + - facet_grid(op ~ file, scales="free_y") + - labs(x="num elements", y="latency / ns") -ggsave('array_ops_latency.png', plot = img, width = 2 + 3 * length(files), height = 40) + facet_grid(op ~ ., scales="free_y") + + labs(x="num elements", y="relative latency / %") +ggsave('array_ops_latency.png', plot = img, width = 8, height = 40) img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) + - facet_grid(op ~ file, scales="free_y") + - labs(x="array size / KB", y="throughput / (ops / ns)") -ggsave('array_ops_throughput.png', plot = img, width = 2 + 3 * length(files), height = 40) + facet_grid(op ~ ., scales="free_y") + + labs(x="array size / KB", y="relative throughput / %") +ggsave('array_ops_throughput.png', plot = img, width = 8, height = 40) From 973f3a29a66e15e27e196604b2594bf3c0d4bf44 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Tue, 25 Jul 2017 00:15:49 +0200 Subject: [PATCH 04/12] switch to easier to read bar plot --- benchmark/arrayops/plot.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmark/arrayops/plot.R b/benchmark/arrayops/plot.R index 5b770dcb06..5bb4a99e2f 100644 --- a/benchmark/arrayops/plot.R +++ b/benchmark/arrayops/plot.R @@ -14,17 +14,22 @@ col.indices <- which(!colnames(new) %in% c("type", "op")) # relative values new[,col.indices] <- 100 * new[,col.indices] / old[,col.indices] +# arrange type factor levels +new$type <- factor(new$type, levels = c('byte', 'ubyte', 'short', 'ushort', 'int', 'uint', 'long', 'ulong', 'float', 'double')) + latencies <- gather(new %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>% mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems)))) throughputs <- gather(new %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>% mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size)))) -img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) + +img <- ggplot(latencies, aes(x=num_elems, y=latency, fill=type)) + + geom_bar(position="dodge", stat="identity") + facet_grid(op ~ ., scales="free_y") + labs(x="num elements", y="relative latency / %") ggsave('array_ops_latency.png', plot = img, width = 8, height = 40) -img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) + +img <- ggplot(throughputs, aes(x=array_size, y=throughput, fill=type)) + + geom_bar(position="dodge", stat="identity") + facet_grid(op ~ ., scales="free_y") + labs(x="array size / KB", y="relative throughput / %") ggsave('array_ops_throughput.png', plot = img, width = 8, height = 40) From 549bc8be02343646260b761fa8e644ea9f1cc3ca Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Sun, 9 Apr 2017 10:40:15 +0200 Subject: [PATCH 05/12] vectorizable ops by introspection - support for targets specific vector ops (e.g. AVX vs. SSE2) --- src/core/internal/arrayop.d | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index b11ffe6c4f..fe8a1d1a25 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -155,15 +155,18 @@ __vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == ' // mixin gen // filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though) -bool vectorizeableOps(E)(string[] ops) +template vectorizeableOps(E, ops...) if (ops.length > 1) { - // dfmt off - return !( - ops.contains("/", "/=") && __traits(isIntegral, E) || - ops.contains("*", "*=") && __traits(isIntegral, E) && E.sizeof != 2 || - ops.contains("%", "%=") - ); - // dfmt on + enum vectorizeableOps = vectorizeableOps!(E, ops[0 .. $ / 2]) + && vectorizeableOps!(E, ops[$ / 2 .. $]); +} + +template vectorizeableOps(E, string op) +{ + static if (isUnaryOp(op)) + enum vectorizeableOps = is(typeof((vec!E a) => mixin(op[1 .. $] ~ " a"))); + else + enum vectorizeableOps = is(typeof((vec!E a, vec!E b) => mixin("a " ~ op ~ " b"))); } // filter out things like float[] = float[] / size_t[] From 60d0eefbcddb497b9c5513f31062316da0c2dee2 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Mon, 10 Apr 2017 00:48:21 +0200 Subject: [PATCH 06/12] proper error message for unsupported scalar ops - with UDTs --- src/core/internal/arrayop.d | 53 ++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index fe8a1d1a25..42cc212c81 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -18,6 +18,8 @@ version (LDC) version = GNU_OR_LDC; */ T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nogc pure nothrow { + enum check = opsSupported!(true, T, Filter!(not!isType, Args)); // must support all scalar ops + size_t pos; static if (vectorizeable!(T[], Args)) { @@ -155,18 +157,26 @@ __vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == ' // mixin gen // filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though) -template vectorizeableOps(E, ops...) if (ops.length > 1) +template opsSupported(bool fail, T, ops...) if (ops.length > 1) { - enum vectorizeableOps = vectorizeableOps!(E, ops[0 .. $ / 2]) - && vectorizeableOps!(E, ops[$ / 2 .. $]); + enum opsSupported = opsSupported!(fail, T, ops[0 .. $ / 2]) + && opsSupported!(fail, T, ops[$ / 2 .. $]); } -template vectorizeableOps(E, string op) +template opsSupported(bool fail, T, string op) { static if (isUnaryOp(op)) - enum vectorizeableOps = is(typeof((vec!E a) => mixin(op[1 .. $] ~ " a"))); + { + enum opsSupported = is(typeof((T a) => mixin(op[1 .. $] ~ " a"))); + static assert(!fail || opsSupported, + "Unary op `" ~ op[1 .. $] ~ "` not supported for element type " ~ T.stringof ~ "."); + } else - enum vectorizeableOps = is(typeof((vec!E a, vec!E b) => mixin("a " ~ op ~ " b"))); + { + enum opsSupported = is(typeof((T a, T b) => mixin("a " ~ op ~ " b"))); + static assert(!fail || opsSupported, + "Binary op `" ~ op ~ "` not supported for element type " ~ T.stringof ~ "."); + } } // filter out things like float[] = float[] / size_t[] @@ -178,7 +188,7 @@ enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2]) template vectorizeable(E : E[], Args...) { static if (is(vec!E)) - enum vectorizeable = vectorizeableOps!E([Filter!(not!isType, Args)]) + enum vectorizeable = opsSupported!(false, vec!E, Filter!(not!isType, Args)) && compatibleVecTypes!(E, Filter!(isType, Args)); else enum vectorizeable = false; @@ -429,3 +439,32 @@ unittest foreach (v; c) assert(v == 6); } + +// proper error message for UDT lacking certain ops +unittest +{ + static assert(!is(typeof(&arrayOp!(int[4][], int[4], "+=")))); + static assert(!is(typeof(&arrayOp!(int[4][], int[4], "u-", "=")))); + + static struct S + { + } + + static assert(!is(typeof(&arrayOp!(S[], S, "+=")))); + static assert(!is(typeof(&arrayOp!(S[], S[], "*", S, "+=")))); + static struct S2 + { + S2 opBinary(string op)(in S2) @nogc pure nothrow + { + return this; + } + + ref S2 opOpAssign(string op)(in S2) @nogc pure nothrow + { + return this; + } + } + + static assert(is(typeof(&arrayOp!(S2[], S2[], S2[], S2, "*", "+", "=")))); + static assert(is(typeof(&arrayOp!(S2[], S2[], S2, "*", "+=")))); +} From 4eaf5004d2eddbc8df860d43d521fed25c4c0ae0 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Tue, 11 Apr 2017 12:36:03 +0200 Subject: [PATCH 07/12] remove Issue 7509/16488 workaround - dmd got broadcast init with #6248 --- src/core/internal/arrayop.d | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index 42cc212c81..5eed3c61fa 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -26,7 +26,6 @@ T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nog alias vec = .vec!T; alias load = .load!(T, vec.length); alias store = .store!(T, vec.length); - alias scalarToVec = .scalarToVec!(T, vec.length); auto n = res.length / vec.length; enum nScalarInits = scalarIndices!Args.length; @@ -129,19 +128,6 @@ const(__vector(T[N])) load(T, size_t N)(in T* p) } } -const(__vector(T[N])) scalarToVec(T, size_t N)(in T a) -{ - pragma(inline, true); - alias vec = __vector(T[N]); - - vec res = void; - version (DigitalMars) // Bugzilla 7509 - res.array = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a][0 .. N]; - else - res = a; - return res; -} - __vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b) { pragma(inline, true); @@ -277,8 +263,7 @@ string initScalarVecs(Args...)() auto scalars = scalarIndices!Args; string res; foreach (i, aidx; scalars) - res ~= "immutable vec scalar" ~ i.toString ~ " = scalarToVec(args[" ~ aidx - .toString ~ "]);\n"; + res ~= "immutable vec scalar" ~ i.toString ~ " = args[" ~ aidx.toString ~ "];\n"; return res; } From 9c5d83b3a7b5d2f2164d78715350dcb00d4a0b8e Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Fri, 9 Jun 2017 18:21:05 +0200 Subject: [PATCH 08/12] always use vec ops --- src/core/internal/arrayop.d | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index 5eed3c61fa..cd4795ac56 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -27,12 +27,14 @@ T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nog alias load = .load!(T, vec.length); alias store = .store!(T, vec.length); - auto n = res.length / vec.length; - enum nScalarInits = scalarIndices!Args.length; - if (n > 2 * (1 + nScalarInits)) // empirically found cost estimate + // Given that there are at most as many scalars broadcast as there are + // operations in any `ary[] = ary[] op const op const`, it should always be + // worthwhile to choose vector operations. + if (res.length >= vec.length) { mixin(initScalarVecs!Args); + auto n = res.length / vec.length; do { mixin(vectorExp!Args ~ ";"); @@ -244,26 +246,19 @@ string scalarExp(Args...)() return stack[0]; } -size_t[] scalarIndices(Args...)() +string initScalarVecs(Args...)() { - size_t[] scalars; - foreach (i, arg; Args) + size_t scalarsIdx; + string res; + foreach (aidx, arg; Args) { - if (is(arg == T[], T)) + static if (is(arg == T[], T)) { } - else if (is(arg)) - scalars ~= i; + else static if (is(arg)) + res ~= "immutable vec scalar" ~ scalarsIdx++.toString ~ " = args[" + ~ aidx.toString ~ "];\n"; } - return scalars; -} - -string initScalarVecs(Args...)() -{ - auto scalars = scalarIndices!Args; - string res; - foreach (i, aidx; scalars) - res ~= "immutable vec scalar" ~ i.toString ~ " = args[" ~ aidx.toString ~ "];\n"; return res; } From 9d7faf97cf4830b2d111aaff493814e30b70019b Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Tue, 25 Jul 2017 00:50:26 +0200 Subject: [PATCH 09/12] rely on auto-vectorizer for gdc/ldc - seems to have made quite some improvements while that module was written - generated code for scalar loops and for vector loops ends up being almost identical, so it seems more reasonable to leave decisions completely to the auto-vectorizers. --- src/core/internal/arrayop.d | 118 ++++++++++++++---------------------- 1 file changed, 45 insertions(+), 73 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index cd4795ac56..724c5b8e11 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -53,45 +53,22 @@ private: // SIMD helpers -version (GNU) - import gcc.builtins; -else version (LDC) +version (DigitalMars) { - import ldc.simd; - import ldc.gccbuiltins_x86; -} -else version (DigitalMars) import core.simd; -else - static assert(0, "unimplemented"); - -template vec(T) -{ - enum regsz = 16; // SSE2 - enum N = regsz / T.sizeof; - alias vec = __vector(T[N]); -} -void store(T, size_t N)(T* p, in __vector(T[N]) val) -{ - pragma(inline, true); - alias vec = __vector(T[N]); - - version (LDC) - { - storeUnaligned!vec(val, p); - } - else version (GNU) + template vec(T) { - static if (is(T == float)) - __builtin_ia32_storeups(p, val); - else static if (is(T == double)) - __builtin_ia32_storeupd(p, val); - else - __builtin_ia32_storedqu(cast(char*) p, val); + enum regsz = 16; // SSE2 + enum N = regsz / T.sizeof; + alias vec = __vector(T[N]); } - else version (DigitalMars) + + void store(T, size_t N)(T* p, in __vector(T[N]) val) { + pragma(inline, true); + alias vec = __vector(T[N]); + static if (is(T == float)) cast(void) __simd_sto(XMM.STOUPS, *cast(vec*) p, val); else static if (is(T == double)) @@ -99,28 +76,14 @@ void store(T, size_t N)(T* p, in __vector(T[N]) val) else cast(void) __simd_sto(XMM.STODQU, *cast(vec*) p, val); } -} - -const(__vector(T[N])) load(T, size_t N)(in T* p) -{ - pragma(inline, true); - alias vec = __vector(T[N]); - version (LDC) - { - return loadUnaligned!vec(cast(T*) p); - } - else version (GNU) - { - static if (is(T == float)) - return __builtin_ia32_loadups(p); - else static if (is(T == double)) - return __builtin_ia32_loadupd(p); - else - return __builtin_ia32_loaddqu(cast(const char*) p); - } - else version (DigitalMars) + const(__vector(T[N])) load(T, size_t N)(in T* p) { + import core.simd; + + pragma(inline, true); + alias vec = __vector(T[N]); + static if (is(T == float)) return __simd(XMM.LODUPS, *cast(const vec*) p); else static if (is(T == double)) @@ -128,18 +91,19 @@ const(__vector(T[N])) load(T, size_t N)(in T* p) else return __simd(XMM.LODDQU, *cast(const vec*) p); } -} -__vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b) -{ - pragma(inline, true); - return mixin("a " ~ op ~ " b"); -} + __vector(T[N]) binop(string op, T, size_t N)(in __vector(T[N]) a, in __vector(T[N]) b) + { + pragma(inline, true); + return mixin("a " ~ op ~ " b"); + } -__vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) if (op[0] == 'u') -{ - pragma(inline, true); - return mixin(op[1 .. $] ~ "a"); + __vector(T[N]) unaop(string op, T, size_t N)(in __vector(T[N]) a) + if (op[0] == 'u') + { + pragma(inline, true); + return mixin(op[1 .. $] ~ "a"); + } } // mixin gen @@ -173,19 +137,27 @@ enum compatibleVecTypes(E, T) = is(T : E); // scalar must be convertible to targ enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2]) && compatibleVecTypes!(E, Types[$ / 2 .. $]); -template vectorizeable(E : E[], Args...) +version (GNU_OR_LDC) { - static if (is(vec!E)) - enum vectorizeable = opsSupported!(false, vec!E, Filter!(not!isType, Args)) - && compatibleVecTypes!(E, Filter!(isType, Args)); - else - enum vectorizeable = false; + // leave it to the auto-vectorizer + enum vectorizeable(E : E[], Args...) = false; } - -version (X86_64) unittest +else { - static assert(vectorizeable!(double[], const(double)[], double[], "+", "=")); - static assert(!vectorizeable!(double[], const(ulong)[], double[], "+", "=")); + template vectorizeable(E : E[], Args...) + { + static if (is(vec!E)) + enum vectorizeable = opsSupported!(false, vec!E, Filter!(not!isType, Args)) + && compatibleVecTypes!(E, Filter!(isType, Args)); + else + enum vectorizeable = false; + } + + version (X86_64) unittest + { + static assert(vectorizeable!(double[], const(double)[], double[], "+", "=")); + static assert(!vectorizeable!(double[], const(ulong)[], double[], "+", "=")); + } } bool isUnaryOp(string op) From 69ff7244c8aa95ce6a9a0514c5438885ec7945fa Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Tue, 25 Jul 2017 00:14:51 +0200 Subject: [PATCH 10/12] use __gshared scalar to avoid const-folding - e.g. replacement of ary[] / scalar with weaker ary[] >> 1 --- benchmark/arrayops/arrayops.d | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/benchmark/arrayops/arrayops.d b/benchmark/arrayops/arrayops.d index bce3ea14b1..dae0a1548a 100644 --- a/benchmark/arrayops/arrayops.d +++ b/benchmark/arrayops/arrayops.d @@ -21,13 +21,16 @@ float[6] getLatencies(T, string op)() a[] = 24; b[] = 4; c[] = 2; + __gshared T s = 2; // scalar, use __gshared to avoid const-folding auto sw = StopWatch(AutoStart.yes); foreach (off; size_t(0) .. size_t(64)) { off = off * len + off; - enum op = op.replace("const", "2").replace("a", - "a[off .. off + len]").replace("b", - "b[off .. off + len]").replace("c", "c[off .. off + len]"); + enum op = op + .replace("scalar", "s") + .replace("a", "a[off .. off + len]") + .replace("b", "b[off .. off + len]") + .replace("c", "c[off .. off + len]"); mixin(op ~ ";"); } latency = min(latency, sw.peek.nsecs); @@ -54,13 +57,16 @@ float[4] getThroughput(T, string op)() a[] = 24; b[] = 4; c[] = 2; + __gshared T s = 2; // scalar, use __gshared to avoid const-folding auto sw = StopWatch(AutoStart.yes); foreach (off; size_t(0) .. size_t(64)) { off = off * len + off; - enum op = op.replace("const", "2").replace("a", - "a[off .. off + len]").replace("b", - "b[off .. off + len]").replace("c", "c[off .. off + len]"); + enum op = op + .replace("scalar", "s") + .replace("a", "a[off .. off + len]") + .replace("b", "b[off .. off + len]") + .replace("c", "c[off .. off + len]"); mixin(op ~ ";"); } immutable nsecs = sw.peek.nsecs; @@ -78,11 +84,11 @@ string[] genOps() foreach (op1; ["+", "-", "*", "/"]) { ops ~= "a " ~ op1 ~ "= b"; - ops ~= "a " ~ op1 ~ "= const"; + ops ~= "a " ~ op1 ~ "= scalar"; foreach (op2; ["+", "-", "*", "/"]) { ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " c"; - ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " const"; + ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " scalar"; } } return ops; From 6bdc5a4c358861408015443cf0e41dca68e340e5 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Wed, 26 Jul 2017 12:34:02 +0200 Subject: [PATCH 11/12] add changelog for templated array ops --- changelog/vectorized_array_ops.dd | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 changelog/vectorized_array_ops.dd diff --git a/changelog/vectorized_array_ops.dd b/changelog/vectorized_array_ops.dd new file mode 100644 index 0000000000..0b20c4e4f4 --- /dev/null +++ b/changelog/vectorized_array_ops.dd @@ -0,0 +1,10 @@ +Vectorized array operations are now templated + +Array operations have been converted from dedicated assembly routines for $(B some) array operations to a generic template implementation for $(B all) array operations. This provides huge performance increases (2-4x higher throughput) for array operations that were not previously vectorized. +Furthermore the implementation makes better use of vectorization even for short arrays to heavily reduce latency for some operations (up to 4x). + +For GDC/LDC the implementation relies on auto-vectorization, for DMD the implementation performs the vectorization itself. Support for vector operations with DMD is determined statically (`-march=native`, `-march=avx2`) to avoid binary bloat and the small test overhead. DMD enables SSE2 for 64-bit targets by default. + +Also see $(DRUNTIMEPR 1891) + +$(RED Note:) The implementation no longer weakens floating point divisions (e.g. `ary[] / scalar`) to multiplication (`ary[] * (1.0 / scalar)`) as that may reduce precision. To preserve the higher performance of float multiplication when loss of precision is acceptable, use either `-ffast-math` with GDC/LDC or manually rewrite your code to multiply by `(1.0 / scalar)` for DMD. From aee45fbb8043def66f81620c4752072ac22db3d1 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Wed, 9 Aug 2017 08:28:04 +0200 Subject: [PATCH 12/12] more docs --- src/core/internal/arrayop.d | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index 724c5b8e11..d24c849e76 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -5,15 +5,24 @@ version (GNU) version = GNU_OR_LDC; version (LDC) version = GNU_OR_LDC; /** - * Perform array (vector) operations and store the result in `res`. - * Operand types and operations are passed as template arguments in Reverse - * Polish Notation (RPN). + * Perform array (vector) operations and store the result in `res`. Operand + * types and operations are passed as template arguments in Reverse Polish + * Notation (RPN). + + * Operands can be slices or scalar types. The unqualified element types of all + * slices must be `T`, scalar types must be implicitly convertible to `T`. + * + * Operations are encoded as strings, e.g. `"+"`, `"%"`, `"*="`. Unary + * operations are prefixed with "u", e.g. `"u-"`, `"u~"`. Only the last + * operation can and must be an assignment (`"="`) or op-assignment (`"op="`). + * * All slice operands must have the same length as the result slice. * - * Params: res = the slice in which to store the results - * args = all other operands + * Params: T[] = type of result slice * Args = operand types and operations in RPN - * T[] = type of result slice + * res = the slice in which to store the results + * args = operand values + * * Returns: the slice containing the result */ T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nogc pure nothrow @@ -108,7 +117,7 @@ version (DigitalMars) // mixin gen -// filter out ops without matching SSE/SIMD instructions (could be composed of several instructions though) +// Check whether operations `ops` are supported for type `T`. Fails with a human-friendly static assert message, if `fail` is true. template opsSupported(bool fail, T, ops...) if (ops.length > 1) { enum opsSupported = opsSupported!(fail, T, ops[0 .. $ / 2]) @@ -131,7 +140,8 @@ template opsSupported(bool fail, T, string op) } } -// filter out things like float[] = float[] / size_t[] +// check whether slices have the unqualified element type `E` and scalars are implicitly convertible to `E` +// i.e. filter out things like float[] = float[] / size_t[] enum compatibleVecTypes(E, T : T[]) = is(Unqual!T == Unqual!E); // array elem types must be same (maybe add cvtpi2ps) enum compatibleVecTypes(E, T) = is(T : E); // scalar must be convertible to target elem type enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2]) @@ -144,6 +154,7 @@ version (GNU_OR_LDC) } else { + // check whether arrayOp is vectorizable template vectorizeable(E : E[], Args...) { static if (is(vec!E)) @@ -183,6 +194,9 @@ bool isBinaryAssignOp(string op) return op.length == 2 && op[1] == '=' && isBinaryOp(op[0 .. 1]); } +// Generate mixin expression to perform scalar arrayOp loop expression, assumes +// `pos` to be the current slice index, `args` to contain operand values, and +// `res` the target slice. string scalarExp(Args...)() { string[] stack; @@ -218,6 +232,8 @@ string scalarExp(Args...)() return stack[0]; } +// Generate mixin statement to perform vector loop initialization, assumes +// `args` to contain operand values. string initScalarVecs(Args...)() { size_t scalarsIdx; @@ -234,6 +250,9 @@ string initScalarVecs(Args...)() return res; } +// Generate mixin expression to perform vector arrayOp loop expression, assumes +// `pos` to be the current slice index, `args` to contain operand values, and +// `res` the target slice. string vectorExp(Args...)() { size_t scalarsIdx, argsIdx;