From 0a2ded36170727ad91abbab0b822d6c4a8c66056 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Thu, 10 Aug 2017 18:43:56 +0200 Subject: [PATCH 1/3] support mixed type arrayops --- src/core/internal/arrayop.d | 174 +++++++++++++++++++++++++++++------- src/core/internal/traits.d | 20 +++++ 2 files changed, 164 insertions(+), 30 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index 34531d8a53..5181c1251b 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -1,5 +1,5 @@ module core.internal.arrayop; -import core.internal.traits : Filter, Unqual; +import core.internal.traits : Filter, staticMap, TypeTuple, Unqual; version (GNU) version = GNU_OR_LDC; version (LDC) version = GNU_OR_LDC; @@ -9,8 +9,8 @@ version (LDC) version = GNU_OR_LDC; * types and operations are passed as template arguments in Reverse Polish * Notation (RPN). - * Operands can be slices or scalar types. The unqualified element types of all - * slices must be `T`, scalar types must be implicitly convertible to `T`. + * Operands can be slices or scalar types. The element types of all + * slices and all scalar types must be implicitly convertible to `T`. * * Operations are encoded as strings, e.g. `"+"`, `"%"`, `"*="`. Unary * operations are prefixed with "u", e.g. `"u-"`, `"u~"`. Only the last @@ -27,7 +27,8 @@ version (LDC) version = GNU_OR_LDC; */ T[] arrayOp(T : T[], Args...)(T[] res, Filter!(isType, Args) args) @trusted @nogc pure nothrow { - enum check = opsSupported!(true, T, Filter!(not!isType, Args)); // must support all scalar ops + alias scalarizedExp = staticMap!(toElementType, Args); + alias check = typeCheck!(true, T, scalarizedExp); // must support all scalar ops size_t pos; static if (vectorizeable!(T[], Args)) @@ -117,35 +118,86 @@ version (DigitalMars) // mixin gen -// Check whether operations `ops` are supported for type `T`. Fails with a human-friendly static assert message, if `fail` is true. -template opsSupported(bool fail, T, ops...) if (ops.length > 1) -{ - enum opsSupported = opsSupported!(fail, T, ops[0 .. $ / 2]) - && opsSupported!(fail, T, ops[$ / 2 .. $]); -} - -template opsSupported(bool fail, T, string op) +/** +Check whether operations on operand types are supported. This +template recursively reduces the expression tree and determines +intermediate types. +Type checking is done here rather than in the compiler to provide more +detailed error messages. + +Params: + fail = whether to fail (static assert) with a human-friendly error message + T = type of result + Args = operand types and operations in RPN +Returns: + The resulting type of the expression +See_Also: + $(LREF arrayOp) +*/ +template typeCheck(bool fail, T, Args...) { - static if (isUnaryOp(op)) + enum idx = staticIndexOf!(not!isType, Args); + static if (isUnaryOp(Args[idx])) { - enum opsSupported = is(typeof((T a) => mixin(op[1 .. $] ~ " a"))); - static assert(!fail || opsSupported, - "Unary op `" ~ op[1 .. $] ~ "` not supported for element type " ~ T.stringof ~ "."); + alias UT = Args[idx - 1]; + enum op = Args[idx][1 .. $]; + static if (is(typeof((UT a) => mixin(op ~ " a")) RT == return)) + alias typeCheck = typeCheck!(fail, T, Args[0 .. idx - 1], RT, Args[idx + 1 .. $]); + else static if (fail) + static assert(0, "Unary `" ~ op ~ "` not supported for type `" ~ UT.stringof ~ "`."); } - else + else static if (isBinaryOp(Args[idx])) { - enum opsSupported = is(typeof((T a, T b) => mixin("a " ~ op ~ " b"))); - static assert(!fail || opsSupported, - "Binary op `" ~ op ~ "` not supported for element type " ~ T.stringof ~ "."); + alias LHT = Args[idx - 2]; + alias RHT = Args[idx - 1]; + enum op = Args[idx]; + static if (is(typeof((LHT a, RHT b) => mixin("a " ~ op ~ " b")) RT == return)) + alias typeCheck = typeCheck!(fail, T, Args[0 .. idx - 2], RT, Args[idx + 1 .. $]); + else static if (fail) + static assert(0, + "Binary `" ~ op ~ "` not supported for types `" + ~ LHT.stringof ~ "` and `" ~ RHT.stringof ~ "`."); } + else static if (Args[idx] == "=" || isBinaryAssignOp(Args[idx])) + { + alias RHT = Args[idx - 1]; + enum op = Args[idx]; + static if (is(T == __vector(ET[N]), ET, size_t N)) + { + // no `cast(T)` before assignment for vectors + static if (is(typeof((T res, RHT b) => mixin("res " ~ op ~ " b")) RT == return) + && // workaround https://issues.dlang.org/show_bug.cgi?id=17758 + (op != "=" || is(Unqual!T == Unqual!RHT))) + alias typeCheck = typeCheck!(fail, T, Args[0 .. idx - 1], RT, Args[idx + 1 .. $]); + else static if (fail) + static assert(0, + "Binary op `" ~ op ~ "` not supported for types `" + ~ T.stringof ~ "` and `" ~ RHT.stringof ~ "`."); + } + else + { + static if (is(typeof((RHT b) => mixin("cast(T) b")))) + { + static if (is(typeof((T res, T b) => mixin("res " ~ op ~ " b")) RT == return)) + alias typeCheck = typeCheck!(fail, T, Args[0 .. idx - 1], RT, Args[idx + 1 .. $]); + else static if (fail) + static assert(0, + "Binary op `" ~ op ~ "` not supported for types `" + ~ T.stringof ~ "` and `" ~ T.stringof ~ "`."); + } + else static if (fail) + static assert(0, + "`cast(" ~ T.stringof ~ ")` not supported for type `" ~ RHT.stringof ~ "`."); + } + } + else + static assert(0); +} +/// ditto +template typeCheck(bool fail, T, ResultType) +{ + alias typeCheck = ResultType; } - -// check whether slices have the unqualified element type `E` and scalars are implicitly convertible to `E` -// i.e. filter out things like float[] = float[] / size_t[] -enum compatibleVecTypes(E, T : T[]) = is(Unqual!T == Unqual!E); // array elem types must be same (maybe add cvtpi2ps) -enum compatibleVecTypes(E, T) = is(T : E); // scalar must be convertible to target elem type -enum compatibleVecTypes(E, Types...) = compatibleVecTypes!(E, Types[0 .. $ / 2]) - && compatibleVecTypes!(E, Types[$ / 2 .. $]); version (GNU_OR_LDC) { @@ -158,16 +210,23 @@ else template vectorizeable(E : E[], Args...) { static if (is(vec!E)) - enum vectorizeable = opsSupported!(false, vec!E, Filter!(not!isType, Args)) - && compatibleVecTypes!(E, Filter!(isType, Args)); + { + // type check with vector types + enum vectorizeable = is(typeCheck!(false, vec!E, staticMap!(toVecType, Args))); + } else enum vectorizeable = false; } version (X86_64) unittest { + pragma(msg, vectorizeable!(double[], const(double)[], double[], "+", "=")); static assert(vectorizeable!(double[], const(double)[], double[], "+", "=")); static assert(!vectorizeable!(double[], const(ulong)[], double[], "+", "=")); + // Vector type are (atm.) not implicitly convertible and would require + // lots of SIMD intrinsics. Therefor leave mixed type array ops to + // GDC/LDC's auto-vectorizers. + static assert(!vectorizeable!(double[], const(uint)[], uint, "+", "=")); } } @@ -224,7 +283,7 @@ string scalarExp(Args...)() } else static if (isBinaryOp(arg)) { - stack[$ - 2] = "(cast(T)(" ~ stack[$ - 2] ~ " " ~ arg ~ " " ~ stack[$ - 1] ~ "))"; + stack[$ - 2] = "(" ~ stack[$ - 2] ~ " " ~ arg ~ " " ~ stack[$ - 1] ~ ")"; stack.length -= 1; } else @@ -302,6 +361,33 @@ template not(alias tmlp) { enum not(Args...) = !tmlp!Args; } +/** +Find element in `haystack` for which `pred` is true. + +Params: + pred = the template predicate + haystack = elements to search +Returns: + The first index for which `pred!haystack[index]` is true or -1. + */ +template staticIndexOf(alias pred, haystack...) +{ + static if (pred!(haystack[0])) + enum staticIndexOf = 0; + else + { + enum next = staticIndexOf!(pred, haystack[1 .. $]); + enum staticIndexOf = next == -1 ? -1 : next + 1; + } +} +/// converts slice types to their element type, preserves anything else +alias toElementType(E : E[]) = E; +alias toElementType(S) = S; +alias toElementType(alias op) = op; +/// converts slice types to their element type, preserves anything else +alias toVecType(E : E[]) = vec!E; +alias toVecType(S) = vec!S; +alias toVecType(alias op) = op; string toString(size_t num) { @@ -449,3 +535,31 @@ unittest static assert(is(typeof(&arrayOp!(S2[], S2[], S2[], S2, "*", "+", "=")))); static assert(is(typeof(&arrayOp!(S2[], S2[], S2, "*", "+=")))); } + +// test mixed type array op +unittest +{ + uint[32] a = 0xF; + float[32] res = 2.0f; + arrayOp!(float[], const(uint)[], uint, "&", "*=")(res[], a[], 12); + foreach (v; res[]) + assert(v == 24.0f); +} + +// test mixed type array op +unittest +{ + static struct S + { + float opBinary(string op)(in S) @nogc const pure nothrow + { + return 2.0f; + } + } + + float[32] res = 24.0f; + S[32] s; + arrayOp!(float[], const(S)[], const(S)[], "+", "/=")(res[], s[], s[]); + foreach (v; res[]) + assert(v == 12.0f); +} diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d index e15dff5204..7f790367c9 100644 --- a/src/core/internal/traits.d +++ b/src/core/internal/traits.d @@ -210,3 +210,23 @@ template Filter(alias pred, TList...) Filter!(pred, TList[$/2 .. $ ])); } } + +// std.meta.staticMap +template staticMap(alias F, T...) +{ + static if (T.length == 0) + { + alias staticMap = TypeTuple!(); + } + else static if (T.length == 1) + { + alias staticMap = TypeTuple!(F!(T[0])); + } + else + { + alias staticMap = + TypeTuple!( + staticMap!(F, T[ 0 .. $/2]), + staticMap!(F, T[$/2 .. $ ])); + } +} From 7ac6acd04d4754b93c32df33eb2bf01465da2805 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Thu, 17 Aug 2017 13:46:24 +0200 Subject: [PATCH 2/3] fix wrong argument index for scalar expression --- src/core/internal/arrayop.d | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/core/internal/arrayop.d b/src/core/internal/arrayop.d index 5181c1251b..6922295214 100644 --- a/src/core/internal/arrayop.d +++ b/src/core/internal/arrayop.d @@ -297,16 +297,17 @@ string scalarExp(Args...)() // `args` to contain operand values. string initScalarVecs(Args...)() { - size_t scalarsIdx; + size_t scalarsIdx, argsIdx; string res; - foreach (aidx, arg; Args) + foreach (arg; Args) { static if (is(arg == T[], T)) { + ++argsIdx; } else static if (is(arg)) res ~= "immutable vec scalar" ~ scalarsIdx++.toString ~ " = args[" - ~ aidx.toString ~ "];\n"; + ~ argsIdx++.toString ~ "];\n"; } return res; } @@ -318,7 +319,7 @@ string vectorExp(Args...)() { size_t scalarsIdx, argsIdx; string[] stack; - foreach (i, arg; Args) + foreach (arg; Args) { static if (is(arg == T[], T)) stack ~= "load(&args[" ~ argsIdx++.toString ~ "][pos])"; @@ -563,3 +564,13 @@ unittest foreach (v; res[]) assert(v == 12.0f); } + +// test scalar after operation argument +unittest +{ + float[32] res, a = 2, b = 3; + float c = 4; + arrayOp!(float[], const(float)[], const(float)[], "*", float, "+", "=")(res[], a[], b[], c); + foreach (v; res[]) + assert(v == 2 * 3 + 4); +} From 823f87f24c97df7eded3386369178ee2ec877bd0 Mon Sep 17 00:00:00 2001 From: Martin Nowak Date: Thu, 17 Aug 2017 13:52:23 +0200 Subject: [PATCH 3/3] Revert "Revert "add changelog for templated array ops"" This reverts commit e05113134fa21c9e7595ef6dd0df807a764f69a0. --- changelog/vectorized_array_ops.dd | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 changelog/vectorized_array_ops.dd diff --git a/changelog/vectorized_array_ops.dd b/changelog/vectorized_array_ops.dd new file mode 100644 index 0000000000..0b20c4e4f4 --- /dev/null +++ b/changelog/vectorized_array_ops.dd @@ -0,0 +1,10 @@ +Vectorized array operations are now templated + +Array operations have been converted from dedicated assembly routines for $(B some) array operations to a generic template implementation for $(B all) array operations. This provides huge performance increases (2-4x higher throughput) for array operations that were not previously vectorized. +Furthermore the implementation makes better use of vectorization even for short arrays to heavily reduce latency for some operations (up to 4x). + +For GDC/LDC the implementation relies on auto-vectorization, for DMD the implementation performs the vectorization itself. Support for vector operations with DMD is determined statically (`-march=native`, `-march=avx2`) to avoid binary bloat and the small test overhead. DMD enables SSE2 for 64-bit targets by default. + +Also see $(DRUNTIMEPR 1891) + +$(RED Note:) The implementation no longer weakens floating point divisions (e.g. `ary[] / scalar`) to multiplication (`ary[] * (1.0 / scalar)`) as that may reduce precision. To preserve the higher performance of float multiplication when loss of precision is acceptable, use either `-ffast-math` with GDC/LDC or manually rewrite your code to multiply by `(1.0 / scalar)` for DMD.