Skip to content
This repository was archived by the owner on Oct 12, 2022. It is now read-only.
/ druntime Public archive
24 changes: 15 additions & 9 deletions benchmark/arrayops/arrayops.d
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@ float[6] getLatencies(T, string op)()
a[] = 24;
b[] = 4;
c[] = 2;
__gshared T s = 2; // scalar, use __gshared to avoid const-folding
auto sw = StopWatch(AutoStart.yes);
foreach (off; size_t(0) .. size_t(64))
{
off = off * len + off;
enum op = op.replace("const", "2").replace("a",
"a[off .. off + len]").replace("b",
"b[off .. off + len]").replace("c", "c[off .. off + len]");
enum op = op
.replace("scalar", "s")
.replace("a", "a[off .. off + len]")
.replace("b", "b[off .. off + len]")
.replace("c", "c[off .. off + len]");
mixin(op ~ ";");
}
latency = min(latency, sw.peek.nsecs);
Expand All @@ -54,13 +57,16 @@ float[4] getThroughput(T, string op)()
a[] = 24;
b[] = 4;
c[] = 2;
__gshared T s = 2; // scalar, use __gshared to avoid const-folding
auto sw = StopWatch(AutoStart.yes);
foreach (off; size_t(0) .. size_t(64))
{
off = off * len + off;
enum op = op.replace("const", "2").replace("a",
"a[off .. off + len]").replace("b",
"b[off .. off + len]").replace("c", "c[off .. off + len]");
enum op = op
.replace("scalar", "s")
.replace("a", "a[off .. off + len]")
.replace("b", "b[off .. off + len]")
.replace("c", "c[off .. off + len]");
mixin(op ~ ";");
}
immutable nsecs = sw.peek.nsecs;
Expand All @@ -78,11 +84,11 @@ string[] genOps()
foreach (op1; ["+", "-", "*", "/"])
{
ops ~= "a " ~ op1 ~ "= b";
ops ~= "a " ~ op1 ~ "= const";
ops ~= "a " ~ op1 ~ "= scalar";
foreach (op2; ["+", "-", "*", "/"])
{
ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " c";
ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " const";
ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " scalar";
}
}
return ops;
Expand Down Expand Up @@ -180,7 +186,7 @@ void main()
unmaskFPUExceptions;

writefln("type, op, %(latency%s, %), %-(throughput%s, %)", iota(6)
.map!(i => 1 << i), ["8KB", "32KB", "512KB", "32MB"]);
.map!(i => 1 << i), ["8KB", "32KB", "512KB", "32768KB"]);
foreach (op; mixin("AliasSeq!(%(%s, %))".format(genOps)))
runOp!op;
maskFPUExceptions;
Expand Down
50 changes: 26 additions & 24 deletions benchmark/arrayops/plot.R
Original file line number Diff line number Diff line change
@@ -1,33 +1,35 @@
# Use `R --vanilla < plot.R` to run this script.
# It will read all *.csv files from the current folder and create a comparison plot for them.
# Use `Rscript --vanilla plot.R old.csv new.csv` to run this script.
# It will read old.csv and new.csv files and create a comparison plot for them.
library(ggplot2)
library(dplyr)
library(tidyr)

dat <- NULL
files <- list.files(pattern='*.csv')
for (file in files)
{
datFile <- read.csv(file) %>% tbl_df() %>%
mutate(file=file)
if (is.null(dat))
dat = datFile
else
dat = bind_rows(dat, datFile)
}
args <- commandArgs(trailingOnly=T)
old <- read.csv(args[1]) %>% tbl_df()
new <- read.csv(args[2]) %>% tbl_df()

latencies <- gather(dat %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency'))
throughputs <- gather(dat %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput'))
col.indices <- which(!colnames(new) %in% c("type", "op"))

levels(latencies$num_elems) <- sub("latency(\\d+)", "\\1", levels(latencies$num_elems))
levels(throughputs$array_size) <- sub("throughput(.+)", "\\1", levels(throughputs$array_size))
# relative values
new[,col.indices] <- 100 * new[,col.indices] / old[,col.indices]

img <- qplot(num_elems, latency, group=type, data=latencies, geom="line", color=type) +
facet_grid(op ~ file, scales="free_y") +
labs(x="num elements", y="latency / ns")
ggsave('array_ops_latency.svg', plot = img, width = 2 + 3 * length(files), height = 40)
# arrange type factor levels
new$type <- factor(new$type, levels = c('byte', 'ubyte', 'short', 'ushort', 'int', 'uint', 'long', 'ulong', 'float', 'double'))

img <- qplot(array_size, throughput, group=type, data=throughputs, geom="line", color=type) +
facet_grid(op ~ file, scales="free_y") +
labs(x="array size", y="throughput / (ops / ns)")
ggsave('array_ops_throughput.svg', plot = img, width = 2 + 3 * length(files), height = 40)
latencies <- gather(new %>% select(-starts_with('throughput')), num_elems, latency, starts_with('latency')) %>%
mutate(num_elems = factor(as.integer(sub("latency(\\d+)", "\\1", num_elems))))
throughputs <- gather(new %>% select(-starts_with('latency')), array_size, throughput, starts_with('throughput')) %>%
mutate(array_size = factor(as.integer(sub("throughput(\\d+)KB", "\\1", array_size))))

img <- ggplot(latencies, aes(x=num_elems, y=latency, fill=type)) +
geom_bar(position="dodge", stat="identity") +
facet_grid(op ~ ., scales="free_y") +
labs(x="num elements", y="relative latency / %")
ggsave('array_ops_latency.png', plot = img, width = 8, height = 40)

img <- ggplot(throughputs, aes(x=array_size, y=throughput, fill=type)) +
geom_bar(position="dodge", stat="identity") +
facet_grid(op ~ ., scales="free_y") +
labs(x="array size / KB", y="relative throughput / %")
ggsave('array_ops_throughput.png', plot = img, width = 8, height = 40)
10 changes: 10 additions & 0 deletions changelog/vectorized_array_ops.dd
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Vectorized array operations are now templated

Array operations have been converted from dedicated assembly routines for $(B some) array operations to a generic template implementation for $(B all) array operations. This provides huge performance increases (2-4x higher throughput) for array operations that were not previously vectorized.
Furthermore the implementation makes better use of vectorization even for short arrays to heavily reduce latency for some operations (up to 4x).

For GDC/LDC the implementation relies on auto-vectorization, for DMD the implementation performs the vectorization itself. Support for vector operations with DMD is determined statically (`-march=native`, `-march=avx2`) to avoid binary bloat and the small test overhead. DMD enables SSE2 for 64-bit targets by default.

Also see $(DRUNTIMEPR 1891)

$(RED Note:) The implementation no longer weakens floating point divisions (e.g. `ary[] / scalar`) to multiplication (`ary[] * (1.0 / scalar)`) as that may reduce precision. To preserve the higher performance of float multiplication when loss of precision is acceptable, use either `-ffast-math` with GDC/LDC or manually rewrite your code to multiply by `(1.0 / scalar)` for DMD.
1 change: 1 addition & 0 deletions mak/COPY
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ COPY=\
$(IMPDIR)\core\vararg.d \
\
$(IMPDIR)\core\internal\abort.d \
$(IMPDIR)\core\internal\arrayop.d \
$(IMPDIR)\core\internal\convert.d \
$(IMPDIR)\core\internal\hash.d \
$(IMPDIR)\core\internal\spinlock.d \
Expand Down
1 change: 1 addition & 0 deletions mak/SRCS
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ SRCS=\
src\core\vararg.d \
\
src\core\internal\abort.d \
src\core\internal\arrayop.d \
src\core\internal\convert.d \
src\core\internal\hash.d \
src\core\internal\spinlock.d \
Expand Down
Loading