From aaead7ed8425dbfd44b48fbe81e0b8ea4a9dd241 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 09:34:01 -0500 Subject: [PATCH 1/9] f --- src/pixie/internal.nim | 6 +++--- src/pixie/runtimechecked/avx.nim | 4 +++- src/pixie/simd.nim | 8 +++++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 25248476..c7720d96 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -79,16 +79,16 @@ proc fillUnsafe*( ) {.raises: [].} = ## Fills the image data with the color starting at index start and ## continuing for len indices. - let rgbx = color.asRgbx() - when allowSimd and compiles(fillUnsafeSimd): fillUnsafeSimd( cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), len, - rgbx + color ) return + let rgbx = color.asRgbx() + # Use memset when every byte has the same value if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: nimSetMem(data[start].addr, rgbx.r.cint, len * 4) diff --git a/src/pixie/runtimechecked/avx.nim b/src/pixie/runtimechecked/avx.nim index cb6d8e09..1b7f2715 100644 --- a/src/pixie/runtimechecked/avx.nim +++ b/src/pixie/runtimechecked/avx.nim @@ -9,8 +9,10 @@ when defined(release): proc fillUnsafeAvx*( data: ptr UncheckedArray[ColorRGBX], len: int, - rgbx: ColorRGBX + color: SomeColor ) = + let rgbx = color.asRgbx() + var i: int while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes data[i] = rgbx diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index d789bdfb..359da3f6 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -1,4 +1,4 @@ -import chroma, vmath +import chroma when defined(release): {.push checks: off.} @@ -33,11 +33,13 @@ when defined(amd64): proc fillUnsafeSimd*( data: ptr UncheckedArray[ColorRGBX], len: int, - rgbx: ColorRGBX + color: SomeColor ) = if cpuHasAvx and len >= 64: - fillUnsafeAvx(data, len, rgbx) + fillUnsafeAvx(data, len, color) else: + let rgbx = color.asRgbx() + var i: int while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes data[i] = rgbx From 96448949032f2ec5e5196fbc6bef2532d0da92cd Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 09:49:42 -0500 Subject: [PATCH 2/9] fillUnsafeSimd --- src/pixie/internal.nim | 6 +---- src/pixie/runtimechecked/avx.nim | 23 +++++++++------- src/pixie/simd.nim | 46 ++++++++++++++++++-------------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index c7720d96..343c6308 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -80,11 +80,7 @@ proc fillUnsafe*( ## Fills the image data with the color starting at index start and ## continuing for len indices. when allowSimd and compiles(fillUnsafeSimd): - fillUnsafeSimd( - cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), - len, - color - ) + fillUnsafeSimd(data, start, len, color) return let rgbx = color.asRgbx() diff --git a/src/pixie/runtimechecked/avx.nim b/src/pixie/runtimechecked/avx.nim index 1b7f2715..faad7ec9 100644 --- a/src/pixie/runtimechecked/avx.nim +++ b/src/pixie/runtimechecked/avx.nim @@ -7,25 +7,30 @@ when defined(release): {.push checks: off.} proc fillUnsafeAvx*( - data: ptr UncheckedArray[ColorRGBX], - len: int, + data: var seq[ColorRGBX], + start, len: int, color: SomeColor ) = let rgbx = color.asRgbx() - var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: data[i] = rgbx inc i + p += 4 let - iterations = (len - i) div 8 + iterations = (start + len - i) div 8 colorVec = mm256_set1_epi32(cast[int32](rgbx)) for _ in 0 ..< iterations: - mm256_store_si256(data[i].addr, colorVec) - i += 8 - # Fill whatever is left the slow way - for i in i ..< len: + mm256_store_si256(cast[pointer](p), colorVec) + p += 32 + i += 8 * iterations + + for i in i ..< start + len: data[i] = rgbx when defined(release): diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 359da3f6..2beee0eb 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -31,30 +31,36 @@ when defined(amd64): result = mm_unpacklo_epi8(mm_setzero_si128(), result) proc fillUnsafeSimd*( - data: ptr UncheckedArray[ColorRGBX], - len: int, + data: var seq[ColorRGBX], + start, len: int, color: SomeColor ) = - if cpuHasAvx and len >= 64: - fillUnsafeAvx(data, len, color) - else: - let rgbx = color.asRgbx() + if cpuHasAvx: + fillUnsafeAvx(data, start, len, color) + return - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes - data[i] = rgbx - inc i + let rgbx = color.asRgbx() - let - colorVec = mm_set1_epi32(cast[int32](rgbx)) - iterations = (len - i) div 8 - for _ in 0 ..< iterations: - mm_store_si128(data[i].addr, colorVec) - mm_store_si128(data[i + 4].addr, colorVec) - i += 8 - - for i in i ..< len: - data[i] = rgbx + var + i = start + p = cast[uint](data[i].addr) + # Align to 16 bytes + while i < (start + len) and (p and 15) != 0: + data[i] = rgbx + inc i + p += 4 + + let + colorVec = mm_set1_epi32(cast[int32](rgbx)) + iterations = (start + len - i) div 8 + for _ in 0 ..< iterations: + mm_store_si128(cast[pointer](p), colorVec) + mm_store_si128(cast[pointer](p + 16), colorVec) + p += 32 + i += iterations * 8 + + for i in i ..< start + len: + data[i] = rgbx proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = if cpuHasAvx2: From af5045ccb889a888bd25f83004c83c353561b571 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 10:12:16 -0500 Subject: [PATCH 3/9] invertImageSimd invertMaskSimd --- src/pixie/images.nim | 5 +- src/pixie/masks.nim | 14 ++--- src/pixie/runtimechecked/avx.nim | 2 +- src/pixie/simd.nim | 91 ++++++++++++++++++++++---------- 4 files changed, 71 insertions(+), 41 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index cd838b8e..21b4a685 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -386,10 +386,7 @@ proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} = proc invert*(image: Image) {.raises: [].} = ## Inverts all of the colors and alpha. when allowSimd and compiles(invertImageSimd): - invertImageSimd( - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + invertImageSimd(image.data) return for i in 0 ..< image.data.len: diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 40b2ea48..e8d380a5 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -1,7 +1,10 @@ import common, internal, vmath -when defined(amd64) and allowSimd: - import nimsimd/sse2 +when allowSimd: + import simd + + when defined(amd64): + import nimsimd/sse2 type Mask* = ref object @@ -234,11 +237,8 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = proc invert*(mask: Mask) {.raises: [].} = ## Inverts all of the values - creates a negative of the mask. - when allowSimd and compiles(invertImageSimd): - invertMaskSimd( - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len - ) + when allowSimd and compiles(invertMaskSimd): + invertMaskSimd(mask.data) return for i in 0 ..< mask.data.len: diff --git a/src/pixie/runtimechecked/avx.nim b/src/pixie/runtimechecked/avx.nim index faad7ec9..c18e9c64 100644 --- a/src/pixie/runtimechecked/avx.nim +++ b/src/pixie/runtimechecked/avx.nim @@ -23,8 +23,8 @@ proc fillUnsafeAvx*( p += 4 let - iterations = (start + len - i) div 8 colorVec = mm256_set1_epi32(cast[int32](rgbx)) + iterations = (start + len - i) div 8 for _ in 0 ..< iterations: mm256_store_si256(cast[pointer](p), colorVec) p += 32 diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 2beee0eb..c0f55338 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -243,22 +243,38 @@ when defined(amd64): for i in i ..< len: dst[i] = src[i].a - proc invertImageSimd*(data: ptr UncheckedArray[ColorRGBX], len: int) = - var i: int - let vec255 = mm_set1_epi8(cast[int8](255)) - for _ in 0 ..< len div 16: - let - a = mm_loadu_si128(data[i + 0].addr) - b = mm_loadu_si128(data[i + 4].addr) - c = mm_loadu_si128(data[i + 8].addr) - d = mm_loadu_si128(data[i + 12].addr) - mm_storeu_si128(data[i + 0].addr, mm_sub_epi8(vec255, a)) - mm_storeu_si128(data[i + 4].addr, mm_sub_epi8(vec255, b)) - mm_storeu_si128(data[i + 8].addr, mm_sub_epi8(vec255, c)) - mm_storeu_si128(data[i + 12].addr, mm_sub_epi8(vec255, d)) - i += 16 + proc invertImageSimd*(data: var seq[ColorRGBX]) = + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: + var rgbx = data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + data[i] = rgbx + inc i + p += 4 - for i in i ..< len: + let + vec255 = mm_set1_epi8(255) + iterations = data.len div 16 + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + i += 16 * iterations + + for i in i ..< data.len: var rgbx = data[i] rgbx.r = 255 - rgbx.r rgbx.g = 255 - rgbx.g @@ -266,19 +282,36 @@ when defined(amd64): rgbx.a = 255 - rgbx.a data[i] = rgbx - toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data), len) + toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data[0].addr), data.len) - proc invertMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = - var i: int - let vec255 = mm_set1_epi8(255) - for _ in 0 ..< len div 16: - var values = mm_loadu_si128(data[i].addr) - values = mm_sub_epi8(vec255, values) - mm_storeu_si128(data[i].addr, values) - i += 16 + proc invertMaskSimd*(data: var seq[uint8]) = + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: + data[i] = 255 - data[i] + inc i + inc p - for j in i ..< len: - data[j] = 255 - data[j] + let + vec255 = mm_set1_epi8(255) + iterations = data.len div 64 + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + i += 64 * iterations + + for i in i ..< data.len: + data[i] = 255 - data[i] proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = var i: int @@ -303,10 +336,10 @@ when defined(amd64): ) = var i: int let - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) zeroVec = mm_setzero_si128() - opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8) + opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) for _ in 0 ..< len div 16: let values = mm_loadu_si128(data[i].addr) if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: From d76550052e5ea0903eb9f85ac24ecafc554887f8 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 10:16:58 -0500 Subject: [PATCH 4/9] toPremultipliedAlphaSimd --- src/pixie/internal.nim | 5 +-- src/pixie/runtimechecked/avx2.nim | 28 +++++++----- src/pixie/simd.nim | 73 ++++++++++++++++--------------- 3 files changed, 57 insertions(+), 49 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 343c6308..d571b5c5 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -113,10 +113,7 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. when allowSimd and compiles(toPremultipliedAlphaSimd): - toPremultipliedAlphaSimd( - cast[ptr UncheckedArray[uint32]](data[0].addr), - data.len - ) + toPremultipliedAlphaSimd(data) return for i in 0 ..< data.len: diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index da34dd4e..cedd2270 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -87,17 +87,17 @@ proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = if data[i].a != 255: return false -proc toPremultipliedAlphaAvx2*( - data: ptr UncheckedArray[uint32], - len: int -): int = +proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) = + var i: int + let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) - oddMask = mm256_set1_epi16(cast[int16](0xff00)) - div255 = mm256_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< len div 8: + oddMask = mm256_set1_epi16(0xff00) + div255 = mm256_set1_epi16(0x8081) + iterations = data.len div 8 + for _ in 0 ..< iterations: let - values = mm256_loadu_si256(data[result].addr) + values = mm256_loadu_si256(data[i].addr) alpha = mm256_and_si256(values, alphaMask) eq = mm256_cmpeq_epi8(values, alphaMask) if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: @@ -112,10 +112,18 @@ proc toPremultipliedAlphaAvx2*( colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7) colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7) mm256_storeu_si256( - data[result].addr, + data[i].addr, mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8)) ) - result += 8 + i += 8 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c when defined(release): {.pop.} diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index c0f55338..890fc69a 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -163,44 +163,47 @@ when defined(amd64): if data[i].a != 255: return false - proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) = - var i: int + proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) = if cpuHasAvx2: - i = toPremultipliedAlphaAvx2(data, len) - else: + toPremultipliedAlphaAvx2(data) + return + + var i: int + + let + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + iterations = data.len div 4 + for _ in 0 ..< iterations: let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< len div 4: + values = mm_loadu_si128(data[i].addr) + alpha = mm_and_si128(values, alphaMask) + eq = mm_cmpeq_epi8(values, alphaMask) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: let - values = mm_loadu_si128(data[i].addr) - alpha = mm_and_si128(values, alphaMask) - eq = mm_cmpeq_epi8(values, alphaMask) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - let - evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) - oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) - var - colorsEven = mm_slli_epi16(values, 8) - colorsOdd = mm_and_si128(values, oddMask) - colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) - colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) - i += 4 + evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) + oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) + var + colorsEven = mm_slli_epi16(values, 8) + colorsOdd = mm_and_si128(values, oddMask) + colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) + ) + i += 4 - for i in i ..< len: - var c: ColorRGBX - copyMem(c.addr, data[i].addr, 4) - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 - copyMem(data[i].addr, c.addr, 4) + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c proc newImageFromMaskSimd*( dst: ptr UncheckedArray[ColorRGBX], @@ -282,7 +285,7 @@ when defined(amd64): rgbx.a = 255 - rgbx.a data[i] = rgbx - toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data[0].addr), data.len) + toPremultipliedAlphaSimd(data) proc invertMaskSimd*(data: var seq[uint8]) = var From 1c6fa86ac0e4625b5043a360ae6d8a20ce5f9f16 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 10:21:53 -0500 Subject: [PATCH 5/9] isOpaqueSimd --- src/pixie/internal.nim | 5 +---- src/pixie/runtimechecked/avx2.nim | 11 ++++++----- src/pixie/simd.nim | 13 +++++++------ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index d571b5c5..18ee742a 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -126,10 +126,7 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = when allowSimd and compiles(isOpaqueSimd): - return isOpaqueSimd( - cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), - len - ) + return isOpaqueSimd(data, start, len) result = true diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index cedd2270..96eb1c4a 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -61,18 +61,19 @@ proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = if data[i].a != 0: return false -proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = +proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = result = true - var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + var i = start + # Align to 32 bytes + while i < (start + len) and (cast[uint](data[i].addr) and 31) != 0: if data[i].a != 255: return false inc i let vec255 = mm256_set1_epi8(255) - iterations = (len - i) div 16 + iterations = (start + len - i) div 16 for _ in 0 ..< iterations: let values0 = mm256_load_si256(data[i].addr) @@ -83,7 +84,7 @@ proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = return false i += 16 - for i in i ..< len: + for i in i ..< start + len: if data[i].a != 255: return false diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 890fc69a..454c925e 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -130,21 +130,22 @@ when defined(amd64): if data[i].a != 0: return false - proc isOpaqueSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + proc isOpaqueSimd*(data: var seq[ColorRGBX], start, len: int): bool = if cpuHasAvx2: - return isOpaqueAvx2(data, len) + return isOpaqueAvx2(data, start, len) result = true - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + var i = start + # Align to 16 bytes + while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0: if data[i].a != 255: return false inc i let vec255 = mm_set1_epi8(255) - iterations = (len - i) div 16 + iterations = (start + len - i) div 16 for _ in 0 ..< iterations: let values0 = mm_load_si128(data[i].addr) @@ -159,7 +160,7 @@ when defined(amd64): return false i += 16 - for i in i ..< len: + for i in i ..< start + len: if data[i].a != 255: return false From f5825daf10c3956b50066e61c4435bbcf7cdbdec Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 10:29:16 -0500 Subject: [PATCH 6/9] ceilMaskSimd --- src/pixie/masks.nim | 5 +---- src/pixie/simd.nim | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index e8d380a5..576c7ed0 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -308,10 +308,7 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = proc ceil*(mask: Mask) {.raises: [].} = ## A value of 0 stays 0. Anything else turns into 255. when allowSimd and compiles(invertImageSimd): - ceilMaskSimd( - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len - ) + ceilMaskSimd(mask.data) return for i in 0 ..< mask.data.len: diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 454c925e..11b1da40 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -317,19 +317,24 @@ when defined(amd64): for i in i ..< data.len: data[i] = 255 - data[i] - proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = - var i: int + proc ceilMaskSimd*(data: var seq[uint8]) = + var + i: int + p = cast[uint](data[0].addr) + let zeroVec = mm_setzero_si128() vec255 = mm_set1_epi8(255) - for _ in 0 ..< len div 16: - var values = mm_loadu_si128(data[i].addr) + iterations = data.len div 16 + for _ in 0 ..< iterations: + var values = mm_loadu_si128(cast[pointer](p)) values = mm_cmpeq_epi8(values, zeroVec) values = mm_andnot_si128(values, vec255) - mm_storeu_si128(data[i].addr, values) - i += 16 + mm_storeu_si128(cast[pointer](p), values) + p += 16 + i += 16 * iterations - for i in i ..< len: + for i in i ..< data.len: if data[i] != 0: data[i] = 255 From 3a41ff8e64baa2bfe1866e2b999299e124367aaf Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 10:35:54 -0500 Subject: [PATCH 7/9] applyOpacitySimd --- src/pixie/images.nim | 6 +----- src/pixie/masks.nim | 6 +----- src/pixie/simd.nim | 37 ++++++++++++++++++++++++++----------- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 21b4a685..f0f8055c 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -368,11 +368,7 @@ proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} = return when allowSimd and compiles(applyOpacitySimd): - applyOpacitySimd( - cast[ptr UncheckedArray[uint8]](image.data[0].addr), - image.data.len * 4, - opacity - ) + applyOpacitySimd(image.data, opacity) return for i in 0 ..< image.data.len: diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 576c7ed0..9797ff0d 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -197,11 +197,7 @@ proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} = return when allowSimd and compiles(applyOpacitySimd): - applyOpacitySimd( - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len, - opacity - ) + applyOpacitySimd(mask.data, opacity) return for i in 0 ..< mask.data.len: diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 11b1da40..363d9003 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -338,19 +338,24 @@ when defined(amd64): if data[i] != 0: data[i] = 255 - proc applyOpacitySimd*( - data: ptr UncheckedArray[uint8], - len: int, - opacity: uint16 - ) = - var i: int + proc applyOpacitySimd*(data: var seq[uint8 | ColorRGBX], opacity: uint16) = + var + i: int + p = cast[uint](data[0].addr) + len = + when data is seq[ColorRGBX]: + data.len * 4 + else: + data.len + let oddMask = mm_set1_epi16(0xff00) div255 = mm_set1_epi16(0x8081) zeroVec = mm_setzero_si128() opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) + iterations = len div 16 for _ in 0 ..< len div 16: - let values = mm_loadu_si128(data[i].addr) + let values = mm_loadu_si128(cast[pointer](p)) if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: var valuesEven = mm_slli_epi16(values, 8) @@ -360,13 +365,23 @@ when defined(amd64): valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) mm_storeu_si128( - data[i].addr, + cast[pointer](p), mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) ) - i += 16 + p += 16 + i += 16 * iterations - for i in i ..< len: - data[i] = ((data[i] * opacity) div 255).uint8 + when data is seq[ColorRGBX]: + for i in i div 4 ..< data.len: + var rgbx = data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + data[i] = rgbx + else: + for i in i ..< data.len: + data[i] = ((data[i] * opacity) div 255).uint8 when defined(release): {.pop.} From c4eadf31ed546f859e97e495aabfadee01d26b21 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 10:48:30 -0500 Subject: [PATCH 8/9] isOneColorSimd isTransparentSimd --- src/pixie/images.nim | 10 +---- src/pixie/runtimechecked/avx2.nim | 18 ++++---- src/pixie/simd.nim | 72 ++++++++++++++++++------------- 3 files changed, 55 insertions(+), 45 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index f0f8055c..2469d237 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -102,10 +102,7 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} = proc isOneColor*(image: Image): bool {.raises: [].} = ## Checks if the entire image is the same color. when allowSimd and compiles(isOneColorSimd): - return isOneColorSimd( - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + return isOneColorSimd(image.data) result = true @@ -117,10 +114,7 @@ proc isOneColor*(image: Image): bool {.raises: [].} = proc isTransparent*(image: Image): bool {.raises: [].} = ## Checks if this image is fully transparent or not. when allowSimd and compiles(isTransparentSimd): - return isTransparentSimd( - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + return isTransparentSimd(image.data) result = true diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index 96eb1c4a..35399317 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -6,20 +6,21 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} -proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = +proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool = result = true let color = data[0] var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + # Align to 32 bytes + while i < data.len and (cast[uint](data[i].addr) and 31) != 0: if data[i] != color: return false inc i let colorVec = mm256_set1_epi32(cast[int32](color)) - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let values0 = mm256_load_si256(data[i].addr) @@ -31,22 +32,23 @@ proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = return false i += 16 - for i in i ..< len: + for i in i ..< data.len: if data[i] != color: return false -proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = +proc isTransparentAvx2*(data: var seq[ColorRGBX]): bool = result = true var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + # Align to 32 bytes + while i < data.len and (cast[uint](data[i].addr) and 31) != 0: if data[i].a != 0: return false inc i let vecZero = mm256_setzero_si256() - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let values0 = mm256_load_si256(data[i].addr) @@ -57,7 +59,7 @@ proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = return false i += 16 - for i in i ..< len: + for i in i ..< data.len: if data[i].a != 0: return false diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 363d9003..6b852c47 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -62,29 +62,33 @@ when defined(amd64): for i in i ..< start + len: data[i] = rgbx - proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + proc isOneColorSimd*(data: var seq[ColorRGBX]): bool = if cpuHasAvx2: - return isOneColorAvx2(data, len) + return isOneColorAvx2(data) result = true let color = data[0] - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: if data[i] != color: return false inc i + p += 4 let colorVec = mm_set1_epi32(cast[int32](color)) - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm_load_si128(data[i].addr) - values1 = mm_load_si128(data[i + 4].addr) - values2 = mm_load_si128(data[i + 8].addr) - values3 = mm_load_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) eq0 = mm_cmpeq_epi8(values0, colorVec) eq1 = mm_cmpeq_epi8(values1, colorVec) eq2 = mm_cmpeq_epi8(values2, colorVec) @@ -92,41 +96,47 @@ when defined(amd64): eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) if mm_movemask_epi8(eq0123) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for i in i ..< len: + for i in i ..< data.len: if data[i] != color: return false - proc isTransparentSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + proc isTransparentSimd*(data: var seq[ColorRGBX]): bool = if cpuHasAvx2: - return isTransparentAvx2(data, len) + return isTransparentAvx2(data) - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: if data[i].a != 0: return false inc i + p += 4 result = true let vecZero = mm_setzero_si128() - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm_load_si128(data[i].addr) - values1 = mm_load_si128(data[i + 4].addr) - values2 = mm_load_si128(data[i + 8].addr) - values3 = mm_load_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_or_si128(values0, values1) values23 = mm_or_si128(values2, values3) values0123 = mm_or_si128(values01, values23) if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for i in i ..< len: + for i in i ..< data.len: if data[i].a != 0: return false @@ -136,29 +146,33 @@ when defined(amd64): result = true - var i = start + var + i = start + p = cast[uint](data[0].addr) # Align to 16 bytes - while i < (start + len) and (cast[uint](data[i].addr) and 15) != 0: + while i < (start + len) and (p and 15) != 0: if data[i].a != 255: return false inc i + p += 4 let vec255 = mm_set1_epi8(255) iterations = (start + len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm_load_si128(data[i].addr) - values1 = mm_load_si128(data[i + 4].addr) - values2 = mm_load_si128(data[i + 8].addr) - values3 = mm_load_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_and_si128(values0, values1) values23 = mm_and_si128(values2, values3) values0123 = mm_and_si128(values01, values23) eq = mm_cmpeq_epi8(values0123, vec255) if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: return false - i += 16 + p += 64 + i += 16 * iterations for i in i ..< start + len: if data[i].a != 255: From 5d27c93f6828975b667ca2b962034fd205ceb8f8 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Thu, 30 Jun 2022 10:58:59 -0500 Subject: [PATCH 9/9] newImageFromMaskSimd newMaskFromImageSimd --- src/pixie/images.nim | 12 ++---------- src/pixie/simd.nim | 20 ++++++-------------- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index 2469d237..a0945453 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -33,11 +33,7 @@ proc newImage*(mask: Mask): Image {.raises: [PixieError].} = result = newImage(mask.width, mask.height) when allowSimd and compiles(newImageFromMaskSimd): - newImageFromMaskSimd( - cast[ptr UncheckedArray[ColorRGBX]](result.data[0].addr), - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len - ) + newImageFromMaskSimd(result.data, mask.data) return for i in 0 ..< mask.data.len: @@ -458,11 +454,7 @@ proc newMask*(image: Image): Mask {.raises: [PixieError].} = result = newMask(image.width, image.height) when allowSimd and compiles(newMaskFromImageSimd): - newMaskFromImageSimd( - cast[ptr UncheckedArray[uint8]](result.data[0].addr), - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + newMaskFromImageSimd(result.data, image.data) return for i in 0 ..< image.data.len: diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index 6b852c47..1ed5baf1 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -220,13 +220,9 @@ when defined(amd64): c.b = ((c.b.uint32 * c.a) div 255).uint8 data[i] = c - proc newImageFromMaskSimd*( - dst: ptr UncheckedArray[ColorRGBX], - src: ptr UncheckedArray[uint8], - len: int - ) = + proc newImageFromMaskSimd*(dst: var seq[ColorRGBX], src: var seq[uint8]) = var i: int - for _ in 0 ..< len div 16: + for _ in 0 ..< src.len div 16: var alphas = mm_loadu_si128(src[i].addr) for j in 0 ..< 4: var unpacked = unpackAlphaValues(alphas) @@ -236,17 +232,13 @@ when defined(amd64): alphas = mm_srli_si128(alphas, 4) i += 16 - for i in i ..< len: + for i in i ..< src.len: let v = src[i] dst[i] = rgbx(v, v, v, v) - proc newMaskFromImageSimd*( - dst: ptr UncheckedArray[uint8], - src: ptr UncheckedArray[ColorRGBX], - len: int - ) = + proc newMaskFromImageSimd*(dst: var seq[uint8], src: var seq[ColorRGBX]) = var i: int - for _ in 0 ..< len div 16: + for _ in 0 ..< src.len div 16: let a = mm_loadu_si128(src[i + 0].addr) b = mm_loadu_si128(src[i + 4].addr) @@ -258,7 +250,7 @@ when defined(amd64): ) i += 16 - for i in i ..< len: + for i in i ..< src.len: dst[i] = src[i].a proc invertImageSimd*(data: var seq[ColorRGBX]) =