diff --git a/src/pixie/images.nim b/src/pixie/images.nim index cd838b8e..a0945453 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -33,11 +33,7 @@ proc newImage*(mask: Mask): Image {.raises: [PixieError].} = result = newImage(mask.width, mask.height) when allowSimd and compiles(newImageFromMaskSimd): - newImageFromMaskSimd( - cast[ptr UncheckedArray[ColorRGBX]](result.data[0].addr), - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len - ) + newImageFromMaskSimd(result.data, mask.data) return for i in 0 ..< mask.data.len: @@ -102,10 +98,7 @@ proc fill*(image: Image, color: SomeColor) {.inline, raises: [].} = proc isOneColor*(image: Image): bool {.raises: [].} = ## Checks if the entire image is the same color. when allowSimd and compiles(isOneColorSimd): - return isOneColorSimd( - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + return isOneColorSimd(image.data) result = true @@ -117,10 +110,7 @@ proc isOneColor*(image: Image): bool {.raises: [].} = proc isTransparent*(image: Image): bool {.raises: [].} = ## Checks if this image is fully transparent or not. when allowSimd and compiles(isTransparentSimd): - return isTransparentSimd( - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + return isTransparentSimd(image.data) result = true @@ -368,11 +358,7 @@ proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} = return when allowSimd and compiles(applyOpacitySimd): - applyOpacitySimd( - cast[ptr UncheckedArray[uint8]](image.data[0].addr), - image.data.len * 4, - opacity - ) + applyOpacitySimd(image.data, opacity) return for i in 0 ..< image.data.len: @@ -386,10 +372,7 @@ proc applyOpacity*(image: Image, opacity: float32) {.raises: [].} = proc invert*(image: Image) {.raises: [].} = ## Inverts all of the colors and alpha. when allowSimd and compiles(invertImageSimd): - invertImageSimd( - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + invertImageSimd(image.data) return for i in 0 ..< image.data.len: @@ -471,11 +454,7 @@ proc newMask*(image: Image): Mask {.raises: [PixieError].} = result = newMask(image.width, image.height) when allowSimd and compiles(newMaskFromImageSimd): - newMaskFromImageSimd( - cast[ptr UncheckedArray[uint8]](result.data[0].addr), - cast[ptr UncheckedArray[ColorRGBX]](image.data[0].addr), - image.data.len - ) + newMaskFromImageSimd(result.data, image.data) return for i in 0 ..< image.data.len: diff --git a/src/pixie/internal.nim b/src/pixie/internal.nim index 25248476..18ee742a 100644 --- a/src/pixie/internal.nim +++ b/src/pixie/internal.nim @@ -79,16 +79,12 @@ proc fillUnsafe*( ) {.raises: [].} = ## Fills the image data with the color starting at index start and ## continuing for len indices. - let rgbx = color.asRgbx() - when allowSimd and compiles(fillUnsafeSimd): - fillUnsafeSimd( - cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), - len, - rgbx - ) + fillUnsafeSimd(data, start, len, color) return + let rgbx = color.asRgbx() + # Use memset when every byte has the same value if rgbx.r == rgbx.g and rgbx.r == rgbx.b and rgbx.r == rgbx.a: nimSetMem(data[start].addr, rgbx.r.cint, len * 4) @@ -117,10 +113,7 @@ proc toStraightAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} = ## Converts an image to premultiplied alpha from straight alpha. when allowSimd and compiles(toPremultipliedAlphaSimd): - toPremultipliedAlphaSimd( - cast[ptr UncheckedArray[uint32]](data[0].addr), - data.len - ) + toPremultipliedAlphaSimd(data) return for i in 0 ..< data.len: @@ -133,10 +126,7 @@ proc toPremultipliedAlpha*(data: var seq[ColorRGBA | ColorRGBX]) {.raises: [].} proc isOpaque*(data: var seq[ColorRGBX], start, len: int): bool = when allowSimd and compiles(isOpaqueSimd): - return isOpaqueSimd( - cast[ptr UncheckedArray[ColorRGBX]](data[start].addr), - len - ) + return isOpaqueSimd(data, start, len) result = true diff --git a/src/pixie/masks.nim b/src/pixie/masks.nim index 40b2ea48..9797ff0d 100644 --- a/src/pixie/masks.nim +++ b/src/pixie/masks.nim @@ -1,7 +1,10 @@ import common, internal, vmath -when defined(amd64) and allowSimd: - import nimsimd/sse2 +when allowSimd: + import simd + + when defined(amd64): + import nimsimd/sse2 type Mask* = ref object @@ -194,11 +197,7 @@ proc applyOpacity*(mask: Mask, opacity: float32) {.raises: [].} = return when allowSimd and compiles(applyOpacitySimd): - applyOpacitySimd( - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len, - opacity - ) + applyOpacitySimd(mask.data, opacity) return for i in 0 ..< mask.data.len: @@ -234,11 +233,8 @@ proc getValueSmooth*(mask: Mask, x, y: float32): uint8 {.raises: [].} = proc invert*(mask: Mask) {.raises: [].} = ## Inverts all of the values - creates a negative of the mask. - when allowSimd and compiles(invertImageSimd): - invertMaskSimd( - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len - ) + when allowSimd and compiles(invertMaskSimd): + invertMaskSimd(mask.data) return for i in 0 ..< mask.data.len: @@ -308,10 +304,7 @@ proc spread*(mask: Mask, spread: float32) {.raises: [PixieError].} = proc ceil*(mask: Mask) {.raises: [].} = ## A value of 0 stays 0. Anything else turns into 255. when allowSimd and compiles(invertImageSimd): - ceilMaskSimd( - cast[ptr UncheckedArray[uint8]](mask.data[0].addr), - mask.data.len - ) + ceilMaskSimd(mask.data) return for i in 0 ..< mask.data.len: diff --git a/src/pixie/runtimechecked/avx.nim b/src/pixie/runtimechecked/avx.nim index cb6d8e09..c18e9c64 100644 --- a/src/pixie/runtimechecked/avx.nim +++ b/src/pixie/runtimechecked/avx.nim @@ -7,23 +7,30 @@ when defined(release): {.push checks: off.} proc fillUnsafeAvx*( - data: ptr UncheckedArray[ColorRGBX], - len: int, - rgbx: ColorRGBX + data: var seq[ColorRGBX], + start, len: int, + color: SomeColor ) = - var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + let rgbx = color.asRgbx() + + var + i = start + p = cast[uint](data[i].addr) + # Align to 32 bytes + while i < (start + len) and (p and 31) != 0: data[i] = rgbx inc i + p += 4 let - iterations = (len - i) div 8 colorVec = mm256_set1_epi32(cast[int32](rgbx)) + iterations = (start + len - i) div 8 for _ in 0 ..< iterations: - mm256_store_si256(data[i].addr, colorVec) - i += 8 - # Fill whatever is left the slow way - for i in i ..< len: + mm256_store_si256(cast[pointer](p), colorVec) + p += 32 + i += 8 * iterations + + for i in i ..< start + len: data[i] = rgbx when defined(release): diff --git a/src/pixie/runtimechecked/avx2.nim b/src/pixie/runtimechecked/avx2.nim index da34dd4e..35399317 100644 --- a/src/pixie/runtimechecked/avx2.nim +++ b/src/pixie/runtimechecked/avx2.nim @@ -6,20 +6,21 @@ when defined(gcc) or defined(clang): when defined(release): {.push checks: off.} -proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = +proc isOneColorAvx2*(data: var seq[ColorRGBX]): bool = result = true let color = data[0] var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + # Align to 32 bytes + while i < data.len and (cast[uint](data[i].addr) and 31) != 0: if data[i] != color: return false inc i let colorVec = mm256_set1_epi32(cast[int32](color)) - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let values0 = mm256_load_si256(data[i].addr) @@ -31,22 +32,23 @@ proc isOneColorAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = return false i += 16 - for i in i ..< len: + for i in i ..< data.len: if data[i] != color: return false -proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = +proc isTransparentAvx2*(data: var seq[ColorRGBX]): bool = result = true var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + # Align to 32 bytes + while i < data.len and (cast[uint](data[i].addr) and 31) != 0: if data[i].a != 0: return false inc i let vecZero = mm256_setzero_si256() - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let values0 = mm256_load_si256(data[i].addr) @@ -57,22 +59,23 @@ proc isTransparentAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = return false i += 16 - for i in i ..< len: + for i in i ..< data.len: if data[i].a != 0: return false -proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = +proc isOpaqueAvx2*(data: var seq[ColorRGBX], start, len: int): bool = result = true - var i: int - while i < len and (cast[uint](data[i].addr) and 31) != 0: # Align to 32 bytes + var i = start + # Align to 32 bytes + while i < (start + len) and (cast[uint](data[i].addr) and 31) != 0: if data[i].a != 255: return false inc i let vec255 = mm256_set1_epi8(255) - iterations = (len - i) div 16 + iterations = (start + len - i) div 16 for _ in 0 ..< iterations: let values0 = mm256_load_si256(data[i].addr) @@ -83,21 +86,21 @@ proc isOpaqueAvx2*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = return false i += 16 - for i in i ..< len: + for i in i ..< start + len: if data[i].a != 255: return false -proc toPremultipliedAlphaAvx2*( - data: ptr UncheckedArray[uint32], - len: int -): int = +proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) = + var i: int + let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) - oddMask = mm256_set1_epi16(cast[int16](0xff00)) - div255 = mm256_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< len div 8: + oddMask = mm256_set1_epi16(0xff00) + div255 = mm256_set1_epi16(0x8081) + iterations = data.len div 8 + for _ in 0 ..< iterations: let - values = mm256_loadu_si256(data[result].addr) + values = mm256_loadu_si256(data[i].addr) alpha = mm256_and_si256(values, alphaMask) eq = mm256_cmpeq_epi8(values, alphaMask) if (mm256_movemask_epi8(eq) and 0x88888888) != 0x88888888: @@ -112,10 +115,18 @@ proc toPremultipliedAlphaAvx2*( colorsEven = mm256_srli_epi16(mm256_mulhi_epu16(colorsEven, div255), 7) colorsOdd = mm256_srli_epi16(mm256_mulhi_epu16(colorsOdd, div255), 7) mm256_storeu_si256( - data[result].addr, + data[i].addr, mm256_or_si256(colorsEven, mm256_slli_epi16(colorsOdd, 8)) ) - result += 8 + i += 8 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c when defined(release): {.pop.} diff --git a/src/pixie/simd.nim b/src/pixie/simd.nim index d789bdfb..1ed5baf1 100644 --- a/src/pixie/simd.nim +++ b/src/pixie/simd.nim @@ -1,4 +1,4 @@ -import chroma, vmath +import chroma when defined(release): {.push checks: off.} @@ -31,52 +31,64 @@ when defined(amd64): result = mm_unpacklo_epi8(mm_setzero_si128(), result) proc fillUnsafeSimd*( - data: ptr UncheckedArray[ColorRGBX], - len: int, - rgbx: ColorRGBX + data: var seq[ColorRGBX], + start, len: int, + color: SomeColor ) = - if cpuHasAvx and len >= 64: - fillUnsafeAvx(data, len, rgbx) - else: - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes - data[i] = rgbx - inc i + if cpuHasAvx: + fillUnsafeAvx(data, start, len, color) + return - let - colorVec = mm_set1_epi32(cast[int32](rgbx)) - iterations = (len - i) div 8 - for _ in 0 ..< iterations: - mm_store_si128(data[i].addr, colorVec) - mm_store_si128(data[i + 4].addr, colorVec) - i += 8 - - for i in i ..< len: - data[i] = rgbx + let rgbx = color.asRgbx() + + var + i = start + p = cast[uint](data[i].addr) + # Align to 16 bytes + while i < (start + len) and (p and 15) != 0: + data[i] = rgbx + inc i + p += 4 + + let + colorVec = mm_set1_epi32(cast[int32](rgbx)) + iterations = (start + len - i) div 8 + for _ in 0 ..< iterations: + mm_store_si128(cast[pointer](p), colorVec) + mm_store_si128(cast[pointer](p + 16), colorVec) + p += 32 + i += iterations * 8 + + for i in i ..< start + len: + data[i] = rgbx - proc isOneColorSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + proc isOneColorSimd*(data: var seq[ColorRGBX]): bool = if cpuHasAvx2: - return isOneColorAvx2(data, len) + return isOneColorAvx2(data) result = true let color = data[0] - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: if data[i] != color: return false inc i + p += 4 let colorVec = mm_set1_epi32(cast[int32](color)) - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm_load_si128(data[i].addr) - values1 = mm_load_si128(data[i + 4].addr) - values2 = mm_load_si128(data[i + 8].addr) - values3 = mm_load_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) eq0 = mm_cmpeq_epi8(values0, colorVec) eq1 = mm_cmpeq_epi8(values1, colorVec) eq2 = mm_cmpeq_epi8(values2, colorVec) @@ -84,123 +96,133 @@ when defined(amd64): eq0123 = mm_and_si128(mm_and_si128(eq0, eq1), mm_and_si128(eq2, eq3)) if mm_movemask_epi8(eq0123) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for i in i ..< len: + for i in i ..< data.len: if data[i] != color: return false - proc isTransparentSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + proc isTransparentSimd*(data: var seq[ColorRGBX]): bool = if cpuHasAvx2: - return isTransparentAvx2(data, len) + return isTransparentAvx2(data) - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: if data[i].a != 0: return false inc i + p += 4 result = true let vecZero = mm_setzero_si128() - iterations = (len - i) div 16 + iterations = (data.len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm_load_si128(data[i].addr) - values1 = mm_load_si128(data[i + 4].addr) - values2 = mm_load_si128(data[i + 8].addr) - values3 = mm_load_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_or_si128(values0, values1) values23 = mm_or_si128(values2, values3) values0123 = mm_or_si128(values01, values23) if mm_movemask_epi8(mm_cmpeq_epi8(values0123, vecZero)) != 0xffff: return false - i += 16 + p += 64 + i += 16 * iterations - for i in i ..< len: + for i in i ..< data.len: if data[i].a != 0: return false - proc isOpaqueSimd*(data: ptr UncheckedArray[ColorRGBX], len: int): bool = + proc isOpaqueSimd*(data: var seq[ColorRGBX], start, len: int): bool = if cpuHasAvx2: - return isOpaqueAvx2(data, len) + return isOpaqueAvx2(data, start, len) result = true - var i: int - while i < len and (cast[uint](data[i].addr) and 15) != 0: # Align to 16 bytes + var + i = start + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < (start + len) and (p and 15) != 0: if data[i].a != 255: return false inc i + p += 4 let vec255 = mm_set1_epi8(255) - iterations = (len - i) div 16 + iterations = (start + len - i) div 16 for _ in 0 ..< iterations: let - values0 = mm_load_si128(data[i].addr) - values1 = mm_load_si128(data[i + 4].addr) - values2 = mm_load_si128(data[i + 8].addr) - values3 = mm_load_si128(data[i + 12].addr) + values0 = mm_load_si128(cast[pointer](p)) + values1 = mm_load_si128(cast[pointer](p + 16)) + values2 = mm_load_si128(cast[pointer](p + 32)) + values3 = mm_load_si128(cast[pointer](p + 48)) values01 = mm_and_si128(values0, values1) values23 = mm_and_si128(values2, values3) values0123 = mm_and_si128(values01, values23) eq = mm_cmpeq_epi8(values0123, vec255) if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: return false - i += 16 + p += 64 + i += 16 * iterations - for i in i ..< len: + for i in i ..< start + len: if data[i].a != 255: return false - proc toPremultipliedAlphaSimd*(data: ptr UncheckedArray[uint32], len: int) = - var i: int + proc toPremultipliedAlphaSimd*(data: var seq[ColorRGBA | ColorRGBX]) = if cpuHasAvx2: - i = toPremultipliedAlphaAvx2(data, len) - else: + toPremultipliedAlphaAvx2(data) + return + + var i: int + + let + alphaMask = mm_set1_epi32(cast[int32](0xff000000)) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) + iterations = data.len div 4 + for _ in 0 ..< iterations: let - alphaMask = mm_set1_epi32(cast[int32](0xff000000)) - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) - for _ in 0 ..< len div 4: + values = mm_loadu_si128(data[i].addr) + alpha = mm_and_si128(values, alphaMask) + eq = mm_cmpeq_epi8(values, alphaMask) + if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: let - values = mm_loadu_si128(data[i].addr) - alpha = mm_and_si128(values, alphaMask) - eq = mm_cmpeq_epi8(values, alphaMask) - if (mm_movemask_epi8(eq) and 0x00008888) != 0x00008888: - let - evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) - oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) - var - colorsEven = mm_slli_epi16(values, 8) - colorsOdd = mm_and_si128(values, oddMask) - colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) - colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) - colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) - colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) - mm_storeu_si128( - data[i].addr, - mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) - ) - i += 4 - - for i in i ..< len: - var c: ColorRGBX - copyMem(c.addr, data[i].addr, 4) - c.r = ((c.r.uint32 * c.a) div 255).uint8 - c.g = ((c.g.uint32 * c.a) div 255).uint8 - c.b = ((c.b.uint32 * c.a) div 255).uint8 - copyMem(data[i].addr, c.addr, 4) - - proc newImageFromMaskSimd*( - dst: ptr UncheckedArray[ColorRGBX], - src: ptr UncheckedArray[uint8], - len: int - ) = + evenMultiplier = mm_or_si128(alpha, mm_srli_epi32(alpha, 16)) + oddMultiplier = mm_or_si128(evenMultiplier, alphaMask) + var + colorsEven = mm_slli_epi16(values, 8) + colorsOdd = mm_and_si128(values, oddMask) + colorsEven = mm_mulhi_epu16(colorsEven, evenMultiplier) + colorsOdd = mm_mulhi_epu16(colorsOdd, oddMultiplier) + colorsEven = mm_srli_epi16(mm_mulhi_epu16(colorsEven, div255), 7) + colorsOdd = mm_srli_epi16(mm_mulhi_epu16(colorsOdd, div255), 7) + mm_storeu_si128( + data[i].addr, + mm_or_si128(colorsEven, mm_slli_epi16(colorsOdd, 8)) + ) + i += 4 + + for i in i ..< data.len: + var c = data[i] + if c.a != 255: + c.r = ((c.r.uint32 * c.a) div 255).uint8 + c.g = ((c.g.uint32 * c.a) div 255).uint8 + c.b = ((c.b.uint32 * c.a) div 255).uint8 + data[i] = c + + proc newImageFromMaskSimd*(dst: var seq[ColorRGBX], src: var seq[uint8]) = var i: int - for _ in 0 ..< len div 16: + for _ in 0 ..< src.len div 16: var alphas = mm_loadu_si128(src[i].addr) for j in 0 ..< 4: var unpacked = unpackAlphaValues(alphas) @@ -210,17 +232,13 @@ when defined(amd64): alphas = mm_srli_si128(alphas, 4) i += 16 - for i in i ..< len: + for i in i ..< src.len: let v = src[i] dst[i] = rgbx(v, v, v, v) - proc newMaskFromImageSimd*( - dst: ptr UncheckedArray[uint8], - src: ptr UncheckedArray[ColorRGBX], - len: int - ) = + proc newMaskFromImageSimd*(dst: var seq[uint8], src: var seq[ColorRGBX]) = var i: int - for _ in 0 ..< len div 16: + for _ in 0 ..< src.len div 16: let a = mm_loadu_si128(src[i + 0].addr) b = mm_loadu_si128(src[i + 4].addr) @@ -232,25 +250,41 @@ when defined(amd64): ) i += 16 - for i in i ..< len: + for i in i ..< src.len: dst[i] = src[i].a - proc invertImageSimd*(data: ptr UncheckedArray[ColorRGBX], len: int) = - var i: int - let vec255 = mm_set1_epi8(cast[int8](255)) - for _ in 0 ..< len div 16: - let - a = mm_loadu_si128(data[i + 0].addr) - b = mm_loadu_si128(data[i + 4].addr) - c = mm_loadu_si128(data[i + 8].addr) - d = mm_loadu_si128(data[i + 12].addr) - mm_storeu_si128(data[i + 0].addr, mm_sub_epi8(vec255, a)) - mm_storeu_si128(data[i + 4].addr, mm_sub_epi8(vec255, b)) - mm_storeu_si128(data[i + 8].addr, mm_sub_epi8(vec255, c)) - mm_storeu_si128(data[i + 12].addr, mm_sub_epi8(vec255, d)) - i += 16 + proc invertImageSimd*(data: var seq[ColorRGBX]) = + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: + var rgbx = data[i] + rgbx.r = 255 - rgbx.r + rgbx.g = 255 - rgbx.g + rgbx.b = 255 - rgbx.b + rgbx.a = 255 - rgbx.a + data[i] = rgbx + inc i + p += 4 - for i in i ..< len: + let + vec255 = mm_set1_epi8(255) + iterations = data.len div 16 + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + i += 16 * iterations + + for i in i ..< data.len: var rgbx = data[i] rgbx.r = 255 - rgbx.r rgbx.g = 255 - rgbx.g @@ -258,49 +292,76 @@ when defined(amd64): rgbx.a = 255 - rgbx.a data[i] = rgbx - toPremultipliedAlphaSimd(cast[ptr UncheckedArray[uint32]](data), len) + toPremultipliedAlphaSimd(data) - proc invertMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = - var i: int - let vec255 = mm_set1_epi8(255) - for _ in 0 ..< len div 16: - var values = mm_loadu_si128(data[i].addr) - values = mm_sub_epi8(vec255, values) - mm_storeu_si128(data[i].addr, values) - i += 16 + proc invertMaskSimd*(data: var seq[uint8]) = + var + i: int + p = cast[uint](data[0].addr) + # Align to 16 bytes + while i < data.len and (p and 15) != 0: + data[i] = 255 - data[i] + inc i + inc p - for j in i ..< len: - data[j] = 255 - data[j] + let + vec255 = mm_set1_epi8(255) + iterations = data.len div 64 + for _ in 0 ..< iterations: + let + a = mm_load_si128(cast[pointer](p)) + b = mm_load_si128(cast[pointer](p + 16)) + c = mm_load_si128(cast[pointer](p + 32)) + d = mm_load_si128(cast[pointer](p + 48)) + mm_store_si128(cast[pointer](p), mm_sub_epi8(vec255, a)) + mm_store_si128(cast[pointer](p + 16), mm_sub_epi8(vec255, b)) + mm_store_si128(cast[pointer](p + 32), mm_sub_epi8(vec255, c)) + mm_store_si128(cast[pointer](p + 48), mm_sub_epi8(vec255, d)) + p += 64 + i += 64 * iterations + + for i in i ..< data.len: + data[i] = 255 - data[i] + + proc ceilMaskSimd*(data: var seq[uint8]) = + var + i: int + p = cast[uint](data[0].addr) - proc ceilMaskSimd*(data: ptr UncheckedArray[uint8], len: int) = - var i: int let zeroVec = mm_setzero_si128() vec255 = mm_set1_epi8(255) - for _ in 0 ..< len div 16: - var values = mm_loadu_si128(data[i].addr) + iterations = data.len div 16 + for _ in 0 ..< iterations: + var values = mm_loadu_si128(cast[pointer](p)) values = mm_cmpeq_epi8(values, zeroVec) values = mm_andnot_si128(values, vec255) - mm_storeu_si128(data[i].addr, values) - i += 16 + mm_storeu_si128(cast[pointer](p), values) + p += 16 + i += 16 * iterations - for i in i ..< len: + for i in i ..< data.len: if data[i] != 0: data[i] = 255 - proc applyOpacitySimd*( - data: ptr UncheckedArray[uint8], - len: int, - opacity: uint16 - ) = - var i: int + proc applyOpacitySimd*(data: var seq[uint8 | ColorRGBX], opacity: uint16) = + var + i: int + p = cast[uint](data[0].addr) + len = + when data is seq[ColorRGBX]: + data.len * 4 + else: + data.len + let - oddMask = mm_set1_epi16(cast[int16](0xff00)) - div255 = mm_set1_epi16(cast[int16](0x8081)) + oddMask = mm_set1_epi16(0xff00) + div255 = mm_set1_epi16(0x8081) zeroVec = mm_setzero_si128() - opacityVec = mm_slli_epi16(mm_set1_epi16(cast[int16](opacity)), 8) + opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) + iterations = len div 16 for _ in 0 ..< len div 16: - let values = mm_loadu_si128(data[i].addr) + let values = mm_loadu_si128(cast[pointer](p)) if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: var valuesEven = mm_slli_epi16(values, 8) @@ -310,13 +371,23 @@ when defined(amd64): valuesEven = mm_srli_epi16(mm_mulhi_epu16(valuesEven, div255), 7) valuesOdd = mm_srli_epi16(mm_mulhi_epu16(valuesOdd, div255), 7) mm_storeu_si128( - data[i].addr, + cast[pointer](p), mm_or_si128(valuesEven, mm_slli_epi16(valuesOdd, 8)) ) - i += 16 - - for i in i ..< len: - data[i] = ((data[i] * opacity) div 255).uint8 + p += 16 + i += 16 * iterations + + when data is seq[ColorRGBX]: + for i in i div 4 ..< data.len: + var rgbx = data[i] + rgbx.r = ((rgbx.r * opacity) div 255).uint8 + rgbx.g = ((rgbx.g * opacity) div 255).uint8 + rgbx.b = ((rgbx.b * opacity) div 255).uint8 + rgbx.a = ((rgbx.a * opacity) div 255).uint8 + data[i] = rgbx + else: + for i in i ..< data.len: + data[i] = ((data[i] * opacity) div 255).uint8 when defined(release): {.pop.}