From c4cd380676a73b0b5ac3663c51d502e541cad88e Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 30 Jul 2022 23:14:02 -0500 Subject: [PATCH 1/6] align blit simd blends --- src/pixie/simd/avx2.nim | 22 ++++++++++++++-------- src/pixie/simd/sse2.nim | 22 ++++++++++++++-------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 7bd9c9c6..9375075e 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -383,6 +383,11 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} = proc blitLineNormalAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 31) != 0: + a[i] = blendNormal(a[i], b[i]) + inc i + let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(cast[int16](0xff00)) @@ -393,8 +398,6 @@ proc blitLineNormalAvx2*( 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 ) - - var i: int while i < len - 8: let source = mm256_loadu_si256(b[i].addr) @@ -402,7 +405,7 @@ proc blitLineNormalAvx2*( if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source mm256_storeu_si256(a[i].addr, source) else: - let backdrop = mm256_loadu_si256(a[i].addr) + let backdrop = mm256_load_si256(a[i].addr) var sourceAlpha = mm256_and_si256(source, alphaMask) @@ -423,7 +426,7 @@ proc blitLineNormalAvx2*( mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) ) - mm256_storeu_si256(a[i].addr, added) + mm256_store_si256(a[i].addr, added) i += 8 @@ -433,6 +436,11 @@ proc blitLineNormalAvx2*( proc blitLineMaskAvx2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 31) != 0: + a[i] = blendMask(a[i], b[i]) + inc i + let alphaMask = mm256_set1_epi32(cast[int32](0xff000000)) oddMask = mm256_set1_epi16(cast[int16](0xff00)) @@ -442,8 +450,6 @@ proc blitLineMaskAvx2*( 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1, 15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1 ) - - var i: int while i < len - 8: let source = mm256_loadu_si256(b[i].addr) @@ -451,7 +457,7 @@ proc blitLineMaskAvx2*( if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source discard else: - let backdrop = mm256_loadu_si256(a[i].addr) + let backdrop = mm256_load_si256(a[i].addr) var sourceAlpha = mm256_and_si256(source, alphaMask) @@ -465,7 +471,7 @@ proc blitLineMaskAvx2*( backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7) - mm256_storeu_si256( + mm256_store_si256( a[i].addr, mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8)) ) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index c8e0dc8a..cc77910d 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -530,14 +530,17 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = proc blitLineNormalSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 15) != 0: + a[i] = blendNormal(a[i], b[i]) + inc i + let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) vec255 = mm_set1_epi8(255) vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255])) - - var i: int while i < len - 4: let source = mm_loadu_si128(b[i].addr) @@ -545,7 +548,7 @@ proc blitLineNormalSse2*( if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source mm_storeu_si128(a[i].addr, source) else: - let backdrop = mm_loadu_si128(a[i].addr) + let backdrop = mm_load_si128(a[i].addr) var sourceAlpha = mm_and_si128(source, alphaMask) @@ -566,7 +569,7 @@ proc blitLineNormalSse2*( mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) ) - mm_storeu_si128(a[i].addr, added) + mm_store_si128(a[i].addr, added) i += 4 @@ -576,13 +579,16 @@ proc blitLineNormalSse2*( proc blitLineMaskSse2*( a, b: ptr UncheckedArray[ColorRGBX], len: int ) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 15) != 0: + a[i] = blendMask(a[i], b[i]) + inc i + let alphaMask = mm_set1_epi32(cast[int32](0xff000000)) oddMask = mm_set1_epi16(cast[int16](0xff00)) div255 = mm_set1_epi16(cast[int16](0x8081)) vec255 = mm_set1_epi8(255) - - var i: int while i < len - 4: let source = mm_loadu_si128(b[i].addr) @@ -590,7 +596,7 @@ proc blitLineMaskSse2*( if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source discard else: - let backdrop = mm_loadu_si128(a[i].addr) + let backdrop = mm_load_si128(a[i].addr) var sourceAlpha = mm_and_si128(source, alphaMask) @@ -604,7 +610,7 @@ proc blitLineMaskSse2*( backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7) backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7) - mm_storeu_si128( + mm_store_si128( a[i].addr, mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8)) ) From 0245eeebea6bf6f050ecf45551b28215eb6ae292 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sat, 30 Jul 2022 23:49:05 -0500 Subject: [PATCH 2/6] better --- src/pixie/simd/neon.nim | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 82a274ee..b646a5ab 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -58,9 +58,9 @@ proc isOneColorNeon*(image: Image): bool {.simd.} = rgEq = vandq_u8(rEq, gEq) baEq = vandq_u8(bEq, aEq) rgbaEq = vandq_u8(rgEq, baEq) - mask = - cast[uint64](vget_low_u64(cast[uint64x2](rgbaEq))) and - cast[uint64](vget_high_u64(cast[uint64x2](rgbaEq))) + mask = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(rgbaEq), vget_high_u8(rgbaEq) + )), 0) if mask != uint64.high: return false i += 16 @@ -82,12 +82,16 @@ proc isTransparentNeon*(image: Image): bool {.simd.} = result = true - let iterations = (image.data.len - i) div 16 + let + vecZero = vmovq_n_u8(0) + iterations = (image.data.len - i) div 16 for _ in 0 ..< iterations: let alphas = vld4q_u8(image.data[i].addr).val[3] - eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(0)) - mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq)) + eq = vceqq_u8(alphas, vecZero) + mask = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eq), vget_high_u8(eq) + )), 0) if mask != uint64.high: return false i += 16 @@ -109,12 +113,16 @@ proc isOpaqueNeon*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} = inc i p += 4 - let iterations = (start + len - i) div 16 + let + vec255 = vmovq_n_u8(255) + iterations = (start + len - i) div 16 for _ in 0 ..< iterations: let alphas = vld4q_u8(data[i].addr).val[3] - eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(uint64.high)) - mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq)) + eq = vceqq_u8(alphas, vec255) + mask = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eq), vget_high_u8(eq) + )), 0) if mask != uint64.high: return false i += 16 From 6fb4b02eb6a38f1cda50ae7ff930b3c14d135064 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 31 Jul 2022 00:12:20 -0500 Subject: [PATCH 3/6] neon line blends --- pixie.nimble | 2 +- src/pixie/simd/neon.nim | 77 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/pixie.nimble b/pixie.nimble index 5d1c5241..2e01f52d 100644 --- a/pixie.nimble +++ b/pixie.nimble @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4" requires "chroma >= 0.2.6" requires "zippy >= 0.10.3" requires "flatty >= 0.3.4" -requires "nimsimd >= 1.1.9" +requires "nimsimd >= 1.1.10" requires "bumpy >= 1.1.1" task bindings, "Generate bindings": diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index b646a5ab..0ec9b018 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -1,4 +1,4 @@ -import chroma, internal, nimsimd/neon, pixie/common, vmath +import chroma, internal, nimsimd/neon, pixie/blends, pixie/common, vmath when defined(release): {.push checks: off.} @@ -146,7 +146,7 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = inc i p += 4 - proc premultiply(c, a: uint8x8): uint8x8 {.inline.} = + template premultiply(c, a: uint8x8): uint8x8 = let ca = vmull_u8(c, a) vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) @@ -408,5 +408,78 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} = result.width * 4 ) +proc blitLineNormalNeon*( + a, b: ptr UncheckedArray[ColorRGBX], len: int +) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 15) != 0: + a[i] = blendNormal(a[i], b[i]) + inc i + + let vec255 = vmov_n_u8(255) + while i < len - 8: + let + source = vld4_u8(b[i].addr) + eq255 = vceq_u8(source.val[3], vec255) + if vget_lane_u64(cast[uint64x1](eq255), 0) == uint64.high: + vst4_u8(a[i].addr, source) + else: + template multiply(c, a: uint8x8): uint8x8 = + let ca = vmull_u8(c, a) + vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) + + let + backdrop = vld4_u8(a[i].addr) + multiplier = vsub_u8(vec255, source.val[3]) + + var blended: uint8x8x4 + blended.val[0] = multiply(backdrop.val[0], multiplier) + blended.val[1] = multiply(backdrop.val[1], multiplier) + blended.val[2] = multiply(backdrop.val[2], multiplier) + blended.val[3] = multiply(backdrop.val[3], multiplier) + blended.val[0] = vadd_u8(blended.val[0], source.val[0]) + blended.val[1] = vadd_u8(blended.val[1], source.val[1]) + blended.val[2] = vadd_u8(blended.val[2], source.val[2]) + blended.val[3] = vadd_u8(blended.val[3], source.val[3]) + vst4_u8(a[i].addr, blended) + + i += 8 + + for i in i ..< len: + a[i] = blendNormal(a[i], b[i]) + +proc blitLineMaskNeon*( + a, b: ptr UncheckedArray[ColorRGBX], len: int +) {.simd.} = + var i: int + while (cast[uint](a[i].addr) and 15) != 0: + a[i] = blendMask(a[i], b[i]) + inc i + + let vec255 = vmov_n_u8(255) + while i < len - 8: + let + source = vld4_u8(b[i].addr) + eq255 = vceq_u8(source.val[3], vec255) + if vget_lane_u64(cast[uint64x1](eq255), 0) == uint64.high: + discard + else: + template multiply(c, a: uint8x8): uint8x8 = + let ca = vmull_u8(c, a) + vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) + + let backdrop = vld4_u8(a[i].addr) + var blended: uint8x8x4 + blended.val[0] = multiply(backdrop.val[0], source.val[3]) + blended.val[1] = multiply(backdrop.val[1], source.val[3]) + blended.val[2] = multiply(backdrop.val[2], source.val[3]) + blended.val[3] = multiply(backdrop.val[3], source.val[3]) + vst4_u8(a[i].addr, blended) + + i += 8 + + for i in i ..< len: + a[i] = blendMask(a[i], b[i]) + when defined(release): {.pop.} From db4c1bea731f9024db113864d523182e47bc33c5 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 31 Jul 2022 00:22:17 -0500 Subject: [PATCH 4/6] wider neon --- src/pixie/simd/neon.nim | 66 ++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 0ec9b018..93ca7a22 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -416,34 +416,43 @@ proc blitLineNormalNeon*( a[i] = blendNormal(a[i], b[i]) inc i - let vec255 = vmov_n_u8(255) - while i < len - 8: + let vec255 = vmovq_n_u8(255) + while i < len - 16: let - source = vld4_u8(b[i].addr) - eq255 = vceq_u8(source.val[3], vec255) - if vget_lane_u64(cast[uint64x1](eq255), 0) == uint64.high: - vst4_u8(a[i].addr, source) + source = vld4q_u8(b[i].addr) + eq255 = vceqq_u8(source.val[3], vec255) + mask = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eq255), vget_high_u8(eq255) + )), 0) + if mask == uint64.high: + vst4q_u8(a[i].addr, source) else: template multiply(c, a: uint8x8): uint8x8 = let ca = vmull_u8(c, a) vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) + template multiply(c, a: uint8x16): uint8x16 = + vcombine_u8( + multiply(vget_low_u8(c), vget_low_u8(a)), + multiply(vget_high_u8(c), vget_high_u8(a)) + ) + let - backdrop = vld4_u8(a[i].addr) - multiplier = vsub_u8(vec255, source.val[3]) + backdrop = vld4q_u8(a[i].addr) + multiplier = vsubq_u8(vec255, source.val[3]) - var blended: uint8x8x4 + var blended: uint8x16x4 blended.val[0] = multiply(backdrop.val[0], multiplier) blended.val[1] = multiply(backdrop.val[1], multiplier) blended.val[2] = multiply(backdrop.val[2], multiplier) blended.val[3] = multiply(backdrop.val[3], multiplier) - blended.val[0] = vadd_u8(blended.val[0], source.val[0]) - blended.val[1] = vadd_u8(blended.val[1], source.val[1]) - blended.val[2] = vadd_u8(blended.val[2], source.val[2]) - blended.val[3] = vadd_u8(blended.val[3], source.val[3]) - vst4_u8(a[i].addr, blended) + blended.val[0] = vaddq_u8(blended.val[0], source.val[0]) + blended.val[1] = vaddq_u8(blended.val[1], source.val[1]) + blended.val[2] = vaddq_u8(blended.val[2], source.val[2]) + blended.val[3] = vaddq_u8(blended.val[3], source.val[3]) + vst4q_u8(a[i].addr, blended) - i += 8 + i += 16 for i in i ..< len: a[i] = blendNormal(a[i], b[i]) @@ -456,27 +465,36 @@ proc blitLineMaskNeon*( a[i] = blendMask(a[i], b[i]) inc i - let vec255 = vmov_n_u8(255) - while i < len - 8: + let vec255 = vmovq_n_u8(255) + while i < len - 16: let - source = vld4_u8(b[i].addr) - eq255 = vceq_u8(source.val[3], vec255) - if vget_lane_u64(cast[uint64x1](eq255), 0) == uint64.high: + source = vld4q_u8(b[i].addr) + eq255 = vceqq_u8(source.val[3], vec255) + mask = vget_lane_u64(cast[uint64x1]( + vand_u8(vget_low_u8(eq255), vget_high_u8(eq255) + )), 0) + if mask == uint64.high: discard else: template multiply(c, a: uint8x8): uint8x8 = let ca = vmull_u8(c, a) vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) - let backdrop = vld4_u8(a[i].addr) - var blended: uint8x8x4 + template multiply(c, a: uint8x16): uint8x16 = + vcombine_u8( + multiply(vget_low_u8(c), vget_low_u8(a)), + multiply(vget_high_u8(c), vget_high_u8(a)) + ) + + let backdrop = vld4q_u8(a[i].addr) + var blended: uint8x16x4 blended.val[0] = multiply(backdrop.val[0], source.val[3]) blended.val[1] = multiply(backdrop.val[1], source.val[3]) blended.val[2] = multiply(backdrop.val[2], source.val[3]) blended.val[3] = multiply(backdrop.val[3], source.val[3]) - vst4_u8(a[i].addr, blended) + vst4q_u8(a[i].addr, blended) - i += 8 + i += 16 for i in i ..< len: a[i] = blendMask(a[i], b[i]) From be142b55ad704892e7d4d3cb6c115b46d5262bc8 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 31 Jul 2022 01:05:17 -0500 Subject: [PATCH 5/6] faster --- src/pixie/simd/neon.nim | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 93ca7a22..26c18acd 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -146,19 +146,25 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = inc i p += 4 - template premultiply(c, a: uint8x8): uint8x8 = + template multiply(c, a: uint8x8): uint8x8 = let ca = vmull_u8(c, a) vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) - let iterations = (data.len - i) div 8 + template multiply(c, a: uint8x16): uint8x16 = + vcombine_u8( + multiply(vget_low_u8(c), vget_low_u8(a)), + multiply(vget_high_u8(c), vget_high_u8(a)) + ) + + let iterations = (data.len - i) div 16 for _ in 0 ..< iterations: - var channels = vld4_u8(cast[pointer](p)) - channels.val[0] = premultiply(channels.val[0], channels.val[3]) - channels.val[1] = premultiply(channels.val[1], channels.val[3]) - channels.val[2] = premultiply(channels.val[2], channels.val[3]) - vst4_u8(cast[pointer](p), channels) - p += 32 - i += 8 * iterations + var channels = vld4q_u8(cast[pointer](p)) + channels.val[0] = multiply(channels.val[0], channels.val[3]) + channels.val[1] = multiply(channels.val[1], channels.val[3]) + channels.val[2] = multiply(channels.val[2], channels.val[3]) + vst4q_u8(cast[pointer](p), channels) + p += 64 + i += 16 * iterations for i in i ..< data.len: var c = data[i] From 85c411e6cdd24b2e0e53cebb9356deedeba4f637 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Sun, 31 Jul 2022 01:10:12 -0500 Subject: [PATCH 6/6] rename --- src/pixie/simd/neon.nim | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index 26c18acd..bb43213e 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -225,19 +225,19 @@ proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} = i: int p = cast[uint](image.data[0].addr) - proc apply(c, o: uint8x8): uint8x8 {.inline.} = - let co = vmull_u8(c, o) - vraddhn_u16(co, vrshrq_n_u16(co, 8)) + template multiply(c, a: uint8x8): uint8x8 = + let ca = vmull_u8(c, a) + vraddhn_u16(ca, vrshrq_n_u16(ca, 8)) let opacityVec = vmov_n_u8(opacity) iterations = image.data.len div 8 for _ in 0 ..< iterations: var channels = vld4_u8(cast[pointer](p)) - channels.val[0] = apply(channels.val[0], opacityVec) - channels.val[1] = apply(channels.val[1], opacityVec) - channels.val[2] = apply(channels.val[2], opacityVec) - channels.val[3] = apply(channels.val[3], opacityVec) + channels.val[0] = multiply(channels.val[0], opacityVec) + channels.val[1] = multiply(channels.val[1], opacityVec) + channels.val[2] = multiply(channels.val[2], opacityVec) + channels.val[3] = multiply(channels.val[3], opacityVec) vst4_u8(cast[pointer](p), channels) p += 32 i += 8 * iterations