Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pixie.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
requires "chroma >= 0.2.6"
requires "zippy >= 0.10.3"
requires "flatty >= 0.3.4"
requires "nimsimd >= 1.1.9"
requires "nimsimd >= 1.1.10"
requires "bumpy >= 1.1.1"

task bindings, "Generate bindings":
Expand Down
22 changes: 14 additions & 8 deletions src/pixie/simd/avx2.nim
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,11 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} =
proc blitLineNormalAvx2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 31) != 0:
a[i] = blendNormal(a[i], b[i])
inc i

let
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00))
Expand All @@ -393,16 +398,14 @@ proc blitLineNormalAvx2*(
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
)

var i: int
while i < len - 8:
let
source = mm256_loadu_si256(b[i].addr)
eq255 = mm256_cmpeq_epi8(source, vec255)
if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
mm256_storeu_si256(a[i].addr, source)
else:
let backdrop = mm256_loadu_si256(a[i].addr)
let backdrop = mm256_load_si256(a[i].addr)

var
sourceAlpha = mm256_and_si256(source, alphaMask)
Expand All @@ -423,7 +426,7 @@ proc blitLineNormalAvx2*(
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)

mm256_storeu_si256(a[i].addr, added)
mm256_store_si256(a[i].addr, added)

i += 8

Expand All @@ -433,6 +436,11 @@ proc blitLineNormalAvx2*(
proc blitLineMaskAvx2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 31) != 0:
a[i] = blendMask(a[i], b[i])
inc i

let
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00))
Expand All @@ -442,16 +450,14 @@ proc blitLineMaskAvx2*(
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
)

var i: int
while i < len - 8:
let
source = mm256_loadu_si256(b[i].addr)
eq255 = mm256_cmpeq_epi8(source, vec255)
if (mm256_movemask_epi8(eq255) and 0x88888888) == 0x88888888: # Opaque source
discard
else:
let backdrop = mm256_loadu_si256(a[i].addr)
let backdrop = mm256_load_si256(a[i].addr)

var
sourceAlpha = mm256_and_si256(source, alphaMask)
Expand All @@ -465,7 +471,7 @@ proc blitLineMaskAvx2*(
backdropEven = mm256_srli_epi16(mm256_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm256_srli_epi16(mm256_mulhi_epu16(backdropOdd, div255), 7)

mm256_storeu_si256(
mm256_store_si256(
a[i].addr,
mm256_or_si256(backdropEven, mm256_slli_epi16(backdropOdd, 8))
)
Expand Down
157 changes: 131 additions & 26 deletions src/pixie/simd/neon.nim
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import chroma, internal, nimsimd/neon, pixie/common, vmath
import chroma, internal, nimsimd/neon, pixie/blends, pixie/common, vmath

when defined(release):
{.push checks: off.}
Expand Down Expand Up @@ -58,9 +58,9 @@ proc isOneColorNeon*(image: Image): bool {.simd.} =
rgEq = vandq_u8(rEq, gEq)
baEq = vandq_u8(bEq, aEq)
rgbaEq = vandq_u8(rgEq, baEq)
mask =
cast[uint64](vget_low_u64(cast[uint64x2](rgbaEq))) and
cast[uint64](vget_high_u64(cast[uint64x2](rgbaEq)))
mask = vget_lane_u64(cast[uint64x1](
vand_u8(vget_low_u8(rgbaEq), vget_high_u8(rgbaEq)
)), 0)
if mask != uint64.high:
return false
i += 16
Expand All @@ -82,12 +82,16 @@ proc isTransparentNeon*(image: Image): bool {.simd.} =

result = true

let iterations = (image.data.len - i) div 16
let
vecZero = vmovq_n_u8(0)
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
alphas = vld4q_u8(image.data[i].addr).val[3]
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(0))
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
eq = vceqq_u8(alphas, vecZero)
mask = vget_lane_u64(cast[uint64x1](
vand_u8(vget_low_u8(eq), vget_high_u8(eq)
)), 0)
if mask != uint64.high:
return false
i += 16
Expand All @@ -109,12 +113,16 @@ proc isOpaqueNeon*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
inc i
p += 4

let iterations = (start + len - i) div 16
let
vec255 = vmovq_n_u8(255)
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
alphas = vld4q_u8(data[i].addr).val[3]
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(uint64.high))
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
eq = vceqq_u8(alphas, vec255)
mask = vget_lane_u64(cast[uint64x1](
vand_u8(vget_low_u8(eq), vget_high_u8(eq)
)), 0)
if mask != uint64.high:
return false
i += 16
Expand All @@ -138,19 +146,25 @@ proc toPremultipliedAlphaNeon*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} =
inc i
p += 4

proc premultiply(c, a: uint8x8): uint8x8 {.inline.} =
template multiply(c, a: uint8x8): uint8x8 =
let ca = vmull_u8(c, a)
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))

let iterations = (data.len - i) div 8
template multiply(c, a: uint8x16): uint8x16 =
vcombine_u8(
multiply(vget_low_u8(c), vget_low_u8(a)),
multiply(vget_high_u8(c), vget_high_u8(a))
)

let iterations = (data.len - i) div 16
for _ in 0 ..< iterations:
var channels = vld4_u8(cast[pointer](p))
channels.val[0] = premultiply(channels.val[0], channels.val[3])
channels.val[1] = premultiply(channels.val[1], channels.val[3])
channels.val[2] = premultiply(channels.val[2], channels.val[3])
vst4_u8(cast[pointer](p), channels)
p += 32
i += 8 * iterations
var channels = vld4q_u8(cast[pointer](p))
channels.val[0] = multiply(channels.val[0], channels.val[3])
channels.val[1] = multiply(channels.val[1], channels.val[3])
channels.val[2] = multiply(channels.val[2], channels.val[3])
vst4q_u8(cast[pointer](p), channels)
p += 64
i += 16 * iterations

for i in i ..< data.len:
var c = data[i]
Expand Down Expand Up @@ -211,19 +225,19 @@ proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} =
i: int
p = cast[uint](image.data[0].addr)

proc apply(c, o: uint8x8): uint8x8 {.inline.} =
let co = vmull_u8(c, o)
vraddhn_u16(co, vrshrq_n_u16(co, 8))
template multiply(c, a: uint8x8): uint8x8 =
let ca = vmull_u8(c, a)
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))

let
opacityVec = vmov_n_u8(opacity)
iterations = image.data.len div 8
for _ in 0 ..< iterations:
var channels = vld4_u8(cast[pointer](p))
channels.val[0] = apply(channels.val[0], opacityVec)
channels.val[1] = apply(channels.val[1], opacityVec)
channels.val[2] = apply(channels.val[2], opacityVec)
channels.val[3] = apply(channels.val[3], opacityVec)
channels.val[0] = multiply(channels.val[0], opacityVec)
channels.val[1] = multiply(channels.val[1], opacityVec)
channels.val[2] = multiply(channels.val[2], opacityVec)
channels.val[3] = multiply(channels.val[3], opacityVec)
vst4_u8(cast[pointer](p), channels)
p += 32
i += 8 * iterations
Expand Down Expand Up @@ -400,5 +414,96 @@ proc magnifyBy2Neon*(image: Image, power = 1): Image {.simd.} =
result.width * 4
)

proc blitLineNormalNeon*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 15) != 0:
a[i] = blendNormal(a[i], b[i])
inc i

let vec255 = vmovq_n_u8(255)
while i < len - 16:
let
source = vld4q_u8(b[i].addr)
eq255 = vceqq_u8(source.val[3], vec255)
mask = vget_lane_u64(cast[uint64x1](
vand_u8(vget_low_u8(eq255), vget_high_u8(eq255)
)), 0)
if mask == uint64.high:
vst4q_u8(a[i].addr, source)
else:
template multiply(c, a: uint8x8): uint8x8 =
let ca = vmull_u8(c, a)
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))

template multiply(c, a: uint8x16): uint8x16 =
vcombine_u8(
multiply(vget_low_u8(c), vget_low_u8(a)),
multiply(vget_high_u8(c), vget_high_u8(a))
)

let
backdrop = vld4q_u8(a[i].addr)
multiplier = vsubq_u8(vec255, source.val[3])

var blended: uint8x16x4
blended.val[0] = multiply(backdrop.val[0], multiplier)
blended.val[1] = multiply(backdrop.val[1], multiplier)
blended.val[2] = multiply(backdrop.val[2], multiplier)
blended.val[3] = multiply(backdrop.val[3], multiplier)
blended.val[0] = vaddq_u8(blended.val[0], source.val[0])
blended.val[1] = vaddq_u8(blended.val[1], source.val[1])
blended.val[2] = vaddq_u8(blended.val[2], source.val[2])
blended.val[3] = vaddq_u8(blended.val[3], source.val[3])
vst4q_u8(a[i].addr, blended)

i += 16

for i in i ..< len:
a[i] = blendNormal(a[i], b[i])

proc blitLineMaskNeon*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 15) != 0:
a[i] = blendMask(a[i], b[i])
inc i

let vec255 = vmovq_n_u8(255)
while i < len - 16:
let
source = vld4q_u8(b[i].addr)
eq255 = vceqq_u8(source.val[3], vec255)
mask = vget_lane_u64(cast[uint64x1](
vand_u8(vget_low_u8(eq255), vget_high_u8(eq255)
)), 0)
if mask == uint64.high:
discard
else:
template multiply(c, a: uint8x8): uint8x8 =
let ca = vmull_u8(c, a)
vraddhn_u16(ca, vrshrq_n_u16(ca, 8))

template multiply(c, a: uint8x16): uint8x16 =
vcombine_u8(
multiply(vget_low_u8(c), vget_low_u8(a)),
multiply(vget_high_u8(c), vget_high_u8(a))
)

let backdrop = vld4q_u8(a[i].addr)
var blended: uint8x16x4
blended.val[0] = multiply(backdrop.val[0], source.val[3])
blended.val[1] = multiply(backdrop.val[1], source.val[3])
blended.val[2] = multiply(backdrop.val[2], source.val[3])
blended.val[3] = multiply(backdrop.val[3], source.val[3])
vst4q_u8(a[i].addr, blended)

i += 16

for i in i ..< len:
a[i] = blendMask(a[i], b[i])

when defined(release):
{.pop.}
22 changes: 14 additions & 8 deletions src/pixie/simd/sse2.nim
Original file line number Diff line number Diff line change
Expand Up @@ -530,22 +530,25 @@ proc magnifyBy2Sse2*(image: Image, power = 1): Image {.simd.} =
proc blitLineNormalSse2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 15) != 0:
a[i] = blendNormal(a[i], b[i])
inc i

let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
vec255 = mm_set1_epi8(255)
vecAlpha255 = mm_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))

var i: int
while i < len - 4:
let
source = mm_loadu_si128(b[i].addr)
eq255 = mm_cmpeq_epi8(source, vec255)
if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
mm_storeu_si128(a[i].addr, source)
else:
let backdrop = mm_loadu_si128(a[i].addr)
let backdrop = mm_load_si128(a[i].addr)

var
sourceAlpha = mm_and_si128(source, alphaMask)
Expand All @@ -566,7 +569,7 @@ proc blitLineNormalSse2*(
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)

mm_storeu_si128(a[i].addr, added)
mm_store_si128(a[i].addr, added)

i += 4

Expand All @@ -576,21 +579,24 @@ proc blitLineNormalSse2*(
proc blitLineMaskSse2*(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.simd.} =
var i: int
while (cast[uint](a[i].addr) and 15) != 0:
a[i] = blendMask(a[i], b[i])
inc i

let
alphaMask = mm_set1_epi32(cast[int32](0xff000000))
oddMask = mm_set1_epi16(cast[int16](0xff00))
div255 = mm_set1_epi16(cast[int16](0x8081))
vec255 = mm_set1_epi8(255)

var i: int
while i < len - 4:
let
source = mm_loadu_si128(b[i].addr)
eq255 = mm_cmpeq_epi8(source, vec255)
if (mm_movemask_epi8(eq255) and 0x00008888) == 0x00008888: # Opaque source
discard
else:
let backdrop = mm_loadu_si128(a[i].addr)
let backdrop = mm_load_si128(a[i].addr)

var
sourceAlpha = mm_and_si128(source, alphaMask)
Expand All @@ -604,7 +610,7 @@ proc blitLineMaskSse2*(
backdropEven = mm_srli_epi16(mm_mulhi_epu16(backdropEven, div255), 7)
backdropOdd = mm_srli_epi16(mm_mulhi_epu16(backdropOdd, div255), 7)

mm_storeu_si128(
mm_store_si128(
a[i].addr,
mm_or_si128(backdropEven, mm_slli_epi16(backdropOdd, 8))
)
Expand Down