Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pixie.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ requires "vmath >= 1.1.4"
requires "chroma >= 0.2.5"
requires "zippy >= 0.10.2"
requires "flatty >= 0.3.4"
requires "nimsimd >= 1.1.5"
requires "nimsimd >= 1.1.6"
requires "bumpy >= 1.1.1"

task bindings, "Generate bindings":
Expand Down
27 changes: 17 additions & 10 deletions src/pixie/simd.nim
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,22 @@ export internal

const allowSimd* = not defined(pixieNoSimd) and not defined(tcc)

when allowSimd and defined(amd64):
import simd/sse2, simd/avx, simd/avx2
export sse2, avx, avx2
when allowSimd:
when defined(amd64):
import simd/sse2, simd/avx, simd/avx2
export sse2, avx, avx2

when not defined(pixieNoAvx):
import nimsimd/runtimecheck
let
cpuHasAvx* = checkInstructionSets({AVX})
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})
when not defined(pixieNoAvx):
import nimsimd/runtimecheck
let
cpuHasAvx* = checkInstructionSets({AVX})
cpuHasAvx2* = checkInstructionSets({AVX, AVX2})

import nimsimd/sse2 as nimsimdsse2
export nimsimdsse2
import nimsimd/sse2 as nimsimdsse2
export nimsimdsse2

elif defined(arm64):
import simd/neon

import nimsimd/neon as nimsimdneon
export nimsimdneon
7 changes: 6 additions & 1 deletion src/pixie/simd/internal.nim
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ macro hasSimd*(procedure: untyped) =
let
name = procedure.procName()
originalBody = procedure[6]
nameNeon = name & "Neon"
nameSse2 = name & "Sse2"
nameAvx = name & "Avx"
nameAvx2 = name & "Avx2"
Expand All @@ -54,7 +55,7 @@ macro hasSimd*(procedure: untyped) =

var body = newStmtList()

when not defined(pixieNoAvx):
when defined(amd64) and not defined(pixieNoAvx):
if nameAvx2 in simdProcs:
body.add quote do:
if cpuHasAvx2:
Expand All @@ -69,6 +70,10 @@ macro hasSimd*(procedure: untyped) =
let bodySse2 = simdProcs[nameSse2][6]
body.add quote do:
`bodySse2`
elif nameNeon in simdProcs:
let bodyNeon = simdProcs[nameNeon][6]
body.add quote do:
`bodyNeon`
else:
body.add quote do:
`originalBody`
Expand Down
161 changes: 161 additions & 0 deletions src/pixie/simd/neon.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import chroma, internal, nimsimd/neon, pixie/common

when defined(release):
{.push checks: off.}

proc fillUnsafeNeon*(
data: var seq[ColorRGBX],
color: SomeColor,
start, len: int
) {.simd.} =
let rgbx = color.asRgbx()

var
i = start
p = cast[uint](data[i].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
data[i] = rgbx
inc i
p += 4

let
colors = vmovq_n_u32(cast[uint32](rgbx))
x4 = vld4q_dup_u32(colors.unsafeAddr)
iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
vst1q_u32_x4(data[i].addr, x4)
i += 16

for i in i ..< start + len:
data[i] = rgbx

proc isOneColorNeon*(image: Image): bool {.simd.} =
result = true

let color = image.data[0]

var
i: int
p = cast[uint](image.data[0].addr)
# Align to 16 bytes
while i < image.data.len and (p and 15) != 0:
if image.data[i] != color:
return false
inc i
p += 4

let
colorVecs = vld4q_dup_u8(color.unsafeAddr)
iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
deinterleved = vld4q_u8(image.data[i].addr)
rEq = vceqq_u8(deinterleved.val[0], colorVecs.val[0])
gEq = vceqq_u8(deinterleved.val[1], colorVecs.val[1])
bEq = vceqq_u8(deinterleved.val[2], colorVecs.val[2])
aEq = vceqq_u8(deinterleved.val[3], colorVecs.val[3])
rgEq = vandq_u8(rEq, gEq)
baEq = vandq_u8(bEq, aEq)
rgbaEq = vandq_u8(rgEq, baEq)
mask =
cast[uint64](vget_low_u64(cast[uint64x2](rgbaEq))) and
cast[uint64](vget_high_u64(cast[uint64x2](rgbaEq)))
if mask != uint64.high:
return false
i += 16

for i in i ..< image.data.len:
if image.data[i] != color:
return false

proc isTransparentNeon*(image: Image): bool {.simd.} =
var
i: int
p = cast[uint](image.data[0].addr)
# Align to 16 bytes
while i < image.data.len and (p and 15) != 0:
if image.data[i].a != 0:
return false
inc i
p += 4

result = true

let iterations = (image.data.len - i) div 16
for _ in 0 ..< iterations:
let
alphas = vld4q_u8(image.data[i].addr).val[3]
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(0))
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
if mask != uint64.high:
return false
i += 16

for i in i ..< image.data.len:
if image.data[i].a != 0:
return false

proc isOpaqueNeon*(data: var seq[ColorRGBX], start, len: int): bool {.simd.} =
result = true

var
i = start
p = cast[uint](data[0].addr)
# Align to 16 bytes
while i < (start + len) and (p and 15) != 0:
if data[i].a != 255:
return false
inc i
p += 4

let iterations = (start + len - i) div 16
for _ in 0 ..< iterations:
let
alphas = vld4q_u8(data[i].addr).val[3]
eq = vceqq_u64(cast[uint64x2](alphas), vmovq_n_u64(uint64.high))
mask = cast[uint64](vget_low_u64(eq)) and cast[uint64](vget_high_u64(eq))
if mask != uint64.high:
return false
i += 16

for i in i ..< start + len:
if data[i].a != 255:
return false

proc newImageNeon*(mask: Mask): Image {.simd.} =
result = newImage(mask.width, mask.height)

var i: int
for _ in 0 ..< mask.data.len div 16:
let alphas = vld1q_u8(mask.data[i].addr)
template doLane(lane: int) =
let packed = vgetq_lane_u32(cast[uint32x4](alphas), lane)
var unpacked = cast[uint8x16](vmovq_n_u32(packed))
unpacked = vzip1q_u8(unpacked, unpacked)
unpacked = vzip1q_u8(unpacked, unpacked)
vst1q_u8(result.data[i + lane * 4].addr, unpacked)
doLane(0)
doLane(1)
doLane(2)
doLane(3)
i += 16

for i in i ..< mask.data.len:
let v = mask.data[i]
result.data[i] = rgbx(v, v, v, v)

proc newMaskNeon*(image: Image): Mask {.simd.} =
result = newMask(image.width, image.height)

var i: int
for _ in 0 ..< image.data.len div 16:
let alphas = vld4q_u8(image.data[i].addr).val[3]
vst1q_u8(result.data[i].addr, alphas)
i += 16

for i in i ..< image.data.len:
result.data[i] = image.data[i].a

when defined(release):
{.pop.}