Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions src/pixie/fileformats/png.nim
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,25 @@ proc unfilter(
uncompressedStartIdx = uncompressedIdx(1, y)
unfilteredStartIx = unfiteredIdx(0, y)
var x: int
when allowSimd and defined(amd64):
when allowSimd and (defined(amd64) or defined(arm64)):
if y - 1 >= 0:
for _ in 0 ..< rowBytes div 16:
let
bytes = mm_loadu_si128(uncompressed[uncompressedStartIdx + x].addr)
up = mm_loadu_si128(result[unfilteredStartIx + x - rowBytes].addr)
mm_storeu_si128(
result[unfilteredStartIx + x].addr,
mm_add_epi8(bytes, up)
)
when defined(amd64):
let
bytes = mm_loadu_si128(uncompressed[uncompressedStartIdx + x].addr)
up = mm_loadu_si128(result[unfilteredStartIx + x - rowBytes].addr)
mm_storeu_si128(
result[unfilteredStartIx + x].addr,
mm_add_epi8(bytes, up)
)
else: # arm64
let
bytes = vld1q_u8(uncompressed[uncompressedStartIdx + x].addr)
up = vld1q_u8(result[unfilteredStartIx + x - rowBytes].addr)
vst1q_u8(
result[unfilteredStartIx + x].addr,
vaddq_u8(bytes, up)
)
x += 16
for x in x ..< rowBytes:
var value = uncompressed[uncompressedStartIdx + x]
Expand Down
10 changes: 7 additions & 3 deletions src/pixie/images.nim
Original file line number Diff line number Diff line change
Expand Up @@ -338,8 +338,9 @@ proc blur*(
var values: array[4, uint32]
for xx in x - radius ..< min(x + radius, 0):
values += outOfBounds * kernel[xx - x + radius]
var idx = image.dataIndex(0, y)
for xx in max(x - radius, 0) .. min(x + radius, image.width - 1):
values += image.unsafe[xx, y] * kernel[xx - x + radius]
values += image.data[idx + xx] * kernel[xx - x + radius]
for xx in max(x - radius, image.width) .. x + radius:
values += outOfBounds * kernel[xx - x + radius]
blurX.unsafe[y, x] = rgbx(values)
Expand All @@ -350,8 +351,9 @@ proc blur*(
var values: array[4, uint32]
for yy in y - radius ..< min(y + radius, 0):
values += outOfBounds * kernel[yy - y + radius]
var idx = blurX.dataIndex(0, x)
for yy in max(y - radius, 0) .. min(y + radius, image.height - 1):
values += blurX.unsafe[yy, x] * kernel[yy - y + radius]
values += blurX.data[idx + yy] * kernel[yy - y + radius]
for yy in max(y - radius, image.height) .. y + radius:
values += outOfBounds * kernel[yy - y + radius]
image.unsafe[x, y] = rgbx(values)
Expand Down Expand Up @@ -447,7 +449,9 @@ proc blendLineOverwrite(
) {.inline.} =
copyMem(a[0].addr, b[0].addr, len * 4)

proc blendLineNormal(a, b: ptr UncheckedArray[ColorRGBX], len: int) {.hasSimd.} =
proc blendLineNormal(
a, b: ptr UncheckedArray[ColorRGBX], len: int
) {.hasSimd.} =
for i in 0 ..< len:
a[i] = blendNormal(a[i], b[i])

Expand Down
31 changes: 20 additions & 11 deletions src/pixie/paints.nim
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ proc fillGradientLinear(image: Image, paint: Paint) =
if at.y == to.y: # Horizontal gradient
var x: int
while x < image.width:
when defined(amd64) and allowSimd:
when allowSimd and (defined(amd64) or defined(arm64)):
if x + 4 <= image.width:
var colors: array[4, ColorRGBX]
for i in 0 ..< 4:
Expand All @@ -128,10 +128,14 @@ proc fillGradientLinear(image: Image, paint: Paint) =
t = toLineSpace(at, to, xy)
rgbx = paint.gradientColor(t)
colors[i] = rgbx

let colorVec = cast[M128i](colors)
for y in 0 ..< image.height:
mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec)
when defined(amd64):
let colorVec = mm_loadu_si128(colors[0].addr)
for y in 0 ..< image.height:
mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec)
else: # arm64
let colorVec = vld1q_u32(colors[0].addr)
for y in 0 ..< image.height:
vst1q_u32(image.data[image.dataIndex(x, y)].addr, colorVec)
x += 4
continue

Expand All @@ -150,11 +154,17 @@ proc fillGradientLinear(image: Image, paint: Paint) =
t = toLineSpace(at, to, xy)
rgbx = paint.gradientColor(t)
var x: int
when defined(amd64) and allowSimd:
let colorVec = mm_set1_epi32(cast[int32](rgbx))
for _ in 0 ..< image.width div 4:
mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec)
x += 4
when allowSimd:
when defined(amd64):
let colorVec = mm_set1_epi32(cast[int32](rgbx))
for _ in 0 ..< image.width div 4:
mm_storeu_si128(image.data[image.dataIndex(x, y)].addr, colorVec)
x += 4
elif defined(arm64):
let colorVec = vmovq_n_u32(cast[uint32](rgbx))
for _ in 0 ..< image.width div 4:
vst1q_u32(image.data[image.dataIndex(x, y)].addr, colorVec)
x += 4
for x in x ..< image.width:
image.unsafe[x, y] = rgbx

Expand Down Expand Up @@ -227,7 +237,6 @@ proc fillGradientAngular(image: Image, paint: Paint) =

proc fillGradient*(image: Image, paint: Paint) {.raises: [PixieError].} =
## Fills with the Paint gradient.

case paint.kind:
of LinearGradientPaint:
image.fillGradientLinear(paint)
Expand Down
22 changes: 15 additions & 7 deletions src/pixie/paths.nim
Original file line number Diff line number Diff line change
Expand Up @@ -1410,13 +1410,21 @@ proc computeCoverage(
let fillLen = at.integer - fillStart
if fillLen > 0:
var i = fillStart
when defined(amd64) and allowSimd:
let sampleCoverageVec = mm_set1_epi8(sampleCoverage)
for _ in 0 ..< fillLen div 16:
var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
mm_storeu_si128(coverages[i - startX].addr, coverageVec)
i += 16
when allowSimd:
when defined(amd64):
let sampleCoverageVec = mm_set1_epi8(sampleCoverage)
for _ in 0 ..< fillLen div 16:
var coverageVec = mm_loadu_si128(coverages[i - startX].addr)
coverageVec = mm_add_epi8(coverageVec, sampleCoverageVec)
mm_storeu_si128(coverages[i - startX].addr, coverageVec)
i += 16
elif defined(arm64):
let sampleCoverageVec = vmovq_n_u8(sampleCoverage)
for _ in 0 ..< fillLen div 16:
var coverageVec = vld1q_u8(coverages[i - startX].addr)
coverageVec = vaddq_u8(coverageVec, sampleCoverageVec)
vst1q_u8(coverages[i - startX].addr, coverageVec)
i += 16
for j in i ..< fillStart + fillLen:
coverages[j - startX] += sampleCoverage

Expand Down
203 changes: 203 additions & 0 deletions src/pixie/simd/avx2.nim
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,76 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} =
# Set src as this result for if we do another power
src = result

template applyCoverage*(rgbxVec: M256i, coverage: M128i): M256i =
## Unpack the first 8 coverage bytes.
let
unpacked0 = mm_shuffle_epi8(coverage, coverageShuffle)
unpacked1 = mm_shuffle_epi8(mm_srli_si128(coverage, 4), coverageShuffle)
unpacked =
mm256_insertf128_si256(mm256_castsi128_si256(unpacked0), unpacked1, 1)

var
rgbxEven = mm256_slli_epi16(rgbxVec, 8)
rgbxOdd = mm256_and_si256(rgbxVec, oddMask)
rgbxEven = mm256_mulhi_epu16(rgbxEven, unpacked)
rgbxOdd = mm256_mulhi_epu16(rgbxOdd, unpacked)
rgbxEven = mm256_srli_epi16(mm256_mulhi_epu16(rgbxEven, div255), 7)
rgbxOdd = mm256_srli_epi16(mm256_mulhi_epu16(rgbxOdd, div255), 7)

mm256_or_si256(rgbxEven, mm256_slli_epi16(rgbxOdd, 8))

proc blendLineCoverageOverwriteAvx2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 31) != 0:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage
inc i

let
rgbxVec = mm256_set1_epi32(cast[uint32](rgbx))
vecZero = mm256_setzero_si256()
vec255 = mm256_set1_epi8(255)
oddMask = mm256_set1_epi16(0xff00)
div255 = mm256_set1_epi16(0x8081)
coverageShuffle = mm_set_epi8(
3, -1, 3, -1, 2, -1, 2, -1, 1, -1, 1, -1, 0, -1, 0, -1
)
while i < len - 32:
let
coverage = mm256_loadu_si256(coverages[i].addr)
eqZero = mm256_cmpeq_epi8(coverage, vecZero)
eq255 = mm256_cmpeq_epi8(coverage, vec255)
if mm256_movemask_epi8(eqZero) == cast[int32](0xffffffff):
i += 32
elif mm256_movemask_epi8(eq255) == cast[int32](0xffffffff):
for _ in 0 ..< 4:
mm256_store_si256(line[i].addr, rgbxVec)
i += 8
else:
let
coverageLo = mm256_castsi256_si128(coverage)
coverageHi = mm256_extractf128_si256(coverage, 1)
coverages = [
coverageLo,
mm_srli_si128(coverageLo, 8),
coverageHi,
mm_srli_si128(coverageHi, 8),
]
for j in 0 ..< 4:
mm256_store_si256(line[i].addr, rgbxVec.applyCoverage(coverages[j]))
i += 8

for i in i ..< len:
let coverage = coverages[i]
if coverage != 0:
line[i] = rgbx * coverage

proc blendLineNormalAvx2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
Expand Down Expand Up @@ -473,6 +543,71 @@ proc blendLineNormalAvx2*(
for i in i ..< len:
a[i] = blendNormal(a[i], b[i])

proc blendLineCoverageNormalAvx2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 31) != 0:
let coverage = coverages[i]
if coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)
inc i

let
rgbxVec = mm256_set1_epi32(cast[uint32](rgbx))
vecZero = mm256_setzero_si256()
vec255 = mm256_set1_epi8(255)
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00))
div255 = mm256_set1_epi16(cast[int16](0x8081))
vecAlpha255 = mm256_set1_epi32(cast[int32]([0.uint8, 255, 0, 255]))
coverageShuffle = mm_set_epi8(
3, -1, 3, -1, 2, -1, 2, -1, 1, -1, 1, -1, 0, -1, 0, -1
)
shuffleControl = mm256_set_epi8(
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
)
while i < len - 32:
let
coverage = mm256_loadu_si256(coverages[i].addr)
eqZero = mm256_cmpeq_epi8(coverage, vecZero)
eq255 = mm256_cmpeq_epi8(coverage, vec255)
if mm256_movemask_epi8(eqZero) == cast[int32](0xffffffff):
i += 32
elif mm256_movemask_epi8(eq255) == cast[int32](0xffffffff) and rgbx.a == 255:
for _ in 0 ..< 4:
mm256_store_si256(line[i].addr, rgbxVec)
i += 8
else:
let
coverageLo = mm256_castsi256_si128(coverage)
coverageHi = mm256_extractf128_si256(coverage, 1)
coverages = [
coverageLo,
mm_srli_si128(coverageLo, 8),
coverageHi,
mm_srli_si128(coverageHi, 8),
]
for j in 0 ..< 4:
let
backdrop = mm256_loadu_si256(line[i].addr)
source = rgbxVec.applyCoverage(coverages[j])
mm256_store_si256(line[i].addr, blendNormalSimd(backdrop, source))
i += 8

for i in i ..< len:
let coverage = coverages[i]
if coverage == 0:
discard
else:
line[i] = blendNormal(line[i], rgbx * coverage)

proc blendLineMaskAvx2*(
line: ptr UncheckedArray[ColorRGBX], rgbx: ColorRGBX, len: int
) {.simd.} =
Expand Down Expand Up @@ -529,5 +664,73 @@ proc blendLineMaskAvx2*(
for i in i ..< len:
a[i] = blendMask(a[i], b[i])

proc blendLineCoverageMaskAvx2*(
line: ptr UncheckedArray[ColorRGBX],
coverages: ptr UncheckedArray[uint8],
rgbx: ColorRGBX,
len: int
) {.simd.} =
var i: int
while i < len and (cast[uint](line[i].addr) and 31) != 0:
let coverage = coverages[i]
if coverage == 0:
line[i] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)
inc i

let
rgbxVec = mm256_set1_epi32(cast[uint32](rgbx))
vecZero = mm256_setzero_si256()
vec255 = mm256_set1_epi8(255)
alphaMask = mm256_set1_epi32(cast[int32](0xff000000))
oddMask = mm256_set1_epi16(cast[int16](0xff00))
div255 = mm256_set1_epi16(cast[int16](0x8081))
coverageShuffle = mm_set_epi8(
3, -1, 3, -1, 2, -1, 2, -1, 1, -1, 1, -1, 0, -1, 0, -1
)
shuffleControl = mm256_set_epi8(
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1,
15, -1, 15, -1, 11, -1, 11, -1, 7, -1, 7, -1, 3, -1, 3, -1
)
while i < len - 16:
let
coverage = mm256_loadu_si256(coverages[i].addr)
eqZero = mm256_cmpeq_epi8(coverage, vecZero)
eq255 = mm256_cmpeq_epi8(coverage, vec255)
if mm256_movemask_epi8(eqZero) == cast[int32](0xffffffff):
for _ in 0 ..< 4:
mm256_store_si256(line[i].addr, vecZero)
i += 8
elif mm256_movemask_epi8(eq255) == cast[int32](0xffffffff) and rgbx.a == 255:
i += 32
else:
let
coverageLo = mm256_castsi256_si128(coverage)
coverageHi = mm256_extractf128_si256(coverage, 1)
coverages = [
coverageLo,
mm_srli_si128(coverageLo, 8),
coverageHi,
mm_srli_si128(coverageHi, 8),
]
for j in 0 ..< 4:
let
backdrop = mm256_loadu_si256(line[i].addr)
source = rgbxVec.applyCoverage(coverages[j])
mm256_store_si256(line[i].addr, blendMaskSimd(backdrop, source))
i += 8

for i in i ..< len:
let coverage = coverages[i]
if coverage == 0:
line[i] = rgbx(0, 0, 0, 0)
elif coverage == 255:
discard
else:
line[i] = blendMask(line[i], rgbx * coverage)

when defined(release):
{.pop.}
Loading