From 518eb1fb814c228fca3e4875aba8967528ad4c5c Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Tue, 26 Jul 2022 23:35:19 -0500 Subject: [PATCH 1/5] faster --- src/pixie/images.nim | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/pixie/images.nim b/src/pixie/images.nim index d2871e4e..82120401 100644 --- a/src/pixie/images.nim +++ b/src/pixie/images.nim @@ -77,23 +77,25 @@ proc isOpaque*(image: Image): bool {.raises: [].} = proc flipHorizontal*(image: Image) {.raises: [].} = ## Flips the image around the Y axis. - let w = image.width div 2 + let halfWidth = image.width div 2 for y in 0 ..< image.height: - for x in 0 ..< w: - swap( - image.data[image.dataIndex(x, y)], - image.data[image.dataIndex(image.width - x - 1, y)] - ) + var + left = image.dataIndex(0, y) + right = left + image.width - 1 + for x in 0 ..< halfWidth: + swap(image.data[left], image.data[right]) + inc left + dec right proc flipVertical*(image: Image) {.raises: [].} = ## Flips the image around the X axis. - let h = image.height div 2 - for y in 0 ..< h: + let halfHeight = image.height div 2 + for y in 0 ..< halfHeight: + let + topStart = image.dataIndex(0, y) + bottomStart = image.dataIndex(0, image.height - y - 1) for x in 0 ..< image.width: - swap( - image.data[image.dataIndex(x, y)], - image.data[image.dataIndex(x, image.height - y - 1)] - ) + swap(image.data[topStart + x], image.data[bottomStart + x]) proc rotate90*(image: Image) {.raises: [PixieError].} = ## Rotates the image 90 degrees clockwise. From 06cc48267c85abc9dce9c69c92fdbc4d2031e3bf Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 27 Jul 2022 00:05:30 -0500 Subject: [PATCH 2/5] ceil neon --- src/pixie/simd/neon.nim | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/pixie/simd/neon.nim b/src/pixie/simd/neon.nim index e28572ef..6b83ee5f 100644 --- a/src/pixie/simd/neon.nim +++ b/src/pixie/simd/neon.nim @@ -236,5 +236,30 @@ proc applyOpacityNeon*(image: Image, opacity: float32) {.simd.} = rgbx.a = ((rgbx.a * opacity) div 255).uint8 image.data[i] = rgbx +proc ceilNeon*(image: Image) {.simd.} = + var + i: int + p = cast[uint](image.data[0].addr) + + let + zeroVec = vmovq_n_u8(0) + vec255 = vmovq_n_u8(255) + iterations = image.data.len div 4 + for _ in 0 ..< iterations: + var values = vld1q_u8(cast[pointer](p)) + values = vceqq_u8(values, zeroVec) + values = vbicq_u8(vec255, values) + vst1q_u8(cast[pointer](p), values) + p += 16 + i += 4 * iterations + + for i in i ..< image.data.len: + var rgbx = image.data[i] + rgbx.r = if rgbx.r == 0: 0 else: 255 + rgbx.g = if rgbx.g == 0: 0 else: 255 + rgbx.b = if rgbx.b == 0: 0 else: 255 + rgbx.a = if rgbx.a == 0: 0 else: 255 + image.data[i] = rgbx + when defined(release): {.pop.} From bf15e44b4fad188a7790805b171b6d8fb34af5a3 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 27 Jul 2022 00:18:49 -0500 Subject: [PATCH 3/5] faster --- src/pixie/simd/sse2.nim | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 815b8801..2d131532 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -386,10 +386,9 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = # index 0 and 2 so mask the others out and shift 0 and 2 into # position and store masked = mm_and_si128(merged, mergedMask) - mm_storeu_si128( - result.data[result.dataIndex(x, y)].addr, - mm_shuffle_epi32(masked, MM_SHUFFLE(3, 3, 2, 0)) - ) + shuffled = mm_shuffle_epi32(masked, MM_SHUFFLE(3, 3, 2, 0)) + lower = mm_cvtsi128_si64(shuffled) + copyMem(result.data[result.dataIndex(x, y)].addr, lower.unsafeAddr, 8) x += 2 for x in x ..< resultEvenWidth: From 2d39091c4402a3eec6bdf082a5b83b334a6b3413 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 27 Jul 2022 00:25:23 -0500 Subject: [PATCH 4/5] faster --- src/pixie/simd/avx2.nim | 5 ++--- src/pixie/simd/sse2.nim | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index 3e36f8f0..ff11e833 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -330,9 +330,8 @@ proc minifyBy2Avx2*(image: Image, power = 1): Image {.simd.} = addedOddDiv4 = mm256_srli_epi16(addedOdd, 2) merged = mm256_or_si256(addedEvenDiv4, mm256_slli_epi16(addedOddDiv4, 8)) # Merged has the correct values for the next two pixels at - # index 0, 2, 4, 6 so mask the others out and permute into position - masked = mm256_and_si256(merged, mergedMask) - permuted = mm_256_permutevar8x32_epi32(masked, permuteControl) + # index 0, 2, 4, 6 so permute into position and store + permuted = mm_256_permutevar8x32_epi32(merged, permuteControl) mm_storeu_si128( result.data[result.dataIndex(x, y)].addr, mm256_castsi256_si128(permuted) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 2d131532..890bd352 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -383,10 +383,8 @@ proc minifyBy2Sse2*(image: Image, power = 1): Image {.simd.} = addedOddDiv4 = mm_srli_epi16(addedOdd, 2) merged = mm_or_si128(addedEvenDiv4, mm_slli_epi16(addedOddDiv4, 8)) # Merged has the correct values for the next two pixels at - # index 0 and 2 so mask the others out and shift 0 and 2 into - # position and store - masked = mm_and_si128(merged, mergedMask) - shuffled = mm_shuffle_epi32(masked, MM_SHUFFLE(3, 3, 2, 0)) + # index 0 and 2 so shift 0 and 2 into position and store + shuffled = mm_shuffle_epi32(merged, MM_SHUFFLE(3, 3, 2, 0)) lower = mm_cvtsi128_si64(shuffled) copyMem(result.data[result.dataIndex(x, y)].addr, lower.unsafeAddr, 8) x += 2 From 0e1df4b0c8e08815f214489b372000f35c7fb7f7 Mon Sep 17 00:00:00 2001 From: Ryan Oldenburg Date: Wed, 27 Jul 2022 00:30:29 -0500 Subject: [PATCH 5/5] bugfix --- src/pixie/simd/avx2.nim | 8 ++++---- src/pixie/simd/sse2.nim | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pixie/simd/avx2.nim b/src/pixie/simd/avx2.nim index ff11e833..b79f695c 100644 --- a/src/pixie/simd/avx2.nim +++ b/src/pixie/simd/avx2.nim @@ -110,7 +110,7 @@ proc toPremultipliedAlphaAvx2*(data: var seq[ColorRGBA | ColorRGBX]) {.simd.} = oddMask = mm256_set1_epi16(0xff00) vec128 = mm256_set1_epi16(128) hiMask = mm256_set1_epi16(255 shl 8) - iterations = data.len div 8 + iterations = (data.len - i) div 8 for _ in 0 ..< iterations: let values = mm256_load_si256(cast[pointer](p)) @@ -163,7 +163,7 @@ proc invertAvx2*(image: Image) {.simd.} = let vec255 = mm256_set1_epi8(255) - iterations = image.data.len div 16 + iterations = (image.data.len - i) div 16 for _ in 0 ..< iterations: let a = mm256_load_si256(cast[pointer](p)) @@ -211,7 +211,7 @@ proc applyOpacityAvx2*(image: Image, opacity: float32) {.simd.} = div255 = mm256_set1_epi16(0x8081) zeroVec = mm256_setzero_si256() opacityVec = mm256_slli_epi16(mm256_set1_epi16(opacity), 8) - iterations = image.data.len div 8 + iterations = (image.data.len - i) div 8 for _ in 0 ..< iterations: let values = mm256_load_si256(cast[pointer](p)) @@ -257,7 +257,7 @@ proc ceilAvx2*(image: Image) {.simd.} = let vecZero = mm256_setzero_si256() vec255 = mm256_set1_epi8(255) - iterations = image.data.len div 8 + iterations = (image.data.len - i) div 8 for _ in 0 ..< iterations: var values = mm256_load_si256(cast[pointer](p)) values = mm256_cmpeq_epi8(values, vecZero) diff --git a/src/pixie/simd/sse2.nim b/src/pixie/simd/sse2.nim index 890bd352..84140ded 100644 --- a/src/pixie/simd/sse2.nim +++ b/src/pixie/simd/sse2.nim @@ -212,7 +212,7 @@ proc invertSse2*(image: Image) {.simd.} = let vec255 = mm_set1_epi8(255) - iterations = image.data.len div 16 + iterations = (image.data.len - i) div 16 for _ in 0 ..< iterations: let a = mm_load_si128(cast[pointer](p)) @@ -264,7 +264,7 @@ proc applyOpacitySse2*(image: Image, opacity: float32) {.simd.} = div255 = mm_set1_epi16(0x8081) zeroVec = mm_setzero_si128() opacityVec = mm_slli_epi16(mm_set1_epi16(opacity), 8) - iterations = image.data.len div 4 + iterations = (image.data.len - i) div 4 for _ in 0 ..< iterations: let values = mm_loadu_si128(cast[pointer](p)) if mm_movemask_epi8(mm_cmpeq_epi16(values, zeroVec)) != 0xffff: @@ -308,7 +308,7 @@ proc ceilSse2*(image: Image) {.simd.} = let vecZero = mm_setzero_si128() vec255 = mm_set1_epi8(255) - iterations = image.data.len div 8 + iterations = (image.data.len - i) div 8 for _ in 0 ..< iterations: var values0 = mm_loadu_si128(cast[pointer](p))