From e9e8461f2457ab0fa0f24c640653a55f2d35d5c9 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Sat, 22 May 2021 15:15:34 -0400 Subject: [PATCH 01/17] remove force-vector-width=32 --- go/parquet/internal/utils/Makefile | 4 +- .../internal/utils/_lib/bit_packing_avx2.s | 2 +- go/parquet/internal/utils/_lib/min_max_avx2.s | 1519 +--- go/parquet/internal/utils/_lib/min_max_sse4.s | 2 +- .../internal/utils/_lib/unpack_bool_avx2.s | 6361 +-------------- .../internal/utils/_lib/unpack_bool_sse4.s | 2 +- .../internal/utils/min_max_avx2_amd64.s | 1567 +--- .../internal/utils/unpack_bool_avx2_amd64.s | 7023 +---------------- 8 files changed, 808 insertions(+), 15672 deletions(-) diff --git a/go/parquet/internal/utils/Makefile b/go/parquet/internal/utils/Makefile index 1de4308dc55..41cc68df5bc 100644 --- a/go/parquet/internal/utils/Makefile +++ b/go/parquet/internal/utils/Makefile @@ -18,10 +18,10 @@ PERL_FIXUP_ROTATE=perl -i -pe 's/(ro[rl]\s+\w{2,3})$$/\1, 1/' C2GOASM=c2goasm -CC=clang +CC=clang-11 C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 \ -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -ASM_FLAGS_AVX2=-mavx2 -mfma -mllvm -force-vector-width=32 +ASM_FLAGS_AVX2=-mavx2 -mfma ASM_FLAGS_SSE4=-msse4 ASM_FLAGS_BMI2=-mbmi2 ASM_FLAGS_POPCNT=-mpopcnt diff --git a/go/parquet/internal/utils/_lib/bit_packing_avx2.s b/go/parquet/internal/utils/_lib/bit_packing_avx2.s index 222bc3ce413..84a5cca2ea3 100644 --- a/go/parquet/internal/utils/_lib/bit_packing_avx2.s +++ b/go/parquet/internal/utils/_lib/bit_packing_avx2.s @@ -4007,6 +4007,6 @@ unpack32_avx2: # @unpack32_avx2 .Lfunc_end0: .size unpack32_avx2, .Lfunc_end0-unpack32_avx2 # -- End function - .ident "Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162" + .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" .section ".note.GNU-stack","",@progbits .addrsig diff --git a/go/parquet/internal/utils/_lib/min_max_avx2.s b/go/parquet/internal/utils/_lib/min_max_avx2.s index dbf9a895ae3..ec24a731d69 100644 --- a/go/parquet/internal/utils/_lib/min_max_avx2.s +++ b/go/parquet/internal/utils/_lib/min_max_avx2.s @@ -15,173 +15,89 @@ int32_max_min_avx2: # @int32_max_min_avx2 # %bb.0: push rbp mov rbp, rsp - and rsp, -32 - sub rsp, 64 + and rsp, -8 test esi, esi jle .LBB0_1 # %bb.2: mov r8d, esi cmp esi, 31 - ja .LBB0_6 + ja .LBB0_4 # %bb.3: - mov eax, -2147483648 - mov r9d, 2147483647 - xor r11d, r11d - jmp .LBB0_4 + mov r10d, -2147483648 + mov eax, 2147483647 + xor r9d, r9d + jmp .LBB0_7 .LBB0_1: - mov r9d, 2147483647 - mov eax, -2147483648 - jmp .LBB0_14 -.LBB0_6: - mov r11d, r8d - and r11d, -32 - lea rax, [r11 - 32] - mov r10, rax - shr r10, 5 - add r10, 1 - mov r9d, r10d - and r9d, 3 - cmp rax, 96 - jae .LBB0_8 -# %bb.7: - vpbroadcastd ymm0, dword ptr [rip + .LCPI0_0] # ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] - vpbroadcastd ymm1, dword ptr [rip + .LCPI0_1] # ymm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] - xor eax, eax - vmovdqa ymm2, ymm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm6, ymm1 - vmovdqa ymm3, ymm0 - vmovdqa ymm5, ymm0 - vmovdqa ymm7, ymm0 - jmp .LBB0_10 -.LBB0_8: - and r10, -4 - vpbroadcastd ymm0, dword ptr [rip + .LCPI0_0] # ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] - neg r10 - vpbroadcastd ymm1, dword ptr [rip + .LCPI0_1] # ymm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] + mov eax, 2147483647 + mov esi, -2147483648 + jmp .LBB0_8 +.LBB0_4: + mov r9d, r8d + vpbroadcastd ymm4, dword ptr [rip + .LCPI0_0] # ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] + and r9d, -32 + vpbroadcastd ymm0, dword ptr [rip + .LCPI0_1] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] xor eax, eax - vmovdqa ymm2, ymm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm6, ymm1 + vmovdqa ymm1, ymm0 + vmovdqa ymm2, ymm0 vmovdqa ymm3, ymm0 - vmovdqa ymm5, ymm0 - vmovdqa ymm7, ymm0 + vmovdqa ymm5, ymm4 + vmovdqa ymm6, ymm4 + vmovdqa ymm7, ymm4 .p2align 4, 0x90 -.LBB0_9: # =>This Inner Loop Header: Depth=1 +.LBB0_5: # =>This Inner Loop Header: Depth=1 vmovdqu ymm8, ymmword ptr [rdi + 4*rax] vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32] vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64] vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96] - vpminsd ymm6, ymm6, ymm11 - vpminsd ymm4, ymm4, ymm10 - vpminsd ymm1, ymm1, ymm8 - vpminsd ymm2, ymm2, ymm9 + vpminsd ymm0, ymm0, ymm8 + vpminsd ymm1, ymm1, ymm9 + vpminsd ymm2, ymm2, ymm10 + vpminsd ymm3, ymm3, ymm11 + vpmaxsd ymm4, ymm4, ymm8 + vpmaxsd ymm5, ymm5, ymm9 + vpmaxsd ymm6, ymm6, ymm10 vpmaxsd ymm7, ymm7, ymm11 - vpmaxsd ymm5, ymm5, ymm10 - vpmaxsd ymm0, ymm0, ymm8 - vpmaxsd ymm3, ymm3, ymm9 - vmovdqu ymm8, ymmword ptr [rdi + 4*rax + 224] - vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 192] - vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 128] - vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 160] - vmovdqu ymm12, ymmword ptr [rdi + 4*rax + 256] - vmovdqu ymm13, ymmword ptr [rdi + 4*rax + 320] - vmovdqu ymm14, ymmword ptr [rdi + 4*rax + 352] - vpminsd ymm15, ymm8, ymm14 - vpminsd ymm6, ymm6, ymm15 - vmovdqa ymmword ptr [rsp], ymm6 # 32-byte Spill - vpminsd ymm15, ymm9, ymm13 - vpminsd ymm4, ymm4, ymm15 - vpminsd ymm15, ymm10, ymm12 - vpminsd ymm1, ymm1, ymm15 - vmovdqu ymm15, ymmword ptr [rdi + 4*rax + 288] - vpminsd ymm6, ymm11, ymm15 - vpminsd ymm2, ymm2, ymm6 - vpmaxsd ymm6, ymm8, ymm14 - vpmaxsd ymm7, ymm7, ymm6 - vpmaxsd ymm6, ymm9, ymm13 - vpmaxsd ymm5, ymm5, ymm6 - vpmaxsd ymm6, ymm10, ymm12 - vpmaxsd ymm0, ymm0, ymm6 - vpmaxsd ymm6, ymm11, ymm15 - vpmaxsd ymm3, ymm3, ymm6 - vmovdqu ymm6, ymmword ptr [rdi + 4*rax + 416] - vpminsd ymm2, ymm2, ymm6 - vpmaxsd ymm3, ymm3, ymm6 - vmovdqu ymm6, ymmword ptr [rdi + 4*rax + 384] - vpminsd ymm1, ymm1, ymm6 - vpmaxsd ymm0, ymm0, ymm6 - vmovdqu ymm6, ymmword ptr [rdi + 4*rax + 448] - vpminsd ymm4, ymm4, ymm6 - vpmaxsd ymm5, ymm5, ymm6 - vmovdqu ymm8, ymmword ptr [rdi + 4*rax + 480] - vpminsd ymm6, ymm8, ymmword ptr [rsp] # 32-byte Folded Reload - vpmaxsd ymm7, ymm7, ymm8 - sub rax, -128 - add r10, 4 - jne .LBB0_9 -.LBB0_10: - test r9, r9 - je .LBB0_13 -# %bb.11: - lea rax, [rdi + 4*rax] - neg r9 - .p2align 4, 0x90 -.LBB0_12: # =>This Inner Loop Header: Depth=1 - vmovdqu ymm8, ymmword ptr [rax] - vmovdqu ymm9, ymmword ptr [rax + 32] - vmovdqu ymm10, ymmword ptr [rax + 64] - vmovdqu ymm11, ymmword ptr [rax + 96] - vpminsd ymm2, ymm2, ymm9 - vpminsd ymm1, ymm1, ymm8 - vpminsd ymm4, ymm4, ymm10 - vpminsd ymm6, ymm6, ymm11 - vpmaxsd ymm3, ymm3, ymm9 - vpmaxsd ymm0, ymm0, ymm8 - vpmaxsd ymm5, ymm5, ymm10 - vpmaxsd ymm7, ymm7, ymm11 - sub rax, -128 - inc r9 - jne .LBB0_12 -.LBB0_13: - vpminsd ymm2, ymm2, ymm6 - vpminsd ymm1, ymm1, ymm4 - vpminsd ymm1, ymm1, ymm2 - vpmaxsd ymm2, ymm3, ymm7 - vpmaxsd ymm0, ymm0, ymm5 - vpmaxsd ymm0, ymm0, ymm2 - vextracti128 xmm2, ymm0, 1 - vpmaxsd xmm0, xmm0, xmm2 - vpshufd xmm2, xmm0, 78 # xmm2 = xmm0[2,3,0,1] - vpmaxsd xmm0, xmm0, xmm2 - vpshufd xmm2, xmm0, 229 # xmm2 = xmm0[1,1,2,3] - vpmaxsd xmm0, xmm0, xmm2 - vmovd eax, xmm0 - vextracti128 xmm0, ymm1, 1 - vpminsd xmm0, xmm1, xmm0 + add rax, 32 + cmp r9, rax + jne .LBB0_5 +# %bb.6: + vpmaxsd ymm4, ymm4, ymm5 + vpmaxsd ymm4, ymm4, ymm6 + vpmaxsd ymm4, ymm4, ymm7 + vextracti128 xmm5, ymm4, 1 + vpmaxsd xmm4, xmm4, xmm5 + vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] + vpmaxsd xmm4, xmm4, xmm5 + vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3] + vpmaxsd xmm4, xmm4, xmm5 + vmovd r10d, xmm4 + vpminsd ymm0, ymm0, ymm1 + vpminsd ymm0, ymm0, ymm2 + vpminsd ymm0, ymm0, ymm3 + vextracti128 xmm1, ymm0, 1 + vpminsd xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpminsd xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3] vpminsd xmm0, xmm0, xmm1 - vmovd r9d, xmm0 - cmp r11, r8 - je .LBB0_14 -.LBB0_4: - mov esi, eax + vmovd eax, xmm0 + mov esi, r10d + cmp r9, r8 + je .LBB0_8 .p2align 4, 0x90 -.LBB0_5: # =>This Inner Loop Header: Depth=1 - mov eax, dword ptr [rdi + 4*r11] - cmp r9d, eax - cmovg r9d, eax - cmp esi, eax - cmovge eax, esi - add r11, 1 - mov esi, eax - cmp r8, r11 - jne .LBB0_5 -.LBB0_14: - mov dword ptr [rcx], eax - mov dword ptr [rdx], r9d +.LBB0_7: # =>This Inner Loop Header: Depth=1 + mov esi, dword ptr [rdi + 4*r9] + cmp eax, esi + cmovg eax, esi + cmp r10d, esi + cmovge esi, r10d + add r9, 1 + mov r10d, esi + cmp r8, r9 + jne .LBB0_7 +.LBB0_8: + mov dword ptr [rcx], esi + mov dword ptr [rdx], eax mov rsp, rbp pop rbp vzeroupper @@ -196,173 +112,89 @@ uint32_max_min_avx2: # @uint32_max_min_avx2 # %bb.0: push rbp mov rbp, rsp - and rsp, -32 - sub rsp, 64 + and rsp, -8 test esi, esi jle .LBB1_1 # %bb.2: mov r8d, esi cmp esi, 31 - ja .LBB1_6 + ja .LBB1_4 # %bb.3: - xor r11d, r11d - mov r9d, -1 - xor esi, esi - jmp .LBB1_4 + xor r9d, r9d + mov eax, -1 + xor r10d, r10d + jmp .LBB1_7 .LBB1_1: - mov r9d, -1 + mov eax, -1 xor esi, esi - jmp .LBB1_14 -.LBB1_6: - mov r11d, r8d - and r11d, -32 - lea rax, [r11 - 32] - mov r10, rax - shr r10, 5 - add r10, 1 - mov r9d, r10d - and r9d, 3 - cmp rax, 96 - jae .LBB1_8 -# %bb.7: - vpxor xmm0, xmm0, xmm0 - vpcmpeqd ymm1, ymm1, ymm1 + jmp .LBB1_8 +.LBB1_4: + mov r9d, r8d + and r9d, -32 + vpxor xmm4, xmm4, xmm4 + vpcmpeqd ymm0, ymm0, ymm0 xor eax, eax - vpcmpeqd ymm2, ymm2, ymm2 - vpcmpeqd ymm4, ymm4, ymm4 - vpcmpeqd ymm6, ymm6, ymm6 - vpxor xmm3, xmm3, xmm3 - vpxor xmm5, xmm5, xmm5 - vpxor xmm7, xmm7, xmm7 - jmp .LBB1_10 -.LBB1_8: - and r10, -4 - neg r10 - vpxor xmm0, xmm0, xmm0 vpcmpeqd ymm1, ymm1, ymm1 - xor eax, eax vpcmpeqd ymm2, ymm2, ymm2 - vpcmpeqd ymm4, ymm4, ymm4 - vpcmpeqd ymm6, ymm6, ymm6 - vpxor xmm3, xmm3, xmm3 + vpcmpeqd ymm3, ymm3, ymm3 vpxor xmm5, xmm5, xmm5 + vpxor xmm6, xmm6, xmm6 vpxor xmm7, xmm7, xmm7 .p2align 4, 0x90 -.LBB1_9: # =>This Inner Loop Header: Depth=1 +.LBB1_5: # =>This Inner Loop Header: Depth=1 vmovdqu ymm8, ymmword ptr [rdi + 4*rax] vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32] vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64] vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96] - vpminud ymm6, ymm6, ymm11 - vpminud ymm4, ymm4, ymm10 - vpminud ymm1, ymm1, ymm8 - vpminud ymm2, ymm2, ymm9 + vpminud ymm0, ymm0, ymm8 + vpminud ymm1, ymm1, ymm9 + vpminud ymm2, ymm2, ymm10 + vpminud ymm3, ymm3, ymm11 + vpmaxud ymm4, ymm4, ymm8 + vpmaxud ymm5, ymm5, ymm9 + vpmaxud ymm6, ymm6, ymm10 vpmaxud ymm7, ymm7, ymm11 - vpmaxud ymm5, ymm5, ymm10 - vpmaxud ymm0, ymm0, ymm8 - vpmaxud ymm3, ymm3, ymm9 - vmovdqu ymm8, ymmword ptr [rdi + 4*rax + 224] - vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 192] - vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 128] - vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 160] - vmovdqu ymm12, ymmword ptr [rdi + 4*rax + 256] - vmovdqu ymm13, ymmword ptr [rdi + 4*rax + 320] - vmovdqu ymm14, ymmword ptr [rdi + 4*rax + 352] - vpminud ymm15, ymm8, ymm14 - vpminud ymm6, ymm6, ymm15 - vmovdqa ymmword ptr [rsp], ymm6 # 32-byte Spill - vpminud ymm15, ymm9, ymm13 - vpminud ymm4, ymm4, ymm15 - vpminud ymm15, ymm10, ymm12 - vpminud ymm1, ymm1, ymm15 - vmovdqu ymm15, ymmword ptr [rdi + 4*rax + 288] - vpminud ymm6, ymm11, ymm15 - vpminud ymm2, ymm2, ymm6 - vpmaxud ymm6, ymm8, ymm14 - vpmaxud ymm7, ymm7, ymm6 - vpmaxud ymm6, ymm9, ymm13 - vpmaxud ymm5, ymm5, ymm6 - vpmaxud ymm6, ymm10, ymm12 - vpmaxud ymm0, ymm0, ymm6 - vpmaxud ymm6, ymm11, ymm15 - vpmaxud ymm3, ymm3, ymm6 - vmovdqu ymm6, ymmword ptr [rdi + 4*rax + 416] - vpminud ymm2, ymm2, ymm6 - vpmaxud ymm3, ymm3, ymm6 - vmovdqu ymm6, ymmword ptr [rdi + 4*rax + 384] - vpminud ymm1, ymm1, ymm6 - vpmaxud ymm0, ymm0, ymm6 - vmovdqu ymm6, ymmword ptr [rdi + 4*rax + 448] - vpminud ymm4, ymm4, ymm6 - vpmaxud ymm5, ymm5, ymm6 - vmovdqu ymm8, ymmword ptr [rdi + 4*rax + 480] - vpminud ymm6, ymm8, ymmword ptr [rsp] # 32-byte Folded Reload - vpmaxud ymm7, ymm7, ymm8 - sub rax, -128 - add r10, 4 - jne .LBB1_9 -.LBB1_10: - test r9, r9 - je .LBB1_13 -# %bb.11: - lea rax, [rdi + 4*rax] - neg r9 - .p2align 4, 0x90 -.LBB1_12: # =>This Inner Loop Header: Depth=1 - vmovdqu ymm8, ymmword ptr [rax] - vmovdqu ymm9, ymmword ptr [rax + 32] - vmovdqu ymm10, ymmword ptr [rax + 64] - vmovdqu ymm11, ymmword ptr [rax + 96] - vpminud ymm2, ymm2, ymm9 - vpminud ymm1, ymm1, ymm8 - vpminud ymm4, ymm4, ymm10 - vpminud ymm6, ymm6, ymm11 - vpmaxud ymm3, ymm3, ymm9 - vpmaxud ymm0, ymm0, ymm8 - vpmaxud ymm5, ymm5, ymm10 - vpmaxud ymm7, ymm7, ymm11 - sub rax, -128 - inc r9 - jne .LBB1_12 -.LBB1_13: - vpminud ymm2, ymm2, ymm6 - vpminud ymm1, ymm1, ymm4 - vpminud ymm1, ymm1, ymm2 - vpmaxud ymm2, ymm3, ymm7 - vpmaxud ymm0, ymm0, ymm5 - vpmaxud ymm0, ymm0, ymm2 - vextracti128 xmm2, ymm0, 1 - vpmaxud xmm0, xmm0, xmm2 - vpshufd xmm2, xmm0, 78 # xmm2 = xmm0[2,3,0,1] - vpmaxud xmm0, xmm0, xmm2 - vpshufd xmm2, xmm0, 229 # xmm2 = xmm0[1,1,2,3] - vpmaxud xmm0, xmm0, xmm2 - vmovd esi, xmm0 - vextracti128 xmm0, ymm1, 1 - vpminud xmm0, xmm1, xmm0 + add rax, 32 + cmp r9, rax + jne .LBB1_5 +# %bb.6: + vpmaxud ymm4, ymm4, ymm5 + vpmaxud ymm4, ymm4, ymm6 + vpmaxud ymm4, ymm4, ymm7 + vextracti128 xmm5, ymm4, 1 + vpmaxud xmm4, xmm4, xmm5 + vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] + vpmaxud xmm4, xmm4, xmm5 + vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3] + vpmaxud xmm4, xmm4, xmm5 + vmovd r10d, xmm4 + vpminud ymm0, ymm0, ymm1 + vpminud ymm0, ymm0, ymm2 + vpminud ymm0, ymm0, ymm3 + vextracti128 xmm1, ymm0, 1 + vpminud xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpminud xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3] vpminud xmm0, xmm0, xmm1 - vmovd r9d, xmm0 - cmp r11, r8 - je .LBB1_14 -.LBB1_4: - mov eax, esi + vmovd eax, xmm0 + mov esi, r10d + cmp r9, r8 + je .LBB1_8 .p2align 4, 0x90 -.LBB1_5: # =>This Inner Loop Header: Depth=1 - mov esi, dword ptr [rdi + 4*r11] - cmp r9d, esi - cmovae r9d, esi +.LBB1_7: # =>This Inner Loop Header: Depth=1 + mov esi, dword ptr [rdi + 4*r9] cmp eax, esi - cmova esi, eax - add r11, 1 - mov eax, esi - cmp r8, r11 - jne .LBB1_5 -.LBB1_14: + cmovae eax, esi + cmp r10d, esi + cmova esi, r10d + add r9, 1 + mov r10d, esi + cmp r8, r9 + jne .LBB1_7 +.LBB1_8: mov dword ptr [rcx], esi - mov dword ptr [rdx], r9d + mov dword ptr [rdx], eax mov rsp, rbp pop rbp vzeroupper @@ -384,387 +216,102 @@ int64_max_min_avx2: # @int64_max_min_avx2 # %bb.0: push rbp mov rbp, rsp - and rsp, -32 - sub rsp, 224 - movabs r9, 9223372036854775807 + and rsp, -8 + movabs rax, 9223372036854775807 test esi, esi jle .LBB2_1 # %bb.2: mov r8d, esi - cmp esi, 31 - ja .LBB2_6 + cmp esi, 15 + ja .LBB2_4 # %bb.3: - lea rsi, [r9 + 1] - xor r11d, r11d - jmp .LBB2_4 + lea r10, [rax + 1] + xor r9d, r9d + jmp .LBB2_7 .LBB2_1: - lea rsi, [r9 + 1] - jmp .LBB2_14 -.LBB2_6: - mov r11d, r8d - and r11d, -32 - lea rax, [r11 - 32] - mov r10, rax - shr r10, 5 - add r10, 1 - mov r9d, r10d - and r9d, 3 - cmp rax, 96 - jae .LBB2_8 -# %bb.7: - vpbroadcastq ymm15, qword ptr [rip + .LCPI2_0] # ymm15 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] - vpbroadcastq ymm11, qword ptr [rip + .LCPI2_1] # ymm11 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] - xor eax, eax - vmovdqa ymmword ptr [rsp + 32], ymm11 # 32-byte Spill - vmovdqa ymm3, ymm11 - vmovdqa ymm9, ymm11 - vmovdqa ymm5, ymm11 - vmovdqa ymm4, ymm11 - vmovdqa ymm6, ymm11 - vmovdqa ymmword ptr [rsp + 96], ymm11 # 32-byte Spill - vmovdqa ymmword ptr [rsp + 64], ymm15 # 32-byte Spill - vmovdqa ymm2, ymm15 - vmovdqa ymm8, ymm15 - vmovdqa ymm12, ymm15 - vmovdqa ymm13, ymm15 - vmovdqa ymm14, ymm15 - vmovdqa ymmword ptr [rsp], ymm15 # 32-byte Spill - jmp .LBB2_10 -.LBB2_8: - and r10, -4 - vpbroadcastq ymm15, qword ptr [rip + .LCPI2_0] # ymm15 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] - neg r10 - vpbroadcastq ymm11, qword ptr [rip + .LCPI2_1] # ymm11 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] + lea rsi, [rax + 1] + jmp .LBB2_8 +.LBB2_4: + mov r9d, r8d + vpbroadcastq ymm4, qword ptr [rip + .LCPI2_0] # ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] + and r9d, -16 + vpbroadcastq ymm0, qword ptr [rip + .LCPI2_1] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] xor eax, eax - vmovdqa ymmword ptr [rsp + 32], ymm11 # 32-byte Spill - vmovdqa ymm3, ymm11 - vmovdqa ymm9, ymm11 - vmovdqa ymm5, ymm11 - vmovdqa ymm4, ymm11 - vmovdqa ymm6, ymm11 - vmovdqa ymmword ptr [rsp + 96], ymm11 # 32-byte Spill - vmovdqa ymmword ptr [rsp + 64], ymm15 # 32-byte Spill - vmovdqa ymm2, ymm15 - vmovdqa ymm8, ymm15 - vmovdqa ymm12, ymm15 - vmovdqa ymm13, ymm15 - vmovdqa ymm14, ymm15 - vmovdqa ymmword ptr [rsp], ymm15 # 32-byte Spill - .p2align 4, 0x90 -.LBB2_9: # =>This Inner Loop Header: Depth=1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 224] - vmovdqa ymm10, ymm8 - vmovdqa ymm8, ymm2 - vmovdqa ymm2, ymm3 - vmovdqa ymm3, ymm9 - vpcmpgtq ymm9, ymm0, ymm11 - vblendvpd ymm1, ymm0, ymm11, ymm9 - vmovapd ymmword ptr [rsp + 160], ymm1 # 32-byte Spill - vpcmpgtq ymm9, ymm15, ymm0 - vblendvpd ymm0, ymm0, ymm15, ymm9 - vmovapd ymmword ptr [rsp + 128], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 192] - vpcmpgtq ymm9, ymm0, ymm6 - vblendvpd ymm7, ymm0, ymm6, ymm9 - vpcmpgtq ymm9, ymm14, ymm0 - vblendvpd ymm14, ymm0, ymm14, ymm9 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 160] - vpcmpgtq ymm9, ymm0, ymm4 - vblendvpd ymm6, ymm0, ymm4, ymm9 - vpcmpgtq ymm9, ymm13, ymm0 - vblendvpd ymm13, ymm0, ymm13, ymm9 - vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 128] - vpcmpgtq ymm0, ymm9, ymm5 - vblendvpd ymm1, ymm9, ymm5, ymm0 - vpcmpgtq ymm5, ymm12, ymm9 - vblendvpd ymm12, ymm9, ymm12, ymm5 - vmovdqu ymm5, ymmword ptr [rdi + 8*rax + 96] - vpcmpgtq ymm9, ymm5, ymm3 - vblendvpd ymm9, ymm5, ymm3, ymm9 - vpcmpgtq ymm4, ymm10, ymm5 - vblendvpd ymm10, ymm5, ymm10, ymm4 - vmovdqu ymm4, ymmword ptr [rdi + 8*rax + 64] - vpcmpgtq ymm5, ymm4, ymm2 - vblendvpd ymm5, ymm4, ymm2, ymm5 - vpcmpgtq ymm3, ymm8, ymm4 - vblendvpd ymm0, ymm4, ymm8, ymm3 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax] - vmovdqa ymm4, ymmword ptr [rsp + 96] # 32-byte Reload - vpcmpgtq ymm3, ymm2, ymm4 - vblendvpd ymm3, ymm2, ymm4, ymm3 - vmovdqa ymm11, ymmword ptr [rsp] # 32-byte Reload - vpcmpgtq ymm4, ymm11, ymm2 - vblendvpd ymm4, ymm2, ymm11, ymm4 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 32] - vmovdqa ymm15, ymmword ptr [rsp + 32] # 32-byte Reload - vpcmpgtq ymm11, ymm2, ymm15 - vblendvpd ymm11, ymm2, ymm15, ymm11 - vmovdqa ymm8, ymmword ptr [rsp + 64] # 32-byte Reload - vpcmpgtq ymm15, ymm8, ymm2 - vblendvpd ymm2, ymm2, ymm8, ymm15 - vmovdqu ymm8, ymmword ptr [rdi + 8*rax + 288] - vpcmpgtq ymm15, ymm8, ymm11 - vblendvpd ymm11, ymm8, ymm11, ymm15 - vmovapd ymmword ptr [rsp + 32], ymm11 # 32-byte Spill - vpcmpgtq ymm11, ymm2, ymm8 - vblendvpd ymm2, ymm8, ymm2, ymm11 - vmovapd ymmword ptr [rsp], ymm2 # 32-byte Spill - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 256] - vpcmpgtq ymm2, ymm11, ymm3 - vblendvpd ymm8, ymm11, ymm3, ymm2 - vpcmpgtq ymm3, ymm4, ymm11 - vblendvpd ymm3, ymm11, ymm4, ymm3 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 320] - vpcmpgtq ymm4, ymm11, ymm5 - vblendvpd ymm4, ymm11, ymm5, ymm4 - vpcmpgtq ymm5, ymm0, ymm11 - vblendvpd ymm5, ymm11, ymm0, ymm5 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 352] - vpcmpgtq ymm11, ymm0, ymm9 - vblendvpd ymm9, ymm0, ymm9, ymm11 - vpcmpgtq ymm11, ymm10, ymm0 - vblendvpd ymm10, ymm0, ymm10, ymm11 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 384] - vpcmpgtq ymm0, ymm11, ymm1 - vblendvpd ymm2, ymm11, ymm1, ymm0 - vpcmpgtq ymm1, ymm12, ymm11 - vblendvpd ymm12, ymm11, ymm12, ymm1 - vmovdqu ymm1, ymmword ptr [rdi + 8*rax + 416] - vpcmpgtq ymm11, ymm1, ymm6 - vblendvpd ymm6, ymm1, ymm6, ymm11 - vpcmpgtq ymm11, ymm13, ymm1 - vblendvpd ymm1, ymm1, ymm13, ymm11 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 448] - vpcmpgtq ymm13, ymm11, ymm7 - vblendvpd ymm7, ymm11, ymm7, ymm13 - vpcmpgtq ymm13, ymm14, ymm11 - vblendvpd ymm13, ymm11, ymm14, ymm13 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 480] - vmovdqa ymm0, ymmword ptr [rsp + 160] # 32-byte Reload - vpcmpgtq ymm14, ymm11, ymm0 - vblendvpd ymm14, ymm11, ymm0, ymm14 - vmovdqa ymm0, ymmword ptr [rsp + 128] # 32-byte Reload - vpcmpgtq ymm15, ymm0, ymm11 - vblendvpd ymm15, ymm11, ymm0, ymm15 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 736] - vpcmpgtq ymm11, ymm0, ymm14 - vblendvpd ymm11, ymm0, ymm14, ymm11 - vmovapd ymmword ptr [rsp + 160], ymm11 # 32-byte Spill - vpcmpgtq ymm14, ymm15, ymm0 - vblendvpd ymm0, ymm0, ymm15, ymm14 - vmovapd ymmword ptr [rsp + 128], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 704] - vpcmpgtq ymm14, ymm0, ymm7 - vblendvpd ymm7, ymm0, ymm7, ymm14 - vpcmpgtq ymm14, ymm13, ymm0 - vblendvpd ymm14, ymm0, ymm13, ymm14 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 672] - vpcmpgtq ymm13, ymm0, ymm6 - vblendvpd ymm6, ymm0, ymm6, ymm13 - vpcmpgtq ymm13, ymm1, ymm0 - vblendvpd ymm13, ymm0, ymm1, ymm13 - vmovdqu ymm1, ymmword ptr [rdi + 8*rax + 640] - vpcmpgtq ymm0, ymm1, ymm2 - vblendvpd ymm0, ymm1, ymm2, ymm0 - vpcmpgtq ymm2, ymm12, ymm1 - vblendvpd ymm12, ymm1, ymm12, ymm2 - vmovdqu ymm1, ymmword ptr [rdi + 8*rax + 608] - vpcmpgtq ymm2, ymm1, ymm9 - vblendvpd ymm9, ymm1, ymm9, ymm2 - vpcmpgtq ymm2, ymm10, ymm1 - vblendvpd ymm10, ymm1, ymm10, ymm2 - vmovdqu ymm1, ymmword ptr [rdi + 8*rax + 576] - vpcmpgtq ymm2, ymm1, ymm4 - vblendvpd ymm2, ymm1, ymm4, ymm2 - vpcmpgtq ymm4, ymm5, ymm1 - vblendvpd ymm1, ymm1, ymm5, ymm4 - vmovdqu ymm4, ymmword ptr [rdi + 8*rax + 512] - vpcmpgtq ymm5, ymm4, ymm8 - vblendvpd ymm5, ymm4, ymm8, ymm5 - vpcmpgtq ymm8, ymm3, ymm4 - vblendvpd ymm3, ymm4, ymm3, ymm8 - vmovdqu ymm4, ymmword ptr [rdi + 8*rax + 544] - vmovdqa ymm11, ymmword ptr [rsp + 32] # 32-byte Reload - vpcmpgtq ymm8, ymm4, ymm11 - vblendvpd ymm8, ymm4, ymm11, ymm8 - vmovdqa ymm15, ymmword ptr [rsp] # 32-byte Reload - vpcmpgtq ymm11, ymm15, ymm4 - vblendvpd ymm4, ymm4, ymm15, ymm11 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 800] - vpcmpgtq ymm15, ymm11, ymm8 - vblendvpd ymm8, ymm11, ymm8, ymm15 - vmovapd ymmword ptr [rsp + 32], ymm8 # 32-byte Spill - vpcmpgtq ymm8, ymm4, ymm11 - vblendvpd ymm4, ymm11, ymm4, ymm8 - vmovapd ymmword ptr [rsp + 64], ymm4 # 32-byte Spill - vmovdqu ymm4, ymmword ptr [rdi + 8*rax + 768] - vpcmpgtq ymm11, ymm4, ymm5 - vblendvpd ymm5, ymm4, ymm5, ymm11 - vmovapd ymmword ptr [rsp + 96], ymm5 # 32-byte Spill - vpcmpgtq ymm5, ymm3, ymm4 - vblendvpd ymm3, ymm4, ymm3, ymm5 - vmovapd ymmword ptr [rsp], ymm3 # 32-byte Spill - vmovdqu ymm4, ymmword ptr [rdi + 8*rax + 832] - vpcmpgtq ymm3, ymm4, ymm2 - vblendvpd ymm3, ymm4, ymm2, ymm3 - vpcmpgtq ymm2, ymm1, ymm4 - vblendvpd ymm2, ymm4, ymm1, ymm2 - vmovdqu ymm1, ymmword ptr [rdi + 8*rax + 864] - vpcmpgtq ymm4, ymm1, ymm9 - vblendvpd ymm9, ymm1, ymm9, ymm4 - vpcmpgtq ymm5, ymm10, ymm1 - vblendvpd ymm8, ymm1, ymm10, ymm5 - vmovdqu ymm1, ymmword ptr [rdi + 8*rax + 896] - vpcmpgtq ymm5, ymm1, ymm0 - vblendvpd ymm5, ymm1, ymm0, ymm5 - vpcmpgtq ymm0, ymm12, ymm1 - vblendvpd ymm12, ymm1, ymm12, ymm0 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 928] - vpcmpgtq ymm1, ymm0, ymm6 - vblendvpd ymm4, ymm0, ymm6, ymm1 - vpcmpgtq ymm1, ymm13, ymm0 - vblendvpd ymm13, ymm0, ymm13, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 960] - vpcmpgtq ymm1, ymm0, ymm7 - vblendvpd ymm6, ymm0, ymm7, ymm1 - vpcmpgtq ymm1, ymm14, ymm0 - vblendvpd ymm14, ymm0, ymm14, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 992] - vmovdqa ymm7, ymmword ptr [rsp + 160] # 32-byte Reload - vpcmpgtq ymm1, ymm0, ymm7 - vblendvpd ymm11, ymm0, ymm7, ymm1 - vmovdqa ymm7, ymmword ptr [rsp + 128] # 32-byte Reload - vpcmpgtq ymm1, ymm7, ymm0 - vblendvpd ymm15, ymm0, ymm7, ymm1 - sub rax, -128 - add r10, 4 - jne .LBB2_9 -.LBB2_10: - test r9, r9 - vmovdqa ymm7, ymm5 - vmovdqa ymm5, ymm9 - vmovdqa ymm9, ymmword ptr [rsp + 96] # 32-byte Reload - vmovdqa ymm10, ymm3 - je .LBB2_13 -# %bb.11: - lea rax, [rdi + 8*rax] - neg r9 + vmovdqa ymm3, ymm0 + vmovdqa ymm2, ymm0 + vmovdqa ymm1, ymm0 + vmovdqa ymm7, ymm4 + vmovdqa ymm6, ymm4 + vmovdqa ymm5, ymm4 .p2align 4, 0x90 -.LBB2_12: # =>This Inner Loop Header: Depth=1 - vmovdqu ymm0, ymmword ptr [rax + 32] - vmovdqa ymm3, ymmword ptr [rsp + 32] # 32-byte Reload - vpcmpgtq ymm1, ymm0, ymm3 - vblendvpd ymm3, ymm0, ymm3, ymm1 - vmovapd ymmword ptr [rsp + 32], ymm3 # 32-byte Spill - vmovdqa ymm3, ymmword ptr [rsp + 64] # 32-byte Reload - vpcmpgtq ymm1, ymm3, ymm0 - vblendvpd ymm3, ymm0, ymm3, ymm1 - vmovapd ymmword ptr [rsp + 64], ymm3 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rax] - vpcmpgtq ymm1, ymm0, ymm9 - vblendvpd ymm9, ymm0, ymm9, ymm1 - vmovdqa ymm3, ymmword ptr [rsp] # 32-byte Reload - vpcmpgtq ymm1, ymm3, ymm0 - vblendvpd ymm3, ymm0, ymm3, ymm1 - vmovapd ymmword ptr [rsp], ymm3 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rax + 64] - vpcmpgtq ymm1, ymm0, ymm10 - vblendvpd ymm10, ymm0, ymm10, ymm1 - vpcmpgtq ymm1, ymm2, ymm0 - vblendvpd ymm2, ymm0, ymm2, ymm1 - vmovdqu ymm0, ymmword ptr [rax + 96] - vpcmpgtq ymm1, ymm0, ymm5 - vblendvpd ymm5, ymm0, ymm5, ymm1 - vpcmpgtq ymm1, ymm8, ymm0 - vblendvpd ymm8, ymm0, ymm8, ymm1 - vmovdqu ymm0, ymmword ptr [rax + 128] - vpcmpgtq ymm1, ymm0, ymm7 - vblendvpd ymm7, ymm0, ymm7, ymm1 - vpcmpgtq ymm1, ymm12, ymm0 - vblendvpd ymm12, ymm0, ymm12, ymm1 - vmovdqu ymm0, ymmword ptr [rax + 160] - vpcmpgtq ymm1, ymm0, ymm4 - vblendvpd ymm4, ymm0, ymm4, ymm1 - vpcmpgtq ymm1, ymm13, ymm0 - vblendvpd ymm13, ymm0, ymm13, ymm1 - vmovdqu ymm0, ymmword ptr [rax + 192] - vpcmpgtq ymm1, ymm0, ymm6 - vblendvpd ymm6, ymm0, ymm6, ymm1 - vpcmpgtq ymm1, ymm14, ymm0 - vblendvpd ymm14, ymm0, ymm14, ymm1 - vmovdqu ymm0, ymmword ptr [rax + 224] - vpcmpgtq ymm1, ymm0, ymm11 - vblendvpd ymm11, ymm0, ymm11, ymm1 - vpcmpgtq ymm1, ymm15, ymm0 - vblendvpd ymm15, ymm0, ymm15, ymm1 - add rax, 256 - inc r9 - jne .LBB2_12 -.LBB2_13: - vmovdqa ymm1, ymmword ptr [rsp + 64] # 32-byte Reload - vpcmpgtq ymm0, ymm1, ymm13 - vblendvpd ymm0, ymm13, ymm1, ymm0 - vpcmpgtq ymm1, ymm8, ymm15 - vblendvpd ymm1, ymm15, ymm8, ymm1 - vmovdqa ymm3, ymmword ptr [rsp] # 32-byte Reload - vpcmpgtq ymm8, ymm3, ymm12 - vblendvpd ymm8, ymm12, ymm3, ymm8 - vmovdqa ymm3, ymm9 - vpcmpgtq ymm9, ymm2, ymm14 - vblendvpd ymm2, ymm14, ymm2, ymm9 - vpcmpgtq ymm9, ymm8, ymm2 - vblendvpd ymm2, ymm2, ymm8, ymm9 - vpcmpgtq ymm8, ymm0, ymm1 - vblendvpd ymm0, ymm1, ymm0, ymm8 - vpcmpgtq ymm1, ymm2, ymm0 - vblendvpd ymm0, ymm0, ymm2, ymm1 +.LBB2_5: # =>This Inner Loop Header: Depth=1 + vmovdqu ymm8, ymmword ptr [rdi + 8*rax] + vpcmpgtq ymm9, ymm8, ymm0 + vblendvpd ymm0, ymm8, ymm0, ymm9 + vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32] + vpcmpgtq ymm10, ymm9, ymm3 + vblendvpd ymm3, ymm9, ymm3, ymm10 + vmovdqu ymm10, ymmword ptr [rdi + 8*rax + 64] + vpcmpgtq ymm11, ymm10, ymm2 + vblendvpd ymm2, ymm10, ymm2, ymm11 + vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 96] + vpcmpgtq ymm12, ymm11, ymm1 + vblendvpd ymm1, ymm11, ymm1, ymm12 + vpcmpgtq ymm12, ymm4, ymm8 + vblendvpd ymm4, ymm8, ymm4, ymm12 + vpcmpgtq ymm8, ymm7, ymm9 + vblendvpd ymm7, ymm9, ymm7, ymm8 + vpcmpgtq ymm8, ymm6, ymm10 + vblendvpd ymm6, ymm10, ymm6, ymm8 + vpcmpgtq ymm8, ymm5, ymm11 + vblendvpd ymm5, ymm11, ymm5, ymm8 + add rax, 16 + cmp r9, rax + jne .LBB2_5 +# %bb.6: + vpcmpgtq ymm8, ymm4, ymm7 + vblendvpd ymm4, ymm7, ymm4, ymm8 + vpcmpgtq ymm7, ymm4, ymm6 + vblendvpd ymm4, ymm6, ymm4, ymm7 + vpcmpgtq ymm6, ymm4, ymm5 + vblendvpd ymm4, ymm5, ymm4, ymm6 + vextractf128 xmm5, ymm4, 1 + vpcmpgtq xmm6, xmm4, xmm5 + vblendvpd xmm4, xmm5, xmm4, xmm6 + vpermilps xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1] + vpcmpgtq xmm6, xmm4, xmm5 + vblendvpd xmm4, xmm5, xmm4, xmm6 + vmovq r10, xmm4 + vpcmpgtq ymm4, ymm3, ymm0 + vblendvpd ymm0, ymm3, ymm0, ymm4 + vpcmpgtq ymm3, ymm2, ymm0 + vblendvpd ymm0, ymm2, ymm0, ymm3 + vpcmpgtq ymm2, ymm1, ymm0 + vblendvpd ymm0, ymm1, ymm0, ymm2 vextractf128 xmm1, ymm0, 1 - vpcmpgtq xmm2, xmm0, xmm1 + vpcmpgtq xmm2, xmm1, xmm0 vblendvpd xmm0, xmm1, xmm0, xmm2 vpermilps xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] - vpcmpgtq xmm2, xmm0, xmm1 + vpcmpgtq xmm2, xmm1, xmm0 vblendvpd xmm0, xmm1, xmm0, xmm2 - vmovdqa ymm2, ymmword ptr [rsp + 32] # 32-byte Reload - vpcmpgtq ymm1, ymm4, ymm2 - vblendvpd ymm1, ymm4, ymm2, ymm1 - vpcmpgtq ymm2, ymm11, ymm5 - vblendvpd ymm2, ymm11, ymm5, ymm2 - vpcmpgtq ymm4, ymm7, ymm3 - vblendvpd ymm4, ymm7, ymm3, ymm4 - vpcmpgtq ymm5, ymm6, ymm10 - vblendvpd ymm3, ymm6, ymm10, ymm5 - vpcmpgtq ymm5, ymm3, ymm4 - vblendvpd ymm3, ymm3, ymm4, ymm5 - vpcmpgtq ymm4, ymm2, ymm1 - vblendvpd ymm1, ymm2, ymm1, ymm4 - vpcmpgtq ymm2, ymm1, ymm3 - vblendvpd ymm1, ymm1, ymm3, ymm2 - vextractf128 xmm2, ymm1, 1 - vpcmpgtq xmm3, xmm2, xmm1 - vblendvpd xmm1, xmm2, xmm1, xmm3 - vpermilps xmm2, xmm1, 78 # xmm2 = xmm1[2,3,0,1] - vpcmpgtq xmm3, xmm2, xmm1 - vblendvpd xmm1, xmm2, xmm1, xmm3 - vmovq rsi, xmm0 - vmovq r9, xmm1 - cmp r11, r8 - je .LBB2_14 -.LBB2_4: - mov rax, rsi + vmovq rax, xmm0 + mov rsi, r10 + cmp r9, r8 + je .LBB2_8 .p2align 4, 0x90 -.LBB2_5: # =>This Inner Loop Header: Depth=1 - mov rsi, qword ptr [rdi + 8*r11] - cmp r9, rsi - cmovg r9, rsi +.LBB2_7: # =>This Inner Loop Header: Depth=1 + mov rsi, qword ptr [rdi + 8*r9] cmp rax, rsi - cmovge rsi, rax - add r11, 1 - mov rax, rsi - cmp r8, r11 - jne .LBB2_5 -.LBB2_14: + cmovg rax, rsi + cmp r10, rsi + cmovge rsi, r10 + add r9, 1 + mov r10, rsi + cmp r8, r9 + jne .LBB2_7 +.LBB2_8: mov qword ptr [rcx], rsi - mov qword ptr [rdx], r9 + mov qword ptr [rdx], rax mov rsp, rbp pop rbp vzeroupper @@ -784,576 +331,136 @@ uint64_max_min_avx2: # @uint64_max_min_avx2 # %bb.0: push rbp mov rbp, rsp - and rsp, -32 - sub rsp, 288 + and rsp, -8 test esi, esi jle .LBB3_1 # %bb.2: mov r8d, esi - cmp esi, 31 - ja .LBB3_6 + cmp esi, 15 + ja .LBB3_4 # %bb.3: - mov r9, -1 - xor r11d, r11d - xor esi, esi - jmp .LBB3_4 + mov rax, -1 + xor r9d, r9d + xor r10d, r10d + jmp .LBB3_7 .LBB3_1: - mov r9, -1 + mov rax, -1 xor esi, esi - jmp .LBB3_14 -.LBB3_6: - mov r11d, r8d - and r11d, -32 - lea rax, [r11 - 32] - mov r10, rax - shr r10, 5 - add r10, 1 - mov r9d, r10d - and r9d, 3 - cmp rax, 96 - jae .LBB3_8 -# %bb.7: - vpxor xmm4, xmm4, xmm4 - vpcmpeqd ymm0, ymm0, ymm0 - vmovdqa ymmword ptr [rsp + 64], ymm0 # 32-byte Spill - xor eax, eax - vpcmpeqd ymm0, ymm0, ymm0 - vmovdqa ymmword ptr [rsp + 96], ymm0 # 32-byte Spill - vpcmpeqd ymm5, ymm5, ymm5 - vpcmpeqd ymm7, ymm7, ymm7 - vpcmpeqd ymm12, ymm12, ymm12 - vpcmpeqd ymm10, ymm10, ymm10 - vpcmpeqd ymm11, ymm11, ymm11 - vpcmpeqd ymm13, ymm13, ymm13 - vpxor xmm0, xmm0, xmm0 - vmovdqa ymmword ptr [rsp + 32], ymm0 # 32-byte Spill - vpxor xmm0, xmm0, xmm0 - vmovdqa ymmword ptr [rsp], ymm0 # 32-byte Spill - vpxor xmm3, xmm3, xmm3 - vpxor xmm9, xmm9, xmm9 - vpxor xmm8, xmm8, xmm8 - vpxor xmm15, xmm15, xmm15 - vpxor xmm0, xmm0, xmm0 - jmp .LBB3_10 -.LBB3_8: - and r10, -4 - neg r10 - vpxor xmm4, xmm4, xmm4 - vpcmpeqd ymm0, ymm0, ymm0 - vmovdqa ymmword ptr [rsp + 64], ymm0 # 32-byte Spill + jmp .LBB3_8 +.LBB3_4: + mov r9d, r8d + and r9d, -16 + vpxor xmm5, xmm5, xmm5 + vpcmpeqd ymm1, ymm1, ymm1 xor eax, eax - vpbroadcastq ymm14, qword ptr [rip + .LCPI3_0] # ymm14 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] - vpcmpeqd ymm0, ymm0, ymm0 - vmovdqa ymmword ptr [rsp + 96], ymm0 # 32-byte Spill - vpcmpeqd ymm5, ymm5, ymm5 - vpcmpeqd ymm7, ymm7, ymm7 - vpcmpeqd ymm12, ymm12, ymm12 - vpcmpeqd ymm10, ymm10, ymm10 - vpcmpeqd ymm11, ymm11, ymm11 - vpcmpeqd ymm13, ymm13, ymm13 - vpxor xmm0, xmm0, xmm0 - vmovdqa ymmword ptr [rsp + 32], ymm0 # 32-byte Spill - vpxor xmm0, xmm0, xmm0 - vmovdqa ymmword ptr [rsp], ymm0 # 32-byte Spill - vpxor xmm3, xmm3, xmm3 - vpxor xmm9, xmm9, xmm9 + vpbroadcastq ymm0, qword ptr [rip + .LCPI3_0] # ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] + vpcmpeqd ymm4, ymm4, ymm4 + vpcmpeqd ymm3, ymm3, ymm3 + vpcmpeqd ymm2, ymm2, ymm2 vpxor xmm8, xmm8, xmm8 - vpxor xmm15, xmm15, xmm15 - vpxor xmm0, xmm0, xmm0 + vpxor xmm7, xmm7, xmm7 + vpxor xmm6, xmm6, xmm6 .p2align 4, 0x90 -.LBB3_9: # =>This Inner Loop Header: Depth=1 - vmovdqu ymm1, ymmword ptr [rdi + 8*rax + 224] - vpxor ymm2, ymm14, ymm1 - vmovdqa ymm6, ymm3 - vpxor ymm3, ymm13, ymm14 - vpcmpgtq ymm3, ymm2, ymm3 - vblendvpd ymm3, ymm1, ymm13, ymm3 - vmovapd ymmword ptr [rsp + 128], ymm3 # 32-byte Spill - vpxor ymm3, ymm14, ymm0 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm0, ymm1, ymm0, ymm2 - vmovapd ymmword ptr [rsp + 224], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 192] - vpxor ymm1, ymm14, ymm0 - vpxor ymm2, ymm11, ymm14 - vpcmpgtq ymm2, ymm1, ymm2 - vblendvpd ymm2, ymm0, ymm11, ymm2 - vmovapd ymmword ptr [rsp + 160], ymm2 # 32-byte Spill - vpxor ymm2, ymm15, ymm14 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm0, ymm0, ymm15, ymm1 - vmovapd ymmword ptr [rsp + 192], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 160] - vpxor ymm1, ymm14, ymm0 - vpxor ymm2, ymm10, ymm14 - vpcmpgtq ymm2, ymm1, ymm2 - vmovdqa ymm3, ymm8 - vblendvpd ymm8, ymm0, ymm10, ymm2 - vpxor ymm2, ymm14, ymm3 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm13, ymm0, ymm3, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 128] - vpxor ymm2, ymm14, ymm0 - vpxor ymm1, ymm12, ymm14 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm1, ymm0, ymm12, ymm1 - vpxor ymm3, ymm9, ymm14 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm12, ymm0, ymm9, ymm2 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 96] - vpxor ymm0, ymm14, ymm7 - vpxor ymm3, ymm14, ymm2 - vpcmpgtq ymm0, ymm3, ymm0 - vblendvpd ymm0, ymm2, ymm7, ymm0 - vmovdqa ymm15, ymm4 - vpxor ymm4, ymm14, ymm6 - vpcmpgtq ymm3, ymm4, ymm3 - vblendvpd ymm10, ymm2, ymm6, ymm3 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 64] - vpxor ymm3, ymm14, ymm5 - vpxor ymm4, ymm14, ymm2 - vpcmpgtq ymm3, ymm4, ymm3 - vblendvpd ymm5, ymm2, ymm5, ymm3 - vmovdqa ymm6, ymmword ptr [rsp] # 32-byte Reload - vpxor ymm3, ymm14, ymm6 - vpcmpgtq ymm3, ymm3, ymm4 - vblendvpd ymm9, ymm2, ymm6, ymm3 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax] - vmovdqa ymm7, ymmword ptr [rsp + 64] # 32-byte Reload - vpxor ymm3, ymm14, ymm7 - vpxor ymm4, ymm14, ymm2 - vpcmpgtq ymm3, ymm4, ymm3 - vblendvpd ymm3, ymm2, ymm7, ymm3 - vpxor ymm11, ymm15, ymm14 - vpcmpgtq ymm4, ymm11, ymm4 - vblendvpd ymm4, ymm2, ymm15, ymm4 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 32] - vmovdqa ymm15, ymmword ptr [rsp + 96] # 32-byte Reload - vpxor ymm11, ymm15, ymm14 - vpxor ymm7, ymm14, ymm2 - vpcmpgtq ymm11, ymm7, ymm11 - vblendvpd ymm11, ymm2, ymm15, ymm11 - vmovdqa ymm6, ymmword ptr [rsp + 32] # 32-byte Reload - vpxor ymm15, ymm14, ymm6 - vpcmpgtq ymm7, ymm15, ymm7 - vblendvpd ymm2, ymm2, ymm6, ymm7 - vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 288] - vxorpd ymm7, ymm11, ymm14 - vpxor ymm15, ymm14, ymm6 - vpcmpgtq ymm7, ymm15, ymm7 - vblendvpd ymm7, ymm6, ymm11, ymm7 - vmovapd ymmword ptr [rsp + 96], ymm7 # 32-byte Spill - vxorpd ymm7, ymm14, ymm2 - vpcmpgtq ymm7, ymm7, ymm15 - vblendvpd ymm2, ymm6, ymm2, ymm7 - vmovapd ymmword ptr [rsp + 64], ymm2 # 32-byte Spill - vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 256] - vxorpd ymm7, ymm14, ymm3 - vpxor ymm11, ymm14, ymm6 - vpcmpgtq ymm7, ymm11, ymm7 - vblendvpd ymm2, ymm6, ymm3, ymm7 - vmovapd ymmword ptr [rsp], ymm2 # 32-byte Spill - vxorpd ymm7, ymm14, ymm4 - vpcmpgtq ymm7, ymm7, ymm11 - vblendvpd ymm2, ymm6, ymm4, ymm7 - vmovapd ymmword ptr [rsp + 32], ymm2 # 32-byte Spill - vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 320] - vxorpd ymm7, ymm14, ymm5 - vpxor ymm11, ymm14, ymm6 - vpcmpgtq ymm7, ymm11, ymm7 +.LBB3_5: # =>This Inner Loop Header: Depth=1 + vmovdqu ymm9, ymmword ptr [rdi + 8*rax] + vpxor ymm10, ymm1, ymm0 + vpxor ymm11, ymm9, ymm0 + vpcmpgtq ymm10, ymm11, ymm10 + vblendvpd ymm1, ymm9, ymm1, ymm10 + vpxor ymm10, ymm5, ymm0 + vpcmpgtq ymm10, ymm10, ymm11 + vblendvpd ymm5, ymm9, ymm5, ymm10 + vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32] + vpxor ymm10, ymm4, ymm0 + vpxor ymm11, ymm9, ymm0 + vpcmpgtq ymm10, ymm11, ymm10 + vblendvpd ymm4, ymm9, ymm4, ymm10 + vpxor ymm10, ymm8, ymm0 + vpcmpgtq ymm10, ymm10, ymm11 + vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 64] + vblendvpd ymm8, ymm9, ymm8, ymm10 + vpxor ymm9, ymm3, ymm0 + vpxor ymm10, ymm11, ymm0 + vpcmpgtq ymm9, ymm10, ymm9 + vblendvpd ymm3, ymm11, ymm3, ymm9 + vpxor ymm9, ymm7, ymm0 + vpcmpgtq ymm9, ymm9, ymm10 + vblendvpd ymm7, ymm11, ymm7, ymm9 + vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 96] + vpxor ymm10, ymm2, ymm0 + vpxor ymm11, ymm9, ymm0 + vpcmpgtq ymm10, ymm11, ymm10 + vblendvpd ymm2, ymm9, ymm2, ymm10 + vpxor ymm10, ymm6, ymm0 + vpcmpgtq ymm10, ymm10, ymm11 + vblendvpd ymm6, ymm9, ymm6, ymm10 + add rax, 16 + cmp r9, rax + jne .LBB3_5 +# %bb.6: + vpxor ymm9, ymm8, ymm0 + vpxor ymm10, ymm5, ymm0 + vpcmpgtq ymm9, ymm10, ymm9 + vblendvpd ymm5, ymm8, ymm5, ymm9 + vxorpd ymm8, ymm5, ymm0 + vpxor ymm9, ymm7, ymm0 + vpcmpgtq ymm8, ymm8, ymm9 + vblendvpd ymm5, ymm7, ymm5, ymm8 + vxorpd ymm7, ymm5, ymm0 + vpxor ymm8, ymm6, ymm0 + vpcmpgtq ymm7, ymm7, ymm8 vblendvpd ymm5, ymm6, ymm5, ymm7 - vxorpd ymm7, ymm9, ymm14 - vpcmpgtq ymm7, ymm7, ymm11 - vblendvpd ymm7, ymm6, ymm9, ymm7 - vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 352] - vxorpd ymm9, ymm14, ymm0 - vpxor ymm11, ymm14, ymm6 - vpcmpgtq ymm9, ymm11, ymm9 - vblendvpd ymm9, ymm6, ymm0, ymm9 - vxorpd ymm0, ymm10, ymm14 - vpcmpgtq ymm0, ymm0, ymm11 - vblendvpd ymm10, ymm6, ymm10, ymm0 - vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 384] - vxorpd ymm0, ymm14, ymm1 - vpxor ymm11, ymm14, ymm6 - vpcmpgtq ymm0, ymm11, ymm0 - vblendvpd ymm4, ymm6, ymm1, ymm0 - vxorpd ymm1, ymm12, ymm14 - vpcmpgtq ymm1, ymm1, ymm11 - vblendvpd ymm3, ymm6, ymm12, ymm1 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 416] - vxorpd ymm6, ymm8, ymm14 - vpxor ymm12, ymm11, ymm14 - vpcmpgtq ymm6, ymm12, ymm6 - vblendvpd ymm6, ymm11, ymm8, ymm6 - vxorpd ymm8, ymm13, ymm14 - vpcmpgtq ymm8, ymm8, ymm12 - vblendvpd ymm12, ymm11, ymm13, ymm8 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 448] - vmovdqa ymm0, ymmword ptr [rsp + 160] # 32-byte Reload - vpxor ymm8, ymm14, ymm0 - vpxor ymm13, ymm11, ymm14 - vpcmpgtq ymm8, ymm13, ymm8 - vblendvpd ymm8, ymm11, ymm0, ymm8 - vmovdqa ymm0, ymmword ptr [rsp + 192] # 32-byte Reload - vpxor ymm15, ymm14, ymm0 - vpcmpgtq ymm13, ymm15, ymm13 - vblendvpd ymm13, ymm11, ymm0, ymm13 - vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 480] - vmovdqa ymm1, ymmword ptr [rsp + 128] # 32-byte Reload - vpxor ymm15, ymm14, ymm1 - vpxor ymm0, ymm11, ymm14 - vpcmpgtq ymm15, ymm0, ymm15 - vblendvpd ymm1, ymm11, ymm1, ymm15 - vmovdqa ymm2, ymmword ptr [rsp + 224] # 32-byte Reload - vpxor ymm15, ymm14, ymm2 - vpcmpgtq ymm0, ymm15, ymm0 - vblendvpd ymm15, ymm11, ymm2, ymm0 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 736] - vxorpd ymm11, ymm14, ymm1 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm11, ymm2, ymm11 - vblendvpd ymm1, ymm0, ymm1, ymm11 - vmovapd ymmword ptr [rsp + 128], ymm1 # 32-byte Spill - vxorpd ymm1, ymm15, ymm14 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm0, ymm0, ymm15, ymm1 - vmovapd ymmword ptr [rsp + 224], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 704] - vxorpd ymm1, ymm8, ymm14 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm1, ymm0, ymm8, ymm1 - vmovapd ymmword ptr [rsp + 160], ymm1 # 32-byte Spill - vxorpd ymm1, ymm13, ymm14 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm0, ymm0, ymm13, ymm1 - vmovapd ymmword ptr [rsp + 192], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 672] - vxorpd ymm1, ymm14, ymm6 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm15, ymm0, ymm6, ymm1 - vxorpd ymm1, ymm12, ymm14 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm13, ymm0, ymm12, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 640] - vxorpd ymm1, ymm14, ymm4 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm12, ymm0, ymm4, ymm1 - vxorpd ymm1, ymm14, ymm3 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm8, ymm0, ymm3, ymm1 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 608] - vxorpd ymm1, ymm9, ymm14 - vpxor ymm3, ymm14, ymm2 - vpcmpgtq ymm1, ymm3, ymm1 - vblendvpd ymm1, ymm2, ymm9, ymm1 - vxorpd ymm4, ymm10, ymm14 - vpcmpgtq ymm3, ymm4, ymm3 - vblendvpd ymm10, ymm2, ymm10, ymm3 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 576] - vxorpd ymm3, ymm14, ymm5 - vpxor ymm4, ymm14, ymm2 - vpcmpgtq ymm3, ymm4, ymm3 - vblendvpd ymm5, ymm2, ymm5, ymm3 - vxorpd ymm3, ymm14, ymm7 - vpcmpgtq ymm3, ymm3, ymm4 - vblendvpd ymm9, ymm2, ymm7, ymm3 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 512] - vmovdqa ymm0, ymmword ptr [rsp] # 32-byte Reload - vpxor ymm3, ymm14, ymm0 - vpxor ymm4, ymm14, ymm2 - vpcmpgtq ymm3, ymm4, ymm3 - vblendvpd ymm3, ymm2, ymm0, ymm3 - vmovdqa ymm0, ymmword ptr [rsp + 32] # 32-byte Reload - vpxor ymm6, ymm14, ymm0 - vpcmpgtq ymm4, ymm6, ymm4 - vblendvpd ymm4, ymm2, ymm0, ymm4 - vmovdqu ymm2, ymmword ptr [rdi + 8*rax + 544] - vmovdqa ymm0, ymmword ptr [rsp + 96] # 32-byte Reload - vpxor ymm6, ymm14, ymm0 - vpxor ymm7, ymm14, ymm2 + vextractf128 xmm6, ymm5, 1 + vxorpd xmm8, xmm6, xmm0 + vxorpd xmm7, xmm5, xmm0 + vpcmpgtq xmm7, xmm7, xmm8 + vblendvpd xmm5, xmm6, xmm5, xmm7 + vpermilps xmm6, xmm5, 78 # xmm6 = xmm5[2,3,0,1] + vxorpd xmm8, xmm5, xmm0 + vxorpd xmm7, xmm6, xmm0 + vpcmpgtq xmm7, xmm8, xmm7 + vblendvpd xmm5, xmm6, xmm5, xmm7 + vpxor ymm6, ymm1, ymm0 + vpxor ymm7, ymm4, ymm0 vpcmpgtq ymm6, ymm7, ymm6 - vblendvpd ymm6, ymm2, ymm0, ymm6 - vmovdqa ymm0, ymmword ptr [rsp + 64] # 32-byte Reload - vpxor ymm11, ymm14, ymm0 - vpcmpgtq ymm7, ymm11, ymm7 - vblendvpd ymm2, ymm2, ymm0, ymm7 - vmovdqu ymm7, ymmword ptr [rdi + 8*rax + 800] - vxorpd ymm11, ymm14, ymm6 - vpxor ymm0, ymm14, ymm7 - vpcmpgtq ymm11, ymm0, ymm11 - vblendvpd ymm6, ymm7, ymm6, ymm11 - vmovapd ymmword ptr [rsp + 96], ymm6 # 32-byte Spill - vxorpd ymm6, ymm14, ymm2 - vpcmpgtq ymm0, ymm6, ymm0 - vblendvpd ymm0, ymm7, ymm2, ymm0 - vmovapd ymmword ptr [rsp + 32], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 768] - vxorpd ymm2, ymm14, ymm3 - vpxor ymm7, ymm14, ymm0 - vpcmpgtq ymm2, ymm7, ymm2 - vblendvpd ymm2, ymm0, ymm3, ymm2 - vmovapd ymmword ptr [rsp + 64], ymm2 # 32-byte Spill - vxorpd ymm2, ymm14, ymm4 - vpcmpgtq ymm2, ymm2, ymm7 - vblendvpd ymm4, ymm0, ymm4, ymm2 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 832] - vxorpd ymm2, ymm14, ymm5 - vpxor ymm3, ymm14, ymm0 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm5, ymm0, ymm5, ymm2 - vxorpd ymm2, ymm9, ymm14 - vpcmpgtq ymm2, ymm2, ymm3 - vblendvpd ymm0, ymm0, ymm9, ymm2 - vmovapd ymmword ptr [rsp], ymm0 # 32-byte Spill - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 864] - vxorpd ymm2, ymm14, ymm1 - vpxor ymm3, ymm14, ymm0 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm7, ymm0, ymm1, ymm2 - vxorpd ymm1, ymm10, ymm14 - vpcmpgtq ymm1, ymm1, ymm3 - vblendvpd ymm3, ymm0, ymm10, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 896] - vxorpd ymm1, ymm12, ymm14 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm12, ymm0, ymm12, ymm1 - vxorpd ymm1, ymm8, ymm14 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm9, ymm0, ymm8, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 928] - vxorpd ymm1, ymm15, ymm14 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm10, ymm0, ymm15, ymm1 - vxorpd ymm1, ymm13, ymm14 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm8, ymm0, ymm13, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 960] - vmovdqa ymm6, ymmword ptr [rsp + 160] # 32-byte Reload - vpxor ymm1, ymm14, ymm6 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm11, ymm0, ymm6, ymm1 - vmovdqa ymm6, ymmword ptr [rsp + 192] # 32-byte Reload - vpxor ymm1, ymm14, ymm6 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm15, ymm0, ymm6, ymm1 - vmovdqu ymm0, ymmword ptr [rdi + 8*rax + 992] - vmovdqa ymm6, ymmword ptr [rsp + 128] # 32-byte Reload - vpxor ymm1, ymm14, ymm6 - vpxor ymm2, ymm14, ymm0 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm13, ymm0, ymm6, ymm1 - vmovdqa ymm6, ymmword ptr [rsp + 224] # 32-byte Reload - vpxor ymm1, ymm14, ymm6 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm0, ymm0, ymm6, ymm1 - sub rax, -128 - add r10, 4 - jne .LBB3_9 -.LBB3_10: - vmovaps ymmword ptr [rsp + 128], ymm10 # 32-byte Spill - test r9, r9 - vmovdqa ymm10, ymm12 - vmovdqa ymm12, ymm3 - je .LBB3_13 -# %bb.11: - lea rax, [rdi + 8*rax] - neg r9 - vpbroadcastq ymm14, qword ptr [rip + .LCPI3_0] # ymm14 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] - .p2align 4, 0x90 -.LBB3_12: # =>This Inner Loop Header: Depth=1 - vmovdqu ymm1, ymmword ptr [rax + 32] - vmovdqa ymm6, ymm7 - vmovdqa ymm7, ymm5 - vmovdqa ymm5, ymm4 - vmovdqa ymm4, ymmword ptr [rsp + 96] # 32-byte Reload - vpxor ymm2, ymm14, ymm4 - vpxor ymm3, ymm14, ymm1 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm4, ymm1, ymm4, ymm2 - vmovapd ymmword ptr [rsp + 96], ymm4 # 32-byte Spill - vmovdqa ymm4, ymmword ptr [rsp + 32] # 32-byte Reload - vpxor ymm2, ymm14, ymm4 - vpcmpgtq ymm2, ymm2, ymm3 - vblendvpd ymm4, ymm1, ymm4, ymm2 - vmovapd ymmword ptr [rsp + 32], ymm4 # 32-byte Spill - vmovdqu ymm1, ymmword ptr [rax] - vmovdqa ymm4, ymmword ptr [rsp + 64] # 32-byte Reload - vpxor ymm2, ymm14, ymm4 - vpxor ymm3, ymm14, ymm1 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm4, ymm1, ymm4, ymm2 - vmovapd ymmword ptr [rsp + 64], ymm4 # 32-byte Spill - vmovdqa ymm4, ymm5 - vmovdqa ymm5, ymm7 - vmovdqa ymm7, ymm6 - vpxor ymm2, ymm14, ymm4 - vpcmpgtq ymm2, ymm2, ymm3 - vmovdqu ymm3, ymmword ptr [rax + 64] - vblendvpd ymm4, ymm1, ymm4, ymm2 - vpxor ymm1, ymm14, ymm3 - vpxor ymm2, ymm14, ymm5 - vpcmpgtq ymm2, ymm1, ymm2 - vblendvpd ymm5, ymm3, ymm5, ymm2 - vmovdqa ymm6, ymmword ptr [rsp] # 32-byte Reload - vpxor ymm2, ymm14, ymm6 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm6, ymm3, ymm6, ymm1 - vmovapd ymmword ptr [rsp], ymm6 # 32-byte Spill - vmovdqu ymm1, ymmword ptr [rax + 96] - vpxor ymm2, ymm14, ymm1 - vpxor ymm3, ymm14, ymm7 - vpcmpgtq ymm3, ymm2, ymm3 - vblendvpd ymm7, ymm1, ymm7, ymm3 - vpxor ymm3, ymm12, ymm14 - vpcmpgtq ymm2, ymm3, ymm2 - vmovdqu ymm3, ymmword ptr [rax + 128] - vblendvpd ymm12, ymm1, ymm12, ymm2 - vpxor ymm1, ymm14, ymm3 - vpxor ymm2, ymm10, ymm14 - vpcmpgtq ymm2, ymm1, ymm2 - vblendvpd ymm10, ymm3, ymm10, ymm2 - vpxor ymm2, ymm9, ymm14 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm9, ymm3, ymm9, ymm1 - vmovdqu ymm1, ymmword ptr [rax + 160] - vpxor ymm2, ymm14, ymm1 - vmovdqa ymm6, ymmword ptr [rsp + 128] # 32-byte Reload - vpxor ymm3, ymm14, ymm6 - vpcmpgtq ymm3, ymm2, ymm3 - vblendvpd ymm6, ymm1, ymm6, ymm3 - vmovapd ymmword ptr [rsp + 128], ymm6 # 32-byte Spill - vpxor ymm3, ymm8, ymm14 - vpcmpgtq ymm2, ymm3, ymm2 - vmovdqu ymm3, ymmword ptr [rax + 192] - vblendvpd ymm8, ymm1, ymm8, ymm2 - vpxor ymm1, ymm14, ymm3 - vpxor ymm2, ymm11, ymm14 - vpcmpgtq ymm2, ymm1, ymm2 - vblendvpd ymm11, ymm3, ymm11, ymm2 - vpxor ymm2, ymm15, ymm14 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm15, ymm3, ymm15, ymm1 - vmovdqu ymm1, ymmword ptr [rax + 224] - vpxor ymm2, ymm14, ymm1 - vpxor ymm3, ymm13, ymm14 - vpcmpgtq ymm3, ymm2, ymm3 - vblendvpd ymm13, ymm1, ymm13, ymm3 - vpxor ymm3, ymm14, ymm0 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm0, ymm1, ymm0, ymm2 - add rax, 256 - inc r9 - jne .LBB3_12 -.LBB3_13: - vpbroadcastq ymm14, qword ptr [rip + .LCPI3_0] # ymm14 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] - vmovdqa ymm3, ymmword ptr [rsp] # 32-byte Reload - vpxor ymm1, ymm14, ymm3 - vpxor ymm2, ymm15, ymm14 - vpcmpgtq ymm1, ymm1, ymm2 - vblendvpd ymm1, ymm15, ymm3, ymm1 - vpxor ymm2, ymm14, ymm4 - vpxor ymm3, ymm9, ymm14 - vpcmpgtq ymm2, ymm2, ymm3 - vblendvpd ymm2, ymm9, ymm4, ymm2 - vpxor ymm3, ymm12, ymm14 - vpxor ymm9, ymm14, ymm0 - vpcmpgtq ymm3, ymm3, ymm9 - vblendvpd ymm0, ymm0, ymm12, ymm3 - vmovdqa ymm4, ymmword ptr [rsp + 32] # 32-byte Reload - vpxor ymm3, ymm14, ymm4 - vpxor ymm9, ymm8, ymm14 - vpcmpgtq ymm3, ymm3, ymm9 - vblendvpd ymm3, ymm8, ymm4, ymm3 - vxorpd ymm6, ymm14, ymm3 - vxorpd ymm9, ymm14, ymm0 - vpcmpgtq ymm6, ymm6, ymm9 - vblendvpd ymm0, ymm0, ymm3, ymm6 - vxorpd ymm3, ymm14, ymm2 - vxorpd ymm6, ymm14, ymm1 - vpcmpgtq ymm3, ymm3, ymm6 - vblendvpd ymm1, ymm1, ymm2, ymm3 - vxorpd ymm2, ymm14, ymm1 - vxorpd ymm3, ymm14, ymm0 - vpcmpgtq ymm2, ymm2, ymm3 - vblendvpd ymm0, ymm0, ymm1, ymm2 - vextractf128 xmm1, ymm0, 1 - vxorpd xmm2, xmm14, xmm1 - vxorpd xmm3, xmm14, xmm0 - vpcmpgtq xmm2, xmm3, xmm2 - vblendvpd xmm0, xmm1, xmm0, xmm2 - vpermilps xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] - vxorpd xmm2, xmm14, xmm0 - vxorpd xmm3, xmm14, xmm1 - vpcmpgtq xmm2, xmm2, xmm3 - vblendvpd xmm0, xmm1, xmm0, xmm2 - vpxor ymm1, ymm14, ymm5 - vpxor ymm2, ymm11, ymm14 - vpcmpgtq ymm1, ymm2, ymm1 - vblendvpd ymm1, ymm11, ymm5, ymm1 - vmovdqa ymm4, ymmword ptr [rsp + 64] # 32-byte Reload - vpxor ymm2, ymm14, ymm4 - vpxor ymm3, ymm10, ymm14 - vpcmpgtq ymm2, ymm3, ymm2 - vblendvpd ymm2, ymm10, ymm4, ymm2 - vpxor ymm3, ymm14, ymm7 - vpxor ymm5, ymm13, ymm14 - vpcmpgtq ymm3, ymm5, ymm3 - vblendvpd ymm3, ymm13, ymm7, ymm3 - vmovdqa ymm6, ymmword ptr [rsp + 96] # 32-byte Reload - vpxor ymm4, ymm14, ymm6 - vmovdqa ymm7, ymmword ptr [rsp + 128] # 32-byte Reload - vpxor ymm5, ymm14, ymm7 - vpcmpgtq ymm4, ymm5, ymm4 - vblendvpd ymm4, ymm7, ymm6, ymm4 - vxorpd ymm5, ymm14, ymm4 - vxorpd ymm6, ymm14, ymm3 - vpcmpgtq ymm5, ymm6, ymm5 - vblendvpd ymm3, ymm3, ymm4, ymm5 - vxorpd ymm4, ymm14, ymm2 - vxorpd ymm5, ymm14, ymm1 - vpcmpgtq ymm4, ymm5, ymm4 - vblendvpd ymm1, ymm1, ymm2, ymm4 - vxorpd ymm2, ymm14, ymm1 - vxorpd ymm4, ymm14, ymm3 - vpcmpgtq ymm2, ymm4, ymm2 - vblendvpd ymm1, ymm3, ymm1, ymm2 + vblendvpd ymm1, ymm4, ymm1, ymm6 + vxorpd ymm4, ymm1, ymm0 + vpxor ymm6, ymm3, ymm0 + vpcmpgtq ymm4, ymm6, ymm4 + vblendvpd ymm1, ymm3, ymm1, ymm4 + vmovq r10, xmm5 + vxorpd ymm3, ymm1, ymm0 + vpxor ymm4, ymm2, ymm0 + vpcmpgtq ymm3, ymm4, ymm3 + vblendvpd ymm1, ymm2, ymm1, ymm3 vextractf128 xmm2, ymm1, 1 - vxorpd xmm3, xmm14, xmm1 - vxorpd xmm4, xmm14, xmm2 + vxorpd xmm3, xmm1, xmm0 + vxorpd xmm4, xmm2, xmm0 vpcmpgtq xmm3, xmm4, xmm3 vblendvpd xmm1, xmm2, xmm1, xmm3 vpermilps xmm2, xmm1, 78 # xmm2 = xmm1[2,3,0,1] - vxorpd xmm3, xmm14, xmm1 - vxorpd xmm4, xmm14, xmm2 - vpcmpgtq xmm3, xmm4, xmm3 - vblendvpd xmm1, xmm2, xmm1, xmm3 - vmovq rsi, xmm0 - vmovq r9, xmm1 - cmp r11, r8 - je .LBB3_14 -.LBB3_4: - mov rax, rsi + vxorpd xmm3, xmm1, xmm0 + vxorpd xmm0, xmm2, xmm0 + vpcmpgtq xmm0, xmm0, xmm3 + vblendvpd xmm0, xmm2, xmm1, xmm0 + vmovq rax, xmm0 + mov rsi, r10 + cmp r9, r8 + je .LBB3_8 .p2align 4, 0x90 -.LBB3_5: # =>This Inner Loop Header: Depth=1 - mov rsi, qword ptr [rdi + 8*r11] - cmp r9, rsi - cmovae r9, rsi +.LBB3_7: # =>This Inner Loop Header: Depth=1 + mov rsi, qword ptr [rdi + 8*r9] cmp rax, rsi - cmova rsi, rax - add r11, 1 - mov rax, rsi - cmp r8, r11 - jne .LBB3_5 -.LBB3_14: + cmovae rax, rsi + cmp r10, rsi + cmova rsi, r10 + add r9, 1 + mov r10, rsi + cmp r8, r9 + jne .LBB3_7 +.LBB3_8: mov qword ptr [rcx], rsi - mov qword ptr [rdx], r9 + mov qword ptr [rdx], rax mov rsp, rbp pop rbp vzeroupper @@ -1361,6 +468,6 @@ uint64_max_min_avx2: # @uint64_max_min_avx2 .Lfunc_end3: .size uint64_max_min_avx2, .Lfunc_end3-uint64_max_min_avx2 # -- End function - .ident "Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162" + .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" .section ".note.GNU-stack","",@progbits .addrsig diff --git a/go/parquet/internal/utils/_lib/min_max_sse4.s b/go/parquet/internal/utils/_lib/min_max_sse4.s index 98f30e3ed1d..893a0a73f02 100644 --- a/go/parquet/internal/utils/_lib/min_max_sse4.s +++ b/go/parquet/internal/utils/_lib/min_max_sse4.s @@ -608,6 +608,6 @@ uint64_max_min_sse4: # @uint64_max_min_sse4 .Lfunc_end3: .size uint64_max_min_sse4, .Lfunc_end3-uint64_max_min_sse4 # -- End function - .ident "Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162" + .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" .section ".note.GNU-stack","",@progbits .addrsig diff --git a/go/parquet/internal/utils/_lib/unpack_bool_avx2.s b/go/parquet/internal/utils/_lib/unpack_bool_avx2.s index 1bc1be53d4d..6ac34887c00 100644 --- a/go/parquet/internal/utils/_lib/unpack_bool_avx2.s +++ b/go/parquet/internal/utils/_lib/unpack_bool_avx2.s @@ -1,6293 +1,104 @@ .text .intel_syntax noprefix .file "unpack_bool.c" - .section .rodata.cst32,"aM",@progbits,32 - .p2align 5 # -- Begin function bytes_to_bools_avx2 -.LCPI0_0: - .long 24 # 0x18 - .long 25 # 0x19 - .long 26 # 0x1a - .long 27 # 0x1b - .long 28 # 0x1c - .long 29 # 0x1d - .long 30 # 0x1e - .long 31 # 0x1f -.LCPI0_1: - .long 16 # 0x10 - .long 17 # 0x11 - .long 18 # 0x12 - .long 19 # 0x13 - .long 20 # 0x14 - .long 21 # 0x15 - .long 22 # 0x16 - .long 23 # 0x17 -.LCPI0_2: - .long 8 # 0x8 - .long 9 # 0x9 - .long 10 # 0xa - .long 11 # 0xb - .long 12 # 0xc - .long 13 # 0xd - .long 14 # 0xe - .long 15 # 0xf -.LCPI0_3: - .long 0 # 0x0 - .long 1 # 0x1 - .long 2 # 0x2 - .long 3 # 0x3 - .long 4 # 0x4 - .long 5 # 0x5 - .long 6 # 0x6 - .long 7 # 0x7 -.LCPI0_4: - .zero 32,1 - .section .rodata.cst8,"aM",@progbits,8 - .p2align 3 -.LCPI0_5: - .quad 1 # 0x1 -.LCPI0_6: - .quad 2 # 0x2 -.LCPI0_7: - .quad 3 # 0x3 -.LCPI0_8: - .quad 4 # 0x4 -.LCPI0_9: - .quad 5 # 0x5 -.LCPI0_10: - .quad 6 # 0x6 -.LCPI0_11: - .quad 7 # 0x7 - .section .rodata.cst4,"aM",@progbits,4 - .p2align 2 -.LCPI0_12: - .long 32 # 0x20 - .text - .globl bytes_to_bools_avx2 + .globl bytes_to_bools_avx2 # -- Begin function bytes_to_bools_avx2 .p2align 4, 0x90 .type bytes_to_bools_avx2,@function bytes_to_bools_avx2: # @bytes_to_bools_avx2 # %bb.0: push rbp mov rbp, rsp - push r15 - push r14 - push r13 - push r12 - push rbx - and rsp, -32 - sub rsp, 960 + and rsp, -8 test esi, esi - jle .LBB0_1051 + jle .LBB0_5 # %bb.1: - mov r9d, ecx - mov r8, rdx - mov r10d, esi - cmp esi, 32 - jae .LBB0_3 -.LBB0_2: - xor r12d, r12d -.LBB0_1055: - lea ecx, [8*r12] - jmp .LBB0_1057 - .p2align 4, 0x90 -.LBB0_1056: # in Loop: Header=BB0_1057 Depth=1 - add r12, 1 - add ecx, 8 - cmp r10, r12 - je .LBB0_1051 -.LBB0_1057: # =>This Inner Loop Header: Depth=1 - mov edx, ecx - mov ecx, ecx - cmp edx, r9d - jge .LBB0_1056 -# %bb.1058: # in Loop: Header=BB0_1057 Depth=1 - movzx edx, byte ptr [rdi + r12] - and dl, 1 - mov byte ptr [r8 + rcx], dl - mov rdx, rcx - or rdx, 1 - cmp edx, r9d - jge .LBB0_1056 -# %bb.1059: # in Loop: Header=BB0_1057 Depth=1 - movzx ebx, byte ptr [rdi + r12] - shr bl - and bl, 1 - mov byte ptr [r8 + rdx], bl - mov rdx, rcx - or rdx, 2 - cmp edx, r9d - jge .LBB0_1056 -# %bb.1060: # in Loop: Header=BB0_1057 Depth=1 - movzx ebx, byte ptr [rdi + r12] - shr bl, 2 - and bl, 1 - mov byte ptr [r8 + rdx], bl - mov rdx, rcx - or rdx, 3 - cmp edx, r9d - jge .LBB0_1056 -# %bb.1061: # in Loop: Header=BB0_1057 Depth=1 - movzx ebx, byte ptr [rdi + r12] - shr bl, 3 - and bl, 1 - mov byte ptr [r8 + rdx], bl - mov rdx, rcx - or rdx, 4 - cmp edx, r9d - jge .LBB0_1056 -# %bb.1062: # in Loop: Header=BB0_1057 Depth=1 - movzx ebx, byte ptr [rdi + r12] - shr bl, 4 - and bl, 1 - mov byte ptr [r8 + rdx], bl - mov rdx, rcx - or rdx, 5 - cmp edx, r9d - jge .LBB0_1056 -# %bb.1063: # in Loop: Header=BB0_1057 Depth=1 - movzx ebx, byte ptr [rdi + r12] - shr bl, 5 - and bl, 1 - mov byte ptr [r8 + rdx], bl - mov rdx, rcx - or rdx, 6 - cmp edx, r9d - jge .LBB0_1056 -# %bb.1064: # in Loop: Header=BB0_1057 Depth=1 - movzx ebx, byte ptr [rdi + r12] - shr bl, 6 - and bl, 1 - mov byte ptr [r8 + rdx], bl - mov rdx, rcx - or rdx, 7 - cmp edx, r9d - jge .LBB0_1056 -# %bb.1065: # in Loop: Header=BB0_1057 Depth=1 - movzx ebx, byte ptr [rdi + r12] - shr bl, 7 - mov byte ptr [r8 + rdx], bl - jmp .LBB0_1056 -.LBB0_3: - mov dword ptr [rsp + 16], r9d # 4-byte Spill - mov qword ptr [rsp + 48], r10 # 8-byte Spill - lea rsi, [r10 - 1] - mov ecx, 8 - mov eax, esi - mul ecx - seto r14b - mov rbx, rsi - shr rbx, 32 - lea rcx, [r8 + 6] - mov edx, 8 - mov rax, rsi - mul rdx - seto sil - add rcx, rax - setb dl - lea rcx, [r8 + 7] - add rcx, rax - setb r13b - lea rcx, [r8 + 5] - add rcx, rax - setb r9b - lea rcx, [r8 + 4] - add rcx, rax - setb r15b - lea rcx, [r8 + 3] - add rcx, rax - setb r11b - lea rcx, [r8 + 2] - add rcx, rax - setb r10b - lea rcx, [r8 + 1] - add rcx, rax - setb cl - add rax, r8 - setb al - xor r12d, r12d - test rbx, rbx - jne .LBB0_1052 -# %bb.4: - test r14b, r14b - jne .LBB0_1052 -# %bb.5: - test dl, dl - jne .LBB0_1052 -# %bb.6: - test sil, sil - jne .LBB0_1052 -# %bb.7: - test r13b, r13b - jne .LBB0_1052 -# %bb.8: - test sil, sil - jne .LBB0_1052 -# %bb.9: - test r9b, r9b - jne .LBB0_1052 -# %bb.10: - test sil, sil - jne .LBB0_1052 -# %bb.11: - test r15b, r15b - jne .LBB0_1052 -# %bb.12: - test sil, sil - jne .LBB0_1052 -# %bb.13: - test r11b, r11b - jne .LBB0_1052 -# %bb.14: - test sil, sil - jne .LBB0_1052 -# %bb.15: - test r10b, r10b - jne .LBB0_1052 -# %bb.16: - test sil, sil - mov r10, qword ptr [rsp + 48] # 8-byte Reload - jne .LBB0_1054 -# %bb.17: - test cl, cl - jne .LBB0_1054 -# %bb.18: - test sil, sil - mov r9d, dword ptr [rsp + 16] # 4-byte Reload - jne .LBB0_1055 -# %bb.19: - test al, al - jne .LBB0_1055 -# %bb.20: - test sil, sil - jne .LBB0_1055 -# %bb.21: - lea rax, [r8 + 8*r10] - cmp rax, rdi - jbe .LBB0_24 -# %bb.22: - lea rax, [rdi + r10] - cmp rax, r8 - ja .LBB0_2 -.LBB0_24: - mov r12d, r10d - and r12d, -32 - vmovd xmm0, r9d - vpbroadcastd ymm0, xmm0 - vmovdqa ymm9, ymmword ptr [rip + .LCPI0_0] # ymm9 = [24,25,26,27,28,29,30,31] - vmovdqa ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [16,17,18,19,20,21,22,23] - vmovdqa ymm3, ymmword ptr [rip + .LCPI0_2] # ymm3 = [8,9,10,11,12,13,14,15] - vmovdqa ymm2, ymmword ptr [rip + .LCPI0_3] # ymm2 = [0,1,2,3,4,5,6,7] - xor r11d, r11d - vbroadcastsd ymm1, qword ptr [rip + .LCPI0_5] # ymm1 = [1,1,1,1] - vmovaps ymmword ptr [rsp + 768], ymm1 # 32-byte Spill - vbroadcastsd ymm1, qword ptr [rip + .LCPI0_6] # ymm1 = [2,2,2,2] - vmovaps ymmword ptr [rsp + 736], ymm1 # 32-byte Spill - vbroadcastsd ymm1, qword ptr [rip + .LCPI0_7] # ymm1 = [3,3,3,3] - vmovaps ymmword ptr [rsp + 704], ymm1 # 32-byte Spill - vbroadcastsd ymm1, qword ptr [rip + .LCPI0_8] # ymm1 = [4,4,4,4] - vmovaps ymmword ptr [rsp + 672], ymm1 # 32-byte Spill - vbroadcastsd ymm1, qword ptr [rip + .LCPI0_9] # ymm1 = [5,5,5,5] - vmovaps ymmword ptr [rsp + 640], ymm1 # 32-byte Spill - vbroadcastsd ymm1, qword ptr [rip + .LCPI0_10] # ymm1 = [6,6,6,6] - vmovaps ymmword ptr [rsp + 608], ymm1 # 32-byte Spill - vbroadcastsd ymm1, qword ptr [rip + .LCPI0_11] # ymm1 = [7,7,7,7] - vmovaps ymmword ptr [rsp + 576], ymm1 # 32-byte Spill - vpbroadcastd ymm1, dword ptr [rip + .LCPI0_12] # ymm1 = [32,32,32,32,32,32,32,32] - vmovdqa ymmword ptr [rsp + 544], ymm1 # 32-byte Spill - jmp .LBB0_26 - .p2align 4, 0x90 -.LBB0_25: # in Loop: Header=BB0_26 Depth=1 - add r11, 32 - vmovdqa ymm1, ymmword ptr [rsp + 544] # 32-byte Reload - vpaddd ymm2, ymm2, ymm1 - vpaddd ymm3, ymm3, ymm1 - vpaddd ymm8, ymm8, ymm1 - vpaddd ymm9, ymm9, ymm1 - cmp r11, r12 - je .LBB0_1050 -.LBB0_26: # =>This Inner Loop Header: Depth=1 - vmovdqa ymmword ptr [rsp + 800], ymm2 # 32-byte Spill - vpslld ymm1, ymm2, 3 - vpcmpgtd xmm2, xmm0, xmm1 - vmovd ecx, xmm2 - # implicit-def: $ymm4 - test cl, 1 - je .LBB0_28 -# %bb.27: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm4, byte ptr [rdi + r11] -.LBB0_28: # in Loop: Header=BB0_26 Depth=1 - mov r10, r11 - or r10, 1 - vpcmpgtd xmm2, xmm0, xmm1 - vpackssdw xmm2, xmm2, xmm2 - vpacksswb xmm2, xmm2, xmm2 - vpextrb ecx, xmm2, 1 - test cl, 1 - je .LBB0_30 -# %bb.29: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm2, xmm4, byte ptr [rdi + r10], 1 - vpblendd ymm4, ymm4, ymm2, 15 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -.LBB0_30: # in Loop: Header=BB0_26 Depth=1 - mov r14, r11 - or r14, 2 - vpcmpgtd xmm2, xmm0, xmm1 - vpackssdw xmm2, xmm2, xmm2 - vpacksswb xmm2, xmm2, xmm2 - vpextrb ecx, xmm2, 2 - test cl, 1 - je .LBB0_32 -# %bb.31: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm2, xmm4, byte ptr [rdi + r14], 2 - vpblendd ymm4, ymm4, ymm2, 15 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -.LBB0_32: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm13, ymm1, 1 - mov rdx, r11 - or rdx, 3 - vpcmpgtd xmm2, xmm0, xmm1 - vpackssdw xmm2, xmm2, xmm2 - vpacksswb xmm2, xmm2, xmm2 - vpextrb ecx, xmm2, 3 - test cl, 1 - je .LBB0_34 -# %bb.33: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm2, xmm4, byte ptr [rdi + rdx], 3 - vpblendd ymm4, ymm4, ymm2, 15 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -.LBB0_34: # in Loop: Header=BB0_26 Depth=1 - mov rcx, r11 - or rcx, 4 - vextracti128 xmm7, ymm0, 1 - vpcmpgtd xmm2, xmm7, xmm13 - vpextrb r9d, xmm2, 0 - test r9b, 1 - mov qword ptr [rsp + 272], rdx # 8-byte Spill - mov qword ptr [rsp + 264], rcx # 8-byte Spill - je .LBB0_36 -# %bb.35: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm2, xmm4, byte ptr [rdi + rcx], 4 - vpblendd ymm4, ymm4, ymm2, 15 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -.LBB0_36: # in Loop: Header=BB0_26 Depth=1 - mov r15, r11 - or r15, 5 - vpcmpgtd ymm6, ymm0, ymm1 - vpackssdw ymm2, ymm6, ymm0 - vextracti128 xmm2, ymm2, 1 - vpbroadcastd xmm2, xmm2 - vpacksswb xmm2, xmm2, xmm2 - vpextrb ecx, xmm2, 5 - test cl, 1 - je .LBB0_38 -# %bb.37: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm2, xmm4, byte ptr [rdi + r15], 5 - vpblendd ymm4, ymm4, ymm2, 15 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -.LBB0_38: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 6 - vpackssdw ymm2, ymm6, ymm0 - vpermq ymm2, ymm2, 232 # ymm2 = ymm2[0,2,2,3] - vpacksswb xmm2, xmm2, xmm2 - vpextrb ecx, xmm2, 6 - test cl, 1 - je .LBB0_40 -# %bb.39: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm2, xmm4, byte ptr [rdi + rbx], 6 - vpblendd ymm4, ymm4, ymm2, 15 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -.LBB0_40: # in Loop: Header=BB0_26 Depth=1 - vpslld ymm2, ymm3, 3 - mov rax, r11 - or rax, 7 - vpackssdw ymm5, ymm6, ymm0 - vpermq ymm5, ymm5, 232 # ymm5 = ymm5[0,2,2,3] - vpacksswb xmm5, xmm5, xmm5 - vpextrb ecx, xmm5, 7 - test cl, 1 - je .LBB0_42 -# %bb.41: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm5, xmm4, byte ptr [rdi + rax], 7 - vpblendd ymm4, ymm4, ymm5, 15 # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -.LBB0_42: # in Loop: Header=BB0_26 Depth=1 - mov rsi, r11 - or rsi, 8 - vpcmpgtd xmm5, xmm0, xmm2 - vpextrb ecx, xmm5, 0 - test cl, 1 - je .LBB0_44 -# %bb.43: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm5, xmm4, byte ptr [rdi + rsi], 8 - vpblendd ymm4, ymm4, ymm5, 15 # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -.LBB0_44: # in Loop: Header=BB0_26 Depth=1 - mov rdx, r11 - or rdx, 9 - vpcmpgtd xmm5, xmm0, xmm2 - vpackssdw xmm5, xmm5, xmm5 - vpacksswb xmm5, xmm5, xmm5 - vpextrb ecx, xmm5, 9 - test cl, 1 - mov qword ptr [rsp + 224], rdx # 8-byte Spill - je .LBB0_46 -# %bb.45: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm5, xmm4, byte ptr [rdi + rdx], 9 - vpblendd ymm4, ymm4, ymm5, 15 # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -.LBB0_46: # in Loop: Header=BB0_26 Depth=1 - mov rdx, r11 - or rdx, 10 - vpcmpgtd xmm5, xmm0, xmm2 - vpackssdw xmm5, xmm5, xmm5 - vpacksswb xmm5, xmm5, xmm5 - vpextrb ecx, xmm5, 10 - test cl, 1 - vmovdqa ymmword ptr [rsp + 832], ymm3 # 32-byte Spill - mov qword ptr [rsp + 96], rsi # 8-byte Spill - je .LBB0_48 -# %bb.47: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm5, xmm4, byte ptr [rdi + rdx], 10 - vpblendd ymm4, ymm4, ymm5, 15 # ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -.LBB0_48: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm5, ymm2, 1 - mov rsi, r11 - or rsi, 11 - vpcmpgtd xmm3, xmm0, xmm2 - vpackssdw xmm3, xmm3, xmm3 - vpacksswb xmm3, xmm3, xmm3 - vpextrb ecx, xmm3, 11 - test cl, 1 - mov qword ptr [rsp + 152], r10 # 8-byte Spill - mov qword ptr [rsp + 296], r14 # 8-byte Spill - mov qword ptr [rsp + 104], r15 # 8-byte Spill - mov qword ptr [rsp + 288], rbx # 8-byte Spill - mov qword ptr [rsp + 232], rax # 8-byte Spill - je .LBB0_50 -# %bb.49: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm3, xmm4, byte ptr [rdi + rsi], 11 - vpblendd ymm4, ymm4, ymm3, 15 # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -.LBB0_50: # in Loop: Header=BB0_26 Depth=1 - mov rcx, r11 - or rcx, 12 - vpcmpgtd xmm3, xmm7, xmm5 - vpextrb r14d, xmm3, 0 - test r14b, 1 - mov qword ptr [rsp + 256], rsi # 8-byte Spill - mov qword ptr [rsp + 248], rcx # 8-byte Spill - je .LBB0_52 -# %bb.51: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm3, xmm4, byte ptr [rdi + rcx], 12 - vpblendd ymm4, ymm4, ymm3, 15 # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -.LBB0_52: # in Loop: Header=BB0_26 Depth=1 - mov rax, r11 - or rax, 13 - vpcmpgtd ymm7, ymm0, ymm2 - vpackssdw ymm3, ymm7, ymm0 - vextracti128 xmm3, ymm3, 1 - vpbroadcastd xmm3, xmm3 - vpacksswb xmm3, xmm3, xmm3 - vpextrb ecx, xmm3, 13 - test cl, 1 - je .LBB0_54 -# %bb.53: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm3, xmm4, byte ptr [rdi + rax], 13 - vpblendd ymm4, ymm4, ymm3, 15 # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -.LBB0_54: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 14 - vpackssdw ymm3, ymm7, ymm0 - vpermq ymm3, ymm3, 232 # ymm3 = ymm3[0,2,2,3] - vpacksswb xmm3, xmm3, xmm3 - vpextrb ecx, xmm3, 14 - test cl, 1 - mov qword ptr [rsp + 80], rbx # 8-byte Spill - je .LBB0_56 -# %bb.55: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm3, xmm4, byte ptr [rdi + rbx], 14 - vpblendd ymm4, ymm4, ymm3, 15 # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -.LBB0_56: # in Loop: Header=BB0_26 Depth=1 - vpslld ymm10, ymm8, 3 - mov rsi, r11 - or rsi, 15 - vpackssdw ymm3, ymm7, ymm0 - vpermq ymm3, ymm3, 232 # ymm3 = ymm3[0,2,2,3] - vpacksswb xmm3, xmm3, xmm3 - vpextrb ecx, xmm3, 15 - test cl, 1 - je .LBB0_58 -# %bb.57: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm3, xmm4, byte ptr [rdi + rsi], 15 - vpblendd ymm4, ymm4, ymm3, 15 # ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -.LBB0_58: # in Loop: Header=BB0_26 Depth=1 - mov r15, r11 - or r15, 16 - vpcmpgtd xmm3, xmm0, xmm10 - vmovd ecx, xmm3 - test cl, 1 - mov qword ptr [rsp + 64], r15 # 8-byte Spill - mov qword ptr [rsp + 72], rsi # 8-byte Spill - je .LBB0_60 -# %bb.59: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + r15], 0 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_60: # in Loop: Header=BB0_26 Depth=1 - mov rsi, r11 - or rsi, 17 - vpcmpgtd xmm3, xmm0, xmm10 - vpackssdw xmm3, xmm3, xmm3 - vpermq ymm3, ymm3, 212 # ymm3 = ymm3[0,1,1,3] - vpacksswb ymm3, ymm3, ymm0 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 1 - test cl, 1 - je .LBB0_62 -# %bb.61: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rsi], 1 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_62: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 18 - vpcmpgtd xmm3, xmm0, xmm10 - vpackssdw xmm3, xmm3, xmm3 - vpermq ymm3, ymm3, 212 # ymm3 = ymm3[0,1,1,3] - vpacksswb ymm3, ymm3, ymm0 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 2 - test cl, 1 - je .LBB0_64 -# %bb.63: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 2 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_64: # in Loop: Header=BB0_26 Depth=1 - mov r15, r11 - or r15, 19 - vpcmpgtd xmm3, xmm0, xmm10 - vpackssdw xmm3, xmm3, xmm3 - vpermq ymm3, ymm3, 212 # ymm3 = ymm3[0,1,1,3] - vpacksswb ymm3, ymm3, ymm0 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 3 - test cl, 1 - vmovdqa ymmword ptr [rsp + 864], ymm8 # 32-byte Spill - je .LBB0_66 -# %bb.65: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + r15], 3 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_66: # in Loop: Header=BB0_26 Depth=1 - mov r13, r11 - or r13, 20 - vpcmpgtd ymm8, ymm0, ymm10 - vpackssdw ymm3, ymm0, ymm8 - vpacksswb ymm3, ymm3, ymm0 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 4 - test cl, 1 - mov qword ptr [rsp + 56], r13 # 8-byte Spill - je .LBB0_68 -# %bb.67: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + r13], 4 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_68: # in Loop: Header=BB0_26 Depth=1 - mov r13, r11 - or r13, 21 - vpackssdw ymm3, ymm0, ymm8 - vpacksswb ymm3, ymm3, ymm0 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 5 - test cl, 1 - mov qword ptr [rsp + 128], rbx # 8-byte Spill - je .LBB0_70 -# %bb.69: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + r13], 5 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_70: # in Loop: Header=BB0_26 Depth=1 - mov r10, r11 - or r10, 22 - vpackssdw ymm3, ymm0, ymm8 - vpacksswb ymm3, ymm3, ymm0 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 6 - test cl, 1 - je .LBB0_72 -# %bb.71: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + r10], 6 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_72: # in Loop: Header=BB0_26 Depth=1 - vpslld ymm11, ymm9, 3 - mov rbx, r11 - or rbx, 23 - vpackssdw ymm3, ymm0, ymm8 - vpacksswb ymm3, ymm3, ymm0 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 7 - test cl, 1 - mov qword ptr [rsp + 240], rbx # 8-byte Spill - vmovdqa ymmword ptr [rsp + 896], ymm9 # 32-byte Spill - je .LBB0_74 -# %bb.73: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 7 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_74: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 24 - vpcmpgtd ymm9, ymm0, ymm11 - vpermq ymm12, ymm9, 68 # ymm12 = ymm9[0,1,0,1] - vpacksswb ymm3, ymm0, ymm12 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 8 - test cl, 1 - mov qword ptr [rsp + 216], rbx # 8-byte Spill - je .LBB0_76 -# %bb.75: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 8 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_76: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 25 - vpcmpgtd xmm3, xmm0, xmm11 - vpackssdw xmm3, xmm3, xmm3 - vpermq ymm3, ymm3, 212 # ymm3 = ymm3[0,1,1,3] - vpacksswb ymm3, ymm0, ymm3 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 9 - test cl, 1 - mov qword ptr [rsp + 208], rbx # 8-byte Spill - je .LBB0_78 -# %bb.77: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 9 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_78: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 26 - vpcmpgtd xmm3, xmm0, xmm11 - vpackssdw xmm3, xmm3, xmm3 - vpermq ymm3, ymm3, 212 # ymm3 = ymm3[0,1,1,3] - vpacksswb ymm3, ymm0, ymm3 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 10 - test cl, 1 - mov qword ptr [rsp + 200], rbx # 8-byte Spill - je .LBB0_80 -# %bb.79: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 10 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_80: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 27 - vpcmpgtd xmm3, xmm0, xmm11 - vpackssdw xmm3, xmm3, xmm3 - vpermq ymm3, ymm3, 212 # ymm3 = ymm3[0,1,1,3] - vpacksswb ymm3, ymm0, ymm3 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 11 - test cl, 1 - mov qword ptr [rsp + 192], rbx # 8-byte Spill - mov qword ptr [rsp + 144], rdx # 8-byte Spill - mov qword ptr [rsp + 88], rax # 8-byte Spill - je .LBB0_82 -# %bb.81: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 11 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_82: # in Loop: Header=BB0_26 Depth=1 - mov rdx, r11 - or rdx, 28 - vpackssdw ymm3, ymm0, ymm9 - vpacksswb ymm3, ymm0, ymm3 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 12 - test cl, 1 - je .LBB0_84 -# %bb.83: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rdx], 12 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_84: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 29 - vpackssdw ymm3, ymm0, ymm9 - vpacksswb ymm3, ymm0, ymm3 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 13 - test cl, 1 - mov qword ptr [rsp + 176], rbx # 8-byte Spill - je .LBB0_86 -# %bb.85: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 13 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_86: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 30 - vpackssdw ymm3, ymm0, ymm9 - vpacksswb ymm3, ymm0, ymm3 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 14 - test cl, 1 - mov qword ptr [rsp + 168], rbx # 8-byte Spill - je .LBB0_88 -# %bb.87: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 14 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_88: # in Loop: Header=BB0_26 Depth=1 - mov rbx, r11 - or rbx, 31 - vpackssdw ymm3, ymm0, ymm9 - vpacksswb ymm3, ymm0, ymm3 - vextracti128 xmm3, ymm3, 1 - vpextrb ecx, xmm3, 15 - test cl, 1 - mov qword ptr [rsp + 160], rbx # 8-byte Spill - je .LBB0_90 -# %bb.89: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm3, ymm4, 1 - vpinsrb xmm3, xmm3, byte ptr [rdi + rbx], 15 - vinserti128 ymm4, ymm4, xmm3, 1 -.LBB0_90: # in Loop: Header=BB0_26 Depth=1 - vpmovzxdq ymm3, xmm1 # ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero - vmovdqa ymmword ptr [rsp + 512], ymm3 # 32-byte Spill - vpand ymm15, ymm4, ymmword ptr [rip + .LCPI0_4] - vpcmpgtd xmm3, xmm0, xmm1 - vmovd ecx, xmm3 - test cl, 1 - je .LBB0_92 -# %bb.91: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm3, ymmword ptr [rsp + 512] # 32-byte Reload - vmovq rcx, xmm3 - vpextrb byte ptr [r8 + rcx], xmm15, 0 -.LBB0_92: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm3, xmm0, xmm1 - vpackssdw xmm3, xmm3, xmm3 - vpacksswb xmm3, xmm3, xmm3 - vpextrb ecx, xmm3, 1 - test cl, 1 - je .LBB0_94 -# %bb.93: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm3, ymmword ptr [rsp + 512] # 32-byte Reload - vpextrq rcx, xmm3, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 1 -.LBB0_94: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm3, xmm0, xmm1 - vpackssdw xmm3, xmm3, xmm3 - vpacksswb xmm3, xmm3, xmm3 - vpextrb ecx, xmm3, 2 - test cl, 1 - je .LBB0_96 -# %bb.95: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm3, ymmword ptr [rsp + 512] # 32-byte Reload - vextracti128 xmm3, ymm3, 1 - vmovq rcx, xmm3 - vpextrb byte ptr [r8 + rcx], xmm15, 2 -.LBB0_96: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm1 - vpackssdw xmm1, xmm1, xmm1 - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 3 - test cl, 1 - je .LBB0_98 -# %bb.97: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 512] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 3 -.LBB0_98: # in Loop: Header=BB0_26 Depth=1 - vpmovzxdq ymm1, xmm13 # ymm1 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero - vmovdqa ymmword ptr [rsp + 480], ymm1 # 32-byte Spill - test r9b, 1 - je .LBB0_100 -# %bb.99: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 480] # 32-byte Reload - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm15, 4 -.LBB0_100: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm6, ymm0 - vextracti128 xmm1, ymm1, 1 - vpbroadcastd xmm1, xmm1 - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 5 - test cl, 1 - je .LBB0_102 -# %bb.101: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 480] # 32-byte Reload - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 5 -.LBB0_102: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm6, ymm0 - vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 6 - test cl, 1 - je .LBB0_104 -# %bb.103: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 480] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm15, 6 -.LBB0_104: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm6, ymm0 - vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 7 - test cl, 1 - je .LBB0_106 -# %bb.105: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 480] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 7 -.LBB0_106: # in Loop: Header=BB0_26 Depth=1 - vpmovzxdq ymm1, xmm2 # ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero - vmovdqa ymmword ptr [rsp + 448], ymm1 # 32-byte Spill - vpcmpgtd xmm1, xmm0, xmm2 - vpextrb ecx, xmm1, 0 - test cl, 1 - je .LBB0_108 -# %bb.107: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 448] # 32-byte Reload - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm15, 8 -.LBB0_108: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm2 - vpackssdw xmm1, xmm1, xmm1 - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 9 - test cl, 1 - je .LBB0_110 -# %bb.109: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 448] # 32-byte Reload - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 9 -.LBB0_110: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm2 - vpackssdw xmm1, xmm1, xmm1 - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 10 - test cl, 1 - je .LBB0_112 -# %bb.111: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 448] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm15, 10 -.LBB0_112: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm2 - vpackssdw xmm1, xmm1, xmm1 - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 11 - test cl, 1 - je .LBB0_114 -# %bb.113: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 448] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 11 -.LBB0_114: # in Loop: Header=BB0_26 Depth=1 - mov qword ptr [rsp + 136], rsi # 8-byte Spill - vpmovzxdq ymm1, xmm5 # ymm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero - vmovdqa ymmword ptr [rsp + 416], ymm1 # 32-byte Spill - test r14b, 1 - je .LBB0_116 -# %bb.115: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 416] # 32-byte Reload - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm15, 12 -.LBB0_116: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm7, ymm0 - vextracti128 xmm1, ymm1, 1 - vpbroadcastd xmm1, xmm1 - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 13 - test cl, 1 - mov r9, qword ptr [rsp + 152] # 8-byte Reload - mov rsi, qword ptr [rsp + 296] # 8-byte Reload - mov r14, qword ptr [rsp + 104] # 8-byte Reload - mov rax, qword ptr [rsp + 288] # 8-byte Reload - je .LBB0_118 -# %bb.117: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 416] # 32-byte Reload - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 13 -.LBB0_118: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm7, ymm0 - vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 14 - test cl, 1 - je .LBB0_120 -# %bb.119: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 416] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm15, 14 -.LBB0_120: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm7, ymm0 - vpermq ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3] - vpacksswb xmm1, xmm1, xmm1 - vpextrb ecx, xmm1, 15 - test cl, 1 - je .LBB0_122 -# %bb.121: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 416] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm15, 15 -.LBB0_122: # in Loop: Header=BB0_26 Depth=1 - vpmovzxdq ymm1, xmm10 # ymm1 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero - vmovdqa ymmword ptr [rsp + 384], ymm1 # 32-byte Spill - vpcmpgtd xmm1, xmm0, xmm10 - vmovd ecx, xmm1 - test cl, 1 - je .LBB0_124 -# %bb.123: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 384] # 32-byte Reload - vmovq rcx, xmm1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 0 -.LBB0_124: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm10 - vpackssdw xmm1, xmm1, xmm1 - vpermq ymm1, ymm1, 212 # ymm1 = ymm1[0,1,1,3] - vpacksswb ymm1, ymm1, ymm0 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 1 - test cl, 1 - je .LBB0_126 -# %bb.125: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 384] # 32-byte Reload - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 1 -.LBB0_126: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm10 - vpackssdw xmm1, xmm1, xmm1 - vpermq ymm1, ymm1, 212 # ymm1 = ymm1[0,1,1,3] - vpacksswb ymm1, ymm1, ymm0 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 2 - test cl, 1 - je .LBB0_128 -# %bb.127: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 384] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 2 -.LBB0_128: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpcmpgtd xmm2, xmm0, xmm10 - vpackssdw xmm2, xmm2, xmm2 - vpermq ymm2, ymm2, 212 # ymm2 = ymm2[0,1,1,3] - vpacksswb ymm2, ymm2, ymm0 - vextracti128 xmm2, ymm2, 1 - vpextrb ecx, xmm2, 3 - test cl, 1 - je .LBB0_130 -# %bb.129: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm2, ymmword ptr [rsp + 384] # 32-byte Reload - vextracti128 xmm2, ymm2, 1 - vpextrq rcx, xmm2, 1 - vextracti128 xmm2, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm2, 3 -.LBB0_130: # in Loop: Header=BB0_26 Depth=1 - vpmovzxdq ymm1, xmm1 # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero - vmovdqa ymmword ptr [rsp + 352], ymm1 # 32-byte Spill - vpackssdw ymm1, ymm0, ymm8 - vpacksswb ymm1, ymm1, ymm0 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 4 - test cl, 1 - je .LBB0_132 -# %bb.131: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 352] # 32-byte Reload - vmovq rcx, xmm1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 4 -.LBB0_132: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm0, ymm8 - vpacksswb ymm1, ymm1, ymm0 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 5 - test cl, 1 - je .LBB0_134 -# %bb.133: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 352] # 32-byte Reload - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 5 -.LBB0_134: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm0, ymm8 - vpacksswb ymm1, ymm1, ymm0 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 6 - test cl, 1 - je .LBB0_136 -# %bb.135: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 352] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 6 -.LBB0_136: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm0, ymm8 - vpacksswb ymm1, ymm1, ymm0 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 7 - test cl, 1 - je .LBB0_138 -# %bb.137: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 352] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 7 -.LBB0_138: # in Loop: Header=BB0_26 Depth=1 - vpmovzxdq ymm1, xmm11 # ymm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero - vmovdqa ymmword ptr [rsp + 320], ymm1 # 32-byte Spill - vpacksswb ymm1, ymm0, ymm12 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 8 - test cl, 1 - je .LBB0_140 -# %bb.139: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 320] # 32-byte Reload - vmovq rcx, xmm1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 8 -.LBB0_140: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm11 - vpackssdw xmm1, xmm1, xmm1 - vpermq ymm1, ymm1, 212 # ymm1 = ymm1[0,1,1,3] - vpacksswb ymm1, ymm0, ymm1 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 9 - test cl, 1 - je .LBB0_142 -# %bb.141: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 320] # 32-byte Reload - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 -.LBB0_142: # in Loop: Header=BB0_26 Depth=1 - vpcmpgtd xmm1, xmm0, xmm11 - vpackssdw xmm1, xmm1, xmm1 - vpermq ymm1, ymm1, 212 # ymm1 = ymm1[0,1,1,3] - vpacksswb ymm1, ymm0, ymm1 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 10 - test cl, 1 - je .LBB0_144 -# %bb.143: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 320] # 32-byte Reload - vextracti128 xmm1, ymm1, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 -.LBB0_144: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpcmpgtd xmm4, xmm0, xmm11 - vpackssdw xmm4, xmm4, xmm4 - vpermq ymm4, ymm4, 212 # ymm4 = ymm4[0,1,1,3] - vpacksswb ymm4, ymm0, ymm4 - vextracti128 xmm4, ymm4, 1 - vpextrb ecx, xmm4, 11 - test cl, 1 - je .LBB0_146 -# %bb.145: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm2, ymmword ptr [rsp + 320] # 32-byte Reload - vextracti128 xmm4, ymm2, 1 - vpextrq rcx, xmm4, 1 - vextracti128 xmm4, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm4, 11 -.LBB0_146: # in Loop: Header=BB0_26 Depth=1 - vpmovzxdq ymm4, xmm1 # ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero - vpackssdw ymm1, ymm0, ymm9 - vpacksswb ymm1, ymm0, ymm1 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 12 - test cl, 1 - je .LBB0_148 -# %bb.147: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm4 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 -.LBB0_148: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm0, ymm9 - vpacksswb ymm1, ymm0, ymm1 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 13 - test cl, 1 - je .LBB0_150 -# %bb.149: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm4, 1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 -.LBB0_150: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm0, ymm9 - vpacksswb ymm1, ymm0, ymm1 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 14 - test cl, 1 - je .LBB0_152 -# %bb.151: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 -.LBB0_152: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm0, ymm9 - vpacksswb ymm1, ymm0, ymm1 - vextracti128 xmm1, ymm1, 1 - vpextrb ecx, xmm1, 15 - test cl, 1 - je .LBB0_154 -# %bb.153: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm15, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 -.LBB0_154: # in Loop: Header=BB0_26 Depth=1 - vpackssdw ymm1, ymm6, ymm8 - vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3] - vpackssdw ymm5, ymm7, ymm9 - vpermq ymm5, ymm5, 216 # ymm5 = ymm5[0,2,1,3] - vpacksswb ymm1, ymm1, ymm5 - vmovdqa ymm2, ymmword ptr [rsp + 768] # 32-byte Reload - vpor ymm15, ymm2, ymmword ptr [rsp + 512] # 32-byte Folded Reload - vpor ymm5, ymm2, ymmword ptr [rsp + 480] # 32-byte Folded Reload - vpor ymm10, ymm2, ymmword ptr [rsp + 384] # 32-byte Folded Reload - vpor ymm9, ymm2, ymmword ptr [rsp + 352] # 32-byte Folded Reload - vpor ymm12, ymm2, ymmword ptr [rsp + 448] # 32-byte Folded Reload - vpor ymm11, ymm2, ymmword ptr [rsp + 416] # 32-byte Folded Reload - vpor ymm8, ymm2, ymmword ptr [rsp + 320] # 32-byte Folded Reload - vpor ymm7, ymm4, ymm2 - vperm2i128 ymm6, ymm8, ymm7, 49 # ymm6 = ymm8[2,3],ymm7[2,3] - vinserti128 ymm13, ymm8, xmm7, 1 - vshufps ymm6, ymm13, ymm6, 136 # ymm6 = ymm13[0,2],ymm6[0,2],ymm13[4,6],ymm6[4,6] - vperm2i128 ymm13, ymm12, ymm11, 49 # ymm13 = ymm12[2,3],ymm11[2,3] - vinserti128 ymm14, ymm12, xmm11, 1 - vshufps ymm13, ymm14, ymm13, 136 # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] - vperm2i128 ymm14, ymm10, ymm9, 49 # ymm14 = ymm10[2,3],ymm9[2,3] - vinserti128 ymm2, ymm10, xmm9, 1 - vshufps ymm2, ymm2, ymm14, 136 # ymm2 = ymm2[0,2],ymm14[0,2],ymm2[4,6],ymm14[4,6] - vperm2i128 ymm14, ymm15, ymm5, 49 # ymm14 = ymm15[2,3],ymm5[2,3] - vinserti128 ymm3, ymm15, xmm5, 1 - vshufps ymm3, ymm3, ymm14, 136 # ymm3 = ymm3[0,2],ymm14[0,2],ymm3[4,6],ymm14[4,6] - vpcmpgtd ymm3, ymm0, ymm3 - vpcmpgtd ymm2, ymm0, ymm2 - vpackssdw ymm2, ymm3, ymm2 - vpcmpgtd ymm3, ymm0, ymm13 - vpcmpgtd ymm6, ymm0, ymm6 - vpackssdw ymm3, ymm3, ymm6 - vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] - vpermq ymm3, ymm3, 216 # ymm3 = ymm3[0,2,1,3] - vpacksswb ymm2, ymm2, ymm3 - vpand ymm6, ymm2, ymm1 - vmovd ecx, xmm6 - # implicit-def: $ymm14 - test cl, 1 - je .LBB0_155 -# %bb.660: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm14, byte ptr [rdi + r11] - vpextrb ecx, xmm6, 1 - test cl, 1 - jne .LBB0_661 -.LBB0_156: # in Loop: Header=BB0_26 Depth=1 - mov rbx, qword ptr [rsp + 224] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - je .LBB0_157 -.LBB0_662: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 2 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 3 - test cl, 1 - jne .LBB0_663 -.LBB0_158: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - test cl, 1 - je .LBB0_159 -.LBB0_664: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 264] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 4 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 5 - test cl, 1 - jne .LBB0_665 -.LBB0_160: # in Loop: Header=BB0_26 Depth=1 - mov rsi, qword ptr [rsp + 232] # 8-byte Reload - vpextrb ecx, xmm6, 6 - test cl, 1 - je .LBB0_161 -.LBB0_666: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 6 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 7 - test cl, 1 - jne .LBB0_667 -.LBB0_162: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 8 - test cl, 1 - je .LBB0_163 -.LBB0_668: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 96] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 8 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_669 -.LBB0_164: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 10 - test cl, 1 - je .LBB0_165 -.LBB0_670: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 144] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 10 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 11 - test cl, 1 - jne .LBB0_671 -.LBB0_166: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 12 - test cl, 1 - je .LBB0_167 -.LBB0_672: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 248] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 12 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 13 - test cl, 1 - jne .LBB0_673 -.LBB0_168: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 14 - test cl, 1 - je .LBB0_169 -.LBB0_674: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 80] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 14 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 15 - test cl, 1 - jne .LBB0_170 - jmp .LBB0_171 - .p2align 4, 0x90 -.LBB0_155: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 1 - test cl, 1 - je .LBB0_156 -.LBB0_661: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r9], 1 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rbx, qword ptr [rsp + 224] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - jne .LBB0_662 -.LBB0_157: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 3 - test cl, 1 - je .LBB0_158 -.LBB0_663: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 272] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 3 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 4 - test cl, 1 - jne .LBB0_664 -.LBB0_159: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 5 - test cl, 1 - je .LBB0_160 -.LBB0_665: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r14], 5 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rsi, qword ptr [rsp + 232] # 8-byte Reload - vpextrb ecx, xmm6, 6 - test cl, 1 - jne .LBB0_666 -.LBB0_161: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 7 - test cl, 1 - je .LBB0_162 -.LBB0_667: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 7 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 8 - test cl, 1 - jne .LBB0_668 -.LBB0_163: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_164 -.LBB0_669: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rbx], 9 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 10 - test cl, 1 - jne .LBB0_670 -.LBB0_165: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 11 - test cl, 1 - je .LBB0_166 -.LBB0_671: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 256] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 11 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 12 - test cl, 1 - jne .LBB0_672 -.LBB0_167: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 13 - test cl, 1 - je .LBB0_168 -.LBB0_673: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 88] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 13 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 14 - test cl, 1 - jne .LBB0_674 -.LBB0_169: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 15 - test cl, 1 - je .LBB0_171 -.LBB0_170: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 72] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 15 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_171: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 208] # 8-byte Reload - vextracti128 xmm13, ymm6, 1 - vmovd eax, xmm13 - mov dword ptr [rsp + 44], eax # 4-byte Spill - test al, 1 - je .LBB0_172 -# %bb.675: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rax, qword ptr [rsp + 64] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 0 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 1 - mov dword ptr [rsp + 40], eax # 4-byte Spill - test al, 1 - jne .LBB0_676 -.LBB0_173: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 2 - mov dword ptr [rsp + 36], eax # 4-byte Spill - test al, 1 - je .LBB0_174 -.LBB0_677: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rax, qword ptr [rsp + 128] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 2 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 3 - mov dword ptr [rsp + 32], eax # 4-byte Spill - test al, 1 - jne .LBB0_678 -.LBB0_175: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 4 - mov dword ptr [rsp + 28], eax # 4-byte Spill - test al, 1 - je .LBB0_176 -.LBB0_679: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rax, qword ptr [rsp + 56] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 4 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - jne .LBB0_680 -.LBB0_177: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 6 - mov dword ptr [rsp + 20], eax # 4-byte Spill - test al, 1 - je .LBB0_178 -.LBB0_681: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + r10], 6 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 7 - mov dword ptr [rsp + 316], eax # 4-byte Spill - test al, 1 - jne .LBB0_682 -.LBB0_179: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpextrb ebx, xmm13, 8 - test bl, 1 - je .LBB0_181 -.LBB0_180: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 8 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_181: # in Loop: Header=BB0_26 Depth=1 - vpextrb r9d, xmm13, 9 - test r9b, 1 - mov qword ptr [rsp + 280], r13 # 8-byte Spill - mov qword ptr [rsp + 112], r10 # 8-byte Spill - mov qword ptr [rsp + 184], rdx # 8-byte Spill - je .LBB0_183 -# %bb.182: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rcx], 9 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_183: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 200] # 8-byte Reload - mov rcx, qword ptr [rsp + 192] # 8-byte Reload - vpextrb r13d, xmm13, 10 - test r13b, 1 - je .LBB0_184 -# %bb.683: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 10 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 11 - test al, 1 - mov qword ptr [rsp + 120], r15 # 8-byte Spill - jne .LBB0_684 -.LBB0_185: # in Loop: Header=BB0_26 Depth=1 - vpextrb r15d, xmm13, 12 - test r15b, 1 - mov qword ptr [rsp + 304], r11 # 8-byte Spill - je .LBB0_186 -.LBB0_685: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rcx, qword ptr [rsp + 184] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rcx], 12 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb edx, xmm13, 13 - test dl, 1 - jne .LBB0_686 -.LBB0_187: # in Loop: Header=BB0_26 Depth=1 - vpextrb esi, xmm13, 14 - test sil, 1 - je .LBB0_188 -.LBB0_687: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rcx, qword ptr [rsp + 168] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rcx], 14 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb r14d, xmm13, 15 - test r14b, 1 - jne .LBB0_189 - jmp .LBB0_190 - .p2align 4, 0x90 -.LBB0_172: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 1 - mov dword ptr [rsp + 40], eax # 4-byte Spill - test al, 1 - je .LBB0_173 -.LBB0_676: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rax, qword ptr [rsp + 136] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 1 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 2 - mov dword ptr [rsp + 36], eax # 4-byte Spill - test al, 1 - jne .LBB0_677 -.LBB0_174: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 3 - mov dword ptr [rsp + 32], eax # 4-byte Spill - test al, 1 - je .LBB0_175 -.LBB0_678: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + r15], 3 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 4 - mov dword ptr [rsp + 28], eax # 4-byte Spill - test al, 1 - jne .LBB0_679 -.LBB0_176: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - je .LBB0_177 -.LBB0_680: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + r13], 5 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb eax, xmm13, 6 - mov dword ptr [rsp + 20], eax # 4-byte Spill - test al, 1 - jne .LBB0_681 -.LBB0_178: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 7 - mov dword ptr [rsp + 316], eax # 4-byte Spill - test al, 1 - je .LBB0_179 -.LBB0_682: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rax, qword ptr [rsp + 240] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 7 - vinserti128 ymm14, ymm14, xmm1, 1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpextrb ebx, xmm13, 8 - test bl, 1 - jne .LBB0_180 - jmp .LBB0_181 - .p2align 4, 0x90 -.LBB0_184: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm13, 11 - test al, 1 - mov qword ptr [rsp + 120], r15 # 8-byte Spill - je .LBB0_185 -.LBB0_684: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rcx], 11 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb r15d, xmm13, 12 - test r15b, 1 - mov qword ptr [rsp + 304], r11 # 8-byte Spill - jne .LBB0_685 -.LBB0_186: # in Loop: Header=BB0_26 Depth=1 - vpextrb edx, xmm13, 13 - test dl, 1 - je .LBB0_187 -.LBB0_686: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rcx, qword ptr [rsp + 176] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rcx], 13 - vinserti128 ymm14, ymm14, xmm1, 1 - vpextrb esi, xmm13, 14 - test sil, 1 - jne .LBB0_687 -.LBB0_188: # in Loop: Header=BB0_26 Depth=1 - vpextrb r14d, xmm13, 15 - test r14b, 1 - je .LBB0_190 -.LBB0_189: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rcx, qword ptr [rsp + 160] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rcx], 15 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_190: # in Loop: Header=BB0_26 Depth=1 - vpsrlw ymm1, ymm14, 1 - vpand ymm14, ymm1, ymmword ptr [rip + .LCPI0_4] - vmovd r10d, xmm6 - test r10b, 1 - je .LBB0_191 -# %bb.688: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm15 - vpextrb byte ptr [r8 + rcx], xmm14, 0 - vpextrb ecx, xmm6, 1 - test cl, 1 - jne .LBB0_689 -.LBB0_192: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 2 - test cl, 1 - je .LBB0_193 -.LBB0_690: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm14, 2 - vpextrb ecx, xmm6, 3 - test cl, 1 - jne .LBB0_691 -.LBB0_194: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - test cl, 1 - je .LBB0_195 -.LBB0_692: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm5 - vpextrb byte ptr [r8 + rcx], xmm14, 4 - vpextrb ecx, xmm6, 5 - test cl, 1 - jne .LBB0_693 -.LBB0_196: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 6 - test cl, 1 - je .LBB0_197 -.LBB0_694: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm14, 6 - vpextrb ecx, xmm6, 7 - test cl, 1 - jne .LBB0_695 -.LBB0_198: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 8 - test cl, 1 - je .LBB0_199 -.LBB0_696: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm12 - vpextrb byte ptr [r8 + rcx], xmm14, 8 - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_697 -.LBB0_200: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 10 - test cl, 1 - je .LBB0_201 -.LBB0_698: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm14, 10 - vpextrb ecx, xmm6, 11 - test cl, 1 - jne .LBB0_699 -.LBB0_202: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 12 - test cl, 1 - je .LBB0_203 -.LBB0_700: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm11 - vpextrb byte ptr [r8 + rcx], xmm14, 12 - vpextrb ecx, xmm6, 13 - test cl, 1 - jne .LBB0_701 -.LBB0_204: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 14 - test cl, 1 - je .LBB0_205 -.LBB0_702: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vmovq rcx, xmm1 - vpextrb byte ptr [r8 + rcx], xmm14, 14 - vpextrb ecx, xmm6, 15 - test cl, 1 - jne .LBB0_703 -.LBB0_206: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - je .LBB0_207 -.LBB0_704: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm10 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 0 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - jne .LBB0_705 -.LBB0_208: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - je .LBB0_209 -.LBB0_706: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 2 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - jne .LBB0_707 -.LBB0_210: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - je .LBB0_211 -.LBB0_708: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm9 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 4 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - jne .LBB0_709 -.LBB0_212: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - je .LBB0_213 -.LBB0_710: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 6 - test byte ptr [rsp + 316], 1 # 1-byte Folded Reload - jne .LBB0_711 -.LBB0_214: # in Loop: Header=BB0_26 Depth=1 - test bl, 1 - je .LBB0_215 -.LBB0_712: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm8 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 8 - test r9b, 1 - mov r10, qword ptr [rsp + 224] # 8-byte Reload - mov r11, qword ptr [rsp + 144] # 8-byte Reload - jne .LBB0_713 -.LBB0_216: # in Loop: Header=BB0_26 Depth=1 - test r13b, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - je .LBB0_217 -.LBB0_714: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 - test al, 1 - mov r9, qword ptr [rsp + 288] # 8-byte Reload - mov rax, qword ptr [rsp + 232] # 8-byte Reload - jne .LBB0_715 -.LBB0_218: # in Loop: Header=BB0_26 Depth=1 - test r15b, 1 - je .LBB0_219 -.LBB0_716: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm7 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 - test dl, 1 - mov r13, qword ptr [rsp + 136] # 8-byte Reload - mov r15, qword ptr [rsp + 128] # 8-byte Reload - jne .LBB0_717 -.LBB0_220: # in Loop: Header=BB0_26 Depth=1 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - je .LBB0_221 -.LBB0_718: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 - test r14b, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - jne .LBB0_222 - jmp .LBB0_223 - .p2align 4, 0x90 -.LBB0_191: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 1 - test cl, 1 - je .LBB0_192 -.LBB0_689: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm15, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 1 - vpextrb ecx, xmm6, 2 - test cl, 1 - jne .LBB0_690 -.LBB0_193: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 3 - test cl, 1 - je .LBB0_194 -.LBB0_691: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 3 - vpextrb ecx, xmm6, 4 - test cl, 1 - jne .LBB0_692 -.LBB0_195: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 5 - test cl, 1 - je .LBB0_196 -.LBB0_693: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm5, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 5 - vpextrb ecx, xmm6, 6 - test cl, 1 - jne .LBB0_694 -.LBB0_197: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 7 - test cl, 1 - je .LBB0_198 -.LBB0_695: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 7 - vpextrb ecx, xmm6, 8 - test cl, 1 - jne .LBB0_696 -.LBB0_199: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_200 -.LBB0_697: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm12, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 9 - vpextrb ecx, xmm6, 10 - test cl, 1 - jne .LBB0_698 -.LBB0_201: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 11 - test cl, 1 - je .LBB0_202 -.LBB0_699: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 11 - vpextrb ecx, xmm6, 12 - test cl, 1 - jne .LBB0_700 -.LBB0_203: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 13 - test cl, 1 - je .LBB0_204 -.LBB0_701: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm11, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 13 - vpextrb ecx, xmm6, 14 - test cl, 1 - jne .LBB0_702 -.LBB0_205: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 15 - test cl, 1 - je .LBB0_206 -.LBB0_703: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpextrq rcx, xmm1, 1 - vpextrb byte ptr [r8 + rcx], xmm14, 15 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - jne .LBB0_704 -.LBB0_207: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - je .LBB0_208 -.LBB0_705: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm10, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - jne .LBB0_706 -.LBB0_209: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - je .LBB0_210 -.LBB0_707: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 3 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - jne .LBB0_708 -.LBB0_211: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - je .LBB0_212 -.LBB0_709: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm9, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 5 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - jne .LBB0_710 -.LBB0_213: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 316], 1 # 1-byte Folded Reload - je .LBB0_214 -.LBB0_711: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 7 - test bl, 1 - jne .LBB0_712 -.LBB0_215: # in Loop: Header=BB0_26 Depth=1 - test r9b, 1 - mov r10, qword ptr [rsp + 224] # 8-byte Reload - mov r11, qword ptr [rsp + 144] # 8-byte Reload - je .LBB0_216 -.LBB0_713: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm8, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 - test r13b, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - jne .LBB0_714 -.LBB0_217: # in Loop: Header=BB0_26 Depth=1 - test al, 1 - mov r9, qword ptr [rsp + 288] # 8-byte Reload - mov rax, qword ptr [rsp + 232] # 8-byte Reload - je .LBB0_218 -.LBB0_715: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 11 - test r15b, 1 - jne .LBB0_716 -.LBB0_219: # in Loop: Header=BB0_26 Depth=1 - test dl, 1 - mov r13, qword ptr [rsp + 136] # 8-byte Reload - mov r15, qword ptr [rsp + 128] # 8-byte Reload - je .LBB0_220 -.LBB0_717: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm7, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - jne .LBB0_718 -.LBB0_221: # in Loop: Header=BB0_26 Depth=1 - test r14b, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - je .LBB0_223 -.LBB0_222: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 -.LBB0_223: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 736] # 32-byte Reload - vpor ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload - vpor ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload - vpor ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload - vpor ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload - vpor ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload - vpor ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload - vpor ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload - vpor ymm7, ymm4, ymm1 - vperm2i128 ymm1, ymm8, ymm7, 49 # ymm1 = ymm8[2,3],ymm7[2,3] - vinserti128 ymm2, ymm8, xmm7, 1 - vshufps ymm1, ymm2, ymm1, 136 # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6] - vperm2i128 ymm2, ymm12, ymm11, 49 # ymm2 = ymm12[2,3],ymm11[2,3] - vinserti128 ymm3, ymm12, xmm11, 1 - vshufps ymm2, ymm3, ymm2, 136 # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] - vperm2i128 ymm3, ymm10, ymm9, 49 # ymm3 = ymm10[2,3],ymm9[2,3] - vinserti128 ymm13, ymm10, xmm9, 1 - vshufps ymm3, ymm13, ymm3, 136 # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6] - vperm2i128 ymm13, ymm15, ymm5, 49 # ymm13 = ymm15[2,3],ymm5[2,3] - vinserti128 ymm14, ymm15, xmm5, 1 - vshufps ymm13, ymm14, ymm13, 136 # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] - vpcmpgtd ymm13, ymm0, ymm13 - vpcmpgtd ymm3, ymm0, ymm3 - vpackssdw ymm3, ymm13, ymm3 - vpcmpgtd ymm2, ymm0, ymm2 - vpcmpgtd ymm1, ymm0, ymm1 - vpackssdw ymm1, ymm2, ymm1 - vpermq ymm2, ymm3, 216 # ymm2 = ymm3[0,2,1,3] - vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3] - vpacksswb ymm1, ymm2, ymm1 - vpand ymm6, ymm1, ymm6 - vmovd ecx, xmm6 - # implicit-def: $ymm14 - test cl, 1 - je .LBB0_224 -# %bb.719: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm14, byte ptr [rdi + rdx] - vpextrb ecx, xmm6, 1 - test cl, 1 - jne .LBB0_720 -.LBB0_225: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - je .LBB0_227 -.LBB0_226: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rbx], 2 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_227: # in Loop: Header=BB0_26 Depth=1 - mov rsi, qword ptr [rsp + 96] # 8-byte Reload - mov rbx, qword ptr [rsp + 72] # 8-byte Reload - vpextrb ecx, xmm6, 3 - test cl, 1 - je .LBB0_228 -# %bb.721: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 272] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 3 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 4 - test cl, 1 - jne .LBB0_722 -.LBB0_229: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 5 - test cl, 1 - je .LBB0_230 -.LBB0_723: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 5 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 6 - test cl, 1 - jne .LBB0_724 -.LBB0_231: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 7 - test cl, 1 - je .LBB0_232 -.LBB0_725: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 7 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 8 - test cl, 1 - jne .LBB0_726 -.LBB0_233: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_234 -.LBB0_727: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r10], 9 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 10 - test cl, 1 - jne .LBB0_728 -.LBB0_235: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 11 - test cl, 1 - je .LBB0_236 -.LBB0_729: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 256] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 11 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 12 - test cl, 1 - jne .LBB0_730 -.LBB0_237: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 13 - test cl, 1 - je .LBB0_239 -.LBB0_238: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 13 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_239: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 80] # 8-byte Reload - mov rdx, qword ptr [rsp + 64] # 8-byte Reload - vpextrb ecx, xmm6, 14 - test cl, 1 - je .LBB0_241 -# %bb.240: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 14 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_241: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 15 - test cl, 1 - je .LBB0_243 -# %bb.242: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rbx], 15 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_243: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm6, 1 - vmovd eax, xmm1 - mov dword ptr [rsp + 44], eax # 4-byte Spill - test al, 1 - je .LBB0_245 -# %bb.244: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rdx], 0 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_245: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 56] # 8-byte Reload - vpextrb eax, xmm1, 1 - mov dword ptr [rsp + 40], eax # 4-byte Spill - test al, 1 - je .LBB0_247 -# %bb.246: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r13], 1 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_247: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 280] # 8-byte Reload - mov rsi, qword ptr [rsp + 112] # 8-byte Reload - vpextrb eax, xmm1, 2 - mov dword ptr [rsp + 36], eax # 4-byte Spill - test al, 1 - je .LBB0_249 -# %bb.248: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r15], 2 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_249: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 120] # 8-byte Reload - vpextrb ebx, xmm1, 3 - mov dword ptr [rsp + 32], ebx # 4-byte Spill - test bl, 1 - je .LBB0_250 -# %bb.731: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 3 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 4 - mov dword ptr [rsp + 28], eax # 4-byte Spill - test al, 1 - jne .LBB0_732 -.LBB0_251: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - je .LBB0_252 -.LBB0_733: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rdx], 5 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 6 - mov dword ptr [rsp + 20], eax # 4-byte Spill - test al, 1 - jne .LBB0_734 -.LBB0_253: # in Loop: Header=BB0_26 Depth=1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - je .LBB0_254 -.LBB0_735: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 240] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 7 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb edx, xmm1, 8 - test dl, 1 - jne .LBB0_736 -.LBB0_255: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 9 - test cl, 1 - je .LBB0_256 -.LBB0_737: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 208] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 9 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb esi, xmm1, 10 - test sil, 1 - jne .LBB0_738 -.LBB0_257: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 11 - test al, 1 - je .LBB0_258 -.LBB0_739: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 192] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 11 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - jne .LBB0_740 -.LBB0_259: # in Loop: Header=BB0_26 Depth=1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - je .LBB0_260 -.LBB0_741: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 176] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 13 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - jne .LBB0_742 -.LBB0_261: # in Loop: Header=BB0_26 Depth=1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - je .LBB0_263 -.LBB0_262: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rbx, qword ptr [rsp + 160] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rbx], 15 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_263: # in Loop: Header=BB0_26 Depth=1 - vpsrlw ymm1, ymm14, 2 - vpand ymm14, ymm1, ymmword ptr [rip + .LCPI0_4] - vmovd r15d, xmm6 - test r15b, 1 - je .LBB0_264 -# %bb.743: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm15 - vpextrb byte ptr [r8 + rbx], xmm14, 0 - vpextrb ebx, xmm6, 1 - test bl, 1 - jne .LBB0_744 -.LBB0_265: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - je .LBB0_266 -.LBB0_745: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 2 - vpextrb ebx, xmm6, 3 - test bl, 1 - jne .LBB0_746 -.LBB0_267: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 4 - test bl, 1 - je .LBB0_268 -.LBB0_747: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm5 - vpextrb byte ptr [r8 + rbx], xmm14, 4 - vpextrb ebx, xmm6, 5 - test bl, 1 - jne .LBB0_748 -.LBB0_269: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 6 - test bl, 1 - je .LBB0_270 -.LBB0_749: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 6 - vpextrb ebx, xmm6, 7 - test bl, 1 - jne .LBB0_750 -.LBB0_271: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 8 - test bl, 1 - je .LBB0_272 -.LBB0_751: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm12 - vpextrb byte ptr [r8 + rbx], xmm14, 8 - vpextrb ebx, xmm6, 9 - test bl, 1 - jne .LBB0_752 -.LBB0_273: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 10 - test bl, 1 - je .LBB0_274 -.LBB0_753: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 10 - vpextrb ebx, xmm6, 11 - test bl, 1 - jne .LBB0_754 -.LBB0_275: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 12 - test bl, 1 - je .LBB0_276 -.LBB0_755: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm11 - vpextrb byte ptr [r8 + rbx], xmm14, 12 - vpextrb ebx, xmm6, 13 - test bl, 1 - jne .LBB0_756 -.LBB0_277: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 14 - test bl, 1 - je .LBB0_278 -.LBB0_757: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 14 - vpextrb ebx, xmm6, 15 - test bl, 1 - jne .LBB0_758 -.LBB0_279: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - je .LBB0_280 -.LBB0_759: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm10 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 0 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - jne .LBB0_760 -.LBB0_281: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - je .LBB0_282 -.LBB0_761: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 2 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - jne .LBB0_762 -.LBB0_283: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - je .LBB0_284 -.LBB0_763: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm9 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 4 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - jne .LBB0_764 -.LBB0_285: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - je .LBB0_286 -.LBB0_765: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 6 - test r9b, 1 - jne .LBB0_766 -.LBB0_287: # in Loop: Header=BB0_26 Depth=1 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - je .LBB0_288 -.LBB0_767: # in Loop: Header=BB0_26 Depth=1 - vmovq rdx, xmm8 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rdx], xmm1, 8 - test cl, 1 - jne .LBB0_768 -.LBB0_289: # in Loop: Header=BB0_26 Depth=1 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - je .LBB0_290 -.LBB0_769: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - jne .LBB0_770 -.LBB0_291: # in Loop: Header=BB0_26 Depth=1 - test r13b, 1 - je .LBB0_292 -.LBB0_771: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm7 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - jne .LBB0_772 -.LBB0_293: # in Loop: Header=BB0_26 Depth=1 - test r11b, 1 - je .LBB0_294 -.LBB0_773: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - jne .LBB0_295 - jmp .LBB0_296 - .p2align 4, 0x90 -.LBB0_224: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 1 - test cl, 1 - je .LBB0_225 -.LBB0_720: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 1 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - jne .LBB0_226 - jmp .LBB0_227 - .p2align 4, 0x90 -.LBB0_228: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - test cl, 1 - je .LBB0_229 -.LBB0_722: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 264] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 4 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 5 - test cl, 1 - jne .LBB0_723 -.LBB0_230: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 6 - test cl, 1 - je .LBB0_231 -.LBB0_724: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r9], 6 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 7 - test cl, 1 - jne .LBB0_725 -.LBB0_232: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 8 - test cl, 1 - je .LBB0_233 -.LBB0_726: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 8 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_727 -.LBB0_234: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 10 - test cl, 1 - je .LBB0_235 -.LBB0_728: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r11], 10 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 11 - test cl, 1 - jne .LBB0_729 -.LBB0_236: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 12 - test cl, 1 - je .LBB0_237 -.LBB0_730: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 248] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 12 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 13 - test cl, 1 - jne .LBB0_238 - jmp .LBB0_239 - .p2align 4, 0x90 -.LBB0_250: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 4 - mov dword ptr [rsp + 28], eax # 4-byte Spill - test al, 1 - je .LBB0_251 -.LBB0_732: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rcx], 4 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - jne .LBB0_733 -.LBB0_252: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 6 - mov dword ptr [rsp + 20], eax # 4-byte Spill - test al, 1 - je .LBB0_253 -.LBB0_734: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rsi], 6 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - jne .LBB0_735 -.LBB0_254: # in Loop: Header=BB0_26 Depth=1 - vpextrb edx, xmm1, 8 - test dl, 1 - je .LBB0_255 -.LBB0_736: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 8 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 9 - test cl, 1 - jne .LBB0_737 -.LBB0_256: # in Loop: Header=BB0_26 Depth=1 - vpextrb esi, xmm1, 10 - test sil, 1 - je .LBB0_257 -.LBB0_738: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 200] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 10 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 11 - test al, 1 - jne .LBB0_739 -.LBB0_258: # in Loop: Header=BB0_26 Depth=1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - je .LBB0_259 -.LBB0_740: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 184] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 12 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - jne .LBB0_741 -.LBB0_260: # in Loop: Header=BB0_26 Depth=1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - je .LBB0_261 -.LBB0_742: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 168] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 14 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - jne .LBB0_262 - jmp .LBB0_263 - .p2align 4, 0x90 -.LBB0_264: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 1 - test bl, 1 - je .LBB0_265 -.LBB0_744: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm15, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - jne .LBB0_745 -.LBB0_266: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 3 - test bl, 1 - je .LBB0_267 -.LBB0_746: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 3 - vpextrb ebx, xmm6, 4 - test bl, 1 - jne .LBB0_747 -.LBB0_268: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 5 - test bl, 1 - je .LBB0_269 -.LBB0_748: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm5, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 5 - vpextrb ebx, xmm6, 6 - test bl, 1 - jne .LBB0_749 -.LBB0_270: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 7 - test bl, 1 - je .LBB0_271 -.LBB0_750: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 7 - vpextrb ebx, xmm6, 8 - test bl, 1 - jne .LBB0_751 -.LBB0_272: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 9 - test bl, 1 - je .LBB0_273 -.LBB0_752: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm12, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 9 - vpextrb ebx, xmm6, 10 - test bl, 1 - jne .LBB0_753 -.LBB0_274: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 11 - test bl, 1 - je .LBB0_275 -.LBB0_754: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 11 - vpextrb ebx, xmm6, 12 - test bl, 1 - jne .LBB0_755 -.LBB0_276: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 13 - test bl, 1 - je .LBB0_277 -.LBB0_756: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm11, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 13 - vpextrb ebx, xmm6, 14 - test bl, 1 - jne .LBB0_757 -.LBB0_278: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 15 - test bl, 1 - je .LBB0_279 -.LBB0_758: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 15 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - jne .LBB0_759 -.LBB0_280: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - je .LBB0_281 -.LBB0_760: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm10, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - jne .LBB0_761 -.LBB0_282: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - je .LBB0_283 -.LBB0_762: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 3 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - jne .LBB0_763 -.LBB0_284: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - je .LBB0_285 -.LBB0_764: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm9, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 5 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - jne .LBB0_765 -.LBB0_286: # in Loop: Header=BB0_26 Depth=1 - test r9b, 1 - je .LBB0_287 -.LBB0_766: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 7 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - jne .LBB0_767 -.LBB0_288: # in Loop: Header=BB0_26 Depth=1 - test cl, 1 - je .LBB0_289 -.LBB0_768: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm8, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - jne .LBB0_769 -.LBB0_290: # in Loop: Header=BB0_26 Depth=1 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - je .LBB0_291 -.LBB0_770: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 11 - test r13b, 1 - jne .LBB0_771 -.LBB0_292: # in Loop: Header=BB0_26 Depth=1 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - je .LBB0_293 -.LBB0_772: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm7, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 - test r11b, 1 - jne .LBB0_773 -.LBB0_294: # in Loop: Header=BB0_26 Depth=1 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - je .LBB0_296 -.LBB0_295: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 -.LBB0_296: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 704] # 32-byte Reload - vpor ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload - vpor ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload - vpor ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload - vpor ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload - vpor ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload - vpor ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload - vpor ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload - vpor ymm7, ymm4, ymm1 - vperm2i128 ymm1, ymm8, ymm7, 49 # ymm1 = ymm8[2,3],ymm7[2,3] - vinserti128 ymm2, ymm8, xmm7, 1 - vshufps ymm1, ymm2, ymm1, 136 # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6] - vperm2i128 ymm2, ymm12, ymm11, 49 # ymm2 = ymm12[2,3],ymm11[2,3] - vinserti128 ymm3, ymm12, xmm11, 1 - vshufps ymm2, ymm3, ymm2, 136 # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] - vperm2i128 ymm3, ymm10, ymm9, 49 # ymm3 = ymm10[2,3],ymm9[2,3] - vinserti128 ymm13, ymm10, xmm9, 1 - vshufps ymm3, ymm13, ymm3, 136 # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6] - vperm2i128 ymm13, ymm15, ymm5, 49 # ymm13 = ymm15[2,3],ymm5[2,3] - vinserti128 ymm14, ymm15, xmm5, 1 - vshufps ymm13, ymm14, ymm13, 136 # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] - vpcmpgtd ymm13, ymm0, ymm13 - vpcmpgtd ymm3, ymm0, ymm3 - vpackssdw ymm3, ymm13, ymm3 - vpcmpgtd ymm2, ymm0, ymm2 - vpcmpgtd ymm1, ymm0, ymm1 - vpackssdw ymm1, ymm2, ymm1 - vpermq ymm2, ymm3, 216 # ymm2 = ymm3[0,2,1,3] - vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3] - vpacksswb ymm1, ymm2, ymm1 - vpand ymm6, ymm1, ymm6 - vmovd ecx, xmm6 - # implicit-def: $ymm14 - test cl, 1 - je .LBB0_297 -# %bb.774: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm14, byte ptr [rdi + rdx] - vpextrb ecx, xmm6, 1 - test cl, 1 - jne .LBB0_775 -.LBB0_298: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - je .LBB0_300 -.LBB0_299: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rbx], 2 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_300: # in Loop: Header=BB0_26 Depth=1 - mov rsi, qword ptr [rsp + 96] # 8-byte Reload - mov r10, qword ptr [rsp + 72] # 8-byte Reload - vpextrb ecx, xmm6, 3 - test cl, 1 - je .LBB0_301 -# %bb.776: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 272] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 3 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 4 - test cl, 1 - jne .LBB0_777 -.LBB0_302: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 5 - test cl, 1 - je .LBB0_303 -.LBB0_778: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 5 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 6 - test cl, 1 - jne .LBB0_779 -.LBB0_304: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 7 - test cl, 1 - je .LBB0_305 -.LBB0_780: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r9], 7 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 8 - test cl, 1 - jne .LBB0_781 -.LBB0_306: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_308 -.LBB0_307: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r15], 9 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_308: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 144] # 8-byte Reload - mov rsi, qword ptr [rsp + 136] # 8-byte Reload - mov rbx, qword ptr [rsp + 128] # 8-byte Reload - mov r9, qword ptr [rsp + 120] # 8-byte Reload - vpextrb ecx, xmm6, 10 - test cl, 1 - je .LBB0_309 -# %bb.782: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 10 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 11 - test cl, 1 - jne .LBB0_783 -.LBB0_310: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 12 - test cl, 1 - je .LBB0_311 -.LBB0_784: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 248] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 12 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 13 - test cl, 1 - jne .LBB0_312 - jmp .LBB0_313 - .p2align 4, 0x90 -.LBB0_297: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 1 - test cl, 1 - je .LBB0_298 -.LBB0_775: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 1 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - jne .LBB0_299 - jmp .LBB0_300 - .p2align 4, 0x90 -.LBB0_301: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - test cl, 1 - je .LBB0_302 -.LBB0_777: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 264] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 4 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 5 - test cl, 1 - jne .LBB0_778 -.LBB0_303: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 6 - test cl, 1 - je .LBB0_304 -.LBB0_779: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 6 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 7 - test cl, 1 - jne .LBB0_780 -.LBB0_305: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 8 - test cl, 1 - je .LBB0_306 -.LBB0_781: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 8 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_307 - jmp .LBB0_308 - .p2align 4, 0x90 -.LBB0_309: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 11 - test cl, 1 - je .LBB0_310 -.LBB0_783: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 256] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 11 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 12 - test cl, 1 - jne .LBB0_784 -.LBB0_311: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 13 - test cl, 1 - je .LBB0_313 -.LBB0_312: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 13 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_313: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 80] # 8-byte Reload - mov rdx, qword ptr [rsp + 64] # 8-byte Reload - vpextrb ecx, xmm6, 14 - test cl, 1 - je .LBB0_315 -# %bb.314: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 14 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_315: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 15 - test cl, 1 - je .LBB0_317 -# %bb.316: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r10], 15 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_317: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm6, 1 - vmovd eax, xmm1 - mov dword ptr [rsp + 44], eax # 4-byte Spill - test al, 1 - je .LBB0_319 -# %bb.318: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rdx], 0 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_319: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 56] # 8-byte Reload - vpextrb ecx, xmm1, 1 - mov dword ptr [rsp + 40], ecx # 4-byte Spill - test cl, 1 - je .LBB0_320 -# %bb.785: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rsi], 1 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_786 -.LBB0_321: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - je .LBB0_322 -.LBB0_787: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r9], 3 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_788 -.LBB0_323: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - je .LBB0_325 -.LBB0_324: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r13], 5 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_325: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 112] # 8-byte Reload - vpextrb ecx, xmm1, 6 - mov dword ptr [rsp + 20], ecx # 4-byte Spill - test cl, 1 - je .LBB0_326 -# %bb.789: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 6 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - jne .LBB0_790 -.LBB0_327: # in Loop: Header=BB0_26 Depth=1 - vpextrb edx, xmm1, 8 - test dl, 1 - je .LBB0_328 -.LBB0_791: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 8 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 9 - test cl, 1 - jne .LBB0_792 -.LBB0_329: # in Loop: Header=BB0_26 Depth=1 - vpextrb esi, xmm1, 10 - test sil, 1 - je .LBB0_330 -.LBB0_793: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 200] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 10 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 11 - test al, 1 - jne .LBB0_794 -.LBB0_331: # in Loop: Header=BB0_26 Depth=1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - je .LBB0_332 -.LBB0_795: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 184] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 12 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - jne .LBB0_796 -.LBB0_333: # in Loop: Header=BB0_26 Depth=1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - je .LBB0_334 -.LBB0_797: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 168] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 14 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - jne .LBB0_335 - jmp .LBB0_336 - .p2align 4, 0x90 -.LBB0_320: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - je .LBB0_321 -.LBB0_786: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 2 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_787 -.LBB0_322: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - je .LBB0_323 -.LBB0_788: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 4 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - jne .LBB0_324 - jmp .LBB0_325 - .p2align 4, 0x90 -.LBB0_326: # in Loop: Header=BB0_26 Depth=1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - je .LBB0_327 -.LBB0_790: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 240] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 7 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb edx, xmm1, 8 - test dl, 1 - jne .LBB0_791 -.LBB0_328: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 9 - test cl, 1 - je .LBB0_329 -.LBB0_792: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 208] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 9 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb esi, xmm1, 10 - test sil, 1 - jne .LBB0_793 -.LBB0_330: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 11 - test al, 1 - je .LBB0_331 -.LBB0_794: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 192] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 11 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - jne .LBB0_795 -.LBB0_332: # in Loop: Header=BB0_26 Depth=1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - je .LBB0_333 -.LBB0_796: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 176] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 13 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - jne .LBB0_797 -.LBB0_334: # in Loop: Header=BB0_26 Depth=1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - je .LBB0_336 -.LBB0_335: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rbx, qword ptr [rsp + 160] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rbx], 15 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_336: # in Loop: Header=BB0_26 Depth=1 - vpsrlw ymm1, ymm14, 3 - vpand ymm14, ymm1, ymmword ptr [rip + .LCPI0_4] - vmovd r15d, xmm6 - test r15b, 1 - je .LBB0_337 -# %bb.798: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm15 - vpextrb byte ptr [r8 + rbx], xmm14, 0 - vpextrb ebx, xmm6, 1 - test bl, 1 - jne .LBB0_799 -.LBB0_338: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - je .LBB0_339 -.LBB0_800: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 2 - vpextrb ebx, xmm6, 3 - test bl, 1 - jne .LBB0_801 -.LBB0_340: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 4 - test bl, 1 - je .LBB0_341 -.LBB0_802: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm5 - vpextrb byte ptr [r8 + rbx], xmm14, 4 - vpextrb ebx, xmm6, 5 - test bl, 1 - jne .LBB0_803 -.LBB0_342: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 6 - test bl, 1 - je .LBB0_343 -.LBB0_804: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 6 - vpextrb ebx, xmm6, 7 - test bl, 1 - jne .LBB0_805 -.LBB0_344: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 8 - test bl, 1 - je .LBB0_345 -.LBB0_806: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm12 - vpextrb byte ptr [r8 + rbx], xmm14, 8 - vpextrb ebx, xmm6, 9 - test bl, 1 - jne .LBB0_807 -.LBB0_346: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 10 - test bl, 1 - je .LBB0_347 -.LBB0_808: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 10 - vpextrb ebx, xmm6, 11 - test bl, 1 - jne .LBB0_809 -.LBB0_348: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 12 - test bl, 1 - je .LBB0_349 -.LBB0_810: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm11 - vpextrb byte ptr [r8 + rbx], xmm14, 12 - vpextrb ebx, xmm6, 13 - test bl, 1 - jne .LBB0_811 -.LBB0_350: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 14 - test bl, 1 - je .LBB0_351 -.LBB0_812: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 14 - vpextrb ebx, xmm6, 15 - test bl, 1 - jne .LBB0_813 -.LBB0_352: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - je .LBB0_353 -.LBB0_814: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm10 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 0 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - jne .LBB0_815 -.LBB0_354: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - je .LBB0_355 -.LBB0_816: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 2 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - jne .LBB0_817 -.LBB0_356: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - je .LBB0_357 -.LBB0_818: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm9 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 4 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - jne .LBB0_819 -.LBB0_358: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - je .LBB0_359 -.LBB0_820: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 6 - test r9b, 1 - jne .LBB0_821 -.LBB0_360: # in Loop: Header=BB0_26 Depth=1 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - je .LBB0_361 -.LBB0_822: # in Loop: Header=BB0_26 Depth=1 - vmovq rdx, xmm8 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rdx], xmm1, 8 - test cl, 1 - jne .LBB0_823 -.LBB0_362: # in Loop: Header=BB0_26 Depth=1 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - je .LBB0_363 -.LBB0_824: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - jne .LBB0_825 -.LBB0_364: # in Loop: Header=BB0_26 Depth=1 - test r13b, 1 - je .LBB0_365 -.LBB0_826: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm7 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - jne .LBB0_827 -.LBB0_366: # in Loop: Header=BB0_26 Depth=1 - test r11b, 1 - je .LBB0_367 -.LBB0_828: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - jne .LBB0_368 - jmp .LBB0_369 - .p2align 4, 0x90 -.LBB0_337: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 1 - test bl, 1 - je .LBB0_338 -.LBB0_799: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm15, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - jne .LBB0_800 -.LBB0_339: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 3 - test bl, 1 - je .LBB0_340 -.LBB0_801: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 3 - vpextrb ebx, xmm6, 4 - test bl, 1 - jne .LBB0_802 -.LBB0_341: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 5 - test bl, 1 - je .LBB0_342 -.LBB0_803: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm5, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 5 - vpextrb ebx, xmm6, 6 - test bl, 1 - jne .LBB0_804 -.LBB0_343: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 7 - test bl, 1 - je .LBB0_344 -.LBB0_805: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 7 - vpextrb ebx, xmm6, 8 - test bl, 1 - jne .LBB0_806 -.LBB0_345: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 9 - test bl, 1 - je .LBB0_346 -.LBB0_807: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm12, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 9 - vpextrb ebx, xmm6, 10 - test bl, 1 - jne .LBB0_808 -.LBB0_347: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 11 - test bl, 1 - je .LBB0_348 -.LBB0_809: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 11 - vpextrb ebx, xmm6, 12 - test bl, 1 - jne .LBB0_810 -.LBB0_349: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 13 - test bl, 1 - je .LBB0_350 -.LBB0_811: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm11, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 13 - vpextrb ebx, xmm6, 14 - test bl, 1 - jne .LBB0_812 -.LBB0_351: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 15 - test bl, 1 - je .LBB0_352 -.LBB0_813: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 15 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - jne .LBB0_814 -.LBB0_353: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - je .LBB0_354 -.LBB0_815: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm10, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - jne .LBB0_816 -.LBB0_355: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - je .LBB0_356 -.LBB0_817: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 3 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - jne .LBB0_818 -.LBB0_357: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - je .LBB0_358 -.LBB0_819: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm9, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 5 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - jne .LBB0_820 -.LBB0_359: # in Loop: Header=BB0_26 Depth=1 - test r9b, 1 - je .LBB0_360 -.LBB0_821: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 7 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - jne .LBB0_822 -.LBB0_361: # in Loop: Header=BB0_26 Depth=1 - test cl, 1 - je .LBB0_362 -.LBB0_823: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm8, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - jne .LBB0_824 -.LBB0_363: # in Loop: Header=BB0_26 Depth=1 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - je .LBB0_364 -.LBB0_825: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 11 - test r13b, 1 - jne .LBB0_826 -.LBB0_365: # in Loop: Header=BB0_26 Depth=1 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - je .LBB0_366 -.LBB0_827: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm7, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 - test r11b, 1 - jne .LBB0_828 -.LBB0_367: # in Loop: Header=BB0_26 Depth=1 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - je .LBB0_369 -.LBB0_368: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 -.LBB0_369: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 672] # 32-byte Reload - vpor ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload - vpor ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload - vpor ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload - vpor ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload - vpor ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload - vpor ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload - vpor ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload - vpor ymm7, ymm4, ymm1 - vperm2i128 ymm1, ymm8, ymm7, 49 # ymm1 = ymm8[2,3],ymm7[2,3] - vinserti128 ymm2, ymm8, xmm7, 1 - vshufps ymm1, ymm2, ymm1, 136 # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6] - vperm2i128 ymm2, ymm12, ymm11, 49 # ymm2 = ymm12[2,3],ymm11[2,3] - vinserti128 ymm3, ymm12, xmm11, 1 - vshufps ymm2, ymm3, ymm2, 136 # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] - vperm2i128 ymm3, ymm10, ymm9, 49 # ymm3 = ymm10[2,3],ymm9[2,3] - vinserti128 ymm13, ymm10, xmm9, 1 - vshufps ymm3, ymm13, ymm3, 136 # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6] - vperm2i128 ymm13, ymm15, ymm5, 49 # ymm13 = ymm15[2,3],ymm5[2,3] - vinserti128 ymm14, ymm15, xmm5, 1 - vshufps ymm13, ymm14, ymm13, 136 # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] - vpcmpgtd ymm13, ymm0, ymm13 - vpcmpgtd ymm3, ymm0, ymm3 - vpackssdw ymm3, ymm13, ymm3 - vpcmpgtd ymm2, ymm0, ymm2 - vpcmpgtd ymm1, ymm0, ymm1 - vpackssdw ymm1, ymm2, ymm1 - vpermq ymm2, ymm3, 216 # ymm2 = ymm3[0,2,1,3] - vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3] - vpacksswb ymm1, ymm2, ymm1 - vpand ymm6, ymm1, ymm6 - vmovd ecx, xmm6 - # implicit-def: $ymm14 - test cl, 1 - je .LBB0_370 -# %bb.829: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm14, byte ptr [rdi + rdx] - vpextrb ecx, xmm6, 1 - test cl, 1 - jne .LBB0_830 -.LBB0_371: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - je .LBB0_373 -.LBB0_372: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rbx], 2 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_373: # in Loop: Header=BB0_26 Depth=1 - mov rsi, qword ptr [rsp + 96] # 8-byte Reload - mov r10, qword ptr [rsp + 72] # 8-byte Reload - vpextrb ecx, xmm6, 3 - test cl, 1 - je .LBB0_374 -# %bb.831: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 272] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 3 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 4 - test cl, 1 - jne .LBB0_832 -.LBB0_375: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 5 - test cl, 1 - je .LBB0_376 -.LBB0_833: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 5 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 6 - test cl, 1 - jne .LBB0_834 -.LBB0_377: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 7 - test cl, 1 - je .LBB0_378 -.LBB0_835: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r9], 7 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 8 - test cl, 1 - jne .LBB0_836 -.LBB0_379: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_381 -.LBB0_380: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r15], 9 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_381: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 144] # 8-byte Reload - mov rsi, qword ptr [rsp + 136] # 8-byte Reload - mov rbx, qword ptr [rsp + 128] # 8-byte Reload - mov r9, qword ptr [rsp + 120] # 8-byte Reload - vpextrb ecx, xmm6, 10 - test cl, 1 - je .LBB0_382 -# %bb.837: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 10 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 11 - test cl, 1 - jne .LBB0_838 -.LBB0_383: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 12 - test cl, 1 - je .LBB0_384 -.LBB0_839: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 248] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 12 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 13 - test cl, 1 - jne .LBB0_385 - jmp .LBB0_386 - .p2align 4, 0x90 -.LBB0_370: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 1 - test cl, 1 - je .LBB0_371 -.LBB0_830: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 1 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - jne .LBB0_372 - jmp .LBB0_373 - .p2align 4, 0x90 -.LBB0_374: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - test cl, 1 - je .LBB0_375 -.LBB0_832: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 264] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 4 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 5 - test cl, 1 - jne .LBB0_833 -.LBB0_376: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 6 - test cl, 1 - je .LBB0_377 -.LBB0_834: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 6 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 7 - test cl, 1 - jne .LBB0_835 -.LBB0_378: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 8 - test cl, 1 - je .LBB0_379 -.LBB0_836: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 8 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_380 - jmp .LBB0_381 - .p2align 4, 0x90 -.LBB0_382: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 11 - test cl, 1 - je .LBB0_383 -.LBB0_838: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 256] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 11 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 12 - test cl, 1 - jne .LBB0_839 -.LBB0_384: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 13 - test cl, 1 - je .LBB0_386 -.LBB0_385: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 13 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_386: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 80] # 8-byte Reload - mov rdx, qword ptr [rsp + 64] # 8-byte Reload - vpextrb ecx, xmm6, 14 - test cl, 1 - je .LBB0_388 -# %bb.387: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 14 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_388: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 15 - test cl, 1 - je .LBB0_390 -# %bb.389: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r10], 15 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_390: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm6, 1 - vmovd eax, xmm1 - mov dword ptr [rsp + 44], eax # 4-byte Spill - test al, 1 - je .LBB0_392 -# %bb.391: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rdx], 0 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_392: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 56] # 8-byte Reload - vpextrb ecx, xmm1, 1 - mov dword ptr [rsp + 40], ecx # 4-byte Spill - test cl, 1 - je .LBB0_393 -# %bb.840: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rsi], 1 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_841 -.LBB0_394: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - je .LBB0_395 -.LBB0_842: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r9], 3 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_843 -.LBB0_396: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - je .LBB0_398 -.LBB0_397: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r13], 5 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_398: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 112] # 8-byte Reload - vpextrb ecx, xmm1, 6 - mov dword ptr [rsp + 20], ecx # 4-byte Spill - test cl, 1 - je .LBB0_399 -# %bb.844: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 6 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - jne .LBB0_845 -.LBB0_400: # in Loop: Header=BB0_26 Depth=1 - vpextrb edx, xmm1, 8 - test dl, 1 - je .LBB0_401 -.LBB0_846: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 8 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 9 - test cl, 1 - jne .LBB0_847 -.LBB0_402: # in Loop: Header=BB0_26 Depth=1 - vpextrb esi, xmm1, 10 - test sil, 1 - je .LBB0_403 -.LBB0_848: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 200] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 10 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 11 - test al, 1 - jne .LBB0_849 -.LBB0_404: # in Loop: Header=BB0_26 Depth=1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - je .LBB0_405 -.LBB0_850: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 184] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 12 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - jne .LBB0_851 -.LBB0_406: # in Loop: Header=BB0_26 Depth=1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - je .LBB0_407 -.LBB0_852: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 168] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 14 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - jne .LBB0_408 - jmp .LBB0_409 - .p2align 4, 0x90 -.LBB0_393: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - je .LBB0_394 -.LBB0_841: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 2 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_842 -.LBB0_395: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - je .LBB0_396 -.LBB0_843: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 4 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - jne .LBB0_397 - jmp .LBB0_398 - .p2align 4, 0x90 -.LBB0_399: # in Loop: Header=BB0_26 Depth=1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - je .LBB0_400 -.LBB0_845: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 240] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 7 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb edx, xmm1, 8 - test dl, 1 - jne .LBB0_846 -.LBB0_401: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 9 - test cl, 1 - je .LBB0_402 -.LBB0_847: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 208] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 9 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb esi, xmm1, 10 - test sil, 1 - jne .LBB0_848 -.LBB0_403: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 11 - test al, 1 - je .LBB0_404 -.LBB0_849: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 192] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 11 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - jne .LBB0_850 -.LBB0_405: # in Loop: Header=BB0_26 Depth=1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - je .LBB0_406 -.LBB0_851: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 176] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 13 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - jne .LBB0_852 -.LBB0_407: # in Loop: Header=BB0_26 Depth=1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - je .LBB0_409 -.LBB0_408: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rbx, qword ptr [rsp + 160] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rbx], 15 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_409: # in Loop: Header=BB0_26 Depth=1 - vpsrlw ymm1, ymm14, 4 - vpand ymm14, ymm1, ymmword ptr [rip + .LCPI0_4] - vmovd r15d, xmm6 - test r15b, 1 - je .LBB0_410 -# %bb.853: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm15 - vpextrb byte ptr [r8 + rbx], xmm14, 0 - vpextrb ebx, xmm6, 1 - test bl, 1 - jne .LBB0_854 -.LBB0_411: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - je .LBB0_412 -.LBB0_855: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 2 - vpextrb ebx, xmm6, 3 - test bl, 1 - jne .LBB0_856 -.LBB0_413: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 4 - test bl, 1 - je .LBB0_414 -.LBB0_857: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm5 - vpextrb byte ptr [r8 + rbx], xmm14, 4 - vpextrb ebx, xmm6, 5 - test bl, 1 - jne .LBB0_858 -.LBB0_415: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 6 - test bl, 1 - je .LBB0_416 -.LBB0_859: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 6 - vpextrb ebx, xmm6, 7 - test bl, 1 - jne .LBB0_860 -.LBB0_417: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 8 - test bl, 1 - je .LBB0_418 -.LBB0_861: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm12 - vpextrb byte ptr [r8 + rbx], xmm14, 8 - vpextrb ebx, xmm6, 9 - test bl, 1 - jne .LBB0_862 -.LBB0_419: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 10 - test bl, 1 - je .LBB0_420 -.LBB0_863: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 10 - vpextrb ebx, xmm6, 11 - test bl, 1 - jne .LBB0_864 -.LBB0_421: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 12 - test bl, 1 - je .LBB0_422 -.LBB0_865: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm11 - vpextrb byte ptr [r8 + rbx], xmm14, 12 - vpextrb ebx, xmm6, 13 - test bl, 1 - jne .LBB0_866 -.LBB0_423: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 14 - test bl, 1 - je .LBB0_424 -.LBB0_867: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 14 - vpextrb ebx, xmm6, 15 - test bl, 1 - jne .LBB0_868 -.LBB0_425: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - je .LBB0_426 -.LBB0_869: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm10 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 0 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - jne .LBB0_870 -.LBB0_427: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - je .LBB0_428 -.LBB0_871: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 2 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - jne .LBB0_872 -.LBB0_429: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - je .LBB0_430 -.LBB0_873: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm9 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 4 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - jne .LBB0_874 -.LBB0_431: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - je .LBB0_432 -.LBB0_875: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 6 - test r9b, 1 - jne .LBB0_876 -.LBB0_433: # in Loop: Header=BB0_26 Depth=1 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - je .LBB0_434 -.LBB0_877: # in Loop: Header=BB0_26 Depth=1 - vmovq rdx, xmm8 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rdx], xmm1, 8 - test cl, 1 - jne .LBB0_878 -.LBB0_435: # in Loop: Header=BB0_26 Depth=1 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - je .LBB0_436 -.LBB0_879: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - jne .LBB0_880 -.LBB0_437: # in Loop: Header=BB0_26 Depth=1 - test r13b, 1 - je .LBB0_438 -.LBB0_881: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm7 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - jne .LBB0_882 -.LBB0_439: # in Loop: Header=BB0_26 Depth=1 - test r11b, 1 - je .LBB0_440 -.LBB0_883: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - jne .LBB0_441 - jmp .LBB0_442 - .p2align 4, 0x90 -.LBB0_410: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 1 - test bl, 1 - je .LBB0_411 -.LBB0_854: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm15, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - jne .LBB0_855 -.LBB0_412: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 3 - test bl, 1 - je .LBB0_413 -.LBB0_856: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 3 - vpextrb ebx, xmm6, 4 - test bl, 1 - jne .LBB0_857 -.LBB0_414: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 5 - test bl, 1 - je .LBB0_415 -.LBB0_858: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm5, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 5 - vpextrb ebx, xmm6, 6 - test bl, 1 - jne .LBB0_859 -.LBB0_416: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 7 - test bl, 1 - je .LBB0_417 -.LBB0_860: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 7 - vpextrb ebx, xmm6, 8 - test bl, 1 - jne .LBB0_861 -.LBB0_418: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 9 - test bl, 1 - je .LBB0_419 -.LBB0_862: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm12, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 9 - vpextrb ebx, xmm6, 10 - test bl, 1 - jne .LBB0_863 -.LBB0_420: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 11 - test bl, 1 - je .LBB0_421 -.LBB0_864: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 11 - vpextrb ebx, xmm6, 12 - test bl, 1 - jne .LBB0_865 -.LBB0_422: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 13 - test bl, 1 - je .LBB0_423 -.LBB0_866: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm11, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 13 - vpextrb ebx, xmm6, 14 - test bl, 1 - jne .LBB0_867 -.LBB0_424: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 15 - test bl, 1 - je .LBB0_425 -.LBB0_868: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 15 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - jne .LBB0_869 -.LBB0_426: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - je .LBB0_427 -.LBB0_870: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm10, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - jne .LBB0_871 -.LBB0_428: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - je .LBB0_429 -.LBB0_872: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 3 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - jne .LBB0_873 -.LBB0_430: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - je .LBB0_431 -.LBB0_874: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm9, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 5 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - jne .LBB0_875 -.LBB0_432: # in Loop: Header=BB0_26 Depth=1 - test r9b, 1 - je .LBB0_433 -.LBB0_876: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 7 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - jne .LBB0_877 -.LBB0_434: # in Loop: Header=BB0_26 Depth=1 - test cl, 1 - je .LBB0_435 -.LBB0_878: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm8, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - jne .LBB0_879 -.LBB0_436: # in Loop: Header=BB0_26 Depth=1 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - je .LBB0_437 -.LBB0_880: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 11 - test r13b, 1 - jne .LBB0_881 -.LBB0_438: # in Loop: Header=BB0_26 Depth=1 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - je .LBB0_439 -.LBB0_882: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm7, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 - test r11b, 1 - jne .LBB0_883 -.LBB0_440: # in Loop: Header=BB0_26 Depth=1 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - je .LBB0_442 -.LBB0_441: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 -.LBB0_442: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 640] # 32-byte Reload - vpor ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload - vpor ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload - vpor ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload - vpor ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload - vpor ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload - vpor ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload - vpor ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload - vpor ymm7, ymm4, ymm1 - vperm2i128 ymm1, ymm8, ymm7, 49 # ymm1 = ymm8[2,3],ymm7[2,3] - vinserti128 ymm2, ymm8, xmm7, 1 - vshufps ymm1, ymm2, ymm1, 136 # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6] - vperm2i128 ymm2, ymm12, ymm11, 49 # ymm2 = ymm12[2,3],ymm11[2,3] - vinserti128 ymm3, ymm12, xmm11, 1 - vshufps ymm2, ymm3, ymm2, 136 # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] - vperm2i128 ymm3, ymm10, ymm9, 49 # ymm3 = ymm10[2,3],ymm9[2,3] - vinserti128 ymm13, ymm10, xmm9, 1 - vshufps ymm3, ymm13, ymm3, 136 # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6] - vperm2i128 ymm13, ymm15, ymm5, 49 # ymm13 = ymm15[2,3],ymm5[2,3] - vinserti128 ymm14, ymm15, xmm5, 1 - vshufps ymm13, ymm14, ymm13, 136 # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] - vpcmpgtd ymm13, ymm0, ymm13 - vpcmpgtd ymm3, ymm0, ymm3 - vpackssdw ymm3, ymm13, ymm3 - vpcmpgtd ymm2, ymm0, ymm2 - vpcmpgtd ymm1, ymm0, ymm1 - vpackssdw ymm1, ymm2, ymm1 - vpermq ymm2, ymm3, 216 # ymm2 = ymm3[0,2,1,3] - vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3] - vpacksswb ymm1, ymm2, ymm1 - vpand ymm6, ymm1, ymm6 - vmovd ecx, xmm6 - # implicit-def: $ymm14 - test cl, 1 - je .LBB0_443 -# %bb.884: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm14, byte ptr [rdi + rdx] - vpextrb ecx, xmm6, 1 - test cl, 1 - jne .LBB0_885 -.LBB0_444: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - je .LBB0_446 -.LBB0_445: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rbx], 2 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_446: # in Loop: Header=BB0_26 Depth=1 - mov rsi, qword ptr [rsp + 96] # 8-byte Reload - mov r10, qword ptr [rsp + 72] # 8-byte Reload - vpextrb ecx, xmm6, 3 - test cl, 1 - je .LBB0_447 -# %bb.886: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 272] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 3 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 4 - test cl, 1 - jne .LBB0_887 -.LBB0_448: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 5 - test cl, 1 - je .LBB0_449 -.LBB0_888: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 5 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 6 - test cl, 1 - jne .LBB0_889 -.LBB0_450: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 7 - test cl, 1 - je .LBB0_451 -.LBB0_890: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r9], 7 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 8 - test cl, 1 - jne .LBB0_891 -.LBB0_452: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_454 -.LBB0_453: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r15], 9 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_454: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 144] # 8-byte Reload - mov rsi, qword ptr [rsp + 136] # 8-byte Reload - mov rbx, qword ptr [rsp + 128] # 8-byte Reload - mov r9, qword ptr [rsp + 120] # 8-byte Reload - vpextrb ecx, xmm6, 10 - test cl, 1 - je .LBB0_455 -# %bb.892: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 10 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 11 - test cl, 1 - jne .LBB0_893 -.LBB0_456: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 12 - test cl, 1 - je .LBB0_457 -.LBB0_894: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 248] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 12 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 13 - test cl, 1 - jne .LBB0_458 - jmp .LBB0_459 - .p2align 4, 0x90 -.LBB0_443: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 1 - test cl, 1 - je .LBB0_444 -.LBB0_885: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 1 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - jne .LBB0_445 - jmp .LBB0_446 - .p2align 4, 0x90 -.LBB0_447: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - test cl, 1 - je .LBB0_448 -.LBB0_887: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 264] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 4 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 5 - test cl, 1 - jne .LBB0_888 -.LBB0_449: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 6 - test cl, 1 - je .LBB0_450 -.LBB0_889: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 6 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 7 - test cl, 1 - jne .LBB0_890 -.LBB0_451: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 8 - test cl, 1 - je .LBB0_452 -.LBB0_891: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 8 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_453 - jmp .LBB0_454 - .p2align 4, 0x90 -.LBB0_455: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 11 - test cl, 1 - je .LBB0_456 -.LBB0_893: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 256] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 11 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 12 - test cl, 1 - jne .LBB0_894 -.LBB0_457: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 13 - test cl, 1 - je .LBB0_459 -.LBB0_458: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 13 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_459: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 80] # 8-byte Reload - mov rdx, qword ptr [rsp + 64] # 8-byte Reload - vpextrb ecx, xmm6, 14 - test cl, 1 - je .LBB0_461 -# %bb.460: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 14 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_461: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 15 - test cl, 1 - je .LBB0_463 -# %bb.462: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r10], 15 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_463: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm6, 1 - vmovd eax, xmm1 - mov dword ptr [rsp + 44], eax # 4-byte Spill - test al, 1 - je .LBB0_465 -# %bb.464: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rdx], 0 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_465: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 56] # 8-byte Reload - vpextrb ecx, xmm1, 1 - mov dword ptr [rsp + 40], ecx # 4-byte Spill - test cl, 1 - je .LBB0_466 -# %bb.895: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rsi], 1 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_896 -.LBB0_467: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - je .LBB0_468 -.LBB0_897: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r9], 3 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_898 -.LBB0_469: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - je .LBB0_471 -.LBB0_470: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r13], 5 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_471: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 112] # 8-byte Reload - vpextrb ecx, xmm1, 6 - mov dword ptr [rsp + 20], ecx # 4-byte Spill - test cl, 1 - je .LBB0_472 -# %bb.899: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 6 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - jne .LBB0_900 -.LBB0_473: # in Loop: Header=BB0_26 Depth=1 - vpextrb edx, xmm1, 8 - test dl, 1 - je .LBB0_474 -.LBB0_901: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 8 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 9 - test cl, 1 - jne .LBB0_902 -.LBB0_475: # in Loop: Header=BB0_26 Depth=1 - vpextrb esi, xmm1, 10 - test sil, 1 - je .LBB0_476 -.LBB0_903: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 200] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 10 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 11 - test al, 1 - jne .LBB0_904 -.LBB0_477: # in Loop: Header=BB0_26 Depth=1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - je .LBB0_478 -.LBB0_905: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 184] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 12 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - jne .LBB0_906 -.LBB0_479: # in Loop: Header=BB0_26 Depth=1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - je .LBB0_480 -.LBB0_907: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 168] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 14 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - jne .LBB0_481 - jmp .LBB0_482 - .p2align 4, 0x90 -.LBB0_466: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - je .LBB0_467 -.LBB0_896: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 2 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_897 -.LBB0_468: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - je .LBB0_469 -.LBB0_898: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 4 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - jne .LBB0_470 - jmp .LBB0_471 - .p2align 4, 0x90 -.LBB0_472: # in Loop: Header=BB0_26 Depth=1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - je .LBB0_473 -.LBB0_900: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 240] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 7 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb edx, xmm1, 8 - test dl, 1 - jne .LBB0_901 -.LBB0_474: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 9 - test cl, 1 - je .LBB0_475 -.LBB0_902: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 208] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 9 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb esi, xmm1, 10 - test sil, 1 - jne .LBB0_903 -.LBB0_476: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 11 - test al, 1 - je .LBB0_477 -.LBB0_904: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 192] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 11 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - jne .LBB0_905 -.LBB0_478: # in Loop: Header=BB0_26 Depth=1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - je .LBB0_479 -.LBB0_906: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 176] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 13 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - jne .LBB0_907 -.LBB0_480: # in Loop: Header=BB0_26 Depth=1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - je .LBB0_482 -.LBB0_481: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rbx, qword ptr [rsp + 160] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rbx], 15 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_482: # in Loop: Header=BB0_26 Depth=1 - vpsrlw ymm1, ymm14, 5 - vpand ymm14, ymm1, ymmword ptr [rip + .LCPI0_4] - vmovd r15d, xmm6 - test r15b, 1 - je .LBB0_483 -# %bb.908: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm15 - vpextrb byte ptr [r8 + rbx], xmm14, 0 - vpextrb ebx, xmm6, 1 - test bl, 1 - jne .LBB0_909 -.LBB0_484: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - je .LBB0_485 -.LBB0_910: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 2 - vpextrb ebx, xmm6, 3 - test bl, 1 - jne .LBB0_911 -.LBB0_486: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 4 - test bl, 1 - je .LBB0_487 -.LBB0_912: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm5 - vpextrb byte ptr [r8 + rbx], xmm14, 4 - vpextrb ebx, xmm6, 5 - test bl, 1 - jne .LBB0_913 -.LBB0_488: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 6 - test bl, 1 - je .LBB0_489 -.LBB0_914: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 6 - vpextrb ebx, xmm6, 7 - test bl, 1 - jne .LBB0_915 -.LBB0_490: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 8 - test bl, 1 - je .LBB0_491 -.LBB0_916: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm12 - vpextrb byte ptr [r8 + rbx], xmm14, 8 - vpextrb ebx, xmm6, 9 - test bl, 1 - jne .LBB0_917 -.LBB0_492: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 10 - test bl, 1 - je .LBB0_493 -.LBB0_918: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 10 - vpextrb ebx, xmm6, 11 - test bl, 1 - jne .LBB0_919 -.LBB0_494: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 12 - test bl, 1 - je .LBB0_495 -.LBB0_920: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm11 - vpextrb byte ptr [r8 + rbx], xmm14, 12 - vpextrb ebx, xmm6, 13 - test bl, 1 - jne .LBB0_921 -.LBB0_496: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 14 - test bl, 1 - je .LBB0_497 -.LBB0_922: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 14 - vpextrb ebx, xmm6, 15 - test bl, 1 - jne .LBB0_923 -.LBB0_498: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - je .LBB0_499 -.LBB0_924: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm10 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 0 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - jne .LBB0_925 -.LBB0_500: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - je .LBB0_501 -.LBB0_926: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 2 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - jne .LBB0_927 -.LBB0_502: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - je .LBB0_503 -.LBB0_928: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm9 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 4 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - jne .LBB0_929 -.LBB0_504: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - je .LBB0_505 -.LBB0_930: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 6 - test r9b, 1 - jne .LBB0_931 -.LBB0_506: # in Loop: Header=BB0_26 Depth=1 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - je .LBB0_507 -.LBB0_932: # in Loop: Header=BB0_26 Depth=1 - vmovq rdx, xmm8 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rdx], xmm1, 8 - test cl, 1 - jne .LBB0_933 -.LBB0_508: # in Loop: Header=BB0_26 Depth=1 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - je .LBB0_509 -.LBB0_934: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - jne .LBB0_935 -.LBB0_510: # in Loop: Header=BB0_26 Depth=1 - test r13b, 1 - je .LBB0_511 -.LBB0_936: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm7 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - jne .LBB0_937 -.LBB0_512: # in Loop: Header=BB0_26 Depth=1 - test r11b, 1 - je .LBB0_513 -.LBB0_938: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - jne .LBB0_514 - jmp .LBB0_515 - .p2align 4, 0x90 -.LBB0_483: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 1 - test bl, 1 - je .LBB0_484 -.LBB0_909: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm15, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - jne .LBB0_910 -.LBB0_485: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 3 - test bl, 1 - je .LBB0_486 -.LBB0_911: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 3 - vpextrb ebx, xmm6, 4 - test bl, 1 - jne .LBB0_912 -.LBB0_487: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 5 - test bl, 1 - je .LBB0_488 -.LBB0_913: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm5, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 5 - vpextrb ebx, xmm6, 6 - test bl, 1 - jne .LBB0_914 -.LBB0_489: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 7 - test bl, 1 - je .LBB0_490 -.LBB0_915: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 7 - vpextrb ebx, xmm6, 8 - test bl, 1 - jne .LBB0_916 -.LBB0_491: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 9 - test bl, 1 - je .LBB0_492 -.LBB0_917: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm12, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 9 - vpextrb ebx, xmm6, 10 - test bl, 1 - jne .LBB0_918 -.LBB0_493: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 11 - test bl, 1 - je .LBB0_494 -.LBB0_919: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 11 - vpextrb ebx, xmm6, 12 - test bl, 1 - jne .LBB0_920 -.LBB0_495: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 13 - test bl, 1 - je .LBB0_496 -.LBB0_921: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm11, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 13 - vpextrb ebx, xmm6, 14 - test bl, 1 - jne .LBB0_922 -.LBB0_497: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 15 - test bl, 1 - je .LBB0_498 -.LBB0_923: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 15 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - jne .LBB0_924 -.LBB0_499: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - je .LBB0_500 -.LBB0_925: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm10, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - jne .LBB0_926 -.LBB0_501: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - je .LBB0_502 -.LBB0_927: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 3 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - jne .LBB0_928 -.LBB0_503: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - je .LBB0_504 -.LBB0_929: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm9, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 5 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - jne .LBB0_930 -.LBB0_505: # in Loop: Header=BB0_26 Depth=1 - test r9b, 1 - je .LBB0_506 -.LBB0_931: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 7 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - jne .LBB0_932 -.LBB0_507: # in Loop: Header=BB0_26 Depth=1 - test cl, 1 - je .LBB0_508 -.LBB0_933: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm8, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - jne .LBB0_934 -.LBB0_509: # in Loop: Header=BB0_26 Depth=1 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - je .LBB0_510 -.LBB0_935: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 11 - test r13b, 1 - jne .LBB0_936 -.LBB0_511: # in Loop: Header=BB0_26 Depth=1 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - je .LBB0_512 -.LBB0_937: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm7, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 - test r11b, 1 - jne .LBB0_938 -.LBB0_513: # in Loop: Header=BB0_26 Depth=1 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - je .LBB0_515 -.LBB0_514: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 -.LBB0_515: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 608] # 32-byte Reload - vpor ymm15, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload - vpor ymm5, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload - vpor ymm10, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload - vpor ymm9, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload - vpor ymm12, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload - vpor ymm11, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload - vpor ymm8, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload - vpor ymm7, ymm4, ymm1 - vperm2i128 ymm1, ymm8, ymm7, 49 # ymm1 = ymm8[2,3],ymm7[2,3] - vinserti128 ymm2, ymm8, xmm7, 1 - vshufps ymm1, ymm2, ymm1, 136 # ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6] - vperm2i128 ymm2, ymm12, ymm11, 49 # ymm2 = ymm12[2,3],ymm11[2,3] - vinserti128 ymm3, ymm12, xmm11, 1 - vshufps ymm2, ymm3, ymm2, 136 # ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] - vperm2i128 ymm3, ymm10, ymm9, 49 # ymm3 = ymm10[2,3],ymm9[2,3] - vinserti128 ymm13, ymm10, xmm9, 1 - vshufps ymm3, ymm13, ymm3, 136 # ymm3 = ymm13[0,2],ymm3[0,2],ymm13[4,6],ymm3[4,6] - vperm2i128 ymm13, ymm15, ymm5, 49 # ymm13 = ymm15[2,3],ymm5[2,3] - vinserti128 ymm14, ymm15, xmm5, 1 - vshufps ymm13, ymm14, ymm13, 136 # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] - vpcmpgtd ymm13, ymm0, ymm13 - vpcmpgtd ymm3, ymm0, ymm3 - vpackssdw ymm3, ymm13, ymm3 - vpcmpgtd ymm2, ymm0, ymm2 - vpcmpgtd ymm1, ymm0, ymm1 - vpackssdw ymm1, ymm2, ymm1 - vpermq ymm2, ymm3, 216 # ymm2 = ymm3[0,2,1,3] - vpermq ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3] - vpacksswb ymm1, ymm2, ymm1 - vpand ymm6, ymm1, ymm6 - vmovd ecx, xmm6 - # implicit-def: $ymm14 - test cl, 1 - je .LBB0_516 -# %bb.939: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm14, byte ptr [rdi + rdx] - vpextrb ecx, xmm6, 1 - test cl, 1 - jne .LBB0_940 -.LBB0_517: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - je .LBB0_519 -.LBB0_518: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rbx], 2 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_519: # in Loop: Header=BB0_26 Depth=1 - mov rsi, qword ptr [rsp + 96] # 8-byte Reload - mov r10, qword ptr [rsp + 72] # 8-byte Reload - vpextrb ecx, xmm6, 3 - test cl, 1 - je .LBB0_520 -# %bb.941: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 272] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 3 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 4 - test cl, 1 - jne .LBB0_942 -.LBB0_521: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 5 - test cl, 1 - je .LBB0_522 -.LBB0_943: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 5 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 6 - test cl, 1 - jne .LBB0_944 -.LBB0_523: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 7 - test cl, 1 - je .LBB0_524 -.LBB0_945: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r9], 7 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 8 - test cl, 1 - jne .LBB0_946 -.LBB0_525: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_527 -.LBB0_526: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r15], 9 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_527: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 144] # 8-byte Reload - mov rsi, qword ptr [rsp + 136] # 8-byte Reload - mov rbx, qword ptr [rsp + 128] # 8-byte Reload - mov r9, qword ptr [rsp + 120] # 8-byte Reload - vpextrb ecx, xmm6, 10 - test cl, 1 - je .LBB0_528 -# %bb.947: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 10 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 11 - test cl, 1 - jne .LBB0_948 -.LBB0_529: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 12 - test cl, 1 - je .LBB0_530 -.LBB0_949: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 248] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 12 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 13 - test cl, 1 - jne .LBB0_531 - jmp .LBB0_532 - .p2align 4, 0x90 -.LBB0_516: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 1 - test cl, 1 - je .LBB0_517 -.LBB0_940: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 1 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm6, 2 - test cl, 1 - jne .LBB0_518 - jmp .LBB0_519 - .p2align 4, 0x90 -.LBB0_520: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - test cl, 1 - je .LBB0_521 -.LBB0_942: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 264] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rcx], 4 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 5 - test cl, 1 - jne .LBB0_943 -.LBB0_522: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 6 - test cl, 1 - je .LBB0_523 -.LBB0_944: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 6 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 7 - test cl, 1 - jne .LBB0_945 -.LBB0_524: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 8 - test cl, 1 - je .LBB0_525 -.LBB0_946: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rsi], 8 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_526 - jmp .LBB0_527 - .p2align 4, 0x90 -.LBB0_528: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 11 - test cl, 1 - je .LBB0_529 -.LBB0_948: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 256] # 8-byte Reload - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 11 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] - vpextrb ecx, xmm6, 12 - test cl, 1 - jne .LBB0_949 -.LBB0_530: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 13 - test cl, 1 - je .LBB0_532 -.LBB0_531: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rdx], 13 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_532: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 80] # 8-byte Reload - mov rdx, qword ptr [rsp + 64] # 8-byte Reload - vpextrb ecx, xmm6, 14 - test cl, 1 - je .LBB0_534 -# %bb.533: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + rax], 14 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_534: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 15 - test cl, 1 - je .LBB0_536 -# %bb.535: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm1, xmm14, byte ptr [rdi + r10], 15 - vpblendd ymm14, ymm14, ymm1, 15 # ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] -.LBB0_536: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm6, 1 - vmovd eax, xmm1 - mov dword ptr [rsp + 44], eax # 4-byte Spill - test al, 1 - je .LBB0_538 -# %bb.537: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rdx], 0 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_538: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 56] # 8-byte Reload - vpextrb ecx, xmm1, 1 - mov dword ptr [rsp + 40], ecx # 4-byte Spill - test cl, 1 - je .LBB0_539 -# %bb.950: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rsi], 1 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_951 -.LBB0_540: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - je .LBB0_541 -.LBB0_952: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r9], 3 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_953 -.LBB0_542: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - je .LBB0_544 -.LBB0_543: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + r13], 5 - vinserti128 ymm14, ymm14, xmm2, 1 -.LBB0_544: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 112] # 8-byte Reload - vpextrb ecx, xmm1, 6 - mov dword ptr [rsp + 20], ecx # 4-byte Spill - test cl, 1 - je .LBB0_545 -# %bb.954: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 6 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - jne .LBB0_955 -.LBB0_546: # in Loop: Header=BB0_26 Depth=1 - vpextrb edx, xmm1, 8 - test dl, 1 - je .LBB0_547 -.LBB0_956: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 8 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 9 - test cl, 1 - jne .LBB0_957 -.LBB0_548: # in Loop: Header=BB0_26 Depth=1 - vpextrb esi, xmm1, 10 - test sil, 1 - je .LBB0_549 -.LBB0_958: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 200] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 10 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 11 - test al, 1 - jne .LBB0_959 -.LBB0_550: # in Loop: Header=BB0_26 Depth=1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - je .LBB0_551 -.LBB0_960: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 184] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 12 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - jne .LBB0_961 -.LBB0_552: # in Loop: Header=BB0_26 Depth=1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - je .LBB0_553 -.LBB0_962: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 168] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 14 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - jne .LBB0_554 - jmp .LBB0_555 - .p2align 4, 0x90 -.LBB0_539: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 2 - mov dword ptr [rsp + 36], ecx # 4-byte Spill - test cl, 1 - je .LBB0_540 -.LBB0_951: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 2 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb ecx, xmm1, 3 - mov dword ptr [rsp + 32], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_952 -.LBB0_541: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 4 - mov dword ptr [rsp + 28], ecx # 4-byte Spill - test cl, 1 - je .LBB0_542 -.LBB0_953: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 4 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb eax, xmm1, 5 - mov dword ptr [rsp + 24], eax # 4-byte Spill - test al, 1 - jne .LBB0_543 - jmp .LBB0_544 - .p2align 4, 0x90 -.LBB0_545: # in Loop: Header=BB0_26 Depth=1 - vpextrb r9d, xmm1, 7 - test r9b, 1 - je .LBB0_546 -.LBB0_955: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 240] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 7 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb edx, xmm1, 8 - test dl, 1 - jne .LBB0_956 -.LBB0_547: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm1, 9 - test cl, 1 - je .LBB0_548 -.LBB0_957: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rax, qword ptr [rsp + 208] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rax], 9 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb esi, xmm1, 10 - test sil, 1 - jne .LBB0_958 -.LBB0_549: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm1, 11 - test al, 1 - je .LBB0_550 -.LBB0_959: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 192] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 11 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r13d, xmm1, 12 - test r13b, 1 - jne .LBB0_960 -.LBB0_551: # in Loop: Header=BB0_26 Depth=1 - vpextrb r10d, xmm1, 13 - test r10b, 1 - je .LBB0_552 -.LBB0_961: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm2, ymm14, 1 - mov rbx, qword ptr [rsp + 176] # 8-byte Reload - vpinsrb xmm2, xmm2, byte ptr [rdi + rbx], 13 - vinserti128 ymm14, ymm14, xmm2, 1 - vpextrb r11d, xmm1, 14 - test r11b, 1 - jne .LBB0_962 -.LBB0_553: # in Loop: Header=BB0_26 Depth=1 - vpextrb r14d, xmm1, 15 - test r14b, 1 - je .LBB0_555 -.LBB0_554: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm14, 1 - mov rbx, qword ptr [rsp + 160] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rbx], 15 - vinserti128 ymm14, ymm14, xmm1, 1 -.LBB0_555: # in Loop: Header=BB0_26 Depth=1 - vpsrlw ymm1, ymm14, 6 - vpand ymm14, ymm1, ymmword ptr [rip + .LCPI0_4] - vmovd r15d, xmm6 - test r15b, 1 - je .LBB0_556 -# %bb.963: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm15 - vpextrb byte ptr [r8 + rbx], xmm14, 0 - vpextrb ebx, xmm6, 1 - test bl, 1 - jne .LBB0_964 -.LBB0_557: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - je .LBB0_558 -.LBB0_965: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 2 - vpextrb ebx, xmm6, 3 - test bl, 1 - jne .LBB0_966 -.LBB0_559: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 4 - test bl, 1 - je .LBB0_560 -.LBB0_967: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm5 - vpextrb byte ptr [r8 + rbx], xmm14, 4 - vpextrb ebx, xmm6, 5 - test bl, 1 - jne .LBB0_968 -.LBB0_561: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 6 - test bl, 1 - je .LBB0_562 -.LBB0_969: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 6 - vpextrb ebx, xmm6, 7 - test bl, 1 - jne .LBB0_970 -.LBB0_563: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 8 - test bl, 1 - je .LBB0_564 -.LBB0_971: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm12 - vpextrb byte ptr [r8 + rbx], xmm14, 8 - vpextrb ebx, xmm6, 9 - test bl, 1 - jne .LBB0_972 -.LBB0_565: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 10 - test bl, 1 - je .LBB0_566 -.LBB0_973: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 10 - vpextrb ebx, xmm6, 11 - test bl, 1 - jne .LBB0_974 -.LBB0_567: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 12 - test bl, 1 - je .LBB0_568 -.LBB0_975: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm11 - vpextrb byte ptr [r8 + rbx], xmm14, 12 - vpextrb ebx, xmm6, 13 - test bl, 1 - jne .LBB0_976 -.LBB0_569: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 14 - test bl, 1 - je .LBB0_570 -.LBB0_977: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm14, 14 - vpextrb ebx, xmm6, 15 - test bl, 1 - jne .LBB0_978 -.LBB0_571: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - je .LBB0_572 -.LBB0_979: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm10 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 0 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - jne .LBB0_980 -.LBB0_573: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - je .LBB0_574 -.LBB0_981: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 2 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - jne .LBB0_982 -.LBB0_575: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - je .LBB0_576 -.LBB0_983: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm9 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 4 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - jne .LBB0_984 -.LBB0_577: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - je .LBB0_578 -.LBB0_985: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 6 - test r9b, 1 - jne .LBB0_986 -.LBB0_579: # in Loop: Header=BB0_26 Depth=1 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - je .LBB0_580 -.LBB0_987: # in Loop: Header=BB0_26 Depth=1 - vmovq rdx, xmm8 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rdx], xmm1, 8 - test cl, 1 - jne .LBB0_988 -.LBB0_581: # in Loop: Header=BB0_26 Depth=1 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - je .LBB0_582 -.LBB0_989: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - jne .LBB0_990 -.LBB0_583: # in Loop: Header=BB0_26 Depth=1 - test r13b, 1 - je .LBB0_584 -.LBB0_991: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm7 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - jne .LBB0_992 -.LBB0_585: # in Loop: Header=BB0_26 Depth=1 - test r11b, 1 - je .LBB0_586 -.LBB0_993: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - jne .LBB0_587 - jmp .LBB0_588 - .p2align 4, 0x90 -.LBB0_556: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 1 - test bl, 1 - je .LBB0_557 -.LBB0_964: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm15, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 1 - vpextrb ebx, xmm6, 2 - test bl, 1 - mov r15, qword ptr [rsp + 224] # 8-byte Reload - jne .LBB0_965 -.LBB0_558: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 3 - test bl, 1 - je .LBB0_559 -.LBB0_966: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 3 - vpextrb ebx, xmm6, 4 - test bl, 1 - jne .LBB0_967 -.LBB0_560: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 5 - test bl, 1 - je .LBB0_561 -.LBB0_968: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm5, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 5 - vpextrb ebx, xmm6, 6 - test bl, 1 - jne .LBB0_969 -.LBB0_562: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 7 - test bl, 1 - je .LBB0_563 -.LBB0_970: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 7 - vpextrb ebx, xmm6, 8 - test bl, 1 - jne .LBB0_971 -.LBB0_564: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 9 - test bl, 1 - je .LBB0_565 -.LBB0_972: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm12, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 9 - vpextrb ebx, xmm6, 10 - test bl, 1 - jne .LBB0_973 -.LBB0_566: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 11 - test bl, 1 - je .LBB0_567 -.LBB0_974: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm12, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 11 - vpextrb ebx, xmm6, 12 - test bl, 1 - jne .LBB0_975 -.LBB0_568: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 13 - test bl, 1 - je .LBB0_569 -.LBB0_976: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm11, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 13 - vpextrb ebx, xmm6, 14 - test bl, 1 - jne .LBB0_977 -.LBB0_570: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm6, 15 - test bl, 1 - je .LBB0_571 -.LBB0_978: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm14, 15 - test byte ptr [rsp + 44], 1 # 1-byte Folded Reload - jne .LBB0_979 -.LBB0_572: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 40], 1 # 1-byte Folded Reload - je .LBB0_573 -.LBB0_980: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm10, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 1 - test byte ptr [rsp + 36], 1 # 1-byte Folded Reload - jne .LBB0_981 -.LBB0_574: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 32], 1 # 1-byte Folded Reload - je .LBB0_575 -.LBB0_982: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 3 - test byte ptr [rsp + 28], 1 # 1-byte Folded Reload - jne .LBB0_983 -.LBB0_576: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 24], 1 # 1-byte Folded Reload - je .LBB0_577 -.LBB0_984: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm9, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 5 - test byte ptr [rsp + 20], 1 # 1-byte Folded Reload - jne .LBB0_985 -.LBB0_578: # in Loop: Header=BB0_26 Depth=1 - test r9b, 1 - je .LBB0_579 -.LBB0_986: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 7 - test dl, 1 - mov rbx, qword ptr [rsp + 296] # 8-byte Reload - jne .LBB0_987 -.LBB0_580: # in Loop: Header=BB0_26 Depth=1 - test cl, 1 - je .LBB0_581 -.LBB0_988: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm8, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 - test sil, 1 - mov rdx, qword ptr [rsp + 304] # 8-byte Reload - jne .LBB0_989 -.LBB0_582: # in Loop: Header=BB0_26 Depth=1 - test al, 1 - mov rsi, qword ptr [rsp + 152] # 8-byte Reload - je .LBB0_583 -.LBB0_990: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 11 - test r13b, 1 - jne .LBB0_991 -.LBB0_584: # in Loop: Header=BB0_26 Depth=1 - test r10b, 1 - mov r13, qword ptr [rsp + 280] # 8-byte Reload - je .LBB0_585 -.LBB0_992: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm7, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 - test r11b, 1 - jne .LBB0_993 -.LBB0_586: # in Loop: Header=BB0_26 Depth=1 - test r14b, 1 - mov rax, qword ptr [rsp + 288] # 8-byte Reload - mov r9, qword ptr [rsp + 232] # 8-byte Reload - je .LBB0_588 -.LBB0_587: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm14, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 -.LBB0_588: # in Loop: Header=BB0_26 Depth=1 - vmovdqa ymm1, ymmword ptr [rsp + 576] # 32-byte Reload - vpor ymm11, ymm1, ymmword ptr [rsp + 512] # 32-byte Folded Reload - vpor ymm10, ymm1, ymmword ptr [rsp + 480] # 32-byte Folded Reload - vpor ymm8, ymm1, ymmword ptr [rsp + 384] # 32-byte Folded Reload - vpor ymm7, ymm1, ymmword ptr [rsp + 352] # 32-byte Folded Reload - vpor ymm9, ymm1, ymmword ptr [rsp + 448] # 32-byte Folded Reload - vpor ymm5, ymm1, ymmword ptr [rsp + 416] # 32-byte Folded Reload - vpor ymm2, ymm1, ymmword ptr [rsp + 320] # 32-byte Folded Reload - vpor ymm15, ymm4, ymm1 - vperm2i128 ymm3, ymm2, ymm15, 49 # ymm3 = ymm2[2,3],ymm15[2,3] - vinserti128 ymm4, ymm2, xmm15, 1 - vshufps ymm3, ymm4, ymm3, 136 # ymm3 = ymm4[0,2],ymm3[0,2],ymm4[4,6],ymm3[4,6] - vperm2i128 ymm4, ymm9, ymm5, 49 # ymm4 = ymm9[2,3],ymm5[2,3] - vinserti128 ymm12, ymm9, xmm5, 1 - vshufps ymm4, ymm12, ymm4, 136 # ymm4 = ymm12[0,2],ymm4[0,2],ymm12[4,6],ymm4[4,6] - vperm2i128 ymm12, ymm8, ymm7, 49 # ymm12 = ymm8[2,3],ymm7[2,3] - vinserti128 ymm13, ymm8, xmm7, 1 - vshufps ymm12, ymm13, ymm12, 136 # ymm12 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] - vperm2i128 ymm13, ymm11, ymm10, 49 # ymm13 = ymm11[2,3],ymm10[2,3] - vinserti128 ymm14, ymm11, xmm10, 1 - vshufps ymm13, ymm14, ymm13, 136 # ymm13 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] - vpcmpgtd ymm13, ymm0, ymm13 - vpcmpgtd ymm12, ymm0, ymm12 - vpackssdw ymm12, ymm13, ymm12 - vpermq ymm12, ymm12, 216 # ymm12 = ymm12[0,2,1,3] - vpcmpgtd ymm4, ymm0, ymm4 - vpcmpgtd ymm3, ymm0, ymm3 - vpackssdw ymm3, ymm4, ymm3 - vpermq ymm3, ymm3, 216 # ymm3 = ymm3[0,2,1,3] - vpacksswb ymm3, ymm12, ymm3 - vpand ymm3, ymm3, ymm6 - vmovd ecx, xmm3 - # implicit-def: $ymm4 - test cl, 1 - je .LBB0_589 -# %bb.994: # in Loop: Header=BB0_26 Depth=1 - vpbroadcastb ymm4, byte ptr [rdi + rdx] - vpextrb ecx, xmm3, 1 - test cl, 1 - jne .LBB0_995 -.LBB0_590: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm3, 2 - test cl, 1 - je .LBB0_592 -.LBB0_591: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rbx], 2 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -.LBB0_592: # in Loop: Header=BB0_26 Depth=1 - mov rsi, qword ptr [rsp + 96] # 8-byte Reload - mov r10, qword ptr [rsp + 72] # 8-byte Reload - vpextrb ecx, xmm3, 3 - test cl, 1 - je .LBB0_593 -# %bb.996: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 272] # 8-byte Reload - vpinsrb xmm6, xmm4, byte ptr [rdi + rcx], 3 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 4 - test cl, 1 - jne .LBB0_997 -.LBB0_594: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 5 - test cl, 1 - je .LBB0_595 -.LBB0_998: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rdx], 5 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 6 - test cl, 1 - jne .LBB0_999 -.LBB0_596: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 7 - test cl, 1 - je .LBB0_597 -.LBB0_1000: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + r9], 7 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 8 - test cl, 1 - jne .LBB0_1001 -.LBB0_598: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm3, 9 - test cl, 1 - je .LBB0_600 -.LBB0_599: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + r15], 9 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -.LBB0_600: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 144] # 8-byte Reload - mov rsi, qword ptr [rsp + 136] # 8-byte Reload - mov rbx, qword ptr [rsp + 128] # 8-byte Reload - mov r9, qword ptr [rsp + 120] # 8-byte Reload - vpextrb ecx, xmm3, 10 - test cl, 1 - je .LBB0_601 -# %bb.1002: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rax], 10 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 11 - test cl, 1 - jne .LBB0_1003 -.LBB0_602: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 12 - test cl, 1 - je .LBB0_603 -.LBB0_1004: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 248] # 8-byte Reload - vpinsrb xmm6, xmm4, byte ptr [rdi + rax], 12 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 13 - test cl, 1 - jne .LBB0_604 - jmp .LBB0_605 - .p2align 4, 0x90 -.LBB0_589: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 1 - test cl, 1 - je .LBB0_590 -.LBB0_995: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rsi], 1 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - mov rdx, qword ptr [rsp + 104] # 8-byte Reload - vpextrb ecx, xmm3, 2 - test cl, 1 - jne .LBB0_591 - jmp .LBB0_592 - .p2align 4, 0x90 -.LBB0_593: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 4 - test cl, 1 - je .LBB0_594 -.LBB0_997: # in Loop: Header=BB0_26 Depth=1 - mov rcx, qword ptr [rsp + 264] # 8-byte Reload - vpinsrb xmm6, xmm4, byte ptr [rdi + rcx], 4 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 5 - test cl, 1 - jne .LBB0_998 -.LBB0_595: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 6 - test cl, 1 - je .LBB0_596 -.LBB0_999: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rax], 6 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 7 - test cl, 1 - jne .LBB0_1000 -.LBB0_597: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 8 - test cl, 1 - je .LBB0_598 -.LBB0_1001: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rsi], 8 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - mov rdx, qword ptr [rsp + 88] # 8-byte Reload - vpextrb ecx, xmm3, 9 - test cl, 1 - jne .LBB0_599 - jmp .LBB0_600 - .p2align 4, 0x90 -.LBB0_601: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 11 - test cl, 1 - je .LBB0_602 -.LBB0_1003: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 256] # 8-byte Reload - vpinsrb xmm6, xmm4, byte ptr [rdi + rax], 11 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] - vpextrb ecx, xmm3, 12 - test cl, 1 - jne .LBB0_1004 -.LBB0_603: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 13 - test cl, 1 - je .LBB0_605 -.LBB0_604: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rdx], 13 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -.LBB0_605: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 80] # 8-byte Reload - mov rdx, qword ptr [rsp + 64] # 8-byte Reload - vpextrb ecx, xmm3, 14 - test cl, 1 - je .LBB0_607 -# %bb.606: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + rax], 14 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -.LBB0_607: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm3, 15 - test cl, 1 - je .LBB0_609 -# %bb.608: # in Loop: Header=BB0_26 Depth=1 - vpinsrb xmm6, xmm4, byte ptr [rdi + r10], 15 - vpblendd ymm4, ymm4, ymm6, 15 # ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -.LBB0_609: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm6, ymm3, 1 - vmovd eax, xmm6 - mov dword ptr [rsp + 512], eax # 4-byte Spill - test al, 1 - je .LBB0_611 -# %bb.610: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rdx], 0 - vinserti128 ymm4, ymm4, xmm1, 1 -.LBB0_611: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 56] # 8-byte Reload - vpextrb ecx, xmm6, 1 - mov dword ptr [rsp + 480], ecx # 4-byte Spill - test cl, 1 - je .LBB0_612 -# %bb.1005: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rsi], 1 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb ecx, xmm6, 2 - mov dword ptr [rsp + 448], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_1006 -.LBB0_613: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 3 - mov dword ptr [rsp + 416], ecx # 4-byte Spill - test cl, 1 - je .LBB0_614 -.LBB0_1007: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + r9], 3 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb ecx, xmm6, 4 - mov dword ptr [rsp + 384], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_1008 -.LBB0_615: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm6, 5 - mov dword ptr [rsp + 352], eax # 4-byte Spill - test al, 1 - je .LBB0_617 -.LBB0_616: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + r13], 5 - vinserti128 ymm4, ymm4, xmm1, 1 -.LBB0_617: # in Loop: Header=BB0_26 Depth=1 - mov rax, qword ptr [rsp + 112] # 8-byte Reload - mov rbx, qword ptr [rsp + 184] # 8-byte Reload - mov rdx, qword ptr [rsp + 176] # 8-byte Reload - vpextrb ecx, xmm6, 6 - mov dword ptr [rsp + 320], ecx # 4-byte Spill - test cl, 1 - je .LBB0_618 -# %bb.1009: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 6 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb eax, xmm6, 7 - mov dword ptr [rsp + 152], eax # 4-byte Spill - test al, 1 - jne .LBB0_1010 -.LBB0_619: # in Loop: Header=BB0_26 Depth=1 - vpextrb r9d, xmm6, 8 - test r9b, 1 - je .LBB0_620 -.LBB0_1011: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - mov rax, qword ptr [rsp + 216] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 8 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb ecx, xmm6, 9 - test cl, 1 - jne .LBB0_1012 -.LBB0_621: # in Loop: Header=BB0_26 Depth=1 - vpextrb r11d, xmm6, 10 - test r11b, 1 - je .LBB0_622 -.LBB0_1013: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - mov rax, qword ptr [rsp + 200] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 10 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb eax, xmm6, 11 - test al, 1 - jne .LBB0_1014 -.LBB0_623: # in Loop: Header=BB0_26 Depth=1 - vpextrb esi, xmm6, 12 - test sil, 1 - je .LBB0_624 -.LBB0_1015: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rbx], 12 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb r10d, xmm6, 13 - test r10b, 1 - jne .LBB0_1016 -.LBB0_625: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 168] # 8-byte Reload - vpextrb r13d, xmm6, 14 - test r13b, 1 - je .LBB0_626 -.LBB0_1017: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rdx], 14 - vinserti128 ymm4, ymm4, xmm1, 1 - mov rdx, qword ptr [rsp + 160] # 8-byte Reload - vpextrb r14d, xmm6, 15 - test r14b, 1 - jne .LBB0_627 - jmp .LBB0_628 - .p2align 4, 0x90 -.LBB0_612: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 2 - mov dword ptr [rsp + 448], ecx # 4-byte Spill - test cl, 1 - je .LBB0_613 -.LBB0_1006: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rbx], 2 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb ecx, xmm6, 3 - mov dword ptr [rsp + 416], ecx # 4-byte Spill - test cl, 1 - jne .LBB0_1007 -.LBB0_614: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 4 - mov dword ptr [rsp + 384], ecx # 4-byte Spill - test cl, 1 - je .LBB0_615 -.LBB0_1008: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 4 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb eax, xmm6, 5 - mov dword ptr [rsp + 352], eax # 4-byte Spill - test al, 1 - jne .LBB0_616 - jmp .LBB0_617 - .p2align 4, 0x90 -.LBB0_618: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm6, 7 - mov dword ptr [rsp + 152], eax # 4-byte Spill - test al, 1 - je .LBB0_619 -.LBB0_1010: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - mov rax, qword ptr [rsp + 240] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 7 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb r9d, xmm6, 8 - test r9b, 1 - jne .LBB0_1011 -.LBB0_620: # in Loop: Header=BB0_26 Depth=1 - vpextrb ecx, xmm6, 9 - test cl, 1 - je .LBB0_621 -.LBB0_1012: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - mov rax, qword ptr [rsp + 208] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 9 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb r11d, xmm6, 10 - test r11b, 1 - jne .LBB0_1013 -.LBB0_622: # in Loop: Header=BB0_26 Depth=1 - vpextrb eax, xmm6, 11 - test al, 1 - je .LBB0_623 -.LBB0_1014: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - mov rsi, qword ptr [rsp + 192] # 8-byte Reload - vpinsrb xmm1, xmm1, byte ptr [rdi + rsi], 11 - vinserti128 ymm4, ymm4, xmm1, 1 - vpextrb esi, xmm6, 12 - test sil, 1 - jne .LBB0_1015 -.LBB0_624: # in Loop: Header=BB0_26 Depth=1 - vpextrb r10d, xmm6, 13 - test r10b, 1 - je .LBB0_625 -.LBB0_1016: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rdx], 13 - vinserti128 ymm4, ymm4, xmm1, 1 - mov rdx, qword ptr [rsp + 168] # 8-byte Reload - vpextrb r13d, xmm6, 14 - test r13b, 1 - jne .LBB0_1017 -.LBB0_626: # in Loop: Header=BB0_26 Depth=1 - mov rdx, qword ptr [rsp + 160] # 8-byte Reload - vpextrb r14d, xmm6, 15 - test r14b, 1 - je .LBB0_628 -.LBB0_627: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm4, 1 - vpinsrb xmm1, xmm1, byte ptr [rdi + rdx], 15 - vinserti128 ymm4, ymm4, xmm1, 1 -.LBB0_628: # in Loop: Header=BB0_26 Depth=1 - vpsrlw ymm1, ymm4, 7 - vpand ymm4, ymm1, ymmword ptr [rip + .LCPI0_4] - vmovd r15d, xmm3 - test r15b, 1 - je .LBB0_629 -# %bb.1018: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm11 - vpextrb byte ptr [r8 + rbx], xmm4, 0 - vpextrb ebx, xmm3, 1 - test bl, 1 - jne .LBB0_1019 -.LBB0_630: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 2 - test bl, 1 - je .LBB0_631 -.LBB0_1020: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm4, 2 - vpextrb ebx, xmm3, 3 - test bl, 1 - jne .LBB0_1021 -.LBB0_632: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 4 - test bl, 1 - je .LBB0_633 -.LBB0_1022: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm10 - vpextrb byte ptr [r8 + rbx], xmm4, 4 - vpextrb ebx, xmm3, 5 - test bl, 1 - jne .LBB0_1023 -.LBB0_634: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 6 - test bl, 1 - je .LBB0_635 -.LBB0_1024: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm4, 6 - vpextrb ebx, xmm3, 7 - test bl, 1 - jne .LBB0_1025 -.LBB0_636: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 8 - test bl, 1 - je .LBB0_637 -.LBB0_1026: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm9 - vpextrb byte ptr [r8 + rbx], xmm4, 8 - vpextrb ebx, xmm3, 9 - test bl, 1 - jne .LBB0_1027 -.LBB0_638: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 10 - test bl, 1 - je .LBB0_639 -.LBB0_1028: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm4, 10 - vpextrb ebx, xmm3, 11 - test bl, 1 - jne .LBB0_1029 -.LBB0_640: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 12 - test bl, 1 - je .LBB0_641 -.LBB0_1030: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm5 - vpextrb byte ptr [r8 + rbx], xmm4, 12 - vpextrb ebx, xmm3, 13 - test bl, 1 - vmovdqa ymm9, ymmword ptr [rsp + 896] # 32-byte Reload - jne .LBB0_1031 -.LBB0_642: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 14 - test bl, 1 - je .LBB0_643 -.LBB0_1032: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vmovq rbx, xmm1 - vpextrb byte ptr [r8 + rbx], xmm4, 14 - vpextrb ebx, xmm3, 15 - test bl, 1 - jne .LBB0_1033 -.LBB0_644: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 512], 1 # 1-byte Folded Reload - vmovdqa ymm3, ymmword ptr [rsp + 832] # 32-byte Reload - je .LBB0_645 -.LBB0_1034: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm8 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 0 - test byte ptr [rsp + 480], 1 # 1-byte Folded Reload - jne .LBB0_1035 -.LBB0_646: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 448], 1 # 1-byte Folded Reload - je .LBB0_647 -.LBB0_1036: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 2 - test byte ptr [rsp + 416], 1 # 1-byte Folded Reload - jne .LBB0_1037 -.LBB0_648: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 384], 1 # 1-byte Folded Reload - je .LBB0_649 -.LBB0_1038: # in Loop: Header=BB0_26 Depth=1 - vmovq rbx, xmm7 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 4 - test byte ptr [rsp + 352], 1 # 1-byte Folded Reload - vmovdqa ymm8, ymmword ptr [rsp + 864] # 32-byte Reload - jne .LBB0_1039 -.LBB0_650: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 320], 1 # 1-byte Folded Reload - je .LBB0_651 -.LBB0_1040: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vmovq rbx, xmm1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 6 - test byte ptr [rsp + 152], 1 # 1-byte Folded Reload - jne .LBB0_1041 -.LBB0_652: # in Loop: Header=BB0_26 Depth=1 - test r9b, 1 - mov r9d, dword ptr [rsp + 16] # 4-byte Reload - je .LBB0_653 -.LBB0_1042: # in Loop: Header=BB0_26 Depth=1 - vmovq rdx, xmm2 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rdx], xmm1, 8 - test cl, 1 - jne .LBB0_1043 -.LBB0_654: # in Loop: Header=BB0_26 Depth=1 - test r11b, 1 - mov r11, qword ptr [rsp + 304] # 8-byte Reload - je .LBB0_655 -.LBB0_1044: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm2, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 10 - test al, 1 - jne .LBB0_1045 -.LBB0_656: # in Loop: Header=BB0_26 Depth=1 - test sil, 1 - je .LBB0_657 -.LBB0_1046: # in Loop: Header=BB0_26 Depth=1 - vmovq rcx, xmm15 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 12 - test r10b, 1 - vmovdqa ymm2, ymmword ptr [rsp + 800] # 32-byte Reload - jne .LBB0_1047 -.LBB0_658: # in Loop: Header=BB0_26 Depth=1 - test r13b, 1 - mov r10, qword ptr [rsp + 48] # 8-byte Reload - je .LBB0_659 -.LBB0_1048: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vmovq rcx, xmm1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 14 - test r14b, 1 - je .LBB0_25 - jmp .LBB0_1049 - .p2align 4, 0x90 -.LBB0_629: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 1 - test bl, 1 - je .LBB0_630 -.LBB0_1019: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm11, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 1 - vpextrb ebx, xmm3, 2 - test bl, 1 - jne .LBB0_1020 -.LBB0_631: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 3 - test bl, 1 - je .LBB0_632 -.LBB0_1021: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm11, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 3 - vpextrb ebx, xmm3, 4 - test bl, 1 - jne .LBB0_1022 -.LBB0_633: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 5 - test bl, 1 - je .LBB0_634 -.LBB0_1023: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm10, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 5 - vpextrb ebx, xmm3, 6 - test bl, 1 - jne .LBB0_1024 -.LBB0_635: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 7 - test bl, 1 - je .LBB0_636 -.LBB0_1025: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm10, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 7 - vpextrb ebx, xmm3, 8 - test bl, 1 - jne .LBB0_1026 -.LBB0_637: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 9 - test bl, 1 - je .LBB0_638 -.LBB0_1027: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm9, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 9 - vpextrb ebx, xmm3, 10 - test bl, 1 - jne .LBB0_1028 -.LBB0_639: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 11 - test bl, 1 - je .LBB0_640 -.LBB0_1029: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm9, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 11 - vpextrb ebx, xmm3, 12 - test bl, 1 - jne .LBB0_1030 -.LBB0_641: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 13 - test bl, 1 - vmovdqa ymm9, ymmword ptr [rsp + 896] # 32-byte Reload - je .LBB0_642 -.LBB0_1031: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm5, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 13 - vpextrb ebx, xmm3, 14 - test bl, 1 - jne .LBB0_1032 -.LBB0_643: # in Loop: Header=BB0_26 Depth=1 - vpextrb ebx, xmm3, 15 - test bl, 1 - je .LBB0_644 -.LBB0_1033: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm5, 1 - vpextrq rbx, xmm1, 1 - vpextrb byte ptr [r8 + rbx], xmm4, 15 - test byte ptr [rsp + 512], 1 # 1-byte Folded Reload - vmovdqa ymm3, ymmword ptr [rsp + 832] # 32-byte Reload - jne .LBB0_1034 -.LBB0_645: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 480], 1 # 1-byte Folded Reload - je .LBB0_646 -.LBB0_1035: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm8, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 1 - test byte ptr [rsp + 448], 1 # 1-byte Folded Reload - jne .LBB0_1036 -.LBB0_647: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 416], 1 # 1-byte Folded Reload - je .LBB0_648 -.LBB0_1037: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm8, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 3 - test byte ptr [rsp + 384], 1 # 1-byte Folded Reload - jne .LBB0_1038 -.LBB0_649: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 352], 1 # 1-byte Folded Reload - vmovdqa ymm8, ymmword ptr [rsp + 864] # 32-byte Reload - je .LBB0_650 -.LBB0_1039: # in Loop: Header=BB0_26 Depth=1 - vpextrq rbx, xmm7, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 5 - test byte ptr [rsp + 320], 1 # 1-byte Folded Reload - jne .LBB0_1040 -.LBB0_651: # in Loop: Header=BB0_26 Depth=1 - test byte ptr [rsp + 152], 1 # 1-byte Folded Reload - je .LBB0_652 -.LBB0_1041: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm7, 1 - vpextrq rbx, xmm1, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rbx], xmm1, 7 - test r9b, 1 - mov r9d, dword ptr [rsp + 16] # 4-byte Reload - jne .LBB0_1042 -.LBB0_653: # in Loop: Header=BB0_26 Depth=1 - test cl, 1 - je .LBB0_654 -.LBB0_1043: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm2, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 9 - test r11b, 1 - mov r11, qword ptr [rsp + 304] # 8-byte Reload - jne .LBB0_1044 -.LBB0_655: # in Loop: Header=BB0_26 Depth=1 - test al, 1 - je .LBB0_656 -.LBB0_1045: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm2, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 11 - test sil, 1 - jne .LBB0_1046 -.LBB0_657: # in Loop: Header=BB0_26 Depth=1 - test r10b, 1 - vmovdqa ymm2, ymmword ptr [rsp + 800] # 32-byte Reload - je .LBB0_658 -.LBB0_1047: # in Loop: Header=BB0_26 Depth=1 - vpextrq rcx, xmm15, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 13 - test r13b, 1 - mov r10, qword ptr [rsp + 48] # 8-byte Reload - jne .LBB0_1048 -.LBB0_659: # in Loop: Header=BB0_26 Depth=1 - test r14b, 1 - je .LBB0_25 -.LBB0_1049: # in Loop: Header=BB0_26 Depth=1 - vextracti128 xmm1, ymm15, 1 - vpextrq rcx, xmm1, 1 - vextracti128 xmm1, ymm4, 1 - vpextrb byte ptr [r8 + rcx], xmm1, 15 - jmp .LBB0_25 -.LBB0_1050: - cmp r12, r10 - jne .LBB0_1055 -.LBB0_1051: - lea rsp, [rbp - 40] - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 + mov r8d, esi + shl r8, 3 + xor r10d, r10d + jmp .LBB0_2 + .p2align 4, 0x90 +.LBB0_4: # in Loop: Header=BB0_2 Depth=1 + add r10, 8 + add rdi, 1 + cmp r8, r10 + je .LBB0_5 +.LBB0_2: # =>This Inner Loop Header: Depth=1 + cmp r10d, ecx + jge .LBB0_4 +# %bb.3: # in Loop: Header=BB0_2 Depth=1 + mov r9d, r10d + movzx eax, byte ptr [rdi] + and al, 1 + mov byte ptr [rdx + r9], al + mov rsi, r9 + or rsi, 1 + cmp esi, ecx + jge .LBB0_4 +# %bb.6: # in Loop: Header=BB0_2 Depth=1 + movzx eax, byte ptr [rdi] + shr al + and al, 1 + mov byte ptr [rdx + rsi], al + mov rsi, r9 + or rsi, 2 + cmp esi, ecx + jge .LBB0_4 +# %bb.7: # in Loop: Header=BB0_2 Depth=1 + movzx eax, byte ptr [rdi] + shr al, 2 + and al, 1 + mov byte ptr [rdx + rsi], al + mov rsi, r9 + or rsi, 3 + cmp esi, ecx + jge .LBB0_4 +# %bb.8: # in Loop: Header=BB0_2 Depth=1 + movzx eax, byte ptr [rdi] + shr al, 3 + and al, 1 + mov byte ptr [rdx + rsi], al + mov rsi, r9 + or rsi, 4 + cmp esi, ecx + jge .LBB0_4 +# %bb.9: # in Loop: Header=BB0_2 Depth=1 + movzx eax, byte ptr [rdi] + shr al, 4 + and al, 1 + mov byte ptr [rdx + rsi], al + mov rsi, r9 + or rsi, 5 + cmp esi, ecx + jge .LBB0_4 +# %bb.10: # in Loop: Header=BB0_2 Depth=1 + movzx eax, byte ptr [rdi] + shr al, 5 + and al, 1 + mov byte ptr [rdx + rsi], al + mov rsi, r9 + or rsi, 6 + cmp esi, ecx + jge .LBB0_4 +# %bb.11: # in Loop: Header=BB0_2 Depth=1 + movzx eax, byte ptr [rdi] + shr al, 6 + and al, 1 + mov byte ptr [rdx + rsi], al + or r9, 7 + cmp r9d, ecx + jge .LBB0_4 +# %bb.12: # in Loop: Header=BB0_2 Depth=1 + movzx eax, byte ptr [rdi] + shr al, 7 + mov byte ptr [rdx + r9], al + jmp .LBB0_4 +.LBB0_5: + mov rsp, rbp pop rbp - vzeroupper ret -.LBB0_1052: - mov r9d, dword ptr [rsp + 16] # 4-byte Reload - mov r10, qword ptr [rsp + 48] # 8-byte Reload - jmp .LBB0_1055 -.LBB0_1054: - mov r9d, dword ptr [rsp + 16] # 4-byte Reload - jmp .LBB0_1055 .Lfunc_end0: .size bytes_to_bools_avx2, .Lfunc_end0-bytes_to_bools_avx2 # -- End function - .ident "Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162" + .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" .section ".note.GNU-stack","",@progbits .addrsig diff --git a/go/parquet/internal/utils/_lib/unpack_bool_sse4.s b/go/parquet/internal/utils/_lib/unpack_bool_sse4.s index 18caa0473df..6719771b865 100644 --- a/go/parquet/internal/utils/_lib/unpack_bool_sse4.s +++ b/go/parquet/internal/utils/_lib/unpack_bool_sse4.s @@ -99,6 +99,6 @@ bytes_to_bools_sse4: # @bytes_to_bools_sse4 .Lfunc_end0: .size bytes_to_bools_sse4, .Lfunc_end0-bytes_to_bools_sse4 # -- End function - .ident "Ubuntu clang version 11.1.0-++20210204121720+1fdec59bffc1-1~exp1~20210203232336.162" + .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" .section ".note.GNU-stack","",@progbits .addrsig diff --git a/go/parquet/internal/utils/min_max_avx2_amd64.s b/go/parquet/internal/utils/min_max_avx2_amd64.s index 6a1bb18fde6..a54758ba1ed 100644 --- a/go/parquet/internal/utils/min_max_avx2_amd64.s +++ b/go/parquet/internal/utils/min_max_avx2_amd64.s @@ -4,364 +4,188 @@ DATA LCDATA1<>+0x000(SB)/8, $0x7fffffff80000000 GLOBL LCDATA1<>(SB), 8, $8 -TEXT ·_int32_max_min_avx2(SB), $72-32 +TEXT ·_int32_max_min_avx2(SB), $0-32 MOVQ values+0(FP), DI MOVQ length+8(FP), SI MOVQ minout+16(FP), DX MOVQ maxout+24(FP), CX - ADDQ $8, SP LEAQ LCDATA1<>(SB), BP WORD $0xf685 // test esi, esi JLE LBB0_1 WORD $0x8941; BYTE $0xf0 // mov r8d, esi WORD $0xfe83; BYTE $0x1f // cmp esi, 31 - JA LBB0_6 - LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648 - LONG $0xffffb941; WORD $0x7fff // mov r9d, 2147483647 - WORD $0x3145; BYTE $0xdb // xor r11d, r11d - JMP LBB0_4 + JA LBB0_4 + LONG $0x0000ba41; WORD $0x8000 // mov r10d, -2147483648 + LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647 + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + JMP LBB0_7 LBB0_1: - LONG $0xffffb941; WORD $0x7fff // mov r9d, 2147483647 - LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648 - JMP LBB0_14 + LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647 + LONG $0x000000be; BYTE $0x80 // mov esi, -2147483648 + JMP LBB0_8 -LBB0_6: - WORD $0x8945; BYTE $0xc3 // mov r11d, r8d - LONG $0xe0e38341 // and r11d, -32 - LONG $0xe0438d49 // lea rax, [r11 - 32] - WORD $0x8949; BYTE $0xc2 // mov r10, rax - LONG $0x05eac149 // shr r10, 5 - LONG $0x01c28349 // add r10, 1 - WORD $0x8945; BYTE $0xd1 // mov r9d, r10d - LONG $0x03e18341 // and r9d, 3 - LONG $0x60f88348 // cmp rax, 96 - JAE LBB0_8 - LONG $0x587de2c4; WORD $0x0045 // vpbroadcastd ymm0, dword 0[rbp] /* [rip + .LCPI0_0] */ - LONG $0x587de2c4; WORD $0x044d // vpbroadcastd ymm1, dword 4[rbp] /* [rip + .LCPI0_1] */ - WORD $0xc031 // xor eax, eax - LONG $0xd16ffdc5 // vmovdqa ymm2, ymm1 - LONG $0xe16ffdc5 // vmovdqa ymm4, ymm1 - LONG $0xf16ffdc5 // vmovdqa ymm6, ymm1 - LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 - LONG $0xe86ffdc5 // vmovdqa ymm5, ymm0 - LONG $0xf86ffdc5 // vmovdqa ymm7, ymm0 - JMP LBB0_10 - -LBB0_8: - LONG $0xfce28349 // and r10, -4 - LONG $0x587de2c4; WORD $0x0045 // vpbroadcastd ymm0, dword 0[rbp] /* [rip + .LCPI0_0] */ - WORD $0xf749; BYTE $0xda // neg r10 - LONG $0x587de2c4; WORD $0x044d // vpbroadcastd ymm1, dword 4[rbp] /* [rip + .LCPI0_1] */ +LBB0_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0x587de2c4; WORD $0x0065 // vpbroadcastd ymm4, dword 0[rbp] /* [rip + .LCPI0_0] */ + LONG $0xe0e18341 // and r9d, -32 + LONG $0x587de2c4; WORD $0x0445 // vpbroadcastd ymm0, dword 4[rbp] /* [rip + .LCPI0_1] */ WORD $0xc031 // xor eax, eax - LONG $0xd16ffdc5 // vmovdqa ymm2, ymm1 - LONG $0xe16ffdc5 // vmovdqa ymm4, ymm1 - LONG $0xf16ffdc5 // vmovdqa ymm6, ymm1 + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 - LONG $0xe86ffdc5 // vmovdqa ymm5, ymm0 - LONG $0xf86ffdc5 // vmovdqa ymm7, ymm0 + LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4 + LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4 + LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4 -LBB0_9: - LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] - LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] - LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] - LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] - LONG $0x394dc2c4; BYTE $0xf3 // vpminsd ymm6, ymm6, ymm11 - LONG $0x395dc2c4; BYTE $0xe2 // vpminsd ymm4, ymm4, ymm10 - LONG $0x3975c2c4; BYTE $0xc8 // vpminsd ymm1, ymm1, ymm8 - LONG $0x396dc2c4; BYTE $0xd1 // vpminsd ymm2, ymm2, ymm9 - LONG $0x3d45c2c4; BYTE $0xfb // vpmaxsd ymm7, ymm7, ymm11 - LONG $0x3d55c2c4; BYTE $0xea // vpmaxsd ymm5, ymm5, ymm10 - LONG $0x3d7dc2c4; BYTE $0xc0 // vpmaxsd ymm0, ymm0, ymm8 - LONG $0x3d65c2c4; BYTE $0xd9 // vpmaxsd ymm3, ymm3, ymm9 - QUAD $0x0000e087846f7ec5; BYTE $0x00 // vmovdqu ymm8, yword [rdi + 4*rax + 224] - QUAD $0x0000c0878c6f7ec5; BYTE $0x00 // vmovdqu ymm9, yword [rdi + 4*rax + 192] - QUAD $0x00008087946f7ec5; BYTE $0x00 // vmovdqu ymm10, yword [rdi + 4*rax + 128] - QUAD $0x0000a0879c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 4*rax + 160] - QUAD $0x00010087a46f7ec5; BYTE $0x00 // vmovdqu ymm12, yword [rdi + 4*rax + 256] - QUAD $0x00014087ac6f7ec5; BYTE $0x00 // vmovdqu ymm13, yword [rdi + 4*rax + 320] - QUAD $0x00016087b46f7ec5; BYTE $0x00 // vmovdqu ymm14, yword [rdi + 4*rax + 352] - LONG $0x393d42c4; BYTE $0xfe // vpminsd ymm15, ymm8, ymm14 - LONG $0x394dc2c4; BYTE $0xf7 // vpminsd ymm6, ymm6, ymm15 - LONG $0x347ffdc5; BYTE $0x24 // vmovdqa yword [rsp], ymm6 - LONG $0x393542c4; BYTE $0xfd // vpminsd ymm15, ymm9, ymm13 - LONG $0x395dc2c4; BYTE $0xe7 // vpminsd ymm4, ymm4, ymm15 - LONG $0x392d42c4; BYTE $0xfc // vpminsd ymm15, ymm10, ymm12 - LONG $0x3975c2c4; BYTE $0xcf // vpminsd ymm1, ymm1, ymm15 - QUAD $0x00012087bc6f7ec5; BYTE $0x00 // vmovdqu ymm15, yword [rdi + 4*rax + 288] - LONG $0x3925c2c4; BYTE $0xf7 // vpminsd ymm6, ymm11, ymm15 - LONG $0x396de2c4; BYTE $0xd6 // vpminsd ymm2, ymm2, ymm6 - LONG $0x3d3dc2c4; BYTE $0xf6 // vpmaxsd ymm6, ymm8, ymm14 - LONG $0x3d45e2c4; BYTE $0xfe // vpmaxsd ymm7, ymm7, ymm6 - LONG $0x3d35c2c4; BYTE $0xf5 // vpmaxsd ymm6, ymm9, ymm13 - LONG $0x3d55e2c4; BYTE $0xee // vpmaxsd ymm5, ymm5, ymm6 - LONG $0x3d2dc2c4; BYTE $0xf4 // vpmaxsd ymm6, ymm10, ymm12 - LONG $0x3d7de2c4; BYTE $0xc6 // vpmaxsd ymm0, ymm0, ymm6 - LONG $0x3d25c2c4; BYTE $0xf7 // vpmaxsd ymm6, ymm11, ymm15 - LONG $0x3d65e2c4; BYTE $0xde // vpmaxsd ymm3, ymm3, ymm6 - QUAD $0x0001a087b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 4*rax + 416] - LONG $0x396de2c4; BYTE $0xd6 // vpminsd ymm2, ymm2, ymm6 - LONG $0x3d65e2c4; BYTE $0xde // vpmaxsd ymm3, ymm3, ymm6 - QUAD $0x00018087b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 4*rax + 384] - LONG $0x3975e2c4; BYTE $0xce // vpminsd ymm1, ymm1, ymm6 - LONG $0x3d7de2c4; BYTE $0xc6 // vpmaxsd ymm0, ymm0, ymm6 - QUAD $0x0001c087b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 4*rax + 448] - LONG $0x395de2c4; BYTE $0xe6 // vpminsd ymm4, ymm4, ymm6 - LONG $0x3d55e2c4; BYTE $0xee // vpmaxsd ymm5, ymm5, ymm6 - QUAD $0x0001e087846f7ec5; BYTE $0x00 // vmovdqu ymm8, yword [rdi + 4*rax + 480] - LONG $0x393de2c4; WORD $0x2434 // vpminsd ymm6, ymm8, yword [rsp] - LONG $0x3d45c2c4; BYTE $0xf8 // vpmaxsd ymm7, ymm7, ymm8 - LONG $0x80e88348 // sub rax, -128 - LONG $0x04c28349 // add r10, 4 - JNE LBB0_9 - -LBB0_10: - WORD $0x854d; BYTE $0xc9 // test r9, r9 - JE LBB0_13 - LONG $0x87048d48 // lea rax, [rdi + 4*rax] - WORD $0xf749; BYTE $0xd9 // neg r9 - -LBB0_12: - LONG $0x006f7ec5 // vmovdqu ymm8, yword [rax] - LONG $0x486f7ec5; BYTE $0x20 // vmovdqu ymm9, yword [rax + 32] - LONG $0x506f7ec5; BYTE $0x40 // vmovdqu ymm10, yword [rax + 64] - LONG $0x586f7ec5; BYTE $0x60 // vmovdqu ymm11, yword [rax + 96] - LONG $0x396dc2c4; BYTE $0xd1 // vpminsd ymm2, ymm2, ymm9 - LONG $0x3975c2c4; BYTE $0xc8 // vpminsd ymm1, ymm1, ymm8 - LONG $0x395dc2c4; BYTE $0xe2 // vpminsd ymm4, ymm4, ymm10 - LONG $0x394dc2c4; BYTE $0xf3 // vpminsd ymm6, ymm6, ymm11 - LONG $0x3d65c2c4; BYTE $0xd9 // vpmaxsd ymm3, ymm3, ymm9 - LONG $0x3d7dc2c4; BYTE $0xc0 // vpmaxsd ymm0, ymm0, ymm8 - LONG $0x3d55c2c4; BYTE $0xea // vpmaxsd ymm5, ymm5, ymm10 - LONG $0x3d45c2c4; BYTE $0xfb // vpmaxsd ymm7, ymm7, ymm11 - LONG $0x80e88348 // sub rax, -128 - WORD $0xff49; BYTE $0xc1 // inc r9 - JNE LBB0_12 - -LBB0_13: - LONG $0x396de2c4; BYTE $0xd6 // vpminsd ymm2, ymm2, ymm6 - LONG $0x3975e2c4; BYTE $0xcc // vpminsd ymm1, ymm1, ymm4 - LONG $0x3975e2c4; BYTE $0xca // vpminsd ymm1, ymm1, ymm2 - LONG $0x3d65e2c4; BYTE $0xd7 // vpmaxsd ymm2, ymm3, ymm7 - LONG $0x3d7de2c4; BYTE $0xc5 // vpmaxsd ymm0, ymm0, ymm5 - LONG $0x3d7de2c4; BYTE $0xc2 // vpmaxsd ymm0, ymm0, ymm2 - LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1 - LONG $0x3d79e2c4; BYTE $0xc2 // vpmaxsd xmm0, xmm0, xmm2 - LONG $0xd070f9c5; BYTE $0x4e // vpshufd xmm2, xmm0, 78 - LONG $0x3d79e2c4; BYTE $0xc2 // vpmaxsd xmm0, xmm0, xmm2 - LONG $0xd070f9c5; BYTE $0xe5 // vpshufd xmm2, xmm0, 229 - LONG $0x3d79e2c4; BYTE $0xc2 // vpmaxsd xmm0, xmm0, xmm2 - LONG $0xc07ef9c5 // vmovd eax, xmm0 - LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1 - LONG $0x3971e2c4; BYTE $0xc0 // vpminsd xmm0, xmm1, xmm0 +LBB0_5: + LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] + LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] + LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] + LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] + LONG $0x397dc2c4; BYTE $0xc0 // vpminsd ymm0, ymm0, ymm8 + LONG $0x3975c2c4; BYTE $0xc9 // vpminsd ymm1, ymm1, ymm9 + LONG $0x396dc2c4; BYTE $0xd2 // vpminsd ymm2, ymm2, ymm10 + LONG $0x3965c2c4; BYTE $0xdb // vpminsd ymm3, ymm3, ymm11 + LONG $0x3d5dc2c4; BYTE $0xe0 // vpmaxsd ymm4, ymm4, ymm8 + LONG $0x3d55c2c4; BYTE $0xe9 // vpmaxsd ymm5, ymm5, ymm9 + LONG $0x3d4dc2c4; BYTE $0xf2 // vpmaxsd ymm6, ymm6, ymm10 + LONG $0x3d45c2c4; BYTE $0xfb // vpmaxsd ymm7, ymm7, ymm11 + LONG $0x20c08348 // add rax, 32 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax + JNE LBB0_5 + LONG $0x3d5de2c4; BYTE $0xe5 // vpmaxsd ymm4, ymm4, ymm5 + LONG $0x3d5de2c4; BYTE $0xe6 // vpmaxsd ymm4, ymm4, ymm6 + LONG $0x3d5de2c4; BYTE $0xe7 // vpmaxsd ymm4, ymm4, ymm7 + LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1 + LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78 + LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229 + LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 + LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4 + LONG $0x397de2c4; BYTE $0xc1 // vpminsd ymm0, ymm0, ymm1 + LONG $0x397de2c4; BYTE $0xc2 // vpminsd ymm0, ymm0, ymm2 + LONG $0x397de2c4; BYTE $0xc3 // vpminsd ymm0, ymm0, ymm3 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 - LONG $0x7e79c1c4; BYTE $0xc1 // vmovd r9d, xmm0 - WORD $0x394d; BYTE $0xc3 // cmp r11, r8 - JE LBB0_14 - -LBB0_4: - WORD $0xc689 // mov esi, eax + LONG $0xc07ef9c5 // vmovd eax, xmm0 + WORD $0x8944; BYTE $0xd6 // mov esi, r10d + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB0_8 -LBB0_5: - LONG $0x9f048b42 // mov eax, dword [rdi + 4*r11] - WORD $0x3941; BYTE $0xc1 // cmp r9d, eax - LONG $0xc84f0f44 // cmovg r9d, eax - WORD $0xc639 // cmp esi, eax - WORD $0x4d0f; BYTE $0xc6 // cmovge eax, esi - LONG $0x01c38349 // add r11, 1 - WORD $0xc689 // mov esi, eax - WORD $0x394d; BYTE $0xd8 // cmp r8, r11 - JNE LBB0_5 +LBB0_7: + LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9] + WORD $0xf039 // cmp eax, esi + WORD $0x4f0f; BYTE $0xc6 // cmovg eax, esi + WORD $0x3941; BYTE $0xf2 // cmp r10d, esi + LONG $0xf24d0f41 // cmovge esi, r10d + LONG $0x01c18349 // add r9, 1 + WORD $0x8941; BYTE $0xf2 // mov r10d, esi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB0_7 -LBB0_14: - WORD $0x0189 // mov dword [rcx], eax - WORD $0x8944; BYTE $0x0a // mov dword [rdx], r9d - SUBQ $8, SP +LBB0_8: + WORD $0x3189 // mov dword [rcx], esi + WORD $0x0289 // mov dword [rdx], eax VZEROUPPER RET -TEXT ·_uint32_max_min_avx2(SB), $72-32 +TEXT ·_uint32_max_min_avx2(SB), $0-32 MOVQ values+0(FP), DI MOVQ length+8(FP), SI MOVQ minout+16(FP), DX MOVQ maxout+24(FP), CX - ADDQ $8, SP - WORD $0xf685 // test esi, esi + WORD $0xf685 // test esi, esi JLE LBB1_1 - WORD $0x8941; BYTE $0xf0 // mov r8d, esi - WORD $0xfe83; BYTE $0x1f // cmp esi, 31 - JA LBB1_6 - WORD $0x3145; BYTE $0xdb // xor r11d, r11d - LONG $0xffffb941; WORD $0xffff // mov r9d, -1 - WORD $0xf631 // xor esi, esi - JMP LBB1_4 + WORD $0x8941; BYTE $0xf0 // mov r8d, esi + WORD $0xfe83; BYTE $0x1f // cmp esi, 31 + JA LBB1_4 + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + LONG $0xffffffb8; BYTE $0xff // mov eax, -1 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB1_7 LBB1_1: - LONG $0xffffb941; WORD $0xffff // mov r9d, -1 - WORD $0xf631 // xor esi, esi - JMP LBB1_14 + LONG $0xffffffb8; BYTE $0xff // mov eax, -1 + WORD $0xf631 // xor esi, esi + JMP LBB1_8 -LBB1_6: - WORD $0x8945; BYTE $0xc3 // mov r11d, r8d - LONG $0xe0e38341 // and r11d, -32 - LONG $0xe0438d49 // lea rax, [r11 - 32] - WORD $0x8949; BYTE $0xc2 // mov r10, rax - LONG $0x05eac149 // shr r10, 5 - LONG $0x01c28349 // add r10, 1 - WORD $0x8945; BYTE $0xd1 // mov r9d, r10d - LONG $0x03e18341 // and r9d, 3 - LONG $0x60f88348 // cmp rax, 96 - JAE LBB1_8 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 - LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 +LBB1_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0xe0e18341 // and r9d, -32 + LONG $0xe4efd9c5 // vpxor xmm4, xmm4, xmm4 + LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0 WORD $0xc031 // xor eax, eax - LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 - LONG $0xe476ddc5 // vpcmpeqd ymm4, ymm4, ymm4 - LONG $0xf676cdc5 // vpcmpeqd ymm6, ymm6, ymm6 - LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 - LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 - LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7 - JMP LBB1_10 - -LBB1_8: - LONG $0xfce28349 // and r10, -4 - WORD $0xf749; BYTE $0xda // neg r10 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 - WORD $0xc031 // xor eax, eax LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 - LONG $0xe476ddc5 // vpcmpeqd ymm4, ymm4, ymm4 - LONG $0xf676cdc5 // vpcmpeqd ymm6, ymm6, ymm6 - LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3 LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 + LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6 LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7 -LBB1_9: - LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] - LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] - LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] - LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] - LONG $0x3b4dc2c4; BYTE $0xf3 // vpminud ymm6, ymm6, ymm11 - LONG $0x3b5dc2c4; BYTE $0xe2 // vpminud ymm4, ymm4, ymm10 - LONG $0x3b75c2c4; BYTE $0xc8 // vpminud ymm1, ymm1, ymm8 - LONG $0x3b6dc2c4; BYTE $0xd1 // vpminud ymm2, ymm2, ymm9 - LONG $0x3f45c2c4; BYTE $0xfb // vpmaxud ymm7, ymm7, ymm11 - LONG $0x3f55c2c4; BYTE $0xea // vpmaxud ymm5, ymm5, ymm10 - LONG $0x3f7dc2c4; BYTE $0xc0 // vpmaxud ymm0, ymm0, ymm8 - LONG $0x3f65c2c4; BYTE $0xd9 // vpmaxud ymm3, ymm3, ymm9 - QUAD $0x0000e087846f7ec5; BYTE $0x00 // vmovdqu ymm8, yword [rdi + 4*rax + 224] - QUAD $0x0000c0878c6f7ec5; BYTE $0x00 // vmovdqu ymm9, yword [rdi + 4*rax + 192] - QUAD $0x00008087946f7ec5; BYTE $0x00 // vmovdqu ymm10, yword [rdi + 4*rax + 128] - QUAD $0x0000a0879c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 4*rax + 160] - QUAD $0x00010087a46f7ec5; BYTE $0x00 // vmovdqu ymm12, yword [rdi + 4*rax + 256] - QUAD $0x00014087ac6f7ec5; BYTE $0x00 // vmovdqu ymm13, yword [rdi + 4*rax + 320] - QUAD $0x00016087b46f7ec5; BYTE $0x00 // vmovdqu ymm14, yword [rdi + 4*rax + 352] - LONG $0x3b3d42c4; BYTE $0xfe // vpminud ymm15, ymm8, ymm14 - LONG $0x3b4dc2c4; BYTE $0xf7 // vpminud ymm6, ymm6, ymm15 - LONG $0x347ffdc5; BYTE $0x24 // vmovdqa yword [rsp], ymm6 - LONG $0x3b3542c4; BYTE $0xfd // vpminud ymm15, ymm9, ymm13 - LONG $0x3b5dc2c4; BYTE $0xe7 // vpminud ymm4, ymm4, ymm15 - LONG $0x3b2d42c4; BYTE $0xfc // vpminud ymm15, ymm10, ymm12 - LONG $0x3b75c2c4; BYTE $0xcf // vpminud ymm1, ymm1, ymm15 - QUAD $0x00012087bc6f7ec5; BYTE $0x00 // vmovdqu ymm15, yword [rdi + 4*rax + 288] - LONG $0x3b25c2c4; BYTE $0xf7 // vpminud ymm6, ymm11, ymm15 - LONG $0x3b6de2c4; BYTE $0xd6 // vpminud ymm2, ymm2, ymm6 - LONG $0x3f3dc2c4; BYTE $0xf6 // vpmaxud ymm6, ymm8, ymm14 - LONG $0x3f45e2c4; BYTE $0xfe // vpmaxud ymm7, ymm7, ymm6 - LONG $0x3f35c2c4; BYTE $0xf5 // vpmaxud ymm6, ymm9, ymm13 - LONG $0x3f55e2c4; BYTE $0xee // vpmaxud ymm5, ymm5, ymm6 - LONG $0x3f2dc2c4; BYTE $0xf4 // vpmaxud ymm6, ymm10, ymm12 - LONG $0x3f7de2c4; BYTE $0xc6 // vpmaxud ymm0, ymm0, ymm6 - LONG $0x3f25c2c4; BYTE $0xf7 // vpmaxud ymm6, ymm11, ymm15 - LONG $0x3f65e2c4; BYTE $0xde // vpmaxud ymm3, ymm3, ymm6 - QUAD $0x0001a087b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 4*rax + 416] - LONG $0x3b6de2c4; BYTE $0xd6 // vpminud ymm2, ymm2, ymm6 - LONG $0x3f65e2c4; BYTE $0xde // vpmaxud ymm3, ymm3, ymm6 - QUAD $0x00018087b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 4*rax + 384] - LONG $0x3b75e2c4; BYTE $0xce // vpminud ymm1, ymm1, ymm6 - LONG $0x3f7de2c4; BYTE $0xc6 // vpmaxud ymm0, ymm0, ymm6 - QUAD $0x0001c087b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 4*rax + 448] - LONG $0x3b5de2c4; BYTE $0xe6 // vpminud ymm4, ymm4, ymm6 - LONG $0x3f55e2c4; BYTE $0xee // vpmaxud ymm5, ymm5, ymm6 - QUAD $0x0001e087846f7ec5; BYTE $0x00 // vmovdqu ymm8, yword [rdi + 4*rax + 480] - LONG $0x3b3de2c4; WORD $0x2434 // vpminud ymm6, ymm8, yword [rsp] - LONG $0x3f45c2c4; BYTE $0xf8 // vpmaxud ymm7, ymm7, ymm8 - LONG $0x80e88348 // sub rax, -128 - LONG $0x04c28349 // add r10, 4 - JNE LBB1_9 - -LBB1_10: - WORD $0x854d; BYTE $0xc9 // test r9, r9 - JE LBB1_13 - LONG $0x87048d48 // lea rax, [rdi + 4*rax] - WORD $0xf749; BYTE $0xd9 // neg r9 - -LBB1_12: - LONG $0x006f7ec5 // vmovdqu ymm8, yword [rax] - LONG $0x486f7ec5; BYTE $0x20 // vmovdqu ymm9, yword [rax + 32] - LONG $0x506f7ec5; BYTE $0x40 // vmovdqu ymm10, yword [rax + 64] - LONG $0x586f7ec5; BYTE $0x60 // vmovdqu ymm11, yword [rax + 96] - LONG $0x3b6dc2c4; BYTE $0xd1 // vpminud ymm2, ymm2, ymm9 - LONG $0x3b75c2c4; BYTE $0xc8 // vpminud ymm1, ymm1, ymm8 - LONG $0x3b5dc2c4; BYTE $0xe2 // vpminud ymm4, ymm4, ymm10 - LONG $0x3b4dc2c4; BYTE $0xf3 // vpminud ymm6, ymm6, ymm11 - LONG $0x3f65c2c4; BYTE $0xd9 // vpmaxud ymm3, ymm3, ymm9 - LONG $0x3f7dc2c4; BYTE $0xc0 // vpmaxud ymm0, ymm0, ymm8 - LONG $0x3f55c2c4; BYTE $0xea // vpmaxud ymm5, ymm5, ymm10 - LONG $0x3f45c2c4; BYTE $0xfb // vpmaxud ymm7, ymm7, ymm11 - LONG $0x80e88348 // sub rax, -128 - WORD $0xff49; BYTE $0xc1 // inc r9 - JNE LBB1_12 - -LBB1_13: - LONG $0x3b6de2c4; BYTE $0xd6 // vpminud ymm2, ymm2, ymm6 - LONG $0x3b75e2c4; BYTE $0xcc // vpminud ymm1, ymm1, ymm4 - LONG $0x3b75e2c4; BYTE $0xca // vpminud ymm1, ymm1, ymm2 - LONG $0x3f65e2c4; BYTE $0xd7 // vpmaxud ymm2, ymm3, ymm7 - LONG $0x3f7de2c4; BYTE $0xc5 // vpmaxud ymm0, ymm0, ymm5 - LONG $0x3f7de2c4; BYTE $0xc2 // vpmaxud ymm0, ymm0, ymm2 - LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1 - LONG $0x3f79e2c4; BYTE $0xc2 // vpmaxud xmm0, xmm0, xmm2 - LONG $0xd070f9c5; BYTE $0x4e // vpshufd xmm2, xmm0, 78 - LONG $0x3f79e2c4; BYTE $0xc2 // vpmaxud xmm0, xmm0, xmm2 - LONG $0xd070f9c5; BYTE $0xe5 // vpshufd xmm2, xmm0, 229 - LONG $0x3f79e2c4; BYTE $0xc2 // vpmaxud xmm0, xmm0, xmm2 - LONG $0xc67ef9c5 // vmovd esi, xmm0 - LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1 - LONG $0x3b71e2c4; BYTE $0xc0 // vpminud xmm0, xmm1, xmm0 +LBB1_5: + LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] + LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] + LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] + LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] + LONG $0x3b7dc2c4; BYTE $0xc0 // vpminud ymm0, ymm0, ymm8 + LONG $0x3b75c2c4; BYTE $0xc9 // vpminud ymm1, ymm1, ymm9 + LONG $0x3b6dc2c4; BYTE $0xd2 // vpminud ymm2, ymm2, ymm10 + LONG $0x3b65c2c4; BYTE $0xdb // vpminud ymm3, ymm3, ymm11 + LONG $0x3f5dc2c4; BYTE $0xe0 // vpmaxud ymm4, ymm4, ymm8 + LONG $0x3f55c2c4; BYTE $0xe9 // vpmaxud ymm5, ymm5, ymm9 + LONG $0x3f4dc2c4; BYTE $0xf2 // vpmaxud ymm6, ymm6, ymm10 + LONG $0x3f45c2c4; BYTE $0xfb // vpmaxud ymm7, ymm7, ymm11 + LONG $0x20c08348 // add rax, 32 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax + JNE LBB1_5 + LONG $0x3f5de2c4; BYTE $0xe5 // vpmaxud ymm4, ymm4, ymm5 + LONG $0x3f5de2c4; BYTE $0xe6 // vpmaxud ymm4, ymm4, ymm6 + LONG $0x3f5de2c4; BYTE $0xe7 // vpmaxud ymm4, ymm4, ymm7 + LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1 + LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78 + LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229 + LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 + LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4 + LONG $0x3b7de2c4; BYTE $0xc1 // vpminud ymm0, ymm0, ymm1 + LONG $0x3b7de2c4; BYTE $0xc2 // vpminud ymm0, ymm0, ymm2 + LONG $0x3b7de2c4; BYTE $0xc3 // vpminud ymm0, ymm0, ymm3 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 - LONG $0x7e79c1c4; BYTE $0xc1 // vmovd r9d, xmm0 - WORD $0x394d; BYTE $0xc3 // cmp r11, r8 - JE LBB1_14 - -LBB1_4: - WORD $0xf089 // mov eax, esi + LONG $0xc07ef9c5 // vmovd eax, xmm0 + WORD $0x8944; BYTE $0xd6 // mov esi, r10d + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB1_8 -LBB1_5: - LONG $0x9f348b42 // mov esi, dword [rdi + 4*r11] - WORD $0x3941; BYTE $0xf1 // cmp r9d, esi - LONG $0xce430f44 // cmovae r9d, esi +LBB1_7: + LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9] WORD $0xf039 // cmp eax, esi - WORD $0x470f; BYTE $0xf0 // cmova esi, eax - LONG $0x01c38349 // add r11, 1 - WORD $0xf089 // mov eax, esi - WORD $0x394d; BYTE $0xd8 // cmp r8, r11 - JNE LBB1_5 + WORD $0x430f; BYTE $0xc6 // cmovae eax, esi + WORD $0x3941; BYTE $0xf2 // cmp r10d, esi + LONG $0xf2470f41 // cmova esi, r10d + LONG $0x01c18349 // add r9, 1 + WORD $0x8941; BYTE $0xf2 // mov r10d, esi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB1_7 -LBB1_14: - WORD $0x3189 // mov dword [rcx], esi - WORD $0x8944; BYTE $0x0a // mov dword [rdx], r9d - SUBQ $8, SP +LBB1_8: + WORD $0x3189 // mov dword [rcx], esi + WORD $0x0289 // mov dword [rdx], eax VZEROUPPER RET @@ -369,984 +193,251 @@ DATA LCDATA2<>+0x000(SB)/8, $0x8000000000000000 DATA LCDATA2<>+0x008(SB)/8, $0x7fffffffffffffff GLOBL LCDATA2<>(SB), 8, $16 -TEXT ·_int64_max_min_avx2(SB), $232-32 +TEXT ·_int64_max_min_avx2(SB), $0-32 MOVQ values+0(FP), DI MOVQ length+8(FP), SI MOVQ minout+16(FP), DX MOVQ maxout+24(FP), CX - ADDQ $8, SP LEAQ LCDATA2<>(SB), BP - QUAD $0xffffffffffffb949; WORD $0x7fff // mov r9, 9223372036854775807 + QUAD $0xffffffffffffb848; WORD $0x7fff // mov rax, 9223372036854775807 WORD $0xf685 // test esi, esi JLE LBB2_1 WORD $0x8941; BYTE $0xf0 // mov r8d, esi - WORD $0xfe83; BYTE $0x1f // cmp esi, 31 - JA LBB2_6 - LONG $0x01718d49 // lea rsi, [r9 + 1] - WORD $0x3145; BYTE $0xdb // xor r11d, r11d - JMP LBB2_4 + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JA LBB2_4 + LONG $0x01508d4c // lea r10, [rax + 1] + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + JMP LBB2_7 LBB2_1: - LONG $0x01718d49 // lea rsi, [r9 + 1] - JMP LBB2_14 + LONG $0x01708d48 // lea rsi, [rax + 1] + JMP LBB2_8 -LBB2_6: - WORD $0x8945; BYTE $0xc3 // mov r11d, r8d - LONG $0xe0e38341 // and r11d, -32 - LONG $0xe0438d49 // lea rax, [r11 - 32] - WORD $0x8949; BYTE $0xc2 // mov r10, rax - LONG $0x05eac149 // shr r10, 5 - LONG $0x01c28349 // add r10, 1 - WORD $0x8945; BYTE $0xd1 // mov r9d, r10d - LONG $0x03e18341 // and r9d, 3 - LONG $0x60f88348 // cmp rax, 96 - JAE LBB2_8 - LONG $0x597d62c4; WORD $0x007d // vpbroadcastq ymm15, qword 0[rbp] /* [rip + .LCPI2_0] */ - LONG $0x597d62c4; WORD $0x085d // vpbroadcastq ymm11, qword 8[rbp] /* [rip + .LCPI2_1] */ - WORD $0xc031 // xor eax, eax - LONG $0x5c7f7dc5; WORD $0x2024 // vmovdqa yword [rsp + 32], ymm11 - LONG $0x6f7dc1c4; BYTE $0xdb // vmovdqa ymm3, ymm11 - LONG $0x6f7d41c4; BYTE $0xcb // vmovdqa ymm9, ymm11 - LONG $0x6f7dc1c4; BYTE $0xeb // vmovdqa ymm5, ymm11 - LONG $0x6f7dc1c4; BYTE $0xe3 // vmovdqa ymm4, ymm11 - LONG $0x6f7dc1c4; BYTE $0xf3 // vmovdqa ymm6, ymm11 - LONG $0x5c7f7dc5; WORD $0x6024 // vmovdqa yword [rsp + 96], ymm11 - LONG $0x7c7f7dc5; WORD $0x4024 // vmovdqa yword [rsp + 64], ymm15 - LONG $0x6f7dc1c4; BYTE $0xd7 // vmovdqa ymm2, ymm15 - LONG $0x6f7d41c4; BYTE $0xc7 // vmovdqa ymm8, ymm15 - LONG $0x6f7d41c4; BYTE $0xe7 // vmovdqa ymm12, ymm15 - LONG $0x6f7d41c4; BYTE $0xef // vmovdqa ymm13, ymm15 - LONG $0x6f7d41c4; BYTE $0xf7 // vmovdqa ymm14, ymm15 - LONG $0x3c7f7dc5; BYTE $0x24 // vmovdqa yword [rsp], ymm15 - JMP LBB2_10 - -LBB2_8: - LONG $0xfce28349 // and r10, -4 - LONG $0x597d62c4; WORD $0x007d // vpbroadcastq ymm15, qword 0[rbp] /* [rip + .LCPI2_0] */ - WORD $0xf749; BYTE $0xda // neg r10 - LONG $0x597d62c4; WORD $0x085d // vpbroadcastq ymm11, qword 8[rbp] /* [rip + .LCPI2_1] */ +LBB2_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0x597de2c4; WORD $0x0065 // vpbroadcastq ymm4, qword 0[rbp] /* [rip + .LCPI2_0] */ + LONG $0xf0e18341 // and r9d, -16 + LONG $0x597de2c4; WORD $0x0845 // vpbroadcastq ymm0, qword 8[rbp] /* [rip + .LCPI2_1] */ WORD $0xc031 // xor eax, eax - LONG $0x5c7f7dc5; WORD $0x2024 // vmovdqa yword [rsp + 32], ymm11 - LONG $0x6f7dc1c4; BYTE $0xdb // vmovdqa ymm3, ymm11 - LONG $0x6f7d41c4; BYTE $0xcb // vmovdqa ymm9, ymm11 - LONG $0x6f7dc1c4; BYTE $0xeb // vmovdqa ymm5, ymm11 - LONG $0x6f7dc1c4; BYTE $0xe3 // vmovdqa ymm4, ymm11 - LONG $0x6f7dc1c4; BYTE $0xf3 // vmovdqa ymm6, ymm11 - LONG $0x5c7f7dc5; WORD $0x6024 // vmovdqa yword [rsp + 96], ymm11 - LONG $0x7c7f7dc5; WORD $0x4024 // vmovdqa yword [rsp + 64], ymm15 - LONG $0x6f7dc1c4; BYTE $0xd7 // vmovdqa ymm2, ymm15 - LONG $0x6f7d41c4; BYTE $0xc7 // vmovdqa ymm8, ymm15 - LONG $0x6f7d41c4; BYTE $0xe7 // vmovdqa ymm12, ymm15 - LONG $0x6f7d41c4; BYTE $0xef // vmovdqa ymm13, ymm15 - LONG $0x6f7d41c4; BYTE $0xf7 // vmovdqa ymm14, ymm15 - LONG $0x3c7f7dc5; BYTE $0x24 // vmovdqa yword [rsp], ymm15 - -LBB2_9: - QUAD $0x0000e0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 224] - LONG $0x6f7d41c4; BYTE $0xd0 // vmovdqa ymm10, ymm8 - LONG $0xc26f7dc5 // vmovdqa ymm8, ymm2 - LONG $0xd36ffdc5 // vmovdqa ymm2, ymm3 - LONG $0x6f7dc1c4; BYTE $0xd9 // vmovdqa ymm3, ymm9 - LONG $0x377d42c4; BYTE $0xcb // vpcmpgtq ymm9, ymm0, ymm11 - LONG $0x4b7dc3c4; WORD $0x90cb // vblendvpd ymm1, ymm0, ymm11, ymm9 - QUAD $0x0000a0248c29fdc5; BYTE $0x00 // vmovapd yword [rsp + 160], ymm1 - LONG $0x370562c4; BYTE $0xc8 // vpcmpgtq ymm9, ymm15, ymm0 - LONG $0x4b7dc3c4; WORD $0x90c7 // vblendvpd ymm0, ymm0, ymm15, ymm9 - QUAD $0x000080248429fdc5; BYTE $0x00 // vmovapd yword [rsp + 128], ymm0 - QUAD $0x0000c0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 192] - LONG $0x377d62c4; BYTE $0xce // vpcmpgtq ymm9, ymm0, ymm6 - LONG $0x4b7de3c4; WORD $0x90fe // vblendvpd ymm7, ymm0, ymm6, ymm9 - LONG $0x370d62c4; BYTE $0xc8 // vpcmpgtq ymm9, ymm14, ymm0 - LONG $0x4b7d43c4; WORD $0x90f6 // vblendvpd ymm14, ymm0, ymm14, ymm9 - QUAD $0x0000a0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 160] - LONG $0x377d62c4; BYTE $0xcc // vpcmpgtq ymm9, ymm0, ymm4 - LONG $0x4b7de3c4; WORD $0x90f4 // vblendvpd ymm6, ymm0, ymm4, ymm9 - LONG $0x371562c4; BYTE $0xc8 // vpcmpgtq ymm9, ymm13, ymm0 - LONG $0x4b7d43c4; WORD $0x90ed // vblendvpd ymm13, ymm0, ymm13, ymm9 - QUAD $0x000080c78c6f7ec5; BYTE $0x00 // vmovdqu ymm9, yword [rdi + 8*rax + 128] - LONG $0x3735e2c4; BYTE $0xc5 // vpcmpgtq ymm0, ymm9, ymm5 - LONG $0x4b35e3c4; WORD $0x00cd // vblendvpd ymm1, ymm9, ymm5, ymm0 - LONG $0x371dc2c4; BYTE $0xe9 // vpcmpgtq ymm5, ymm12, ymm9 - LONG $0x4b3543c4; WORD $0x50e4 // vblendvpd ymm12, ymm9, ymm12, ymm5 - LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, yword [rdi + 8*rax + 96] - LONG $0x375562c4; BYTE $0xcb // vpcmpgtq ymm9, ymm5, ymm3 - LONG $0x4b5563c4; WORD $0x90cb // vblendvpd ymm9, ymm5, ymm3, ymm9 - LONG $0x372de2c4; BYTE $0xe5 // vpcmpgtq ymm4, ymm10, ymm5 - LONG $0x4b5543c4; WORD $0x40d2 // vblendvpd ymm10, ymm5, ymm10, ymm4 - LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64] - LONG $0x375de2c4; BYTE $0xea // vpcmpgtq ymm5, ymm4, ymm2 - LONG $0x4b5de3c4; WORD $0x50ea // vblendvpd ymm5, ymm4, ymm2, ymm5 - LONG $0x373de2c4; BYTE $0xdc // vpcmpgtq ymm3, ymm8, ymm4 - LONG $0x4b5dc3c4; WORD $0x30c0 // vblendvpd ymm0, ymm4, ymm8, ymm3 - LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax] - LONG $0x646ffdc5; WORD $0x6024 // vmovdqa ymm4, yword [rsp + 96] - LONG $0x376de2c4; BYTE $0xdc // vpcmpgtq ymm3, ymm2, ymm4 - LONG $0x4b6de3c4; WORD $0x30dc // vblendvpd ymm3, ymm2, ymm4, ymm3 - LONG $0x1c6f7dc5; BYTE $0x24 // vmovdqa ymm11, yword [rsp] - LONG $0x3725e2c4; BYTE $0xe2 // vpcmpgtq ymm4, ymm11, ymm2 - LONG $0x4b6dc3c4; WORD $0x40e3 // vblendvpd ymm4, ymm2, ymm11, ymm4 - LONG $0x546ffec5; WORD $0x20c7 // vmovdqu ymm2, yword [rdi + 8*rax + 32] - LONG $0x7c6f7dc5; WORD $0x2024 // vmovdqa ymm15, yword [rsp + 32] - LONG $0x376d42c4; BYTE $0xdf // vpcmpgtq ymm11, ymm2, ymm15 - LONG $0x4b6d43c4; WORD $0xb0df // vblendvpd ymm11, ymm2, ymm15, ymm11 - LONG $0x446f7dc5; WORD $0x4024 // vmovdqa ymm8, yword [rsp + 64] - LONG $0x373d62c4; BYTE $0xfa // vpcmpgtq ymm15, ymm8, ymm2 - LONG $0x4b6dc3c4; WORD $0xf0d0 // vblendvpd ymm2, ymm2, ymm8, ymm15 - QUAD $0x000120c7846f7ec5; BYTE $0x00 // vmovdqu ymm8, yword [rdi + 8*rax + 288] - LONG $0x373d42c4; BYTE $0xfb // vpcmpgtq ymm15, ymm8, ymm11 - LONG $0x4b3d43c4; WORD $0xf0db // vblendvpd ymm11, ymm8, ymm11, ymm15 - LONG $0x5c297dc5; WORD $0x2024 // vmovapd yword [rsp + 32], ymm11 - LONG $0x376d42c4; BYTE $0xd8 // vpcmpgtq ymm11, ymm2, ymm8 - LONG $0x4b3de3c4; WORD $0xb0d2 // vblendvpd ymm2, ymm8, ymm2, ymm11 - LONG $0x1429fdc5; BYTE $0x24 // vmovapd yword [rsp], ymm2 - QUAD $0x000100c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 256] - LONG $0x3725e2c4; BYTE $0xd3 // vpcmpgtq ymm2, ymm11, ymm3 - LONG $0x4b2563c4; WORD $0x20c3 // vblendvpd ymm8, ymm11, ymm3, ymm2 - LONG $0x375dc2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm11 - LONG $0x4b25e3c4; WORD $0x30dc // vblendvpd ymm3, ymm11, ymm4, ymm3 - QUAD $0x000140c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 320] - LONG $0x3725e2c4; BYTE $0xe5 // vpcmpgtq ymm4, ymm11, ymm5 - LONG $0x4b25e3c4; WORD $0x40e5 // vblendvpd ymm4, ymm11, ymm5, ymm4 - LONG $0x377dc2c4; BYTE $0xeb // vpcmpgtq ymm5, ymm0, ymm11 - LONG $0x4b25e3c4; WORD $0x50e8 // vblendvpd ymm5, ymm11, ymm0, ymm5 - QUAD $0x000160c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 352] - LONG $0x377d42c4; BYTE $0xd9 // vpcmpgtq ymm11, ymm0, ymm9 - LONG $0x4b7d43c4; WORD $0xb0c9 // vblendvpd ymm9, ymm0, ymm9, ymm11 - LONG $0x372d62c4; BYTE $0xd8 // vpcmpgtq ymm11, ymm10, ymm0 - LONG $0x4b7d43c4; WORD $0xb0d2 // vblendvpd ymm10, ymm0, ymm10, ymm11 - QUAD $0x000180c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 384] - LONG $0x3725e2c4; BYTE $0xc1 // vpcmpgtq ymm0, ymm11, ymm1 - LONG $0x4b25e3c4; WORD $0x00d1 // vblendvpd ymm2, ymm11, ymm1, ymm0 - LONG $0x371dc2c4; BYTE $0xcb // vpcmpgtq ymm1, ymm12, ymm11 - LONG $0x4b2543c4; WORD $0x10e4 // vblendvpd ymm12, ymm11, ymm12, ymm1 - QUAD $0x0001a0c78c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdi + 8*rax + 416] - LONG $0x377562c4; BYTE $0xde // vpcmpgtq ymm11, ymm1, ymm6 - LONG $0x4b75e3c4; WORD $0xb0f6 // vblendvpd ymm6, ymm1, ymm6, ymm11 - LONG $0x371562c4; BYTE $0xd9 // vpcmpgtq ymm11, ymm13, ymm1 - LONG $0x4b75c3c4; WORD $0xb0cd // vblendvpd ymm1, ymm1, ymm13, ymm11 - QUAD $0x0001c0c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 448] - LONG $0x372562c4; BYTE $0xef // vpcmpgtq ymm13, ymm11, ymm7 - LONG $0x4b25e3c4; WORD $0xd0ff // vblendvpd ymm7, ymm11, ymm7, ymm13 - LONG $0x370d42c4; BYTE $0xeb // vpcmpgtq ymm13, ymm14, ymm11 - LONG $0x4b2543c4; WORD $0xd0ee // vblendvpd ymm13, ymm11, ymm14, ymm13 - QUAD $0x0001e0c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 480] - QUAD $0x0000a024846ffdc5; BYTE $0x00 // vmovdqa ymm0, yword [rsp + 160] - LONG $0x372562c4; BYTE $0xf0 // vpcmpgtq ymm14, ymm11, ymm0 - LONG $0x4b2563c4; WORD $0xe0f0 // vblendvpd ymm14, ymm11, ymm0, ymm14 - QUAD $0x00008024846ffdc5; BYTE $0x00 // vmovdqa ymm0, yword [rsp + 128] - LONG $0x377d42c4; BYTE $0xfb // vpcmpgtq ymm15, ymm0, ymm11 - LONG $0x4b2563c4; WORD $0xf0f8 // vblendvpd ymm15, ymm11, ymm0, ymm15 - QUAD $0x0002e0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 736] - LONG $0x377d42c4; BYTE $0xde // vpcmpgtq ymm11, ymm0, ymm14 - LONG $0x4b7d43c4; WORD $0xb0de // vblendvpd ymm11, ymm0, ymm14, ymm11 - QUAD $0x0000a0249c297dc5; BYTE $0x00 // vmovapd yword [rsp + 160], ymm11 - LONG $0x370562c4; BYTE $0xf0 // vpcmpgtq ymm14, ymm15, ymm0 - LONG $0x4b7dc3c4; WORD $0xe0c7 // vblendvpd ymm0, ymm0, ymm15, ymm14 - QUAD $0x000080248429fdc5; BYTE $0x00 // vmovapd yword [rsp + 128], ymm0 - QUAD $0x0002c0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 704] - LONG $0x377d62c4; BYTE $0xf7 // vpcmpgtq ymm14, ymm0, ymm7 - LONG $0x4b7de3c4; WORD $0xe0ff // vblendvpd ymm7, ymm0, ymm7, ymm14 - LONG $0x371562c4; BYTE $0xf0 // vpcmpgtq ymm14, ymm13, ymm0 - LONG $0x4b7d43c4; WORD $0xe0f5 // vblendvpd ymm14, ymm0, ymm13, ymm14 - QUAD $0x0002a0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 672] - LONG $0x377d62c4; BYTE $0xee // vpcmpgtq ymm13, ymm0, ymm6 - LONG $0x4b7de3c4; WORD $0xd0f6 // vblendvpd ymm6, ymm0, ymm6, ymm13 - LONG $0x377562c4; BYTE $0xe8 // vpcmpgtq ymm13, ymm1, ymm0 - LONG $0x4b7d63c4; WORD $0xd0e9 // vblendvpd ymm13, ymm0, ymm1, ymm13 - QUAD $0x000280c78c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdi + 8*rax + 640] - LONG $0x3775e2c4; BYTE $0xc2 // vpcmpgtq ymm0, ymm1, ymm2 - LONG $0x4b75e3c4; WORD $0x00c2 // vblendvpd ymm0, ymm1, ymm2, ymm0 - LONG $0x371de2c4; BYTE $0xd1 // vpcmpgtq ymm2, ymm12, ymm1 - LONG $0x4b7543c4; WORD $0x20e4 // vblendvpd ymm12, ymm1, ymm12, ymm2 - QUAD $0x000260c78c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdi + 8*rax + 608] - LONG $0x3775c2c4; BYTE $0xd1 // vpcmpgtq ymm2, ymm1, ymm9 - LONG $0x4b7543c4; WORD $0x20c9 // vblendvpd ymm9, ymm1, ymm9, ymm2 - LONG $0x372de2c4; BYTE $0xd1 // vpcmpgtq ymm2, ymm10, ymm1 - LONG $0x4b7543c4; WORD $0x20d2 // vblendvpd ymm10, ymm1, ymm10, ymm2 - QUAD $0x000240c78c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdi + 8*rax + 576] - LONG $0x3775e2c4; BYTE $0xd4 // vpcmpgtq ymm2, ymm1, ymm4 - LONG $0x4b75e3c4; WORD $0x20d4 // vblendvpd ymm2, ymm1, ymm4, ymm2 - LONG $0x3755e2c4; BYTE $0xe1 // vpcmpgtq ymm4, ymm5, ymm1 - LONG $0x4b75e3c4; WORD $0x40cd // vblendvpd ymm1, ymm1, ymm5, ymm4 - QUAD $0x000200c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 512] - LONG $0x375dc2c4; BYTE $0xe8 // vpcmpgtq ymm5, ymm4, ymm8 - LONG $0x4b5dc3c4; WORD $0x50e8 // vblendvpd ymm5, ymm4, ymm8, ymm5 - LONG $0x376562c4; BYTE $0xc4 // vpcmpgtq ymm8, ymm3, ymm4 - LONG $0x4b5de3c4; WORD $0x80db // vblendvpd ymm3, ymm4, ymm3, ymm8 - QUAD $0x000220c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 544] - LONG $0x5c6f7dc5; WORD $0x2024 // vmovdqa ymm11, yword [rsp + 32] - LONG $0x375d42c4; BYTE $0xc3 // vpcmpgtq ymm8, ymm4, ymm11 - LONG $0x4b5d43c4; WORD $0x80c3 // vblendvpd ymm8, ymm4, ymm11, ymm8 - LONG $0x3c6f7dc5; BYTE $0x24 // vmovdqa ymm15, yword [rsp] - LONG $0x370562c4; BYTE $0xdc // vpcmpgtq ymm11, ymm15, ymm4 - LONG $0x4b5dc3c4; WORD $0xb0e7 // vblendvpd ymm4, ymm4, ymm15, ymm11 - QUAD $0x000320c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 800] - LONG $0x372542c4; BYTE $0xf8 // vpcmpgtq ymm15, ymm11, ymm8 - LONG $0x4b2543c4; WORD $0xf0c0 // vblendvpd ymm8, ymm11, ymm8, ymm15 - LONG $0x44297dc5; WORD $0x2024 // vmovapd yword [rsp + 32], ymm8 - LONG $0x375d42c4; BYTE $0xc3 // vpcmpgtq ymm8, ymm4, ymm11 - LONG $0x4b25e3c4; WORD $0x80e4 // vblendvpd ymm4, ymm11, ymm4, ymm8 - LONG $0x6429fdc5; WORD $0x4024 // vmovapd yword [rsp + 64], ymm4 - QUAD $0x000300c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 768] - LONG $0x375d62c4; BYTE $0xdd // vpcmpgtq ymm11, ymm4, ymm5 - LONG $0x4b5de3c4; WORD $0xb0ed // vblendvpd ymm5, ymm4, ymm5, ymm11 - LONG $0x6c29fdc5; WORD $0x6024 // vmovapd yword [rsp + 96], ymm5 - LONG $0x3765e2c4; BYTE $0xec // vpcmpgtq ymm5, ymm3, ymm4 - LONG $0x4b5de3c4; WORD $0x50db // vblendvpd ymm3, ymm4, ymm3, ymm5 - LONG $0x1c29fdc5; BYTE $0x24 // vmovapd yword [rsp], ymm3 - QUAD $0x000340c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 832] - LONG $0x375de2c4; BYTE $0xda // vpcmpgtq ymm3, ymm4, ymm2 - LONG $0x4b5de3c4; WORD $0x30da // vblendvpd ymm3, ymm4, ymm2, ymm3 - LONG $0x3775e2c4; BYTE $0xd4 // vpcmpgtq ymm2, ymm1, ymm4 - LONG $0x4b5de3c4; WORD $0x20d1 // vblendvpd ymm2, ymm4, ymm1, ymm2 - QUAD $0x000360c78c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdi + 8*rax + 864] - LONG $0x3775c2c4; BYTE $0xe1 // vpcmpgtq ymm4, ymm1, ymm9 - LONG $0x4b7543c4; WORD $0x40c9 // vblendvpd ymm9, ymm1, ymm9, ymm4 - LONG $0x372de2c4; BYTE $0xe9 // vpcmpgtq ymm5, ymm10, ymm1 - LONG $0x4b7543c4; WORD $0x50c2 // vblendvpd ymm8, ymm1, ymm10, ymm5 - QUAD $0x000380c78c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdi + 8*rax + 896] - LONG $0x3775e2c4; BYTE $0xe8 // vpcmpgtq ymm5, ymm1, ymm0 - LONG $0x4b75e3c4; WORD $0x50e8 // vblendvpd ymm5, ymm1, ymm0, ymm5 - LONG $0x371de2c4; BYTE $0xc1 // vpcmpgtq ymm0, ymm12, ymm1 - LONG $0x4b7543c4; WORD $0x00e4 // vblendvpd ymm12, ymm1, ymm12, ymm0 - QUAD $0x0003a0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 928] - LONG $0x377de2c4; BYTE $0xce // vpcmpgtq ymm1, ymm0, ymm6 - LONG $0x4b7de3c4; WORD $0x10e6 // vblendvpd ymm4, ymm0, ymm6, ymm1 - LONG $0x3715e2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm13, ymm0 - LONG $0x4b7d43c4; WORD $0x10ed // vblendvpd ymm13, ymm0, ymm13, ymm1 - QUAD $0x0003c0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 960] - LONG $0x377de2c4; BYTE $0xcf // vpcmpgtq ymm1, ymm0, ymm7 - LONG $0x4b7de3c4; WORD $0x10f7 // vblendvpd ymm6, ymm0, ymm7, ymm1 - LONG $0x370de2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm14, ymm0 - LONG $0x4b7d43c4; WORD $0x10f6 // vblendvpd ymm14, ymm0, ymm14, ymm1 - QUAD $0x0003e0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 992] - QUAD $0x0000a024bc6ffdc5; BYTE $0x00 // vmovdqa ymm7, yword [rsp + 160] - LONG $0x377de2c4; BYTE $0xcf // vpcmpgtq ymm1, ymm0, ymm7 - LONG $0x4b7d63c4; WORD $0x10df // vblendvpd ymm11, ymm0, ymm7, ymm1 - QUAD $0x00008024bc6ffdc5; BYTE $0x00 // vmovdqa ymm7, yword [rsp + 128] - LONG $0x3745e2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm7, ymm0 - LONG $0x4b7d63c4; WORD $0x10ff // vblendvpd ymm15, ymm0, ymm7, ymm1 - LONG $0x80e88348 // sub rax, -128 - LONG $0x04c28349 // add r10, 4 - JNE LBB2_9 - -LBB2_10: - WORD $0x854d; BYTE $0xc9 // test r9, r9 - LONG $0xfd6ffdc5 // vmovdqa ymm7, ymm5 - LONG $0x6f7dc1c4; BYTE $0xe9 // vmovdqa ymm5, ymm9 - LONG $0x4c6f7dc5; WORD $0x6024 // vmovdqa ymm9, yword [rsp + 96] - LONG $0xd36f7dc5 // vmovdqa ymm10, ymm3 - JE LBB2_13 - LONG $0xc7048d48 // lea rax, [rdi + 8*rax] - WORD $0xf749; BYTE $0xd9 // neg r9 - -LBB2_12: - LONG $0x406ffec5; BYTE $0x20 // vmovdqu ymm0, yword [rax + 32] - LONG $0x5c6ffdc5; WORD $0x2024 // vmovdqa ymm3, yword [rsp + 32] - LONG $0x377de2c4; BYTE $0xcb // vpcmpgtq ymm1, ymm0, ymm3 - LONG $0x4b7de3c4; WORD $0x10db // vblendvpd ymm3, ymm0, ymm3, ymm1 - LONG $0x5c29fdc5; WORD $0x2024 // vmovapd yword [rsp + 32], ymm3 - LONG $0x5c6ffdc5; WORD $0x4024 // vmovdqa ymm3, yword [rsp + 64] - LONG $0x3765e2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm3, ymm0 - LONG $0x4b7de3c4; WORD $0x10db // vblendvpd ymm3, ymm0, ymm3, ymm1 - LONG $0x5c29fdc5; WORD $0x4024 // vmovapd yword [rsp + 64], ymm3 - LONG $0x006ffec5 // vmovdqu ymm0, yword [rax] - LONG $0x377dc2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm0, ymm9 - LONG $0x4b7d43c4; WORD $0x10c9 // vblendvpd ymm9, ymm0, ymm9, ymm1 - LONG $0x1c6ffdc5; BYTE $0x24 // vmovdqa ymm3, yword [rsp] - LONG $0x3765e2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm3, ymm0 - LONG $0x4b7de3c4; WORD $0x10db // vblendvpd ymm3, ymm0, ymm3, ymm1 - LONG $0x1c29fdc5; BYTE $0x24 // vmovapd yword [rsp], ymm3 - LONG $0x406ffec5; BYTE $0x40 // vmovdqu ymm0, yword [rax + 64] - LONG $0x377dc2c4; BYTE $0xca // vpcmpgtq ymm1, ymm0, ymm10 - LONG $0x4b7d43c4; WORD $0x10d2 // vblendvpd ymm10, ymm0, ymm10, ymm1 - LONG $0x376de2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm2, ymm0 - LONG $0x4b7de3c4; WORD $0x10d2 // vblendvpd ymm2, ymm0, ymm2, ymm1 - LONG $0x406ffec5; BYTE $0x60 // vmovdqu ymm0, yword [rax + 96] - LONG $0x377de2c4; BYTE $0xcd // vpcmpgtq ymm1, ymm0, ymm5 - LONG $0x4b7de3c4; WORD $0x10ed // vblendvpd ymm5, ymm0, ymm5, ymm1 - LONG $0x373de2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm8, ymm0 - LONG $0x4b7d43c4; WORD $0x10c0 // vblendvpd ymm8, ymm0, ymm8, ymm1 - QUAD $0x00000080806ffec5 // vmovdqu ymm0, yword [rax + 128] - LONG $0x377de2c4; BYTE $0xcf // vpcmpgtq ymm1, ymm0, ymm7 - LONG $0x4b7de3c4; WORD $0x10ff // vblendvpd ymm7, ymm0, ymm7, ymm1 - LONG $0x371de2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm12, ymm0 - LONG $0x4b7d43c4; WORD $0x10e4 // vblendvpd ymm12, ymm0, ymm12, ymm1 - QUAD $0x000000a0806ffec5 // vmovdqu ymm0, yword [rax + 160] - LONG $0x377de2c4; BYTE $0xcc // vpcmpgtq ymm1, ymm0, ymm4 - LONG $0x4b7de3c4; WORD $0x10e4 // vblendvpd ymm4, ymm0, ymm4, ymm1 - LONG $0x3715e2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm13, ymm0 - LONG $0x4b7d43c4; WORD $0x10ed // vblendvpd ymm13, ymm0, ymm13, ymm1 - QUAD $0x000000c0806ffec5 // vmovdqu ymm0, yword [rax + 192] - LONG $0x377de2c4; BYTE $0xce // vpcmpgtq ymm1, ymm0, ymm6 - LONG $0x4b7de3c4; WORD $0x10f6 // vblendvpd ymm6, ymm0, ymm6, ymm1 - LONG $0x370de2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm14, ymm0 - LONG $0x4b7d43c4; WORD $0x10f6 // vblendvpd ymm14, ymm0, ymm14, ymm1 - QUAD $0x000000e0806ffec5 // vmovdqu ymm0, yword [rax + 224] - LONG $0x377dc2c4; BYTE $0xcb // vpcmpgtq ymm1, ymm0, ymm11 - LONG $0x4b7d43c4; WORD $0x10db // vblendvpd ymm11, ymm0, ymm11, ymm1 - LONG $0x3705e2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm15, ymm0 - LONG $0x4b7d43c4; WORD $0x10ff // vblendvpd ymm15, ymm0, ymm15, ymm1 - LONG $0x01000548; WORD $0x0000 // add rax, 256 - WORD $0xff49; BYTE $0xc1 // inc r9 - JNE LBB2_12 + LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4 + LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4 + LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4 -LBB2_13: - LONG $0x4c6ffdc5; WORD $0x4024 // vmovdqa ymm1, yword [rsp + 64] - LONG $0x3775c2c4; BYTE $0xc5 // vpcmpgtq ymm0, ymm1, ymm13 - LONG $0x4b15e3c4; WORD $0x00c1 // vblendvpd ymm0, ymm13, ymm1, ymm0 - LONG $0x373dc2c4; BYTE $0xcf // vpcmpgtq ymm1, ymm8, ymm15 - LONG $0x4b05c3c4; WORD $0x10c8 // vblendvpd ymm1, ymm15, ymm8, ymm1 - LONG $0x1c6ffdc5; BYTE $0x24 // vmovdqa ymm3, yword [rsp] - LONG $0x376542c4; BYTE $0xc4 // vpcmpgtq ymm8, ymm3, ymm12 - LONG $0x4b1d63c4; WORD $0x80c3 // vblendvpd ymm8, ymm12, ymm3, ymm8 - LONG $0x6f7dc1c4; BYTE $0xd9 // vmovdqa ymm3, ymm9 - LONG $0x376d42c4; BYTE $0xce // vpcmpgtq ymm9, ymm2, ymm14 - LONG $0x4b0de3c4; WORD $0x90d2 // vblendvpd ymm2, ymm14, ymm2, ymm9 - LONG $0x373d62c4; BYTE $0xca // vpcmpgtq ymm9, ymm8, ymm2 - LONG $0x4b6dc3c4; WORD $0x90d0 // vblendvpd ymm2, ymm2, ymm8, ymm9 - LONG $0x377d62c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm0, ymm1 - LONG $0x4b75e3c4; WORD $0x80c0 // vblendvpd ymm0, ymm1, ymm0, ymm8 - LONG $0x376de2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm2, ymm0 - LONG $0x4b7de3c4; WORD $0x10c2 // vblendvpd ymm0, ymm0, ymm2, ymm1 +LBB2_5: + LONG $0x046f7ec5; BYTE $0xc7 // vmovdqu ymm8, yword [rdi + 8*rax] + LONG $0x373d62c4; BYTE $0xc8 // vpcmpgtq ymm9, ymm8, ymm0 + LONG $0x4b3de3c4; WORD $0x90c0 // vblendvpd ymm0, ymm8, ymm0, ymm9 + LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32] + LONG $0x373562c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm9, ymm3 + LONG $0x4b35e3c4; WORD $0xa0db // vblendvpd ymm3, ymm9, ymm3, ymm10 + LONG $0x546f7ec5; WORD $0x40c7 // vmovdqu ymm10, yword [rdi + 8*rax + 64] + LONG $0x372d62c4; BYTE $0xda // vpcmpgtq ymm11, ymm10, ymm2 + LONG $0x4b2de3c4; WORD $0xb0d2 // vblendvpd ymm2, ymm10, ymm2, ymm11 + LONG $0x5c6f7ec5; WORD $0x60c7 // vmovdqu ymm11, yword [rdi + 8*rax + 96] + LONG $0x372562c4; BYTE $0xe1 // vpcmpgtq ymm12, ymm11, ymm1 + LONG $0x4b25e3c4; WORD $0xc0c9 // vblendvpd ymm1, ymm11, ymm1, ymm12 + LONG $0x375d42c4; BYTE $0xe0 // vpcmpgtq ymm12, ymm4, ymm8 + LONG $0x4b3de3c4; WORD $0xc0e4 // vblendvpd ymm4, ymm8, ymm4, ymm12 + LONG $0x374542c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm7, ymm9 + LONG $0x4b35e3c4; WORD $0x80ff // vblendvpd ymm7, ymm9, ymm7, ymm8 + LONG $0x374d42c4; BYTE $0xc2 // vpcmpgtq ymm8, ymm6, ymm10 + LONG $0x4b2de3c4; WORD $0x80f6 // vblendvpd ymm6, ymm10, ymm6, ymm8 + LONG $0x375542c4; BYTE $0xc3 // vpcmpgtq ymm8, ymm5, ymm11 + LONG $0x4b25e3c4; WORD $0x80ed // vblendvpd ymm5, ymm11, ymm5, ymm8 + LONG $0x10c08348 // add rax, 16 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax + JNE LBB2_5 + LONG $0x375d62c4; BYTE $0xc7 // vpcmpgtq ymm8, ymm4, ymm7 + LONG $0x4b45e3c4; WORD $0x80e4 // vblendvpd ymm4, ymm7, ymm4, ymm8 + LONG $0x375de2c4; BYTE $0xfe // vpcmpgtq ymm7, ymm4, ymm6 + LONG $0x4b4de3c4; WORD $0x70e4 // vblendvpd ymm4, ymm6, ymm4, ymm7 + LONG $0x375de2c4; BYTE $0xf5 // vpcmpgtq ymm6, ymm4, ymm5 + LONG $0x4b55e3c4; WORD $0x60e4 // vblendvpd ymm4, ymm5, ymm4, ymm6 + LONG $0x197de3c4; WORD $0x01e5 // vextractf128 xmm5, ymm4, 1 + LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5 + LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6 + LONG $0x0479e3c4; WORD $0x4eec // vpermilps xmm5, xmm4, 78 + LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5 + LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6 + LONG $0x7ef9c1c4; BYTE $0xe2 // vmovq r10, xmm4 + LONG $0x3765e2c4; BYTE $0xe0 // vpcmpgtq ymm4, ymm3, ymm0 + LONG $0x4b65e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm3, ymm0, ymm4 + LONG $0x376de2c4; BYTE $0xd8 // vpcmpgtq ymm3, ymm2, ymm0 + LONG $0x4b6de3c4; WORD $0x30c0 // vblendvpd ymm0, ymm2, ymm0, ymm3 + LONG $0x3775e2c4; BYTE $0xd0 // vpcmpgtq ymm2, ymm1, ymm0 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 - LONG $0x3779e2c4; BYTE $0xd1 // vpcmpgtq xmm2, xmm0, xmm1 + LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 - LONG $0x3779e2c4; BYTE $0xd1 // vpcmpgtq xmm2, xmm0, xmm1 + LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 - LONG $0x546ffdc5; WORD $0x2024 // vmovdqa ymm2, yword [rsp + 32] - LONG $0x375de2c4; BYTE $0xca // vpcmpgtq ymm1, ymm4, ymm2 - LONG $0x4b5de3c4; WORD $0x10ca // vblendvpd ymm1, ymm4, ymm2, ymm1 - LONG $0x3725e2c4; BYTE $0xd5 // vpcmpgtq ymm2, ymm11, ymm5 - LONG $0x4b25e3c4; WORD $0x20d5 // vblendvpd ymm2, ymm11, ymm5, ymm2 - LONG $0x3745e2c4; BYTE $0xe3 // vpcmpgtq ymm4, ymm7, ymm3 - LONG $0x4b45e3c4; WORD $0x40e3 // vblendvpd ymm4, ymm7, ymm3, ymm4 - LONG $0x374dc2c4; BYTE $0xea // vpcmpgtq ymm5, ymm6, ymm10 - LONG $0x4b4dc3c4; WORD $0x50da // vblendvpd ymm3, ymm6, ymm10, ymm5 - LONG $0x3765e2c4; BYTE $0xec // vpcmpgtq ymm5, ymm3, ymm4 - LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 - LONG $0x376de2c4; BYTE $0xe1 // vpcmpgtq ymm4, ymm2, ymm1 - LONG $0x4b6de3c4; WORD $0x40c9 // vblendvpd ymm1, ymm2, ymm1, ymm4 - LONG $0x3775e2c4; BYTE $0xd3 // vpcmpgtq ymm2, ymm1, ymm3 - LONG $0x4b75e3c4; WORD $0x20cb // vblendvpd ymm1, ymm1, ymm3, ymm2 - LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 - LONG $0x3769e2c4; BYTE $0xd9 // vpcmpgtq xmm3, xmm2, xmm1 - LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 - LONG $0x0479e3c4; WORD $0x4ed1 // vpermilps xmm2, xmm1, 78 - LONG $0x3769e2c4; BYTE $0xd9 // vpcmpgtq xmm3, xmm2, xmm1 - LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 - LONG $0x7ef9e1c4; BYTE $0xc6 // vmovq rsi, xmm0 - LONG $0x7ef9c1c4; BYTE $0xc9 // vmovq r9, xmm1 - WORD $0x394d; BYTE $0xc3 // cmp r11, r8 - JE LBB2_14 - -LBB2_4: - WORD $0x8948; BYTE $0xf0 // mov rax, rsi + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x894c; BYTE $0xd6 // mov rsi, r10 + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB2_8 -LBB2_5: - LONG $0xdf348b4a // mov rsi, qword [rdi + 8*r11] - WORD $0x3949; BYTE $0xf1 // cmp r9, rsi - LONG $0xce4f0f4c // cmovg r9, rsi +LBB2_7: + LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9] WORD $0x3948; BYTE $0xf0 // cmp rax, rsi - LONG $0xf04d0f48 // cmovge rsi, rax - LONG $0x01c38349 // add r11, 1 - WORD $0x8948; BYTE $0xf0 // mov rax, rsi - WORD $0x394d; BYTE $0xd8 // cmp r8, r11 - JNE LBB2_5 + LONG $0xc64f0f48 // cmovg rax, rsi + WORD $0x3949; BYTE $0xf2 // cmp r10, rsi + LONG $0xf24d0f49 // cmovge rsi, r10 + LONG $0x01c18349 // add r9, 1 + WORD $0x8949; BYTE $0xf2 // mov r10, rsi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB2_7 -LBB2_14: +LBB2_8: WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi - WORD $0x894c; BYTE $0x0a // mov qword [rdx], r9 - SUBQ $8, SP + WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax VZEROUPPER RET DATA LCDATA3<>+0x000(SB)/8, $0x8000000000000000 GLOBL LCDATA3<>(SB), 8, $8 -TEXT ·_uint64_max_min_avx2(SB), $296-32 +TEXT ·_uint64_max_min_avx2(SB), $0-32 MOVQ values+0(FP), DI MOVQ length+8(FP), SI MOVQ minout+16(FP), DX MOVQ maxout+24(FP), CX - ADDQ $8, SP LEAQ LCDATA3<>(SB), BP WORD $0xf685 // test esi, esi JLE LBB3_1 WORD $0x8941; BYTE $0xf0 // mov r8d, esi - WORD $0xfe83; BYTE $0x1f // cmp esi, 31 - JA LBB3_6 - LONG $0xffc1c749; WORD $0xffff; BYTE $0xff // mov r9, -1 - WORD $0x3145; BYTE $0xdb // xor r11d, r11d - WORD $0xf631 // xor esi, esi - JMP LBB3_4 + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JA LBB3_4 + LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1 + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB3_7 LBB3_1: - LONG $0xffc1c749; WORD $0xffff; BYTE $0xff // mov r9, -1 + LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1 WORD $0xf631 // xor esi, esi - JMP LBB3_14 - -LBB3_6: - WORD $0x8945; BYTE $0xc3 // mov r11d, r8d - LONG $0xe0e38341 // and r11d, -32 - LONG $0xe0438d49 // lea rax, [r11 - 32] - WORD $0x8949; BYTE $0xc2 // mov r10, rax - LONG $0x05eac149 // shr r10, 5 - LONG $0x01c28349 // add r10, 1 - WORD $0x8945; BYTE $0xd1 // mov r9d, r10d - LONG $0x03e18341 // and r9d, 3 - LONG $0x60f88348 // cmp rax, 96 - JAE LBB3_8 - LONG $0xe4efd9c5 // vpxor xmm4, xmm4, xmm4 - LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0 - LONG $0x447ffdc5; WORD $0x4024 // vmovdqa yword [rsp + 64], ymm0 - WORD $0xc031 // xor eax, eax - LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0 - LONG $0x447ffdc5; WORD $0x6024 // vmovdqa yword [rsp + 96], ymm0 - LONG $0xed76d5c5 // vpcmpeqd ymm5, ymm5, ymm5 - LONG $0xff76c5c5 // vpcmpeqd ymm7, ymm7, ymm7 - LONG $0x761d41c4; BYTE $0xe4 // vpcmpeqd ymm12, ymm12, ymm12 - LONG $0x762d41c4; BYTE $0xd2 // vpcmpeqd ymm10, ymm10, ymm10 - LONG $0x762541c4; BYTE $0xdb // vpcmpeqd ymm11, ymm11, ymm11 - LONG $0x761541c4; BYTE $0xed // vpcmpeqd ymm13, ymm13, ymm13 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 - LONG $0x447ffdc5; WORD $0x2024 // vmovdqa yword [rsp + 32], ymm0 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 - LONG $0x047ffdc5; BYTE $0x24 // vmovdqa yword [rsp], ymm0 - LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 - LONG $0xef3141c4; BYTE $0xc9 // vpxor xmm9, xmm9, xmm9 - LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 - LONG $0xef0141c4; BYTE $0xff // vpxor xmm15, xmm15, xmm15 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 - JMP LBB3_10 + JMP LBB3_8 -LBB3_8: - LONG $0xfce28349 // and r10, -4 - WORD $0xf749; BYTE $0xda // neg r10 - LONG $0xe4efd9c5 // vpxor xmm4, xmm4, xmm4 - LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0 - LONG $0x447ffdc5; WORD $0x4024 // vmovdqa yword [rsp + 64], ymm0 +LBB3_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0xf0e18341 // and r9d, -16 + LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 + LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 WORD $0xc031 // xor eax, eax - LONG $0x597d62c4; WORD $0x0075 // vpbroadcastq ymm14, qword 0[rbp] /* [rip + .LCPI3_0] */ - LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0 - LONG $0x447ffdc5; WORD $0x6024 // vmovdqa yword [rsp + 96], ymm0 - LONG $0xed76d5c5 // vpcmpeqd ymm5, ymm5, ymm5 - LONG $0xff76c5c5 // vpcmpeqd ymm7, ymm7, ymm7 - LONG $0x761d41c4; BYTE $0xe4 // vpcmpeqd ymm12, ymm12, ymm12 - LONG $0x762d41c4; BYTE $0xd2 // vpcmpeqd ymm10, ymm10, ymm10 - LONG $0x762541c4; BYTE $0xdb // vpcmpeqd ymm11, ymm11, ymm11 - LONG $0x761541c4; BYTE $0xed // vpcmpeqd ymm13, ymm13, ymm13 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 - LONG $0x447ffdc5; WORD $0x2024 // vmovdqa yword [rsp + 32], ymm0 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 - LONG $0x047ffdc5; BYTE $0x24 // vmovdqa yword [rsp], ymm0 - LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 - LONG $0xef3141c4; BYTE $0xc9 // vpxor xmm9, xmm9, xmm9 + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI3_0] */ + LONG $0xe476ddc5 // vpcmpeqd ymm4, ymm4, ymm4 + LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3 + LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 - LONG $0xef0141c4; BYTE $0xff // vpxor xmm15, xmm15, xmm15 - LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 - -LBB3_9: - QUAD $0x0000e0c78c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdi + 8*rax + 224] - LONG $0xd1ef8dc5 // vpxor ymm2, ymm14, ymm1 - LONG $0xf36ffdc5 // vmovdqa ymm6, ymm3 - LONG $0xef15c1c4; BYTE $0xde // vpxor ymm3, ymm13, ymm14 - LONG $0x376de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm2, ymm3 - LONG $0x4b75c3c4; WORD $0x30dd // vblendvpd ymm3, ymm1, ymm13, ymm3 - QUAD $0x000080249c29fdc5; BYTE $0x00 // vmovapd yword [rsp + 128], ymm3 - LONG $0xd8ef8dc5 // vpxor ymm3, ymm14, ymm0 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 - QUAD $0x0000e0248429fdc5; BYTE $0x00 // vmovapd yword [rsp + 224], ymm0 - QUAD $0x0000c0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 192] - LONG $0xc8ef8dc5 // vpxor ymm1, ymm14, ymm0 - LONG $0xef25c1c4; BYTE $0xd6 // vpxor ymm2, ymm11, ymm14 - LONG $0x3775e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm1, ymm2 - LONG $0x4b7dc3c4; WORD $0x20d3 // vblendvpd ymm2, ymm0, ymm11, ymm2 - QUAD $0x0000a0249429fdc5; BYTE $0x00 // vmovapd yword [rsp + 160], ymm2 - LONG $0xef05c1c4; BYTE $0xd6 // vpxor ymm2, ymm15, ymm14 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7dc3c4; WORD $0x10c7 // vblendvpd ymm0, ymm0, ymm15, ymm1 - QUAD $0x0000c0248429fdc5; BYTE $0x00 // vmovapd yword [rsp + 192], ymm0 - QUAD $0x0000a0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 160] - LONG $0xc8ef8dc5 // vpxor ymm1, ymm14, ymm0 - LONG $0xef2dc1c4; BYTE $0xd6 // vpxor ymm2, ymm10, ymm14 - LONG $0x3775e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm1, ymm2 - LONG $0x6f7dc1c4; BYTE $0xd8 // vmovdqa ymm3, ymm8 - LONG $0x4b7d43c4; WORD $0x20c2 // vblendvpd ymm8, ymm0, ymm10, ymm2 - LONG $0xd3ef8dc5 // vpxor ymm2, ymm14, ymm3 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7d63c4; WORD $0x10eb // vblendvpd ymm13, ymm0, ymm3, ymm1 - QUAD $0x000080c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 128] - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0xef1dc1c4; BYTE $0xce // vpxor ymm1, ymm12, ymm14 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7dc3c4; WORD $0x10cc // vblendvpd ymm1, ymm0, ymm12, ymm1 - LONG $0xef35c1c4; BYTE $0xde // vpxor ymm3, ymm9, ymm14 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b7d43c4; WORD $0x20e1 // vblendvpd ymm12, ymm0, ymm9, ymm2 - LONG $0x546ffec5; WORD $0x60c7 // vmovdqu ymm2, yword [rdi + 8*rax + 96] - LONG $0xc7ef8dc5 // vpxor ymm0, ymm14, ymm7 - LONG $0xdaef8dc5 // vpxor ymm3, ymm14, ymm2 - LONG $0x3765e2c4; BYTE $0xc0 // vpcmpgtq ymm0, ymm3, ymm0 - LONG $0x4b6de3c4; WORD $0x00c7 // vblendvpd ymm0, ymm2, ymm7, ymm0 - LONG $0xfc6f7dc5 // vmovdqa ymm15, ymm4 - LONG $0xe6ef8dc5 // vpxor ymm4, ymm14, ymm6 - LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 - LONG $0x4b6d63c4; WORD $0x30d6 // vblendvpd ymm10, ymm2, ymm6, ymm3 - LONG $0x546ffec5; WORD $0x40c7 // vmovdqu ymm2, yword [rdi + 8*rax + 64] - LONG $0xddef8dc5 // vpxor ymm3, ymm14, ymm5 - LONG $0xe2ef8dc5 // vpxor ymm4, ymm14, ymm2 - LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 - LONG $0x4b6de3c4; WORD $0x30ed // vblendvpd ymm5, ymm2, ymm5, ymm3 - LONG $0x346ffdc5; BYTE $0x24 // vmovdqa ymm6, yword [rsp] - LONG $0xdeef8dc5 // vpxor ymm3, ymm14, ymm6 - LONG $0x3765e2c4; BYTE $0xdc // vpcmpgtq ymm3, ymm3, ymm4 - LONG $0x4b6d63c4; WORD $0x30ce // vblendvpd ymm9, ymm2, ymm6, ymm3 - LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax] - LONG $0x7c6ffdc5; WORD $0x4024 // vmovdqa ymm7, yword [rsp + 64] - LONG $0xdfef8dc5 // vpxor ymm3, ymm14, ymm7 - LONG $0xe2ef8dc5 // vpxor ymm4, ymm14, ymm2 - LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 - LONG $0x4b6de3c4; WORD $0x30df // vblendvpd ymm3, ymm2, ymm7, ymm3 - LONG $0xef0541c4; BYTE $0xde // vpxor ymm11, ymm15, ymm14 - LONG $0x3725e2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm11, ymm4 - LONG $0x4b6dc3c4; WORD $0x40e7 // vblendvpd ymm4, ymm2, ymm15, ymm4 - LONG $0x546ffec5; WORD $0x20c7 // vmovdqu ymm2, yword [rdi + 8*rax + 32] - LONG $0x7c6f7dc5; WORD $0x6024 // vmovdqa ymm15, yword [rsp + 96] - LONG $0xef0541c4; BYTE $0xde // vpxor ymm11, ymm15, ymm14 - LONG $0xfaef8dc5 // vpxor ymm7, ymm14, ymm2 - LONG $0x374542c4; BYTE $0xdb // vpcmpgtq ymm11, ymm7, ymm11 - LONG $0x4b6d43c4; WORD $0xb0df // vblendvpd ymm11, ymm2, ymm15, ymm11 - LONG $0x746ffdc5; WORD $0x2024 // vmovdqa ymm6, yword [rsp + 32] - LONG $0xfeef0dc5 // vpxor ymm15, ymm14, ymm6 - LONG $0x3705e2c4; BYTE $0xff // vpcmpgtq ymm7, ymm15, ymm7 - LONG $0x4b6de3c4; WORD $0x70d6 // vblendvpd ymm2, ymm2, ymm6, ymm7 - QUAD $0x000120c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 288] - LONG $0x5725c1c4; BYTE $0xfe // vxorpd ymm7, ymm11, ymm14 - LONG $0xfeef0dc5 // vpxor ymm15, ymm14, ymm6 - LONG $0x3705e2c4; BYTE $0xff // vpcmpgtq ymm7, ymm15, ymm7 - LONG $0x4b4dc3c4; WORD $0x70fb // vblendvpd ymm7, ymm6, ymm11, ymm7 - LONG $0x7c29fdc5; WORD $0x6024 // vmovapd yword [rsp + 96], ymm7 - LONG $0xfa578dc5 // vxorpd ymm7, ymm14, ymm2 - LONG $0x3745c2c4; BYTE $0xff // vpcmpgtq ymm7, ymm7, ymm15 - LONG $0x4b4de3c4; WORD $0x70d2 // vblendvpd ymm2, ymm6, ymm2, ymm7 - LONG $0x5429fdc5; WORD $0x4024 // vmovapd yword [rsp + 64], ymm2 - QUAD $0x000100c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 256] - LONG $0xfb578dc5 // vxorpd ymm7, ymm14, ymm3 - LONG $0xdeef0dc5 // vpxor ymm11, ymm14, ymm6 - LONG $0x3725e2c4; BYTE $0xff // vpcmpgtq ymm7, ymm11, ymm7 - LONG $0x4b4de3c4; WORD $0x70d3 // vblendvpd ymm2, ymm6, ymm3, ymm7 - LONG $0x1429fdc5; BYTE $0x24 // vmovapd yword [rsp], ymm2 - LONG $0xfc578dc5 // vxorpd ymm7, ymm14, ymm4 - LONG $0x3745c2c4; BYTE $0xfb // vpcmpgtq ymm7, ymm7, ymm11 - LONG $0x4b4de3c4; WORD $0x70d4 // vblendvpd ymm2, ymm6, ymm4, ymm7 - LONG $0x5429fdc5; WORD $0x2024 // vmovapd yword [rsp + 32], ymm2 - QUAD $0x000140c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 320] - LONG $0xfd578dc5 // vxorpd ymm7, ymm14, ymm5 - LONG $0xdeef0dc5 // vpxor ymm11, ymm14, ymm6 - LONG $0x3725e2c4; BYTE $0xff // vpcmpgtq ymm7, ymm11, ymm7 - LONG $0x4b4de3c4; WORD $0x70ed // vblendvpd ymm5, ymm6, ymm5, ymm7 - LONG $0x5735c1c4; BYTE $0xfe // vxorpd ymm7, ymm9, ymm14 - LONG $0x3745c2c4; BYTE $0xfb // vpcmpgtq ymm7, ymm7, ymm11 - LONG $0x4b4dc3c4; WORD $0x70f9 // vblendvpd ymm7, ymm6, ymm9, ymm7 - QUAD $0x000160c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 352] - LONG $0xc8570dc5 // vxorpd ymm9, ymm14, ymm0 - LONG $0xdeef0dc5 // vpxor ymm11, ymm14, ymm6 - LONG $0x372542c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm11, ymm9 - LONG $0x4b4d63c4; WORD $0x90c8 // vblendvpd ymm9, ymm6, ymm0, ymm9 - LONG $0x572dc1c4; BYTE $0xc6 // vxorpd ymm0, ymm10, ymm14 - LONG $0x377dc2c4; BYTE $0xc3 // vpcmpgtq ymm0, ymm0, ymm11 - LONG $0x4b4d43c4; WORD $0x00d2 // vblendvpd ymm10, ymm6, ymm10, ymm0 - QUAD $0x000180c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 384] - LONG $0xc1578dc5 // vxorpd ymm0, ymm14, ymm1 - LONG $0xdeef0dc5 // vpxor ymm11, ymm14, ymm6 - LONG $0x3725e2c4; BYTE $0xc0 // vpcmpgtq ymm0, ymm11, ymm0 - LONG $0x4b4de3c4; WORD $0x00e1 // vblendvpd ymm4, ymm6, ymm1, ymm0 - LONG $0x571dc1c4; BYTE $0xce // vxorpd ymm1, ymm12, ymm14 - LONG $0x3775c2c4; BYTE $0xcb // vpcmpgtq ymm1, ymm1, ymm11 - LONG $0x4b4dc3c4; WORD $0x10dc // vblendvpd ymm3, ymm6, ymm12, ymm1 - QUAD $0x0001a0c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 416] - LONG $0x573dc1c4; BYTE $0xf6 // vxorpd ymm6, ymm8, ymm14 - LONG $0xef2541c4; BYTE $0xe6 // vpxor ymm12, ymm11, ymm14 - LONG $0x371de2c4; BYTE $0xf6 // vpcmpgtq ymm6, ymm12, ymm6 - LONG $0x4b25c3c4; WORD $0x60f0 // vblendvpd ymm6, ymm11, ymm8, ymm6 - LONG $0x571541c4; BYTE $0xc6 // vxorpd ymm8, ymm13, ymm14 - LONG $0x373d42c4; BYTE $0xc4 // vpcmpgtq ymm8, ymm8, ymm12 - LONG $0x4b2543c4; WORD $0x80e5 // vblendvpd ymm12, ymm11, ymm13, ymm8 - QUAD $0x0001c0c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 448] - QUAD $0x0000a024846ffdc5; BYTE $0x00 // vmovdqa ymm0, yword [rsp + 160] - LONG $0xc0ef0dc5 // vpxor ymm8, ymm14, ymm0 - LONG $0xef2541c4; BYTE $0xee // vpxor ymm13, ymm11, ymm14 - LONG $0x371542c4; BYTE $0xc0 // vpcmpgtq ymm8, ymm13, ymm8 - LONG $0x4b2563c4; WORD $0x80c0 // vblendvpd ymm8, ymm11, ymm0, ymm8 - QUAD $0x0000c024846ffdc5; BYTE $0x00 // vmovdqa ymm0, yword [rsp + 192] - LONG $0xf8ef0dc5 // vpxor ymm15, ymm14, ymm0 - LONG $0x370542c4; BYTE $0xed // vpcmpgtq ymm13, ymm15, ymm13 - LONG $0x4b2563c4; WORD $0xd0e8 // vblendvpd ymm13, ymm11, ymm0, ymm13 - QUAD $0x0001e0c79c6f7ec5; BYTE $0x00 // vmovdqu ymm11, yword [rdi + 8*rax + 480] - QUAD $0x000080248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 128] - LONG $0xf9ef0dc5 // vpxor ymm15, ymm14, ymm1 - LONG $0xef25c1c4; BYTE $0xc6 // vpxor ymm0, ymm11, ymm14 - LONG $0x377d42c4; BYTE $0xff // vpcmpgtq ymm15, ymm0, ymm15 - LONG $0x4b25e3c4; WORD $0xf0c9 // vblendvpd ymm1, ymm11, ymm1, ymm15 - QUAD $0x0000e024946ffdc5; BYTE $0x00 // vmovdqa ymm2, yword [rsp + 224] - LONG $0xfaef0dc5 // vpxor ymm15, ymm14, ymm2 - LONG $0x3705e2c4; BYTE $0xc0 // vpcmpgtq ymm0, ymm15, ymm0 - LONG $0x4b2563c4; WORD $0x00fa // vblendvpd ymm15, ymm11, ymm2, ymm0 - QUAD $0x0002e0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 736] - LONG $0xd9570dc5 // vxorpd ymm11, ymm14, ymm1 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376d42c4; BYTE $0xdb // vpcmpgtq ymm11, ymm2, ymm11 - LONG $0x4b7de3c4; WORD $0xb0c9 // vblendvpd ymm1, ymm0, ymm1, ymm11 - QUAD $0x000080248c29fdc5; BYTE $0x00 // vmovapd yword [rsp + 128], ymm1 - LONG $0x5705c1c4; BYTE $0xce // vxorpd ymm1, ymm15, ymm14 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7dc3c4; WORD $0x10c7 // vblendvpd ymm0, ymm0, ymm15, ymm1 - QUAD $0x0000e0248429fdc5; BYTE $0x00 // vmovapd yword [rsp + 224], ymm0 - QUAD $0x0002c0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 704] - LONG $0x573dc1c4; BYTE $0xce // vxorpd ymm1, ymm8, ymm14 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7dc3c4; WORD $0x10c8 // vblendvpd ymm1, ymm0, ymm8, ymm1 - QUAD $0x0000a0248c29fdc5; BYTE $0x00 // vmovapd yword [rsp + 160], ymm1 - LONG $0x5715c1c4; BYTE $0xce // vxorpd ymm1, ymm13, ymm14 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7dc3c4; WORD $0x10c5 // vblendvpd ymm0, ymm0, ymm13, ymm1 - QUAD $0x0000c0248429fdc5; BYTE $0x00 // vmovapd yword [rsp + 192], ymm0 - QUAD $0x0002a0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 672] - LONG $0xce578dc5 // vxorpd ymm1, ymm14, ymm6 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7d63c4; WORD $0x10fe // vblendvpd ymm15, ymm0, ymm6, ymm1 - LONG $0x571dc1c4; BYTE $0xce // vxorpd ymm1, ymm12, ymm14 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7d43c4; WORD $0x10ec // vblendvpd ymm13, ymm0, ymm12, ymm1 - QUAD $0x000280c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 640] - LONG $0xcc578dc5 // vxorpd ymm1, ymm14, ymm4 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7d63c4; WORD $0x10e4 // vblendvpd ymm12, ymm0, ymm4, ymm1 - LONG $0xcb578dc5 // vxorpd ymm1, ymm14, ymm3 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7d63c4; WORD $0x10c3 // vblendvpd ymm8, ymm0, ymm3, ymm1 - QUAD $0x000260c7946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdi + 8*rax + 608] - LONG $0x5735c1c4; BYTE $0xce // vxorpd ymm1, ymm9, ymm14 - LONG $0xdaef8dc5 // vpxor ymm3, ymm14, ymm2 - LONG $0x3765e2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm3, ymm1 - LONG $0x4b6dc3c4; WORD $0x10c9 // vblendvpd ymm1, ymm2, ymm9, ymm1 - LONG $0x572dc1c4; BYTE $0xe6 // vxorpd ymm4, ymm10, ymm14 - LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 - LONG $0x4b6d43c4; WORD $0x30d2 // vblendvpd ymm10, ymm2, ymm10, ymm3 - QUAD $0x000240c7946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdi + 8*rax + 576] - LONG $0xdd578dc5 // vxorpd ymm3, ymm14, ymm5 - LONG $0xe2ef8dc5 // vpxor ymm4, ymm14, ymm2 - LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 - LONG $0x4b6de3c4; WORD $0x30ed // vblendvpd ymm5, ymm2, ymm5, ymm3 - LONG $0xdf578dc5 // vxorpd ymm3, ymm14, ymm7 - LONG $0x3765e2c4; BYTE $0xdc // vpcmpgtq ymm3, ymm3, ymm4 - LONG $0x4b6d63c4; WORD $0x30cf // vblendvpd ymm9, ymm2, ymm7, ymm3 - QUAD $0x000200c7946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdi + 8*rax + 512] - LONG $0x046ffdc5; BYTE $0x24 // vmovdqa ymm0, yword [rsp] - LONG $0xd8ef8dc5 // vpxor ymm3, ymm14, ymm0 - LONG $0xe2ef8dc5 // vpxor ymm4, ymm14, ymm2 - LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 - LONG $0x4b6de3c4; WORD $0x30d8 // vblendvpd ymm3, ymm2, ymm0, ymm3 - LONG $0x446ffdc5; WORD $0x2024 // vmovdqa ymm0, yword [rsp + 32] - LONG $0xf0ef8dc5 // vpxor ymm6, ymm14, ymm0 - LONG $0x374de2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm6, ymm4 - LONG $0x4b6de3c4; WORD $0x40e0 // vblendvpd ymm4, ymm2, ymm0, ymm4 - QUAD $0x000220c7946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdi + 8*rax + 544] - LONG $0x446ffdc5; WORD $0x6024 // vmovdqa ymm0, yword [rsp + 96] - LONG $0xf0ef8dc5 // vpxor ymm6, ymm14, ymm0 - LONG $0xfaef8dc5 // vpxor ymm7, ymm14, ymm2 - LONG $0x3745e2c4; BYTE $0xf6 // vpcmpgtq ymm6, ymm7, ymm6 - LONG $0x4b6de3c4; WORD $0x60f0 // vblendvpd ymm6, ymm2, ymm0, ymm6 - LONG $0x446ffdc5; WORD $0x4024 // vmovdqa ymm0, yword [rsp + 64] - LONG $0xd8ef0dc5 // vpxor ymm11, ymm14, ymm0 - LONG $0x3725e2c4; BYTE $0xff // vpcmpgtq ymm7, ymm11, ymm7 - LONG $0x4b6de3c4; WORD $0x70d0 // vblendvpd ymm2, ymm2, ymm0, ymm7 - QUAD $0x000320c7bc6ffec5; BYTE $0x00 // vmovdqu ymm7, yword [rdi + 8*rax + 800] - LONG $0xde570dc5 // vxorpd ymm11, ymm14, ymm6 - LONG $0xc7ef8dc5 // vpxor ymm0, ymm14, ymm7 - LONG $0x377d42c4; BYTE $0xdb // vpcmpgtq ymm11, ymm0, ymm11 - LONG $0x4b45e3c4; WORD $0xb0f6 // vblendvpd ymm6, ymm7, ymm6, ymm11 - LONG $0x7429fdc5; WORD $0x6024 // vmovapd yword [rsp + 96], ymm6 - LONG $0xf2578dc5 // vxorpd ymm6, ymm14, ymm2 - LONG $0x374de2c4; BYTE $0xc0 // vpcmpgtq ymm0, ymm6, ymm0 - LONG $0x4b45e3c4; WORD $0x00c2 // vblendvpd ymm0, ymm7, ymm2, ymm0 - LONG $0x4429fdc5; WORD $0x2024 // vmovapd yword [rsp + 32], ymm0 - QUAD $0x000300c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 768] - LONG $0xd3578dc5 // vxorpd ymm2, ymm14, ymm3 - LONG $0xf8ef8dc5 // vpxor ymm7, ymm14, ymm0 - LONG $0x3745e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm7, ymm2 - LONG $0x4b7de3c4; WORD $0x20d3 // vblendvpd ymm2, ymm0, ymm3, ymm2 - LONG $0x5429fdc5; WORD $0x4024 // vmovapd yword [rsp + 64], ymm2 - LONG $0xd4578dc5 // vxorpd ymm2, ymm14, ymm4 - LONG $0x376de2c4; BYTE $0xd7 // vpcmpgtq ymm2, ymm2, ymm7 - LONG $0x4b7de3c4; WORD $0x20e4 // vblendvpd ymm4, ymm0, ymm4, ymm2 - QUAD $0x000340c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 832] - LONG $0xd5578dc5 // vxorpd ymm2, ymm14, ymm5 - LONG $0xd8ef8dc5 // vpxor ymm3, ymm14, ymm0 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b7de3c4; WORD $0x20ed // vblendvpd ymm5, ymm0, ymm5, ymm2 - LONG $0x5735c1c4; BYTE $0xd6 // vxorpd ymm2, ymm9, ymm14 - LONG $0x376de2c4; BYTE $0xd3 // vpcmpgtq ymm2, ymm2, ymm3 - LONG $0x4b7dc3c4; WORD $0x20c1 // vblendvpd ymm0, ymm0, ymm9, ymm2 - LONG $0x0429fdc5; BYTE $0x24 // vmovapd yword [rsp], ymm0 - QUAD $0x000360c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 864] - LONG $0xd1578dc5 // vxorpd ymm2, ymm14, ymm1 - LONG $0xd8ef8dc5 // vpxor ymm3, ymm14, ymm0 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b7de3c4; WORD $0x20f9 // vblendvpd ymm7, ymm0, ymm1, ymm2 - LONG $0x572dc1c4; BYTE $0xce // vxorpd ymm1, ymm10, ymm14 - LONG $0x3775e2c4; BYTE $0xcb // vpcmpgtq ymm1, ymm1, ymm3 - LONG $0x4b7dc3c4; WORD $0x10da // vblendvpd ymm3, ymm0, ymm10, ymm1 - QUAD $0x000380c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 896] - LONG $0x571dc1c4; BYTE $0xce // vxorpd ymm1, ymm12, ymm14 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7d43c4; WORD $0x10e4 // vblendvpd ymm12, ymm0, ymm12, ymm1 - LONG $0x573dc1c4; BYTE $0xce // vxorpd ymm1, ymm8, ymm14 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7d43c4; WORD $0x10c8 // vblendvpd ymm9, ymm0, ymm8, ymm1 - QUAD $0x0003a0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 928] - LONG $0x5705c1c4; BYTE $0xce // vxorpd ymm1, ymm15, ymm14 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7d43c4; WORD $0x10d7 // vblendvpd ymm10, ymm0, ymm15, ymm1 - LONG $0x5715c1c4; BYTE $0xce // vxorpd ymm1, ymm13, ymm14 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7d43c4; WORD $0x10c5 // vblendvpd ymm8, ymm0, ymm13, ymm1 - QUAD $0x0003c0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 960] - QUAD $0x0000a024b46ffdc5; BYTE $0x00 // vmovdqa ymm6, yword [rsp + 160] - LONG $0xceef8dc5 // vpxor ymm1, ymm14, ymm6 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7d63c4; WORD $0x10de // vblendvpd ymm11, ymm0, ymm6, ymm1 - QUAD $0x0000c024b46ffdc5; BYTE $0x00 // vmovdqa ymm6, yword [rsp + 192] - LONG $0xceef8dc5 // vpxor ymm1, ymm14, ymm6 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7d63c4; WORD $0x10fe // vblendvpd ymm15, ymm0, ymm6, ymm1 - QUAD $0x0003e0c7846ffec5; BYTE $0x00 // vmovdqu ymm0, yword [rdi + 8*rax + 992] - QUAD $0x00008024b46ffdc5; BYTE $0x00 // vmovdqa ymm6, yword [rsp + 128] - LONG $0xceef8dc5 // vpxor ymm1, ymm14, ymm6 - LONG $0xd0ef8dc5 // vpxor ymm2, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b7d63c4; WORD $0x10ee // vblendvpd ymm13, ymm0, ymm6, ymm1 - QUAD $0x0000e024b46ffdc5; BYTE $0x00 // vmovdqa ymm6, yword [rsp + 224] - LONG $0xceef8dc5 // vpxor ymm1, ymm14, ymm6 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b7de3c4; WORD $0x10c6 // vblendvpd ymm0, ymm0, ymm6, ymm1 - LONG $0x80e88348 // sub rax, -128 - LONG $0x04c28349 // add r10, 4 - JNE LBB3_9 - -LBB3_10: - QUAD $0x0000802494297cc5; BYTE $0x00 // vmovaps yword [rsp + 128], ymm10 - WORD $0x854d; BYTE $0xc9 // test r9, r9 - LONG $0x6f7d41c4; BYTE $0xd4 // vmovdqa ymm10, ymm12 - LONG $0xe36f7dc5 // vmovdqa ymm12, ymm3 - JE LBB3_13 - LONG $0xc7048d48 // lea rax, [rdi + 8*rax] - WORD $0xf749; BYTE $0xd9 // neg r9 - LONG $0x597d62c4; WORD $0x0075 // vpbroadcastq ymm14, qword 0[rbp] /* [rip + .LCPI3_0] */ - -LBB3_12: - LONG $0x486ffec5; BYTE $0x20 // vmovdqu ymm1, yword [rax + 32] - LONG $0xf76ffdc5 // vmovdqa ymm6, ymm7 - LONG $0xfd6ffdc5 // vmovdqa ymm7, ymm5 - LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4 - LONG $0x646ffdc5; WORD $0x6024 // vmovdqa ymm4, yword [rsp + 96] - LONG $0xd4ef8dc5 // vpxor ymm2, ymm14, ymm4 - LONG $0xd9ef8dc5 // vpxor ymm3, ymm14, ymm1 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b75e3c4; WORD $0x20e4 // vblendvpd ymm4, ymm1, ymm4, ymm2 - LONG $0x6429fdc5; WORD $0x6024 // vmovapd yword [rsp + 96], ymm4 - LONG $0x646ffdc5; WORD $0x2024 // vmovdqa ymm4, yword [rsp + 32] - LONG $0xd4ef8dc5 // vpxor ymm2, ymm14, ymm4 - LONG $0x376de2c4; BYTE $0xd3 // vpcmpgtq ymm2, ymm2, ymm3 - LONG $0x4b75e3c4; WORD $0x20e4 // vblendvpd ymm4, ymm1, ymm4, ymm2 - LONG $0x6429fdc5; WORD $0x2024 // vmovapd yword [rsp + 32], ymm4 - LONG $0x086ffec5 // vmovdqu ymm1, yword [rax] - LONG $0x646ffdc5; WORD $0x4024 // vmovdqa ymm4, yword [rsp + 64] - LONG $0xd4ef8dc5 // vpxor ymm2, ymm14, ymm4 - LONG $0xd9ef8dc5 // vpxor ymm3, ymm14, ymm1 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b75e3c4; WORD $0x20e4 // vblendvpd ymm4, ymm1, ymm4, ymm2 - LONG $0x6429fdc5; WORD $0x4024 // vmovapd yword [rsp + 64], ymm4 - LONG $0xe56ffdc5 // vmovdqa ymm4, ymm5 - LONG $0xef6ffdc5 // vmovdqa ymm5, ymm7 - LONG $0xfe6ffdc5 // vmovdqa ymm7, ymm6 - LONG $0xd4ef8dc5 // vpxor ymm2, ymm14, ymm4 - LONG $0x376de2c4; BYTE $0xd3 // vpcmpgtq ymm2, ymm2, ymm3 - LONG $0x586ffec5; BYTE $0x40 // vmovdqu ymm3, yword [rax + 64] - LONG $0x4b75e3c4; WORD $0x20e4 // vblendvpd ymm4, ymm1, ymm4, ymm2 - LONG $0xcbef8dc5 // vpxor ymm1, ymm14, ymm3 - LONG $0xd5ef8dc5 // vpxor ymm2, ymm14, ymm5 - LONG $0x3775e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm1, ymm2 - LONG $0x4b65e3c4; WORD $0x20ed // vblendvpd ymm5, ymm3, ymm5, ymm2 - LONG $0x346ffdc5; BYTE $0x24 // vmovdqa ymm6, yword [rsp] - LONG $0xd6ef8dc5 // vpxor ymm2, ymm14, ymm6 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b65e3c4; WORD $0x10f6 // vblendvpd ymm6, ymm3, ymm6, ymm1 - LONG $0x3429fdc5; BYTE $0x24 // vmovapd yword [rsp], ymm6 - LONG $0x486ffec5; BYTE $0x60 // vmovdqu ymm1, yword [rax + 96] - LONG $0xd1ef8dc5 // vpxor ymm2, ymm14, ymm1 - LONG $0xdfef8dc5 // vpxor ymm3, ymm14, ymm7 - LONG $0x376de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm2, ymm3 - LONG $0x4b75e3c4; WORD $0x30ff // vblendvpd ymm7, ymm1, ymm7, ymm3 - LONG $0xef1dc1c4; BYTE $0xde // vpxor ymm3, ymm12, ymm14 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - QUAD $0x00000080986ffec5 // vmovdqu ymm3, yword [rax + 128] - LONG $0x4b7543c4; WORD $0x20e4 // vblendvpd ymm12, ymm1, ymm12, ymm2 - LONG $0xcbef8dc5 // vpxor ymm1, ymm14, ymm3 - LONG $0xef2dc1c4; BYTE $0xd6 // vpxor ymm2, ymm10, ymm14 - LONG $0x3775e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm1, ymm2 - LONG $0x4b6543c4; WORD $0x20d2 // vblendvpd ymm10, ymm3, ymm10, ymm2 - LONG $0xef35c1c4; BYTE $0xd6 // vpxor ymm2, ymm9, ymm14 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b6543c4; WORD $0x10c9 // vblendvpd ymm9, ymm3, ymm9, ymm1 - QUAD $0x000000a0886ffec5 // vmovdqu ymm1, yword [rax + 160] - LONG $0xd1ef8dc5 // vpxor ymm2, ymm14, ymm1 - QUAD $0x00008024b46ffdc5; BYTE $0x00 // vmovdqa ymm6, yword [rsp + 128] - LONG $0xdeef8dc5 // vpxor ymm3, ymm14, ymm6 - LONG $0x376de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm2, ymm3 - LONG $0x4b75e3c4; WORD $0x30f6 // vblendvpd ymm6, ymm1, ymm6, ymm3 - QUAD $0x00008024b429fdc5; BYTE $0x00 // vmovapd yword [rsp + 128], ymm6 - LONG $0xef3dc1c4; BYTE $0xde // vpxor ymm3, ymm8, ymm14 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - QUAD $0x000000c0986ffec5 // vmovdqu ymm3, yword [rax + 192] - LONG $0x4b7543c4; WORD $0x20c0 // vblendvpd ymm8, ymm1, ymm8, ymm2 - LONG $0xcbef8dc5 // vpxor ymm1, ymm14, ymm3 - LONG $0xef25c1c4; BYTE $0xd6 // vpxor ymm2, ymm11, ymm14 - LONG $0x3775e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm1, ymm2 - LONG $0x4b6543c4; WORD $0x20db // vblendvpd ymm11, ymm3, ymm11, ymm2 - LONG $0xef05c1c4; BYTE $0xd6 // vpxor ymm2, ymm15, ymm14 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b6543c4; WORD $0x10ff // vblendvpd ymm15, ymm3, ymm15, ymm1 - QUAD $0x000000e0886ffec5 // vmovdqu ymm1, yword [rax + 224] - LONG $0xd1ef8dc5 // vpxor ymm2, ymm14, ymm1 - LONG $0xef15c1c4; BYTE $0xde // vpxor ymm3, ymm13, ymm14 - LONG $0x376de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm2, ymm3 - LONG $0x4b7543c4; WORD $0x30ed // vblendvpd ymm13, ymm1, ymm13, ymm3 - LONG $0xd8ef8dc5 // vpxor ymm3, ymm14, ymm0 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 - LONG $0x01000548; WORD $0x0000 // add rax, 256 - WORD $0xff49; BYTE $0xc1 // inc r9 - JNE LBB3_12 - -LBB3_13: - LONG $0x597d62c4; WORD $0x0075 // vpbroadcastq ymm14, qword 0[rbp] /* [rip + .LCPI3_0] */ - LONG $0x1c6ffdc5; BYTE $0x24 // vmovdqa ymm3, yword [rsp] - LONG $0xcbef8dc5 // vpxor ymm1, ymm14, ymm3 - LONG $0xef05c1c4; BYTE $0xd6 // vpxor ymm2, ymm15, ymm14 - LONG $0x3775e2c4; BYTE $0xca // vpcmpgtq ymm1, ymm1, ymm2 - LONG $0x4b05e3c4; WORD $0x10cb // vblendvpd ymm1, ymm15, ymm3, ymm1 - LONG $0xd4ef8dc5 // vpxor ymm2, ymm14, ymm4 - LONG $0xef35c1c4; BYTE $0xde // vpxor ymm3, ymm9, ymm14 - LONG $0x376de2c4; BYTE $0xd3 // vpcmpgtq ymm2, ymm2, ymm3 - LONG $0x4b35e3c4; WORD $0x20d4 // vblendvpd ymm2, ymm9, ymm4, ymm2 - LONG $0xef1dc1c4; BYTE $0xde // vpxor ymm3, ymm12, ymm14 - LONG $0xc8ef0dc5 // vpxor ymm9, ymm14, ymm0 - LONG $0x3765c2c4; BYTE $0xd9 // vpcmpgtq ymm3, ymm3, ymm9 - LONG $0x4b7dc3c4; WORD $0x30c4 // vblendvpd ymm0, ymm0, ymm12, ymm3 - LONG $0x646ffdc5; WORD $0x2024 // vmovdqa ymm4, yword [rsp + 32] - LONG $0xdcef8dc5 // vpxor ymm3, ymm14, ymm4 - LONG $0xef3d41c4; BYTE $0xce // vpxor ymm9, ymm8, ymm14 - LONG $0x3765c2c4; BYTE $0xd9 // vpcmpgtq ymm3, ymm3, ymm9 - LONG $0x4b3de3c4; WORD $0x30dc // vblendvpd ymm3, ymm8, ymm4, ymm3 - LONG $0xf3578dc5 // vxorpd ymm6, ymm14, ymm3 - LONG $0xc8570dc5 // vxorpd ymm9, ymm14, ymm0 - LONG $0x374dc2c4; BYTE $0xf1 // vpcmpgtq ymm6, ymm6, ymm9 - LONG $0x4b7de3c4; WORD $0x60c3 // vblendvpd ymm0, ymm0, ymm3, ymm6 - LONG $0xda578dc5 // vxorpd ymm3, ymm14, ymm2 - LONG $0xf1578dc5 // vxorpd ymm6, ymm14, ymm1 - LONG $0x3765e2c4; BYTE $0xde // vpcmpgtq ymm3, ymm3, ymm6 - LONG $0x4b75e3c4; WORD $0x30ca // vblendvpd ymm1, ymm1, ymm2, ymm3 - LONG $0xd1578dc5 // vxorpd ymm2, ymm14, ymm1 - LONG $0xd8578dc5 // vxorpd ymm3, ymm14, ymm0 - LONG $0x376de2c4; BYTE $0xd3 // vpcmpgtq ymm2, ymm2, ymm3 - LONG $0x4b7de3c4; WORD $0x20c1 // vblendvpd ymm0, ymm0, ymm1, ymm2 - LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 - LONG $0xd15789c5 // vxorpd xmm2, xmm14, xmm1 - LONG $0xd85789c5 // vxorpd xmm3, xmm14, xmm0 - LONG $0x3761e2c4; BYTE $0xd2 // vpcmpgtq xmm2, xmm3, xmm2 - LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 - LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 - LONG $0xd05789c5 // vxorpd xmm2, xmm14, xmm0 - LONG $0xd95789c5 // vxorpd xmm3, xmm14, xmm1 - LONG $0x3769e2c4; BYTE $0xd3 // vpcmpgtq xmm2, xmm2, xmm3 - LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 - LONG $0xcdef8dc5 // vpxor ymm1, ymm14, ymm5 - LONG $0xef25c1c4; BYTE $0xd6 // vpxor ymm2, ymm11, ymm14 - LONG $0x376de2c4; BYTE $0xc9 // vpcmpgtq ymm1, ymm2, ymm1 - LONG $0x4b25e3c4; WORD $0x10cd // vblendvpd ymm1, ymm11, ymm5, ymm1 - LONG $0x646ffdc5; WORD $0x4024 // vmovdqa ymm4, yword [rsp + 64] - LONG $0xd4ef8dc5 // vpxor ymm2, ymm14, ymm4 - LONG $0xef2dc1c4; BYTE $0xde // vpxor ymm3, ymm10, ymm14 - LONG $0x3765e2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm3, ymm2 - LONG $0x4b2de3c4; WORD $0x20d4 // vblendvpd ymm2, ymm10, ymm4, ymm2 - LONG $0xdfef8dc5 // vpxor ymm3, ymm14, ymm7 - LONG $0xef15c1c4; BYTE $0xee // vpxor ymm5, ymm13, ymm14 - LONG $0x3755e2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm5, ymm3 - LONG $0x4b15e3c4; WORD $0x30df // vblendvpd ymm3, ymm13, ymm7, ymm3 - LONG $0x746ffdc5; WORD $0x6024 // vmovdqa ymm6, yword [rsp + 96] - LONG $0xe6ef8dc5 // vpxor ymm4, ymm14, ymm6 - QUAD $0x00008024bc6ffdc5; BYTE $0x00 // vmovdqa ymm7, yword [rsp + 128] - LONG $0xefef8dc5 // vpxor ymm5, ymm14, ymm7 - LONG $0x3755e2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm5, ymm4 - LONG $0x4b45e3c4; WORD $0x40e6 // vblendvpd ymm4, ymm7, ymm6, ymm4 - LONG $0xec578dc5 // vxorpd ymm5, ymm14, ymm4 - LONG $0xf3578dc5 // vxorpd ymm6, ymm14, ymm3 - LONG $0x374de2c4; BYTE $0xed // vpcmpgtq ymm5, ymm6, ymm5 - LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 - LONG $0xe2578dc5 // vxorpd ymm4, ymm14, ymm2 - LONG $0xe9578dc5 // vxorpd ymm5, ymm14, ymm1 - LONG $0x3755e2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm5, ymm4 - LONG $0x4b75e3c4; WORD $0x40ca // vblendvpd ymm1, ymm1, ymm2, ymm4 - LONG $0xd1578dc5 // vxorpd ymm2, ymm14, ymm1 - LONG $0xe3578dc5 // vxorpd ymm4, ymm14, ymm3 - LONG $0x375de2c4; BYTE $0xd2 // vpcmpgtq ymm2, ymm4, ymm2 - LONG $0x4b65e3c4; WORD $0x20c9 // vblendvpd ymm1, ymm3, ymm1, ymm2 - LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 - LONG $0xd95789c5 // vxorpd xmm3, xmm14, xmm1 - LONG $0xe25789c5 // vxorpd xmm4, xmm14, xmm2 - LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3 - LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 - LONG $0x0479e3c4; WORD $0x4ed1 // vpermilps xmm2, xmm1, 78 - LONG $0xd95789c5 // vxorpd xmm3, xmm14, xmm1 - LONG $0xe25789c5 // vxorpd xmm4, xmm14, xmm2 - LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3 - LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 - LONG $0x7ef9e1c4; BYTE $0xc6 // vmovq rsi, xmm0 - LONG $0x7ef9c1c4; BYTE $0xc9 // vmovq r9, xmm1 - WORD $0x394d; BYTE $0xc3 // cmp r11, r8 - JE LBB3_14 - -LBB3_4: - WORD $0x8948; BYTE $0xf0 // mov rax, rsi + LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7 + LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6 LBB3_5: - LONG $0xdf348b4a // mov rsi, qword [rdi + 8*r11] - WORD $0x3949; BYTE $0xf1 // cmp r9, rsi - LONG $0xce430f4c // cmovae r9, rsi - WORD $0x3948; BYTE $0xf0 // cmp rax, rsi - LONG $0xf0470f48 // cmova rsi, rax - LONG $0x01c38349 // add r11, 1 - WORD $0x8948; BYTE $0xf0 // mov rax, rsi - WORD $0x394d; BYTE $0xd8 // cmp r8, r11 + LONG $0x0c6f7ec5; BYTE $0xc7 // vmovdqu ymm9, yword [rdi + 8*rax] + LONG $0xd0ef75c5 // vpxor ymm10, ymm1, ymm0 + LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 + LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 + LONG $0x4b35e3c4; WORD $0xa0c9 // vblendvpd ymm1, ymm9, ymm1, ymm10 + LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0 + LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 + LONG $0x4b35e3c4; WORD $0xa0ed // vblendvpd ymm5, ymm9, ymm5, ymm10 + LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32] + LONG $0xd0ef5dc5 // vpxor ymm10, ymm4, ymm0 + LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 + LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 + LONG $0x4b35e3c4; WORD $0xa0e4 // vblendvpd ymm4, ymm9, ymm4, ymm10 + LONG $0xd0ef3dc5 // vpxor ymm10, ymm8, ymm0 + LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 + LONG $0x5c6f7ec5; WORD $0x40c7 // vmovdqu ymm11, yword [rdi + 8*rax + 64] + LONG $0x4b3543c4; WORD $0xa0c0 // vblendvpd ymm8, ymm9, ymm8, ymm10 + LONG $0xc8ef65c5 // vpxor ymm9, ymm3, ymm0 + LONG $0xd0ef25c5 // vpxor ymm10, ymm11, ymm0 + LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9 + LONG $0x4b25e3c4; WORD $0x90db // vblendvpd ymm3, ymm11, ymm3, ymm9 + LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0 + LONG $0x373542c4; BYTE $0xca // vpcmpgtq ymm9, ymm9, ymm10 + LONG $0x4b25e3c4; WORD $0x90ff // vblendvpd ymm7, ymm11, ymm7, ymm9 + LONG $0x4c6f7ec5; WORD $0x60c7 // vmovdqu ymm9, yword [rdi + 8*rax + 96] + LONG $0xd0ef6dc5 // vpxor ymm10, ymm2, ymm0 + LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 + LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 + LONG $0x4b35e3c4; WORD $0xa0d2 // vblendvpd ymm2, ymm9, ymm2, ymm10 + LONG $0xd0ef4dc5 // vpxor ymm10, ymm6, ymm0 + LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 + LONG $0x4b35e3c4; WORD $0xa0f6 // vblendvpd ymm6, ymm9, ymm6, ymm10 + LONG $0x10c08348 // add rax, 16 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax JNE LBB3_5 + LONG $0xc8ef3dc5 // vpxor ymm9, ymm8, ymm0 + LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0 + LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9 + LONG $0x4b3de3c4; WORD $0x90ed // vblendvpd ymm5, ymm8, ymm5, ymm9 + LONG $0xc05755c5 // vxorpd ymm8, ymm5, ymm0 + LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0 + LONG $0x373d42c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm8, ymm9 + LONG $0x4b45e3c4; WORD $0x80ed // vblendvpd ymm5, ymm7, ymm5, ymm8 + LONG $0xf857d5c5 // vxorpd ymm7, ymm5, ymm0 + LONG $0xc0ef4dc5 // vpxor ymm8, ymm6, ymm0 + LONG $0x3745c2c4; BYTE $0xf8 // vpcmpgtq ymm7, ymm7, ymm8 + LONG $0x4b4de3c4; WORD $0x70ed // vblendvpd ymm5, ymm6, ymm5, ymm7 + LONG $0x197de3c4; WORD $0x01ee // vextractf128 xmm6, ymm5, 1 + LONG $0xc05749c5 // vxorpd xmm8, xmm6, xmm0 + LONG $0xf857d1c5 // vxorpd xmm7, xmm5, xmm0 + LONG $0x3741c2c4; BYTE $0xf8 // vpcmpgtq xmm7, xmm7, xmm8 + LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7 + LONG $0x0479e3c4; WORD $0x4ef5 // vpermilps xmm6, xmm5, 78 + LONG $0xc05751c5 // vxorpd xmm8, xmm5, xmm0 + LONG $0xf857c9c5 // vxorpd xmm7, xmm6, xmm0 + LONG $0x3739e2c4; BYTE $0xff // vpcmpgtq xmm7, xmm8, xmm7 + LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7 + LONG $0xf0eff5c5 // vpxor ymm6, ymm1, ymm0 + LONG $0xf8efddc5 // vpxor ymm7, ymm4, ymm0 + LONG $0x3745e2c4; BYTE $0xf6 // vpcmpgtq ymm6, ymm7, ymm6 + LONG $0x4b5de3c4; WORD $0x60c9 // vblendvpd ymm1, ymm4, ymm1, ymm6 + LONG $0xe057f5c5 // vxorpd ymm4, ymm1, ymm0 + LONG $0xf0efe5c5 // vpxor ymm6, ymm3, ymm0 + LONG $0x374de2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm6, ymm4 + LONG $0x4b65e3c4; WORD $0x40c9 // vblendvpd ymm1, ymm3, ymm1, ymm4 + LONG $0x7ef9c1c4; BYTE $0xea // vmovq r10, xmm5 + LONG $0xd857f5c5 // vxorpd ymm3, ymm1, ymm0 + LONG $0xe0efedc5 // vpxor ymm4, ymm2, ymm0 + LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 + LONG $0x4b6de3c4; WORD $0x30c9 // vblendvpd ymm1, ymm2, ymm1, ymm3 + LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 + LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 + LONG $0xe057e9c5 // vxorpd xmm4, xmm2, xmm0 + LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3 + LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 + LONG $0x0479e3c4; WORD $0x4ed1 // vpermilps xmm2, xmm1, 78 + LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 + LONG $0xc057e9c5 // vxorpd xmm0, xmm2, xmm0 + LONG $0x3779e2c4; BYTE $0xc3 // vpcmpgtq xmm0, xmm0, xmm3 + LONG $0x4b69e3c4; WORD $0x00c1 // vblendvpd xmm0, xmm2, xmm1, xmm0 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x894c; BYTE $0xd6 // mov rsi, r10 + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB3_8 + +LBB3_7: + LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9] + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + LONG $0xc6430f48 // cmovae rax, rsi + WORD $0x3949; BYTE $0xf2 // cmp r10, rsi + LONG $0xf2470f49 // cmova rsi, r10 + LONG $0x01c18349 // add r9, 1 + WORD $0x8949; BYTE $0xf2 // mov r10, rsi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB3_7 -LBB3_14: +LBB3_8: WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi - WORD $0x894c; BYTE $0x0a // mov qword [rdx], r9 - SUBQ $8, SP + WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax VZEROUPPER RET diff --git a/go/parquet/internal/utils/unpack_bool_avx2_amd64.s b/go/parquet/internal/utils/unpack_bool_avx2_amd64.s index 99c2cc88265..459ff78675d 100644 --- a/go/parquet/internal/utils/unpack_bool_avx2_amd64.s +++ b/go/parquet/internal/utils/unpack_bool_avx2_amd64.s @@ -1,6961 +1,88 @@ //+build !noasm !appengine // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT -DATA LCDATA1<>+0x000(SB)/8, $0x0000001900000018 -DATA LCDATA1<>+0x008(SB)/8, $0x0000001b0000001a -DATA LCDATA1<>+0x010(SB)/8, $0x0000001d0000001c -DATA LCDATA1<>+0x018(SB)/8, $0x0000001f0000001e -DATA LCDATA1<>+0x020(SB)/8, $0x0000001100000010 -DATA LCDATA1<>+0x028(SB)/8, $0x0000001300000012 -DATA LCDATA1<>+0x030(SB)/8, $0x0000001500000014 -DATA LCDATA1<>+0x038(SB)/8, $0x0000001700000016 -DATA LCDATA1<>+0x040(SB)/8, $0x0000000900000008 -DATA LCDATA1<>+0x048(SB)/8, $0x0000000b0000000a -DATA LCDATA1<>+0x050(SB)/8, $0x0000000d0000000c -DATA LCDATA1<>+0x058(SB)/8, $0x0000000f0000000e -DATA LCDATA1<>+0x060(SB)/8, $0x0000000100000000 -DATA LCDATA1<>+0x068(SB)/8, $0x0000000300000002 -DATA LCDATA1<>+0x070(SB)/8, $0x0000000500000004 -DATA LCDATA1<>+0x078(SB)/8, $0x0000000700000006 -DATA LCDATA1<>+0x080(SB)/8, $0x0101010101010101 -DATA LCDATA1<>+0x088(SB)/8, $0x0101010101010101 -DATA LCDATA1<>+0x090(SB)/8, $0x0101010101010101 -DATA LCDATA1<>+0x098(SB)/8, $0x0101010101010101 -DATA LCDATA1<>+0x0a0(SB)/8, $0x0000000000000001 -DATA LCDATA1<>+0x0a8(SB)/8, $0x0000000000000002 -DATA LCDATA1<>+0x0b0(SB)/8, $0x0000000000000003 -DATA LCDATA1<>+0x0b8(SB)/8, $0x0000000000000004 -DATA LCDATA1<>+0x0c0(SB)/8, $0x0000000000000005 -DATA LCDATA1<>+0x0c8(SB)/8, $0x0000000000000006 -DATA LCDATA1<>+0x0d0(SB)/8, $0x0000000000000007 -DATA LCDATA1<>+0x0d8(SB)/8, $0x0000000000000020 -GLOBL LCDATA1<>(SB), 8, $224 - -TEXT ·_bytes_to_bools_avx2(SB), $1000-32 +TEXT ·_bytes_to_bools_avx2(SB), $0-32 MOVQ in+0(FP), DI MOVQ len+8(FP), SI MOVQ out+16(FP), DX MOVQ outlen+24(FP), CX - MOVQ SP, BP - ADDQ $32, SP - ANDQ $-32, SP - MOVQ BP, 960(SP) - LEAQ LCDATA1<>(SB), BP WORD $0xf685 // test esi, esi - JLE LBB0_1051 - WORD $0x8941; BYTE $0xc9 // mov r9d, ecx - WORD $0x8949; BYTE $0xd0 // mov r8, rdx - WORD $0x8941; BYTE $0xf2 // mov r10d, esi - WORD $0xfe83; BYTE $0x20 // cmp esi, 32 - JAE LBB0_3 + JLE LBB0_5 + WORD $0x8941; BYTE $0xf0 // mov r8d, esi + LONG $0x03e0c149 // shl r8, 3 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB0_2 + +LBB0_4: + LONG $0x08c28349 // add r10, 8 + LONG $0x01c78348 // add rdi, 1 + WORD $0x394d; BYTE $0xd0 // cmp r8, r10 + JE LBB0_5 LBB0_2: - WORD $0x3145; BYTE $0xe4 // xor r12d, r12d - -LBB0_1055: - QUAD $0x00000000e50c8d42 // lea ecx, [8*r12] - JMP LBB0_1057 - -LBB0_1056: - LONG $0x01c48349 // add r12, 1 - WORD $0xc183; BYTE $0x08 // add ecx, 8 - WORD $0x394d; BYTE $0xe2 // cmp r10, r12 - JE LBB0_1051 - -LBB0_1057: - WORD $0xca89 // mov edx, ecx - WORD $0xc989 // mov ecx, ecx - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x14b60f42; BYTE $0x27 // movzx edx, byte [rdi + r12] - WORD $0xe280; BYTE $0x01 // and dl, 1 - LONG $0x08148841 // mov byte [r8 + rcx], dl - WORD $0x8948; BYTE $0xca // mov rdx, rcx - LONG $0x01ca8348 // or rdx, 1 - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x1cb60f42; BYTE $0x27 // movzx ebx, byte [rdi + r12] - WORD $0xebd0 // shr bl, 1 - WORD $0xe380; BYTE $0x01 // and bl, 1 - LONG $0x101c8841 // mov byte [r8 + rdx], bl - WORD $0x8948; BYTE $0xca // mov rdx, rcx - LONG $0x02ca8348 // or rdx, 2 - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x1cb60f42; BYTE $0x27 // movzx ebx, byte [rdi + r12] - WORD $0xebc0; BYTE $0x02 // shr bl, 2 - WORD $0xe380; BYTE $0x01 // and bl, 1 - LONG $0x101c8841 // mov byte [r8 + rdx], bl - WORD $0x8948; BYTE $0xca // mov rdx, rcx - LONG $0x03ca8348 // or rdx, 3 - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x1cb60f42; BYTE $0x27 // movzx ebx, byte [rdi + r12] - WORD $0xebc0; BYTE $0x03 // shr bl, 3 - WORD $0xe380; BYTE $0x01 // and bl, 1 - LONG $0x101c8841 // mov byte [r8 + rdx], bl - WORD $0x8948; BYTE $0xca // mov rdx, rcx - LONG $0x04ca8348 // or rdx, 4 - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x1cb60f42; BYTE $0x27 // movzx ebx, byte [rdi + r12] - WORD $0xebc0; BYTE $0x04 // shr bl, 4 - WORD $0xe380; BYTE $0x01 // and bl, 1 - LONG $0x101c8841 // mov byte [r8 + rdx], bl - WORD $0x8948; BYTE $0xca // mov rdx, rcx - LONG $0x05ca8348 // or rdx, 5 - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x1cb60f42; BYTE $0x27 // movzx ebx, byte [rdi + r12] - WORD $0xebc0; BYTE $0x05 // shr bl, 5 - WORD $0xe380; BYTE $0x01 // and bl, 1 - LONG $0x101c8841 // mov byte [r8 + rdx], bl - WORD $0x8948; BYTE $0xca // mov rdx, rcx - LONG $0x06ca8348 // or rdx, 6 - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x1cb60f42; BYTE $0x27 // movzx ebx, byte [rdi + r12] - WORD $0xebc0; BYTE $0x06 // shr bl, 6 - WORD $0xe380; BYTE $0x01 // and bl, 1 - LONG $0x101c8841 // mov byte [r8 + rdx], bl - WORD $0x8948; BYTE $0xca // mov rdx, rcx - LONG $0x07ca8348 // or rdx, 7 - WORD $0x3944; BYTE $0xca // cmp edx, r9d - JGE LBB0_1056 - LONG $0x1cb60f42; BYTE $0x27 // movzx ebx, byte [rdi + r12] - WORD $0xebc0; BYTE $0x07 // shr bl, 7 - LONG $0x101c8841 // mov byte [r8 + rdx], bl - JMP LBB0_1056 - -LBB0_3: - LONG $0x244c8944; BYTE $0x10 // mov dword [rsp + 16], r9d - LONG $0x2454894c; BYTE $0x30 // mov qword [rsp + 48], r10 - LONG $0xff728d49 // lea rsi, [r10 - 1] - LONG $0x000008b9; BYTE $0x00 // mov ecx, 8 - WORD $0xf089 // mov eax, esi - WORD $0xe1f7 // mul ecx - LONG $0xd6900f41 // seto r14b - WORD $0x8948; BYTE $0xf3 // mov rbx, rsi - LONG $0x20ebc148 // shr rbx, 32 - LONG $0x06488d49 // lea rcx, [r8 + 6] - LONG $0x000008ba; BYTE $0x00 // mov edx, 8 - WORD $0x8948; BYTE $0xf0 // mov rax, rsi - WORD $0xf748; BYTE $0xe2 // mul rdx - LONG $0xd6900f40 // seto sil - WORD $0x0148; BYTE $0xc1 // add rcx, rax - WORD $0x920f; BYTE $0xd2 // setb dl - LONG $0x07488d49 // lea rcx, [r8 + 7] - WORD $0x0148; BYTE $0xc1 // add rcx, rax - LONG $0xd5920f41 // setb r13b - LONG $0x05488d49 // lea rcx, [r8 + 5] - WORD $0x0148; BYTE $0xc1 // add rcx, rax - LONG $0xd1920f41 // setb r9b - LONG $0x04488d49 // lea rcx, [r8 + 4] - WORD $0x0148; BYTE $0xc1 // add rcx, rax - LONG $0xd7920f41 // setb r15b - LONG $0x03488d49 // lea rcx, [r8 + 3] - WORD $0x0148; BYTE $0xc1 // add rcx, rax - LONG $0xd3920f41 // setb r11b - LONG $0x02488d49 // lea rcx, [r8 + 2] - WORD $0x0148; BYTE $0xc1 // add rcx, rax - LONG $0xd2920f41 // setb r10b - LONG $0x01488d49 // lea rcx, [r8 + 1] - WORD $0x0148; BYTE $0xc1 // add rcx, rax - WORD $0x920f; BYTE $0xd1 // setb cl - WORD $0x014c; BYTE $0xc0 // add rax, r8 - WORD $0x920f; BYTE $0xd0 // setb al - WORD $0x3145; BYTE $0xe4 // xor r12d, r12d - WORD $0x8548; BYTE $0xdb // test rbx, rbx - JNE LBB0_1052 - WORD $0x8445; BYTE $0xf6 // test r14b, r14b - JNE LBB0_1052 - WORD $0xd284 // test dl, dl - JNE LBB0_1052 - WORD $0x8440; BYTE $0xf6 // test sil, sil - JNE LBB0_1052 - WORD $0x8445; BYTE $0xed // test r13b, r13b - JNE LBB0_1052 - WORD $0x8440; BYTE $0xf6 // test sil, sil - JNE LBB0_1052 - WORD $0x8445; BYTE $0xc9 // test r9b, r9b - JNE LBB0_1052 - WORD $0x8440; BYTE $0xf6 // test sil, sil - JNE LBB0_1052 - WORD $0x8445; BYTE $0xff // test r15b, r15b - JNE LBB0_1052 - WORD $0x8440; BYTE $0xf6 // test sil, sil - JNE LBB0_1052 - WORD $0x8445; BYTE $0xdb // test r11b, r11b - JNE LBB0_1052 - WORD $0x8440; BYTE $0xf6 // test sil, sil - JNE LBB0_1052 - WORD $0x8445; BYTE $0xd2 // test r10b, r10b - JNE LBB0_1052 - WORD $0x8440; BYTE $0xf6 // test sil, sil - LONG $0x24548b4c; BYTE $0x30 // mov r10, qword [rsp + 48] - JNE LBB0_1054 - WORD $0xc984 // test cl, cl - JNE LBB0_1054 - WORD $0x8440; BYTE $0xf6 // test sil, sil - LONG $0x244c8b44; BYTE $0x10 // mov r9d, dword [rsp + 16] - JNE LBB0_1055 - WORD $0xc084 // test al, al - JNE LBB0_1055 - WORD $0x8440; BYTE $0xf6 // test sil, sil - JNE LBB0_1055 - LONG $0xd0048d4b // lea rax, [r8 + 8*r10] - WORD $0x3948; BYTE $0xf8 // cmp rax, rdi - JBE LBB0_24 - LONG $0x17048d4a // lea rax, [rdi + r10] - WORD $0x394c; BYTE $0xc0 // cmp rax, r8 - JA LBB0_2 - -LBB0_24: - WORD $0x8945; BYTE $0xd4 // mov r12d, r10d - LONG $0xe0e48341 // and r12d, -32 - LONG $0x6e79c1c4; BYTE $0xc1 // vmovd xmm0, r9d - LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0 - LONG $0x4d6f7dc5; BYTE $0x00 // vmovdqa ymm9, yword 0[rbp] /* [rip + .LCPI0_0] */ - LONG $0x456f7dc5; BYTE $0x20 // vmovdqa ymm8, yword 32[rbp] /* [rip + .LCPI0_1] */ - LONG $0x5d6ffdc5; BYTE $0x40 // vmovdqa ymm3, yword 64[rbp] /* [rip + .LCPI0_2] */ - LONG $0x556ffdc5; BYTE $0x60 // vmovdqa ymm2, yword 96[rbp] /* [rip + .LCPI0_3] */ - WORD $0x3145; BYTE $0xdb // xor r11d, r11d - QUAD $0x0000a08d197de2c4; BYTE $0x00 // vbroadcastsd ymm1, qword 160[rbp] /* [rip + .LCPI0_5] */ - QUAD $0x000300248c29fcc5; BYTE $0x00 // vmovaps yword [rsp + 768], ymm1 - QUAD $0x0000a88d197de2c4; BYTE $0x00 // vbroadcastsd ymm1, qword 168[rbp] /* [rip + .LCPI0_6] */ - QUAD $0x0002e0248c29fcc5; BYTE $0x00 // vmovaps yword [rsp + 736], ymm1 - QUAD $0x0000b08d197de2c4; BYTE $0x00 // vbroadcastsd ymm1, qword 176[rbp] /* [rip + .LCPI0_7] */ - QUAD $0x0002c0248c29fcc5; BYTE $0x00 // vmovaps yword [rsp + 704], ymm1 - QUAD $0x0000b88d197de2c4; BYTE $0x00 // vbroadcastsd ymm1, qword 184[rbp] /* [rip + .LCPI0_8] */ - QUAD $0x0002a0248c29fcc5; BYTE $0x00 // vmovaps yword [rsp + 672], ymm1 - QUAD $0x0000c08d197de2c4; BYTE $0x00 // vbroadcastsd ymm1, qword 192[rbp] /* [rip + .LCPI0_9] */ - QUAD $0x000280248c29fcc5; BYTE $0x00 // vmovaps yword [rsp + 640], ymm1 - QUAD $0x0000c88d197de2c4; BYTE $0x00 // vbroadcastsd ymm1, qword 200[rbp] /* [rip + .LCPI0_10] */ - QUAD $0x000260248c29fcc5; BYTE $0x00 // vmovaps yword [rsp + 608], ymm1 - QUAD $0x0000d08d197de2c4; BYTE $0x00 // vbroadcastsd ymm1, qword 208[rbp] /* [rip + .LCPI0_11] */ - QUAD $0x000240248c29fcc5; BYTE $0x00 // vmovaps yword [rsp + 576], ymm1 - QUAD $0x0000d88d587de2c4; BYTE $0x00 // vpbroadcastd ymm1, dword 216[rbp] /* [rip + .LCPI0_12] */ - QUAD $0x000220248c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 544], ymm1 - JMP LBB0_26 - -LBB0_25: - LONG $0x20c38349 // add r11, 32 - QUAD $0x000220248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 544] - LONG $0xd1feedc5 // vpaddd ymm2, ymm2, ymm1 - LONG $0xd9fee5c5 // vpaddd ymm3, ymm3, ymm1 - LONG $0xc1fe3dc5 // vpaddd ymm8, ymm8, ymm1 - LONG $0xc9fe35c5 // vpaddd ymm9, ymm9, ymm1 - WORD $0x394d; BYTE $0xe3 // cmp r11, r12 - JE LBB0_1050 - -LBB0_26: - QUAD $0x00032024947ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 800], ymm2 - LONG $0xf272f5c5; BYTE $0x03 // vpslld ymm1, ymm2, 3 - LONG $0xd166f9c5 // vpcmpgtd xmm2, xmm0, xmm1 - LONG $0xd17ef9c5 // vmovd ecx, xmm2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_28 - LONG $0x787da2c4; WORD $0x1f24 // vpbroadcastb ymm4, byte [rdi + r11] - -LBB0_28: - WORD $0x894d; BYTE $0xda // mov r10, r11 - LONG $0x01ca8349 // or r10, 1 - LONG $0xd166f9c5 // vpcmpgtd xmm2, xmm0, xmm1 - LONG $0xd26be9c5 // vpackssdw xmm2, xmm2, xmm2 - LONG $0xd263e9c5 // vpacksswb xmm2, xmm2, xmm2 - LONG $0x1479e3c4; WORD $0x01d1 // vpextrb ecx, xmm2, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_30 - LONG $0x2059a3c4; WORD $0x1714; BYTE $0x01 // vpinsrb xmm2, xmm4, byte [rdi + r10], 1 - LONG $0x025de3c4; WORD $0x0fe2 // vpblendd ymm4, ymm4, ymm2, 15 - -LBB0_30: - WORD $0x894d; BYTE $0xde // mov r14, r11 - LONG $0x02ce8349 // or r14, 2 - LONG $0xd166f9c5 // vpcmpgtd xmm2, xmm0, xmm1 - LONG $0xd26be9c5 // vpackssdw xmm2, xmm2, xmm2 - LONG $0xd263e9c5 // vpacksswb xmm2, xmm2, xmm2 - LONG $0x1479e3c4; WORD $0x02d1 // vpextrb ecx, xmm2, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_32 - LONG $0x2059a3c4; WORD $0x3714; BYTE $0x02 // vpinsrb xmm2, xmm4, byte [rdi + r14], 2 - LONG $0x025de3c4; WORD $0x0fe2 // vpblendd ymm4, ymm4, ymm2, 15 - -LBB0_32: - LONG $0x397dc3c4; WORD $0x01cd // vextracti128 xmm13, ymm1, 1 - WORD $0x894c; BYTE $0xda // mov rdx, r11 - LONG $0x03ca8348 // or rdx, 3 - LONG $0xd166f9c5 // vpcmpgtd xmm2, xmm0, xmm1 - LONG $0xd26be9c5 // vpackssdw xmm2, xmm2, xmm2 - LONG $0xd263e9c5 // vpacksswb xmm2, xmm2, xmm2 - LONG $0x1479e3c4; WORD $0x03d1 // vpextrb ecx, xmm2, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_34 - LONG $0x2059e3c4; WORD $0x1714; BYTE $0x03 // vpinsrb xmm2, xmm4, byte [rdi + rdx], 3 - LONG $0x025de3c4; WORD $0x0fe2 // vpblendd ymm4, ymm4, ymm2, 15 - -LBB0_34: - WORD $0x894c; BYTE $0xd9 // mov rcx, r11 - LONG $0x04c98348 // or rcx, 4 - LONG $0x397de3c4; WORD $0x01c7 // vextracti128 xmm7, ymm0, 1 - LONG $0x6641c1c4; BYTE $0xd5 // vpcmpgtd xmm2, xmm7, xmm13 - LONG $0x1479c3c4; WORD $0x00d1 // vpextrb r9d, xmm2, 0 - LONG $0x01c1f641 // test r9b, 1 - QUAD $0x0000011024948948 // mov qword [rsp + 272], rdx - QUAD $0x00000108248c8948 // mov qword [rsp + 264], rcx - JE LBB0_36 - LONG $0x2059e3c4; WORD $0x0f14; BYTE $0x04 // vpinsrb xmm2, xmm4, byte [rdi + rcx], 4 - LONG $0x025de3c4; WORD $0x0fe2 // vpblendd ymm4, ymm4, ymm2, 15 - -LBB0_36: - WORD $0x894d; BYTE $0xdf // mov r15, r11 - LONG $0x05cf8349 // or r15, 5 - LONG $0xf166fdc5 // vpcmpgtd ymm6, ymm0, ymm1 - LONG $0xd06bcdc5 // vpackssdw ymm2, ymm6, ymm0 - LONG $0x397de3c4; WORD $0x01d2 // vextracti128 xmm2, ymm2, 1 - LONG $0x5879e2c4; BYTE $0xd2 // vpbroadcastd xmm2, xmm2 - LONG $0xd263e9c5 // vpacksswb xmm2, xmm2, xmm2 - LONG $0x1479e3c4; WORD $0x05d1 // vpextrb ecx, xmm2, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_38 - LONG $0x2059a3c4; WORD $0x3f14; BYTE $0x05 // vpinsrb xmm2, xmm4, byte [rdi + r15], 5 - LONG $0x025de3c4; WORD $0x0fe2 // vpblendd ymm4, ymm4, ymm2, 15 - -LBB0_38: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x06cb8348 // or rbx, 6 - LONG $0xd06bcdc5 // vpackssdw ymm2, ymm6, ymm0 - LONG $0x00fde3c4; WORD $0xe8d2 // vpermq ymm2, ymm2, 232 - LONG $0xd263e9c5 // vpacksswb xmm2, xmm2, xmm2 - LONG $0x1479e3c4; WORD $0x06d1 // vpextrb ecx, xmm2, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_40 - LONG $0x2059e3c4; WORD $0x1f14; BYTE $0x06 // vpinsrb xmm2, xmm4, byte [rdi + rbx], 6 - LONG $0x025de3c4; WORD $0x0fe2 // vpblendd ymm4, ymm4, ymm2, 15 - -LBB0_40: - LONG $0xf372edc5; BYTE $0x03 // vpslld ymm2, ymm3, 3 - WORD $0x894c; BYTE $0xd8 // mov rax, r11 - LONG $0x07c88348 // or rax, 7 - LONG $0xe86bcdc5 // vpackssdw ymm5, ymm6, ymm0 - LONG $0x00fde3c4; WORD $0xe8ed // vpermq ymm5, ymm5, 232 - LONG $0xed63d1c5 // vpacksswb xmm5, xmm5, xmm5 - LONG $0x1479e3c4; WORD $0x07e9 // vpextrb ecx, xmm5, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_42 - LONG $0x2059e3c4; WORD $0x072c; BYTE $0x07 // vpinsrb xmm5, xmm4, byte [rdi + rax], 7 - LONG $0x025de3c4; WORD $0x0fe5 // vpblendd ymm4, ymm4, ymm5, 15 - -LBB0_42: - WORD $0x894c; BYTE $0xde // mov rsi, r11 - LONG $0x08ce8348 // or rsi, 8 - LONG $0xea66f9c5 // vpcmpgtd xmm5, xmm0, xmm2 - LONG $0x1479e3c4; WORD $0x00e9 // vpextrb ecx, xmm5, 0 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_44 - LONG $0x2059e3c4; WORD $0x372c; BYTE $0x08 // vpinsrb xmm5, xmm4, byte [rdi + rsi], 8 - LONG $0x025de3c4; WORD $0x0fe5 // vpblendd ymm4, ymm4, ymm5, 15 - -LBB0_44: - WORD $0x894c; BYTE $0xda // mov rdx, r11 - LONG $0x09ca8348 // or rdx, 9 - LONG $0xea66f9c5 // vpcmpgtd xmm5, xmm0, xmm2 - LONG $0xed6bd1c5 // vpackssdw xmm5, xmm5, xmm5 - LONG $0xed63d1c5 // vpacksswb xmm5, xmm5, xmm5 - LONG $0x1479e3c4; WORD $0x09e9 // vpextrb ecx, xmm5, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000e024948948 // mov qword [rsp + 224], rdx - JE LBB0_46 - LONG $0x2059e3c4; WORD $0x172c; BYTE $0x09 // vpinsrb xmm5, xmm4, byte [rdi + rdx], 9 - LONG $0x025de3c4; WORD $0x0fe5 // vpblendd ymm4, ymm4, ymm5, 15 - -LBB0_46: - WORD $0x894c; BYTE $0xda // mov rdx, r11 - LONG $0x0aca8348 // or rdx, 10 - LONG $0xea66f9c5 // vpcmpgtd xmm5, xmm0, xmm2 - LONG $0xed6bd1c5 // vpackssdw xmm5, xmm5, xmm5 - LONG $0xed63d1c5 // vpacksswb xmm5, xmm5, xmm5 - LONG $0x1479e3c4; WORD $0x0ae9 // vpextrb ecx, xmm5, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000340249c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 832], ymm3 - LONG $0x24748948; BYTE $0x60 // mov qword [rsp + 96], rsi - JE LBB0_48 - LONG $0x2059e3c4; WORD $0x172c; BYTE $0x0a // vpinsrb xmm5, xmm4, byte [rdi + rdx], 10 - LONG $0x025de3c4; WORD $0x0fe5 // vpblendd ymm4, ymm4, ymm5, 15 - -LBB0_48: - LONG $0x397de3c4; WORD $0x01d5 // vextracti128 xmm5, ymm2, 1 - WORD $0x894c; BYTE $0xde // mov rsi, r11 - LONG $0x0bce8348 // or rsi, 11 - LONG $0xda66f9c5 // vpcmpgtd xmm3, xmm0, xmm2 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0xdb63e1c5 // vpacksswb xmm3, xmm3, xmm3 - LONG $0x1479e3c4; WORD $0x0bd9 // vpextrb ecx, xmm3, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000982494894c // mov qword [rsp + 152], r10 - QUAD $0x0000012824b4894c // mov qword [rsp + 296], r14 - LONG $0x247c894c; BYTE $0x68 // mov qword [rsp + 104], r15 - QUAD $0x00000120249c8948 // mov qword [rsp + 288], rbx - QUAD $0x000000e824848948 // mov qword [rsp + 232], rax - JE LBB0_50 - LONG $0x2059e3c4; WORD $0x371c; BYTE $0x0b // vpinsrb xmm3, xmm4, byte [rdi + rsi], 11 - LONG $0x025de3c4; WORD $0x0fe3 // vpblendd ymm4, ymm4, ymm3, 15 - -LBB0_50: - WORD $0x894c; BYTE $0xd9 // mov rcx, r11 - LONG $0x0cc98348 // or rcx, 12 - LONG $0xdd66c1c5 // vpcmpgtd xmm3, xmm7, xmm5 - LONG $0x1479c3c4; WORD $0x00de // vpextrb r14d, xmm3, 0 - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000010024b48948 // mov qword [rsp + 256], rsi - QUAD $0x000000f8248c8948 // mov qword [rsp + 248], rcx - JE LBB0_52 - LONG $0x2059e3c4; WORD $0x0f1c; BYTE $0x0c // vpinsrb xmm3, xmm4, byte [rdi + rcx], 12 - LONG $0x025de3c4; WORD $0x0fe3 // vpblendd ymm4, ymm4, ymm3, 15 - -LBB0_52: - WORD $0x894c; BYTE $0xd8 // mov rax, r11 - LONG $0x0dc88348 // or rax, 13 - LONG $0xfa66fdc5 // vpcmpgtd ymm7, ymm0, ymm2 - LONG $0xd86bc5c5 // vpackssdw ymm3, ymm7, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x5879e2c4; BYTE $0xdb // vpbroadcastd xmm3, xmm3 - LONG $0xdb63e1c5 // vpacksswb xmm3, xmm3, xmm3 - LONG $0x1479e3c4; WORD $0x0dd9 // vpextrb ecx, xmm3, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_54 - LONG $0x2059e3c4; WORD $0x071c; BYTE $0x0d // vpinsrb xmm3, xmm4, byte [rdi + rax], 13 - LONG $0x025de3c4; WORD $0x0fe3 // vpblendd ymm4, ymm4, ymm3, 15 - -LBB0_54: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x0ecb8348 // or rbx, 14 - LONG $0xd86bc5c5 // vpackssdw ymm3, ymm7, ymm0 - LONG $0x00fde3c4; WORD $0xe8db // vpermq ymm3, ymm3, 232 - LONG $0xdb63e1c5 // vpacksswb xmm3, xmm3, xmm3 - LONG $0x1479e3c4; WORD $0x0ed9 // vpextrb ecx, xmm3, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - LONG $0x245c8948; BYTE $0x50 // mov qword [rsp + 80], rbx - JE LBB0_56 - LONG $0x2059e3c4; WORD $0x1f1c; BYTE $0x0e // vpinsrb xmm3, xmm4, byte [rdi + rbx], 14 - LONG $0x025de3c4; WORD $0x0fe3 // vpblendd ymm4, ymm4, ymm3, 15 - -LBB0_56: - LONG $0x722dc1c4; WORD $0x03f0 // vpslld ymm10, ymm8, 3 - WORD $0x894c; BYTE $0xde // mov rsi, r11 - LONG $0x0fce8348 // or rsi, 15 - LONG $0xd86bc5c5 // vpackssdw ymm3, ymm7, ymm0 - LONG $0x00fde3c4; WORD $0xe8db // vpermq ymm3, ymm3, 232 - LONG $0xdb63e1c5 // vpacksswb xmm3, xmm3, xmm3 - LONG $0x1479e3c4; WORD $0x0fd9 // vpextrb ecx, xmm3, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_58 - LONG $0x2059e3c4; WORD $0x371c; BYTE $0x0f // vpinsrb xmm3, xmm4, byte [rdi + rsi], 15 - LONG $0x025de3c4; WORD $0x0fe3 // vpblendd ymm4, ymm4, ymm3, 15 - -LBB0_58: - WORD $0x894d; BYTE $0xdf // mov r15, r11 - LONG $0x10cf8349 // or r15, 16 - LONG $0x6679c1c4; BYTE $0xda // vpcmpgtd xmm3, xmm0, xmm10 - LONG $0xd97ef9c5 // vmovd ecx, xmm3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - LONG $0x247c894c; BYTE $0x40 // mov qword [rsp + 64], r15 - LONG $0x24748948; BYTE $0x48 // mov qword [rsp + 72], rsi - JE LBB0_60 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061a3c4; WORD $0x3f1c; BYTE $0x00 // vpinsrb xmm3, xmm3, byte [rdi + r15], 0 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_60: - WORD $0x894c; BYTE $0xde // mov rsi, r11 - LONG $0x11ce8348 // or rsi, 17 - LONG $0x6679c1c4; BYTE $0xda // vpcmpgtd xmm3, xmm0, xmm10 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0x00fde3c4; WORD $0xd4db // vpermq ymm3, ymm3, 212 - LONG $0xd863e5c5 // vpacksswb ymm3, ymm3, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x01d9 // vpextrb ecx, xmm3, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_62 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x371c; BYTE $0x01 // vpinsrb xmm3, xmm3, byte [rdi + rsi], 1 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_62: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x12cb8348 // or rbx, 18 - LONG $0x6679c1c4; BYTE $0xda // vpcmpgtd xmm3, xmm0, xmm10 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0x00fde3c4; WORD $0xd4db // vpermq ymm3, ymm3, 212 - LONG $0xd863e5c5 // vpacksswb ymm3, ymm3, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x02d9 // vpextrb ecx, xmm3, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_64 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x02 // vpinsrb xmm3, xmm3, byte [rdi + rbx], 2 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_64: - WORD $0x894d; BYTE $0xdf // mov r15, r11 - LONG $0x13cf8349 // or r15, 19 - LONG $0x6679c1c4; BYTE $0xda // vpcmpgtd xmm3, xmm0, xmm10 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0x00fde3c4; WORD $0xd4db // vpermq ymm3, ymm3, 212 - LONG $0xd863e5c5 // vpacksswb ymm3, ymm3, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x03d9 // vpextrb ecx, xmm3, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x00036024847f7dc5; BYTE $0x00 // vmovdqa yword [rsp + 864], ymm8 - JE LBB0_66 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061a3c4; WORD $0x3f1c; BYTE $0x03 // vpinsrb xmm3, xmm3, byte [rdi + r15], 3 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_66: - WORD $0x894d; BYTE $0xdd // mov r13, r11 - LONG $0x14cd8349 // or r13, 20 - LONG $0x667d41c4; BYTE $0xc2 // vpcmpgtd ymm8, ymm0, ymm10 - LONG $0x6b7dc1c4; BYTE $0xd8 // vpackssdw ymm3, ymm0, ymm8 - LONG $0xd863e5c5 // vpacksswb ymm3, ymm3, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x04d9 // vpextrb ecx, xmm3, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - LONG $0x246c894c; BYTE $0x38 // mov qword [rsp + 56], r13 - JE LBB0_68 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061a3c4; WORD $0x2f1c; BYTE $0x04 // vpinsrb xmm3, xmm3, byte [rdi + r13], 4 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_68: - WORD $0x894d; BYTE $0xdd // mov r13, r11 - LONG $0x15cd8349 // or r13, 21 - LONG $0x6b7dc1c4; BYTE $0xd8 // vpackssdw ymm3, ymm0, ymm8 - LONG $0xd863e5c5 // vpacksswb ymm3, ymm3, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x05d9 // vpextrb ecx, xmm3, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x00000080249c8948 // mov qword [rsp + 128], rbx - JE LBB0_70 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061a3c4; WORD $0x2f1c; BYTE $0x05 // vpinsrb xmm3, xmm3, byte [rdi + r13], 5 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_70: - WORD $0x894d; BYTE $0xda // mov r10, r11 - LONG $0x16ca8349 // or r10, 22 - LONG $0x6b7dc1c4; BYTE $0xd8 // vpackssdw ymm3, ymm0, ymm8 - LONG $0xd863e5c5 // vpacksswb ymm3, ymm3, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x06d9 // vpextrb ecx, xmm3, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_72 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061a3c4; WORD $0x171c; BYTE $0x06 // vpinsrb xmm3, xmm3, byte [rdi + r10], 6 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_72: - LONG $0x7225c1c4; WORD $0x03f1 // vpslld ymm11, ymm9, 3 - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x17cb8348 // or rbx, 23 - LONG $0x6b7dc1c4; BYTE $0xd8 // vpackssdw ymm3, ymm0, ymm8 - LONG $0xd863e5c5 // vpacksswb ymm3, ymm3, ymm0 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x07d9 // vpextrb ecx, xmm3, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000f0249c8948 // mov qword [rsp + 240], rbx - QUAD $0x000380248c7f7dc5; BYTE $0x00 // vmovdqa yword [rsp + 896], ymm9 - JE LBB0_74 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x07 // vpinsrb xmm3, xmm3, byte [rdi + rbx], 7 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_74: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x18cb8348 // or rbx, 24 - LONG $0x667d41c4; BYTE $0xcb // vpcmpgtd ymm9, ymm0, ymm11 - LONG $0x00fd43c4; WORD $0x44e1 // vpermq ymm12, ymm9, 68 - LONG $0x637dc1c4; BYTE $0xdc // vpacksswb ymm3, ymm0, ymm12 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x08d9 // vpextrb ecx, xmm3, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000d8249c8948 // mov qword [rsp + 216], rbx - JE LBB0_76 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x08 // vpinsrb xmm3, xmm3, byte [rdi + rbx], 8 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_76: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x19cb8348 // or rbx, 25 - LONG $0x6679c1c4; BYTE $0xdb // vpcmpgtd xmm3, xmm0, xmm11 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0x00fde3c4; WORD $0xd4db // vpermq ymm3, ymm3, 212 - LONG $0xdb63fdc5 // vpacksswb ymm3, ymm0, ymm3 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x09d9 // vpextrb ecx, xmm3, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000d0249c8948 // mov qword [rsp + 208], rbx - JE LBB0_78 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x09 // vpinsrb xmm3, xmm3, byte [rdi + rbx], 9 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_78: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x1acb8348 // or rbx, 26 - LONG $0x6679c1c4; BYTE $0xdb // vpcmpgtd xmm3, xmm0, xmm11 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0x00fde3c4; WORD $0xd4db // vpermq ymm3, ymm3, 212 - LONG $0xdb63fdc5 // vpacksswb ymm3, ymm0, ymm3 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x0ad9 // vpextrb ecx, xmm3, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000c8249c8948 // mov qword [rsp + 200], rbx - JE LBB0_80 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0a // vpinsrb xmm3, xmm3, byte [rdi + rbx], 10 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_80: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x1bcb8348 // or rbx, 27 - LONG $0x6679c1c4; BYTE $0xdb // vpcmpgtd xmm3, xmm0, xmm11 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0x00fde3c4; WORD $0xd4db // vpermq ymm3, ymm3, 212 - LONG $0xdb63fdc5 // vpacksswb ymm3, ymm0, ymm3 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x0bd9 // vpextrb ecx, xmm3, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000c0249c8948 // mov qword [rsp + 192], rbx - QUAD $0x0000009024948948 // mov qword [rsp + 144], rdx - LONG $0x24448948; BYTE $0x58 // mov qword [rsp + 88], rax - JE LBB0_82 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0b // vpinsrb xmm3, xmm3, byte [rdi + rbx], 11 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_82: - WORD $0x894c; BYTE $0xda // mov rdx, r11 - LONG $0x1cca8348 // or rdx, 28 - LONG $0x6b7dc1c4; BYTE $0xd9 // vpackssdw ymm3, ymm0, ymm9 - LONG $0xdb63fdc5 // vpacksswb ymm3, ymm0, ymm3 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x0cd9 // vpextrb ecx, xmm3, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_84 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x171c; BYTE $0x0c // vpinsrb xmm3, xmm3, byte [rdi + rdx], 12 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_84: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x1dcb8348 // or rbx, 29 - LONG $0x6b7dc1c4; BYTE $0xd9 // vpackssdw ymm3, ymm0, ymm9 - LONG $0xdb63fdc5 // vpacksswb ymm3, ymm0, ymm3 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x0dd9 // vpextrb ecx, xmm3, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000b0249c8948 // mov qword [rsp + 176], rbx - JE LBB0_86 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0d // vpinsrb xmm3, xmm3, byte [rdi + rbx], 13 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_86: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x1ecb8348 // or rbx, 30 - LONG $0x6b7dc1c4; BYTE $0xd9 // vpackssdw ymm3, ymm0, ymm9 - LONG $0xdb63fdc5 // vpacksswb ymm3, ymm0, ymm3 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x0ed9 // vpextrb ecx, xmm3, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000a8249c8948 // mov qword [rsp + 168], rbx - JE LBB0_88 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0e // vpinsrb xmm3, xmm3, byte [rdi + rbx], 14 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_88: - WORD $0x894c; BYTE $0xdb // mov rbx, r11 - LONG $0x1fcb8348 // or rbx, 31 - LONG $0x6b7dc1c4; BYTE $0xd9 // vpackssdw ymm3, ymm0, ymm9 - LONG $0xdb63fdc5 // vpacksswb ymm3, ymm0, ymm3 - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x1479e3c4; WORD $0x0fd9 // vpextrb ecx, xmm3, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x000000a0249c8948 // mov qword [rsp + 160], rbx - JE LBB0_90 - LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 - LONG $0x2061e3c4; WORD $0x1f1c; BYTE $0x0f // vpinsrb xmm3, xmm3, byte [rdi + rbx], 15 - LONG $0x385de3c4; WORD $0x01e3 // vinserti128 ymm4, ymm4, xmm3, 1 - -LBB0_90: - LONG $0x357de2c4; BYTE $0xd9 // vpmovzxdq ymm3, xmm1 - QUAD $0x000200249c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 512], ymm3 - QUAD $0x00000080bddb5dc5 // vpand ymm15, ymm4, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0xd966f9c5 // vpcmpgtd xmm3, xmm0, xmm1 - LONG $0xd97ef9c5 // vmovd ecx, xmm3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_92 - QUAD $0x000200249c6ffdc5; BYTE $0x00 // vmovdqa ymm3, yword [rsp + 512] - LONG $0x7ef9e1c4; BYTE $0xd9 // vmovq rcx, xmm3 - LONG $0x147943c4; WORD $0x083c; BYTE $0x00 // vpextrb byte [r8 + rcx], xmm15, 0 - -LBB0_92: - LONG $0xd966f9c5 // vpcmpgtd xmm3, xmm0, xmm1 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0xdb63e1c5 // vpacksswb xmm3, xmm3, xmm3 - LONG $0x1479e3c4; WORD $0x01d9 // vpextrb ecx, xmm3, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_94 - QUAD $0x000200249c6ffdc5; BYTE $0x00 // vmovdqa ymm3, yword [rsp + 512] - LONG $0x16f9e3c4; WORD $0x01d9 // vpextrq rcx, xmm3, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x01 // vpextrb byte [r8 + rcx], xmm15, 1 - -LBB0_94: - LONG $0xd966f9c5 // vpcmpgtd xmm3, xmm0, xmm1 - LONG $0xdb6be1c5 // vpackssdw xmm3, xmm3, xmm3 - LONG $0xdb63e1c5 // vpacksswb xmm3, xmm3, xmm3 - LONG $0x1479e3c4; WORD $0x02d9 // vpextrb ecx, xmm3, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_96 - QUAD $0x000200249c6ffdc5; BYTE $0x00 // vmovdqa ymm3, yword [rsp + 512] - LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 - LONG $0x7ef9e1c4; BYTE $0xd9 // vmovq rcx, xmm3 - LONG $0x147943c4; WORD $0x083c; BYTE $0x02 // vpextrb byte [r8 + rcx], xmm15, 2 - -LBB0_96: - LONG $0xc966f9c5 // vpcmpgtd xmm1, xmm0, xmm1 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_98 - QUAD $0x000200248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 512] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x03 // vpextrb byte [r8 + rcx], xmm15, 3 - -LBB0_98: - LONG $0x357dc2c4; BYTE $0xcd // vpmovzxdq ymm1, xmm13 - QUAD $0x0001e0248c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 480], ymm1 - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_100 - QUAD $0x0001e0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 480] - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x04 // vpextrb byte [r8 + rcx], xmm15, 4 - -LBB0_100: - LONG $0xc86bcdc5 // vpackssdw ymm1, ymm6, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x5879e2c4; BYTE $0xc9 // vpbroadcastd xmm1, xmm1 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x05c9 // vpextrb ecx, xmm1, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_102 - QUAD $0x0001e0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 480] - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x05 // vpextrb byte [r8 + rcx], xmm15, 5 - -LBB0_102: - LONG $0xc86bcdc5 // vpackssdw ymm1, ymm6, ymm0 - LONG $0x00fde3c4; WORD $0xe8c9 // vpermq ymm1, ymm1, 232 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x06c9 // vpextrb ecx, xmm1, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_104 - QUAD $0x0001e0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 480] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x06 // vpextrb byte [r8 + rcx], xmm15, 6 - -LBB0_104: - LONG $0xc86bcdc5 // vpackssdw ymm1, ymm6, ymm0 - LONG $0x00fde3c4; WORD $0xe8c9 // vpermq ymm1, ymm1, 232 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x07c9 // vpextrb ecx, xmm1, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_106 - QUAD $0x0001e0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 480] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x07 // vpextrb byte [r8 + rcx], xmm15, 7 - -LBB0_106: - LONG $0x357de2c4; BYTE $0xca // vpmovzxdq ymm1, xmm2 - QUAD $0x0001c0248c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 448], ymm1 - LONG $0xca66f9c5 // vpcmpgtd xmm1, xmm0, xmm2 - LONG $0x1479e3c4; WORD $0x00c9 // vpextrb ecx, xmm1, 0 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_108 - QUAD $0x0001c0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 448] - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x08 // vpextrb byte [r8 + rcx], xmm15, 8 - -LBB0_108: - LONG $0xca66f9c5 // vpcmpgtd xmm1, xmm0, xmm2 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_110 - QUAD $0x0001c0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 448] - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm15, 9 - -LBB0_110: - LONG $0xca66f9c5 // vpcmpgtd xmm1, xmm0, xmm2 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x0ac9 // vpextrb ecx, xmm1, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_112 - QUAD $0x0001c0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 448] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm15, 10 - -LBB0_112: - LONG $0xca66f9c5 // vpcmpgtd xmm1, xmm0, xmm2 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x0bc9 // vpextrb ecx, xmm1, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_114 - QUAD $0x0001c0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 448] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm15, 11 - -LBB0_114: - QUAD $0x0000008824b48948 // mov qword [rsp + 136], rsi - LONG $0x357de2c4; BYTE $0xcd // vpmovzxdq ymm1, xmm5 - QUAD $0x0001a0248c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 416], ymm1 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_116 - QUAD $0x0001a0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 416] - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm15, 12 - -LBB0_116: - LONG $0xc86bc5c5 // vpackssdw ymm1, ymm7, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x5879e2c4; BYTE $0xc9 // vpbroadcastd xmm1, xmm1 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x0dc9 // vpextrb ecx, xmm1, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - QUAD $0x00000098248c8b4c // mov r9, qword [rsp + 152] - QUAD $0x0000012824b48b48 // mov rsi, qword [rsp + 296] - LONG $0x24748b4c; BYTE $0x68 // mov r14, qword [rsp + 104] - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - JE LBB0_118 - QUAD $0x0001a0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 416] - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm15, 13 - -LBB0_118: - LONG $0xc86bc5c5 // vpackssdw ymm1, ymm7, ymm0 - LONG $0x00fde3c4; WORD $0xe8c9 // vpermq ymm1, ymm1, 232 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x0ec9 // vpextrb ecx, xmm1, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_120 - QUAD $0x0001a0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 416] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm15, 14 - -LBB0_120: - LONG $0xc86bc5c5 // vpackssdw ymm1, ymm7, ymm0 - LONG $0x00fde3c4; WORD $0xe8c9 // vpermq ymm1, ymm1, 232 - LONG $0xc963f1c5 // vpacksswb xmm1, xmm1, xmm1 - LONG $0x1479e3c4; WORD $0x0fc9 // vpextrb ecx, xmm1, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_122 - QUAD $0x0001a0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 416] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x083c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm15, 15 - -LBB0_122: - LONG $0x357dc2c4; BYTE $0xca // vpmovzxdq ymm1, xmm10 - QUAD $0x000180248c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 384], ymm1 - LONG $0x6679c1c4; BYTE $0xca // vpcmpgtd xmm1, xmm0, xmm10 - LONG $0xc97ef9c5 // vmovd ecx, xmm1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_124 - QUAD $0x000180248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 384] - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x00 // vpextrb byte [r8 + rcx], xmm1, 0 - -LBB0_124: - LONG $0x6679c1c4; BYTE $0xca // vpcmpgtd xmm1, xmm0, xmm10 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0x00fde3c4; WORD $0xd4c9 // vpermq ymm1, ymm1, 212 - LONG $0xc863f5c5 // vpacksswb ymm1, ymm1, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x01c9 // vpextrb ecx, xmm1, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_126 - QUAD $0x000180248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 384] - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x01 // vpextrb byte [r8 + rcx], xmm1, 1 - -LBB0_126: - LONG $0x6679c1c4; BYTE $0xca // vpcmpgtd xmm1, xmm0, xmm10 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0x00fde3c4; WORD $0xd4c9 // vpermq ymm1, ymm1, 212 - LONG $0xc863f5c5 // vpacksswb ymm1, ymm1, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_128 - QUAD $0x000180248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 384] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x02 // vpextrb byte [r8 + rcx], xmm1, 2 - -LBB0_128: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x6679c1c4; BYTE $0xd2 // vpcmpgtd xmm2, xmm0, xmm10 - LONG $0xd26be9c5 // vpackssdw xmm2, xmm2, xmm2 - LONG $0x00fde3c4; WORD $0xd4d2 // vpermq ymm2, ymm2, 212 - LONG $0xd063edc5 // vpacksswb ymm2, ymm2, ymm0 - LONG $0x397de3c4; WORD $0x01d2 // vextracti128 xmm2, ymm2, 1 - LONG $0x1479e3c4; WORD $0x03d1 // vpextrb ecx, xmm2, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_130 - QUAD $0x00018024946ffdc5; BYTE $0x00 // vmovdqa ymm2, yword [rsp + 384] - LONG $0x397de3c4; WORD $0x01d2 // vextracti128 xmm2, ymm2, 1 - LONG $0x16f9e3c4; WORD $0x01d1 // vpextrq rcx, xmm2, 1 - LONG $0x397d63c4; WORD $0x01fa // vextracti128 xmm2, ymm15, 1 - LONG $0x1479c3c4; WORD $0x0814; BYTE $0x03 // vpextrb byte [r8 + rcx], xmm2, 3 - -LBB0_130: - LONG $0x357de2c4; BYTE $0xc9 // vpmovzxdq ymm1, xmm1 - QUAD $0x000160248c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 352], ymm1 - LONG $0x6b7dc1c4; BYTE $0xc8 // vpackssdw ymm1, ymm0, ymm8 - LONG $0xc863f5c5 // vpacksswb ymm1, ymm1, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_132 - QUAD $0x000160248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 352] - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x04 // vpextrb byte [r8 + rcx], xmm1, 4 - -LBB0_132: - LONG $0x6b7dc1c4; BYTE $0xc8 // vpackssdw ymm1, ymm0, ymm8 - LONG $0xc863f5c5 // vpacksswb ymm1, ymm1, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x05c9 // vpextrb ecx, xmm1, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_134 - QUAD $0x000160248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 352] - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x05 // vpextrb byte [r8 + rcx], xmm1, 5 - -LBB0_134: - LONG $0x6b7dc1c4; BYTE $0xc8 // vpackssdw ymm1, ymm0, ymm8 - LONG $0xc863f5c5 // vpacksswb ymm1, ymm1, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x06c9 // vpextrb ecx, xmm1, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_136 - QUAD $0x000160248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 352] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x06 // vpextrb byte [r8 + rcx], xmm1, 6 - -LBB0_136: - LONG $0x6b7dc1c4; BYTE $0xc8 // vpackssdw ymm1, ymm0, ymm8 - LONG $0xc863f5c5 // vpacksswb ymm1, ymm1, ymm0 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x07c9 // vpextrb ecx, xmm1, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_138 - QUAD $0x000160248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 352] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x07 // vpextrb byte [r8 + rcx], xmm1, 7 - -LBB0_138: - LONG $0x357dc2c4; BYTE $0xcb // vpmovzxdq ymm1, xmm11 - QUAD $0x000140248c7ffdc5; BYTE $0x00 // vmovdqa yword [rsp + 320], ymm1 - LONG $0x637dc1c4; BYTE $0xcc // vpacksswb ymm1, ymm0, ymm12 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x08c9 // vpextrb ecx, xmm1, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_140 - QUAD $0x000140248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 320] - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x08 // vpextrb byte [r8 + rcx], xmm1, 8 - -LBB0_140: - LONG $0x6679c1c4; BYTE $0xcb // vpcmpgtd xmm1, xmm0, xmm11 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0x00fde3c4; WORD $0xd4c9 // vpermq ymm1, ymm1, 212 - LONG $0xc963fdc5 // vpacksswb ymm1, ymm0, ymm1 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_142 - QUAD $0x000140248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 320] - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - -LBB0_142: - LONG $0x6679c1c4; BYTE $0xcb // vpcmpgtd xmm1, xmm0, xmm11 - LONG $0xc96bf1c5 // vpackssdw xmm1, xmm1, xmm1 - LONG $0x00fde3c4; WORD $0xd4c9 // vpermq ymm1, ymm1, 212 - LONG $0xc963fdc5 // vpacksswb ymm1, ymm0, ymm1 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x0ac9 // vpextrb ecx, xmm1, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_144 - QUAD $0x000140248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 320] - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - -LBB0_144: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x6679c1c4; BYTE $0xe3 // vpcmpgtd xmm4, xmm0, xmm11 - LONG $0xe46bd9c5 // vpackssdw xmm4, xmm4, xmm4 - LONG $0x00fde3c4; WORD $0xd4e4 // vpermq ymm4, ymm4, 212 - LONG $0xe463fdc5 // vpacksswb ymm4, ymm0, ymm4 - LONG $0x397de3c4; WORD $0x01e4 // vextracti128 xmm4, ymm4, 1 - LONG $0x1479e3c4; WORD $0x0be1 // vpextrb ecx, xmm4, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_146 - QUAD $0x00014024946ffdc5; BYTE $0x00 // vmovdqa ymm2, yword [rsp + 320] - LONG $0x397de3c4; WORD $0x01d4 // vextracti128 xmm4, ymm2, 1 - LONG $0x16f9e3c4; WORD $0x01e1 // vpextrq rcx, xmm4, 1 - LONG $0x397d63c4; WORD $0x01fc // vextracti128 xmm4, ymm15, 1 - LONG $0x1479c3c4; WORD $0x0824; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm4, 11 - -LBB0_146: - LONG $0x357de2c4; BYTE $0xe1 // vpmovzxdq ymm4, xmm1 - LONG $0x6b7dc1c4; BYTE $0xc9 // vpackssdw ymm1, ymm0, ymm9 - LONG $0xc963fdc5 // vpacksswb ymm1, ymm0, ymm1 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x0cc9 // vpextrb ecx, xmm1, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_148 - LONG $0x7ef9e1c4; BYTE $0xe1 // vmovq rcx, xmm4 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - -LBB0_148: - LONG $0x6b7dc1c4; BYTE $0xc9 // vpackssdw ymm1, ymm0, ymm9 - LONG $0xc963fdc5 // vpacksswb ymm1, ymm0, ymm1 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x0dc9 // vpextrb ecx, xmm1, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_150 - LONG $0x16f9e3c4; WORD $0x01e1 // vpextrq rcx, xmm4, 1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - -LBB0_150: - LONG $0x6b7dc1c4; BYTE $0xc9 // vpackssdw ymm1, ymm0, ymm9 - LONG $0xc963fdc5 // vpacksswb ymm1, ymm0, ymm1 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x0ec9 // vpextrb ecx, xmm1, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_152 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - -LBB0_152: - LONG $0x6b7dc1c4; BYTE $0xc9 // vpackssdw ymm1, ymm0, ymm9 - LONG $0xc963fdc5 // vpacksswb ymm1, ymm0, ymm1 - LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 - LONG $0x1479e3c4; WORD $0x0fc9 // vpextrb ecx, xmm1, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_154 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - -LBB0_154: - LONG $0x6b4dc1c4; BYTE $0xc8 // vpackssdw ymm1, ymm6, ymm8 - LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 - LONG $0x6b45c1c4; BYTE $0xe9 // vpackssdw ymm5, ymm7, ymm9 - LONG $0x00fde3c4; WORD $0xd8ed // vpermq ymm5, ymm5, 216 - LONG $0xcd63f5c5 // vpacksswb ymm1, ymm1, ymm5 - QUAD $0x00030024946ffdc5; BYTE $0x00 // vmovdqa ymm2, yword [rsp + 768] - QUAD $0x00020024bceb6dc5; BYTE $0x00 // vpor ymm15, ymm2, yword [rsp + 512] - QUAD $0x0001e024acebedc5; BYTE $0x00 // vpor ymm5, ymm2, yword [rsp + 480] - QUAD $0x0001802494eb6dc5; BYTE $0x00 // vpor ymm10, ymm2, yword [rsp + 384] - QUAD $0x000160248ceb6dc5; BYTE $0x00 // vpor ymm9, ymm2, yword [rsp + 352] - QUAD $0x0001c024a4eb6dc5; BYTE $0x00 // vpor ymm12, ymm2, yword [rsp + 448] - QUAD $0x0001a0249ceb6dc5; BYTE $0x00 // vpor ymm11, ymm2, yword [rsp + 416] - QUAD $0x0001402484eb6dc5; BYTE $0x00 // vpor ymm8, ymm2, yword [rsp + 320] - LONG $0xfaebddc5 // vpor ymm7, ymm4, ymm2 - LONG $0x463de3c4; WORD $0x31f7 // vperm2i128 ymm6, ymm8, ymm7, 49 - LONG $0x383d63c4; WORD $0x01ef // vinserti128 ymm13, ymm8, xmm7, 1 - LONG $0xf6c694c5; BYTE $0x88 // vshufps ymm6, ymm13, ymm6, 136 - LONG $0x461d43c4; WORD $0x31eb // vperm2i128 ymm13, ymm12, ymm11, 49 - LONG $0x381d43c4; WORD $0x01f3 // vinserti128 ymm14, ymm12, xmm11, 1 - LONG $0xc60c41c4; WORD $0x88ed // vshufps ymm13, ymm14, ymm13, 136 - LONG $0x462d43c4; WORD $0x31f1 // vperm2i128 ymm14, ymm10, ymm9, 49 - LONG $0x382dc3c4; WORD $0x01d1 // vinserti128 ymm2, ymm10, xmm9, 1 - LONG $0xc66cc1c4; WORD $0x88d6 // vshufps ymm2, ymm2, ymm14, 136 - LONG $0x460563c4; WORD $0x31f5 // vperm2i128 ymm14, ymm15, ymm5, 49 - LONG $0x3805e3c4; WORD $0x01dd // vinserti128 ymm3, ymm15, xmm5, 1 - LONG $0xc664c1c4; WORD $0x88de // vshufps ymm3, ymm3, ymm14, 136 - LONG $0xdb66fdc5 // vpcmpgtd ymm3, ymm0, ymm3 - LONG $0xd266fdc5 // vpcmpgtd ymm2, ymm0, ymm2 - LONG $0xd26be5c5 // vpackssdw ymm2, ymm3, ymm2 - LONG $0x667dc1c4; BYTE $0xdd // vpcmpgtd ymm3, ymm0, ymm13 - LONG $0xf666fdc5 // vpcmpgtd ymm6, ymm0, ymm6 - LONG $0xde6be5c5 // vpackssdw ymm3, ymm3, ymm6 - LONG $0x00fde3c4; WORD $0xd8d2 // vpermq ymm2, ymm2, 216 - LONG $0x00fde3c4; WORD $0xd8db // vpermq ymm3, ymm3, 216 - LONG $0xd363edc5 // vpacksswb ymm2, ymm2, ymm3 - LONG $0xf1dbedc5 // vpand ymm6, ymm2, ymm1 - LONG $0xf17ef9c5 // vmovd ecx, xmm6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_155 - LONG $0x787d22c4; WORD $0x1f34 // vpbroadcastb ymm14, byte [rdi + r11] - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_661 - -LBB0_156: - QUAD $0x000000e0249c8b48 // mov rbx, qword [rsp + 224] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_157 - -LBB0_662: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x02 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 2 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_663 - -LBB0_158: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_159 - -LBB0_664: - QUAD $0x00000108248c8b48 // mov rcx, qword [rsp + 264] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 4 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_665 - -LBB0_160: - QUAD $0x000000e824b48b48 // mov rsi, qword [rsp + 232] - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_161 - -LBB0_666: - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb xmm1, xmm14, byte [rdi + rax], 6 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_667 - -LBB0_162: - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_163 - -LBB0_668: - LONG $0x24448b48; BYTE $0x60 // mov rax, qword [rsp + 96] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x08 // vpinsrb xmm1, xmm14, byte [rdi + rax], 8 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_669 - -LBB0_164: - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_165 - -LBB0_670: - QUAD $0x0000009024848b48 // mov rax, qword [rsp + 144] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb xmm1, xmm14, byte [rdi + rax], 10 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_671 - -LBB0_166: - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_167 - -LBB0_672: - QUAD $0x000000f824848b48 // mov rax, qword [rsp + 248] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb xmm1, xmm14, byte [rdi + rax], 12 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_673 - -LBB0_168: - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_169 - -LBB0_674: - LONG $0x24448b48; BYTE $0x50 // mov rax, qword [rsp + 80] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb xmm1, xmm14, byte [rdi + rax], 14 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_170 - JMP LBB0_171 - -LBB0_155: - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_156 - -LBB0_661: - LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x01 // vpinsrb xmm1, xmm14, byte [rdi + r9], 1 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - QUAD $0x000000e0249c8b48 // mov rbx, qword [rsp + 224] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_662 - -LBB0_157: - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_158 - -LBB0_663: - QUAD $0x00000110248c8b48 // mov rcx, qword [rsp + 272] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 3 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_664 - -LBB0_159: - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_160 - -LBB0_665: - LONG $0x2009a3c4; WORD $0x370c; BYTE $0x05 // vpinsrb xmm1, xmm14, byte [rdi + r14], 5 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - QUAD $0x000000e824b48b48 // mov rsi, qword [rsp + 232] - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_666 - -LBB0_161: - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_162 - -LBB0_667: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x07 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 7 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_668 - -LBB0_163: - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_164 - -LBB0_669: - LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x09 // vpinsrb xmm1, xmm14, byte [rdi + rbx], 9 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_670 - -LBB0_165: - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_166 - -LBB0_671: - QUAD $0x0000010024848b48 // mov rax, qword [rsp + 256] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb xmm1, xmm14, byte [rdi + rax], 11 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_672 - -LBB0_167: - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_168 - -LBB0_673: - LONG $0x24448b48; BYTE $0x58 // mov rax, qword [rsp + 88] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0d // vpinsrb xmm1, xmm14, byte [rdi + rax], 13 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_674 - -LBB0_169: - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_171 - -LBB0_170: - LONG $0x24448b48; BYTE $0x48 // mov rax, qword [rsp + 72] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0f // vpinsrb xmm1, xmm14, byte [rdi + rax], 15 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_171: - QUAD $0x000000d0248c8b48 // mov rcx, qword [rsp + 208] - LONG $0x397dc3c4; WORD $0x01f5 // vextracti128 xmm13, ymm6, 1 - LONG $0xe87e79c5 // vmovd eax, xmm13 - LONG $0x2c244489 // mov dword [rsp + 44], eax - WORD $0x01a8 // test al, 1 - JE LBB0_172 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x24448b48; BYTE $0x40 // mov rax, qword [rsp + 64] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x00 // vpinsrb xmm1, xmm1, byte [rdi + rax], 0 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x01e8 // vpextrb eax, xmm13, 1 - LONG $0x28244489 // mov dword [rsp + 40], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_676 - -LBB0_173: - LONG $0x147963c4; WORD $0x02e8 // vpextrb eax, xmm13, 2 - LONG $0x24244489 // mov dword [rsp + 36], eax - WORD $0x01a8 // test al, 1 - JE LBB0_174 - -LBB0_677: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x0000008024848b48 // mov rax, qword [rsp + 128] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x02 // vpinsrb xmm1, xmm1, byte [rdi + rax], 2 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x03e8 // vpextrb eax, xmm13, 3 - LONG $0x20244489 // mov dword [rsp + 32], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_678 - -LBB0_175: - LONG $0x147963c4; WORD $0x04e8 // vpextrb eax, xmm13, 4 - LONG $0x1c244489 // mov dword [rsp + 28], eax - WORD $0x01a8 // test al, 1 - JE LBB0_176 - -LBB0_679: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x24448b48; BYTE $0x38 // mov rax, qword [rsp + 56] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x04 // vpinsrb xmm1, xmm1, byte [rdi + rax], 4 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x05e8 // vpextrb eax, xmm13, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_680 - -LBB0_177: - LONG $0x147963c4; WORD $0x06e8 // vpextrb eax, xmm13, 6 - LONG $0x14244489 // mov dword [rsp + 20], eax - WORD $0x01a8 // test al, 1 - JE LBB0_178 - -LBB0_681: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x2071a3c4; WORD $0x170c; BYTE $0x06 // vpinsrb xmm1, xmm1, byte [rdi + r10], 6 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x07e8 // vpextrb eax, xmm13, 7 - LONG $0x3c248489; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 316], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_682 - -LBB0_179: - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x147963c4; WORD $0x08eb // vpextrb ebx, xmm13, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_181 - -LBB0_180: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x08 // vpinsrb xmm1, xmm1, byte [rdi + rax], 8 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_181: - LONG $0x147943c4; WORD $0x09e9 // vpextrb r9d, xmm13, 9 - LONG $0x01c1f641 // test r9b, 1 - QUAD $0x0000011824ac894c // mov qword [rsp + 280], r13 - LONG $0x2454894c; BYTE $0x70 // mov qword [rsp + 112], r10 - QUAD $0x000000b824948948 // mov qword [rsp + 184], rdx - JE LBB0_183 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x09 // vpinsrb xmm1, xmm1, byte [rdi + rcx], 9 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_183: - QUAD $0x000000c824848b48 // mov rax, qword [rsp + 200] - QUAD $0x000000c0248c8b48 // mov rcx, qword [rsp + 192] - LONG $0x147943c4; WORD $0x0aed // vpextrb r13d, xmm13, 10 - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_184 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb xmm1, xmm1, byte [rdi + rax], 10 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x0be8 // vpextrb eax, xmm13, 11 - WORD $0x01a8 // test al, 1 - LONG $0x247c894c; BYTE $0x78 // mov qword [rsp + 120], r15 - JNE LBB0_684 - -LBB0_185: - LONG $0x147943c4; WORD $0x0cef // vpextrb r15d, xmm13, 12 - LONG $0x01c7f641 // test r15b, 1 - QUAD $0x00000130249c894c // mov qword [rsp + 304], r11 - JE LBB0_186 - -LBB0_685: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000b8248c8b48 // mov rcx, qword [rsp + 184] - LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0c // vpinsrb xmm1, xmm1, byte [rdi + rcx], 12 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x0dea // vpextrb edx, xmm13, 13 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JNE LBB0_686 - -LBB0_187: - LONG $0x147963c4; WORD $0x0eee // vpextrb esi, xmm13, 14 - LONG $0x01c6f640 // test sil, 1 - JE LBB0_188 - -LBB0_687: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000a8248c8b48 // mov rcx, qword [rsp + 168] - LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0e // vpinsrb xmm1, xmm1, byte [rdi + rcx], 14 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147943c4; WORD $0x0fee // vpextrb r14d, xmm13, 15 - LONG $0x01c6f641 // test r14b, 1 - JNE LBB0_189 - JMP LBB0_190 - -LBB0_172: - LONG $0x147963c4; WORD $0x01e8 // vpextrb eax, xmm13, 1 - LONG $0x28244489 // mov dword [rsp + 40], eax - WORD $0x01a8 // test al, 1 - JE LBB0_173 - -LBB0_676: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x0000008824848b48 // mov rax, qword [rsp + 136] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x01 // vpinsrb xmm1, xmm1, byte [rdi + rax], 1 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x02e8 // vpextrb eax, xmm13, 2 - LONG $0x24244489 // mov dword [rsp + 36], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_677 - -LBB0_174: - LONG $0x147963c4; WORD $0x03e8 // vpextrb eax, xmm13, 3 - LONG $0x20244489 // mov dword [rsp + 32], eax - WORD $0x01a8 // test al, 1 - JE LBB0_175 - -LBB0_678: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x2071a3c4; WORD $0x3f0c; BYTE $0x03 // vpinsrb xmm1, xmm1, byte [rdi + r15], 3 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x04e8 // vpextrb eax, xmm13, 4 - LONG $0x1c244489 // mov dword [rsp + 28], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_679 - -LBB0_176: - LONG $0x147963c4; WORD $0x05e8 // vpextrb eax, xmm13, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JE LBB0_177 - -LBB0_680: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x2071a3c4; WORD $0x2f0c; BYTE $0x05 // vpinsrb xmm1, xmm1, byte [rdi + r13], 5 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x06e8 // vpextrb eax, xmm13, 6 - LONG $0x14244489 // mov dword [rsp + 20], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_681 - -LBB0_178: - LONG $0x147963c4; WORD $0x07e8 // vpextrb eax, xmm13, 7 - LONG $0x3c248489; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 316], eax - WORD $0x01a8 // test al, 1 - JE LBB0_179 - -LBB0_682: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000f024848b48 // mov rax, qword [rsp + 240] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x07 // vpinsrb xmm1, xmm1, byte [rdi + rax], 7 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x147963c4; WORD $0x08eb // vpextrb ebx, xmm13, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_180 - JMP LBB0_181 - -LBB0_184: - LONG $0x147963c4; WORD $0x0be8 // vpextrb eax, xmm13, 11 - WORD $0x01a8 // test al, 1 - LONG $0x247c894c; BYTE $0x78 // mov qword [rsp + 120], r15 - JE LBB0_185 - -LBB0_684: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0b // vpinsrb xmm1, xmm1, byte [rdi + rcx], 11 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147943c4; WORD $0x0cef // vpextrb r15d, xmm13, 12 - LONG $0x01c7f641 // test r15b, 1 - QUAD $0x00000130249c894c // mov qword [rsp + 304], r11 - JNE LBB0_685 - -LBB0_186: - LONG $0x147963c4; WORD $0x0dea // vpextrb edx, xmm13, 13 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JE LBB0_187 - -LBB0_686: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000b0248c8b48 // mov rcx, qword [rsp + 176] - LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0d // vpinsrb xmm1, xmm1, byte [rdi + rcx], 13 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - LONG $0x147963c4; WORD $0x0eee // vpextrb esi, xmm13, 14 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_687 - -LBB0_188: - LONG $0x147943c4; WORD $0x0fee // vpextrb r14d, xmm13, 15 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_190 - -LBB0_189: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000a0248c8b48 // mov rcx, qword [rsp + 160] - LONG $0x2071e3c4; WORD $0x0f0c; BYTE $0x0f // vpinsrb xmm1, xmm1, byte [rdi + rcx], 15 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_190: - LONG $0x7175c1c4; WORD $0x01d6 // vpsrlw ymm1, ymm14, 1 - QUAD $0x00000080b5db75c5 // vpand ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0x7e79c1c4; BYTE $0xf2 // vmovd r10d, xmm6 - LONG $0x01c2f641 // test r10b, 1 - JE LBB0_191 - LONG $0x7ef961c4; BYTE $0xf9 // vmovq rcx, xmm15 - LONG $0x147943c4; WORD $0x0834; BYTE $0x00 // vpextrb byte [r8 + rcx], xmm14, 0 - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_689 - -LBB0_192: - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_193 - -LBB0_690: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x02 // vpextrb byte [r8 + rcx], xmm14, 2 - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_691 - -LBB0_194: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_195 - -LBB0_692: - LONG $0x7ef9e1c4; BYTE $0xe9 // vmovq rcx, xmm5 - LONG $0x147943c4; WORD $0x0834; BYTE $0x04 // vpextrb byte [r8 + rcx], xmm14, 4 - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_693 - -LBB0_196: - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_197 - -LBB0_694: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x06 // vpextrb byte [r8 + rcx], xmm14, 6 - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_695 - -LBB0_198: - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_199 - -LBB0_696: - LONG $0x7ef961c4; BYTE $0xe1 // vmovq rcx, xmm12 - LONG $0x147943c4; WORD $0x0834; BYTE $0x08 // vpextrb byte [r8 + rcx], xmm14, 8 - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_697 - -LBB0_200: - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_201 - -LBB0_698: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm14, 10 - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_699 - -LBB0_202: - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_203 - -LBB0_700: - LONG $0x7ef961c4; BYTE $0xd9 // vmovq rcx, xmm11 - LONG $0x147943c4; WORD $0x0834; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm14, 12 - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_701 - -LBB0_204: - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_205 - -LBB0_702: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm14, 14 - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_703 - -LBB0_206: - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JE LBB0_207 - -LBB0_704: - LONG $0x7ef961c4; BYTE $0xd1 // vmovq rcx, xmm10 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x00 // vpextrb byte [r8 + rcx], xmm1, 0 - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JNE LBB0_705 - -LBB0_208: - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JE LBB0_209 - -LBB0_706: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x02 // vpextrb byte [r8 + rcx], xmm1, 2 - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JNE LBB0_707 - -LBB0_210: - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JE LBB0_211 - -LBB0_708: - LONG $0x7ef961c4; BYTE $0xc9 // vmovq rcx, xmm9 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x04 // vpextrb byte [r8 + rcx], xmm1, 4 - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JNE LBB0_709 - -LBB0_212: - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JE LBB0_213 - -LBB0_710: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x06 // vpextrb byte [r8 + rcx], xmm1, 6 - QUAD $0x010000013c2484f6 // test byte [rsp + 316], 1 - JNE LBB0_711 - -LBB0_214: - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_215 - -LBB0_712: - LONG $0x7ef961c4; BYTE $0xc1 // vmovq rcx, xmm8 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x08 // vpextrb byte [r8 + rcx], xmm1, 8 - LONG $0x01c1f641 // test r9b, 1 - QUAD $0x000000e024948b4c // mov r10, qword [rsp + 224] - QUAD $0x00000090249c8b4c // mov r11, qword [rsp + 144] - JNE LBB0_713 - -LBB0_216: - LONG $0x01c5f641 // test r13b, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JE LBB0_217 - -LBB0_714: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - WORD $0x01a8 // test al, 1 - QUAD $0x00000120248c8b4c // mov r9, qword [rsp + 288] - QUAD $0x000000e824848b48 // mov rax, qword [rsp + 232] - JNE LBB0_715 - -LBB0_218: - LONG $0x01c7f641 // test r15b, 1 - JE LBB0_219 - -LBB0_716: - LONG $0x7ef9e1c4; BYTE $0xf9 // vmovq rcx, xmm7 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x0000008824ac8b4c // mov r13, qword [rsp + 136] - QUAD $0x0000008024bc8b4c // mov r15, qword [rsp + 128] - JNE LBB0_717 - -LBB0_220: - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JE LBB0_221 - -LBB0_718: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JNE LBB0_222 - JMP LBB0_223 - -LBB0_191: - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_192 - -LBB0_689: - LONG $0x16f963c4; WORD $0x01f9 // vpextrq rcx, xmm15, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x01 // vpextrb byte [r8 + rcx], xmm14, 1 - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_690 - -LBB0_193: - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_194 - -LBB0_691: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x03 // vpextrb byte [r8 + rcx], xmm14, 3 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_692 - -LBB0_195: - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_196 - -LBB0_693: - LONG $0x16f9e3c4; WORD $0x01e9 // vpextrq rcx, xmm5, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x05 // vpextrb byte [r8 + rcx], xmm14, 5 - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_694 - -LBB0_197: - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_198 - -LBB0_695: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x07 // vpextrb byte [r8 + rcx], xmm14, 7 - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_696 - -LBB0_199: - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_200 - -LBB0_697: - LONG $0x16f963c4; WORD $0x01e1 // vpextrq rcx, xmm12, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm14, 9 - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_698 - -LBB0_201: - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_202 - -LBB0_699: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm14, 11 - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_700 - -LBB0_203: - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_204 - -LBB0_701: - LONG $0x16f963c4; WORD $0x01d9 // vpextrq rcx, xmm11, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm14, 13 - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_702 - -LBB0_205: - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_206 - -LBB0_703: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x147943c4; WORD $0x0834; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm14, 15 - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JNE LBB0_704 - -LBB0_207: - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JE LBB0_208 - -LBB0_705: - LONG $0x16f963c4; WORD $0x01d1 // vpextrq rcx, xmm10, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x01 // vpextrb byte [r8 + rcx], xmm1, 1 - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JNE LBB0_706 - -LBB0_209: - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JE LBB0_210 - -LBB0_707: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x03 // vpextrb byte [r8 + rcx], xmm1, 3 - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JNE LBB0_708 - -LBB0_211: - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JE LBB0_212 - -LBB0_709: - LONG $0x16f963c4; WORD $0x01c9 // vpextrq rcx, xmm9, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x05 // vpextrb byte [r8 + rcx], xmm1, 5 - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JNE LBB0_710 - -LBB0_213: - QUAD $0x010000013c2484f6 // test byte [rsp + 316], 1 - JE LBB0_214 - -LBB0_711: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x07 // vpextrb byte [r8 + rcx], xmm1, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_712 - -LBB0_215: - LONG $0x01c1f641 // test r9b, 1 - QUAD $0x000000e024948b4c // mov r10, qword [rsp + 224] - QUAD $0x00000090249c8b4c // mov r11, qword [rsp + 144] - JE LBB0_216 - -LBB0_713: - LONG $0x16f963c4; WORD $0x01c1 // vpextrq rcx, xmm8, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - LONG $0x01c5f641 // test r13b, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JNE LBB0_714 - -LBB0_217: - WORD $0x01a8 // test al, 1 - QUAD $0x00000120248c8b4c // mov r9, qword [rsp + 288] - QUAD $0x000000e824848b48 // mov rax, qword [rsp + 232] - JE LBB0_218 - -LBB0_715: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm1, 11 - LONG $0x01c7f641 // test r15b, 1 - JNE LBB0_716 - -LBB0_219: - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x0000008824ac8b4c // mov r13, qword [rsp + 136] - QUAD $0x0000008024bc8b4c // mov r15, qword [rsp + 128] - JE LBB0_220 - -LBB0_717: - LONG $0x16f9e3c4; WORD $0x01f9 // vpextrq rcx, xmm7, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JNE LBB0_718 - -LBB0_221: - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JE LBB0_223 - -LBB0_222: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - -LBB0_223: - QUAD $0x0002e0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 736] - QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor ymm15, ymm1, yword [rsp + 512] - QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor ymm5, ymm1, yword [rsp + 480] - QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor ymm10, ymm1, yword [rsp + 384] - QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor ymm9, ymm1, yword [rsp + 352] - QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor ymm12, ymm1, yword [rsp + 448] - QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor ymm11, ymm1, yword [rsp + 416] - QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor ymm8, ymm1, yword [rsp + 320] - LONG $0xf9ebddc5 // vpor ymm7, ymm4, ymm1 - LONG $0x463de3c4; WORD $0x31cf // vperm2i128 ymm1, ymm8, ymm7, 49 - LONG $0x383de3c4; WORD $0x01d7 // vinserti128 ymm2, ymm8, xmm7, 1 - LONG $0xc9c6ecc5; BYTE $0x88 // vshufps ymm1, ymm2, ymm1, 136 - LONG $0x461dc3c4; WORD $0x31d3 // vperm2i128 ymm2, ymm12, ymm11, 49 - LONG $0x381dc3c4; WORD $0x01db // vinserti128 ymm3, ymm12, xmm11, 1 - LONG $0xd2c6e4c5; BYTE $0x88 // vshufps ymm2, ymm3, ymm2, 136 - LONG $0x462dc3c4; WORD $0x31d9 // vperm2i128 ymm3, ymm10, ymm9, 49 - LONG $0x382d43c4; WORD $0x01e9 // vinserti128 ymm13, ymm10, xmm9, 1 - LONG $0xdbc694c5; BYTE $0x88 // vshufps ymm3, ymm13, ymm3, 136 - LONG $0x460563c4; WORD $0x31ed // vperm2i128 ymm13, ymm15, ymm5, 49 - LONG $0x380563c4; WORD $0x01f5 // vinserti128 ymm14, ymm15, xmm5, 1 - LONG $0xc60c41c4; WORD $0x88ed // vshufps ymm13, ymm14, ymm13, 136 - LONG $0x667d41c4; BYTE $0xed // vpcmpgtd ymm13, ymm0, ymm13 - LONG $0xdb66fdc5 // vpcmpgtd ymm3, ymm0, ymm3 - LONG $0xdb6b95c5 // vpackssdw ymm3, ymm13, ymm3 - LONG $0xd266fdc5 // vpcmpgtd ymm2, ymm0, ymm2 - LONG $0xc966fdc5 // vpcmpgtd ymm1, ymm0, ymm1 - LONG $0xc96bedc5 // vpackssdw ymm1, ymm2, ymm1 - LONG $0x00fde3c4; WORD $0xd8d3 // vpermq ymm2, ymm3, 216 - LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 - LONG $0xc963edc5 // vpacksswb ymm1, ymm2, ymm1 - LONG $0xf6dbf5c5 // vpand ymm6, ymm1, ymm6 - LONG $0xf17ef9c5 // vmovd ecx, xmm6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_224 - LONG $0x787d62c4; WORD $0x1734 // vpbroadcastb ymm14, byte [rdi + rdx] - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_720 - -LBB0_225: - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_227 - -LBB0_226: - LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb xmm1, xmm14, byte [rdi + rbx], 2 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_227: - LONG $0x24748b48; BYTE $0x60 // mov rsi, qword [rsp + 96] - LONG $0x245c8b48; BYTE $0x48 // mov rbx, qword [rsp + 72] - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_228 - QUAD $0x00000110248c8b48 // mov rcx, qword [rsp + 272] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 3 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_722 - -LBB0_229: - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_230 - -LBB0_723: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb xmm1, xmm14, byte [rdi + rdx], 5 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_724 - -LBB0_231: - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_232 - -LBB0_725: - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x07 // vpinsrb xmm1, xmm14, byte [rdi + rax], 7 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_726 - -LBB0_233: - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_234 - -LBB0_727: - LONG $0x2009a3c4; WORD $0x170c; BYTE $0x09 // vpinsrb xmm1, xmm14, byte [rdi + r10], 9 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_728 - -LBB0_235: - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_236 - -LBB0_729: - QUAD $0x0000010024848b48 // mov rax, qword [rsp + 256] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb xmm1, xmm14, byte [rdi + rax], 11 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_730 - -LBB0_237: - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_239 - -LBB0_238: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb xmm1, xmm14, byte [rdi + rdx], 13 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_239: - LONG $0x24448b48; BYTE $0x50 // mov rax, qword [rsp + 80] - LONG $0x24548b48; BYTE $0x40 // mov rdx, qword [rsp + 64] - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_241 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb xmm1, xmm14, byte [rdi + rax], 14 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_241: - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_243 - LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb xmm1, xmm14, byte [rdi + rbx], 15 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_243: - LONG $0x397de3c4; WORD $0x01f1 // vextracti128 xmm1, ymm6, 1 - LONG $0xc87ef9c5 // vmovd eax, xmm1 - LONG $0x2c244489 // mov dword [rsp + 44], eax - WORD $0x01a8 // test al, 1 - JE LBB0_245 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb xmm2, xmm2, byte [rdi + rdx], 0 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_245: - LONG $0x244c8b48; BYTE $0x38 // mov rcx, qword [rsp + 56] - LONG $0x1479e3c4; WORD $0x01c8 // vpextrb eax, xmm1, 1 - LONG $0x28244489 // mov dword [rsp + 40], eax - WORD $0x01a8 // test al, 1 - JE LBB0_247 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x01 // vpinsrb xmm2, xmm2, byte [rdi + r13], 1 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_247: - QUAD $0x0000011824948b48 // mov rdx, qword [rsp + 280] - LONG $0x24748b48; BYTE $0x70 // mov rsi, qword [rsp + 112] - LONG $0x1479e3c4; WORD $0x02c8 // vpextrb eax, xmm1, 2 - LONG $0x24244489 // mov dword [rsp + 36], eax - WORD $0x01a8 // test al, 1 - JE LBB0_249 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x3f14; BYTE $0x02 // vpinsrb xmm2, xmm2, byte [rdi + r15], 2 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_249: - LONG $0x24448b48; BYTE $0x78 // mov rax, qword [rsp + 120] - LONG $0x1479e3c4; WORD $0x03cb // vpextrb ebx, xmm1, 3 - LONG $0x20245c89 // mov dword [rsp + 32], ebx - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_250 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x03 // vpinsrb xmm2, xmm2, byte [rdi + rax], 3 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x04c8 // vpextrb eax, xmm1, 4 - LONG $0x1c244489 // mov dword [rsp + 28], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_732 - -LBB0_251: - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JE LBB0_252 - -LBB0_733: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1714; BYTE $0x05 // vpinsrb xmm2, xmm2, byte [rdi + rdx], 5 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x06c8 // vpextrb eax, xmm1, 6 - LONG $0x14244489 // mov dword [rsp + 20], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_734 - -LBB0_253: - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_254 - -LBB0_735: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000f024848b48 // mov rax, qword [rsp + 240] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb xmm2, xmm2, byte [rdi + rax], 7 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JNE LBB0_736 - -LBB0_255: - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_256 - -LBB0_737: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d024848b48 // mov rax, qword [rsp + 208] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb xmm2, xmm2, byte [rdi + rax], 9 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_738 - -LBB0_257: - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JE LBB0_258 - -LBB0_739: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c0249c8b48 // mov rbx, qword [rsp + 192] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb xmm2, xmm2, byte [rdi + rbx], 11 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_740 - -LBB0_259: - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JE LBB0_260 - -LBB0_741: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b0249c8b48 // mov rbx, qword [rsp + 176] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb xmm2, xmm2, byte [rdi + rbx], 13 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_742 - -LBB0_261: - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_263 - -LBB0_262: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000a0249c8b48 // mov rbx, qword [rsp + 160] - LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb xmm1, xmm1, byte [rdi + rbx], 15 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_263: - LONG $0x7175c1c4; WORD $0x02d6 // vpsrlw ymm1, ymm14, 2 - QUAD $0x00000080b5db75c5 // vpand ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0x7e79c1c4; BYTE $0xf7 // vmovd r15d, xmm6 - LONG $0x01c7f641 // test r15b, 1 - JE LBB0_264 - LONG $0x7ef961c4; BYTE $0xfb // vmovq rbx, xmm15 - LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm14, 0 - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_744 - -LBB0_265: - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JE LBB0_266 - -LBB0_745: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm14, 2 - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_746 - -LBB0_267: - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_268 - -LBB0_747: - LONG $0x7ef9e1c4; BYTE $0xeb // vmovq rbx, xmm5 - LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm14, 4 - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_748 - -LBB0_269: - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_270 - -LBB0_749: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm14, 6 - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_750 - -LBB0_271: - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_272 - -LBB0_751: - LONG $0x7ef961c4; BYTE $0xe3 // vmovq rbx, xmm12 - LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb byte [r8 + rbx], xmm14, 8 - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_752 - -LBB0_273: - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_274 - -LBB0_753: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb byte [r8 + rbx], xmm14, 10 - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_754 - -LBB0_275: - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_276 - -LBB0_755: - LONG $0x7ef961c4; BYTE $0xdb // vmovq rbx, xmm11 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb byte [r8 + rbx], xmm14, 12 - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_756 - -LBB0_277: - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_278 - -LBB0_757: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb byte [r8 + rbx], xmm14, 14 - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_758 - -LBB0_279: - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JE LBB0_280 - -LBB0_759: - LONG $0x7ef961c4; BYTE $0xd3 // vmovq rbx, xmm10 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm1, 0 - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JNE LBB0_760 - -LBB0_281: - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JE LBB0_282 - -LBB0_761: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm1, 2 - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JNE LBB0_762 - -LBB0_283: - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JE LBB0_284 - -LBB0_763: - LONG $0x7ef961c4; BYTE $0xcb // vmovq rbx, xmm9 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm1, 4 - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JNE LBB0_764 - -LBB0_285: - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JE LBB0_286 - -LBB0_765: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm1, 6 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_766 - -LBB0_287: - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JE LBB0_288 - -LBB0_767: - LONG $0x7ef961c4; BYTE $0xc2 // vmovq rdx, xmm8 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb byte [r8 + rdx], xmm1, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_768 - -LBB0_289: - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JE LBB0_290 - -LBB0_769: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JNE LBB0_770 - -LBB0_291: - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_292 - -LBB0_771: - LONG $0x7ef9e1c4; BYTE $0xf9 // vmovq rcx, xmm7 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JNE LBB0_772 - -LBB0_293: - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_294 - -LBB0_773: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JNE LBB0_295 - JMP LBB0_296 - -LBB0_224: - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_225 - -LBB0_720: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 1 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_226 - JMP LBB0_227 - -LBB0_228: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_229 - -LBB0_722: - QUAD $0x00000108248c8b48 // mov rcx, qword [rsp + 264] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 4 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_723 - -LBB0_230: - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_231 - -LBB0_724: - LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x06 // vpinsrb xmm1, xmm14, byte [rdi + r9], 6 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_725 - -LBB0_232: - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_233 - -LBB0_726: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 8 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_727 - -LBB0_234: - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_235 - -LBB0_728: - LONG $0x2009a3c4; WORD $0x1f0c; BYTE $0x0a // vpinsrb xmm1, xmm14, byte [rdi + r11], 10 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_729 - -LBB0_236: - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_237 - -LBB0_730: - QUAD $0x000000f824848b48 // mov rax, qword [rsp + 248] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb xmm1, xmm14, byte [rdi + rax], 12 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_238 - JMP LBB0_239 - -LBB0_250: - LONG $0x1479e3c4; WORD $0x04c8 // vpextrb eax, xmm1, 4 - LONG $0x1c244489 // mov dword [rsp + 28], eax - WORD $0x01a8 // test al, 1 - JE LBB0_251 - -LBB0_732: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0f14; BYTE $0x04 // vpinsrb xmm2, xmm2, byte [rdi + rcx], 4 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_733 - -LBB0_252: - LONG $0x1479e3c4; WORD $0x06c8 // vpextrb eax, xmm1, 6 - LONG $0x14244489 // mov dword [rsp + 20], eax - WORD $0x01a8 // test al, 1 - JE LBB0_253 - -LBB0_734: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x3714; BYTE $0x06 // vpinsrb xmm2, xmm2, byte [rdi + rsi], 6 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_735 - -LBB0_254: - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JE LBB0_255 - -LBB0_736: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb xmm2, xmm2, byte [rdi + rax], 8 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_737 - -LBB0_256: - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JE LBB0_257 - -LBB0_738: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c824848b48 // mov rax, qword [rsp + 200] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb xmm2, xmm2, byte [rdi + rax], 10 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JNE LBB0_739 - -LBB0_258: - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_259 - -LBB0_740: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b8249c8b48 // mov rbx, qword [rsp + 184] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb xmm2, xmm2, byte [rdi + rbx], 12 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JNE LBB0_741 - -LBB0_260: - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_261 - -LBB0_742: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000a8249c8b48 // mov rbx, qword [rsp + 168] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb xmm2, xmm2, byte [rdi + rbx], 14 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JNE LBB0_262 - JMP LBB0_263 - -LBB0_264: - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_265 - -LBB0_744: - LONG $0x16f963c4; WORD $0x01fb // vpextrq rbx, xmm15, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm14, 1 - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JNE LBB0_745 - -LBB0_266: - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_267 - -LBB0_746: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm14, 3 - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_747 - -LBB0_268: - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_269 - -LBB0_748: - LONG $0x16f9e3c4; WORD $0x01eb // vpextrq rbx, xmm5, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm14, 5 - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_749 - -LBB0_270: - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_271 - -LBB0_750: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm14, 7 - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_751 - -LBB0_272: - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_273 - -LBB0_752: - LONG $0x16f963c4; WORD $0x01e3 // vpextrq rbx, xmm12, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb byte [r8 + rbx], xmm14, 9 - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_753 - -LBB0_274: - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_275 - -LBB0_754: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb byte [r8 + rbx], xmm14, 11 - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_755 - -LBB0_276: - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_277 - -LBB0_756: - LONG $0x16f963c4; WORD $0x01db // vpextrq rbx, xmm11, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb byte [r8 + rbx], xmm14, 13 - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_757 - -LBB0_278: - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_279 - -LBB0_758: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb byte [r8 + rbx], xmm14, 15 - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JNE LBB0_759 - -LBB0_280: - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JE LBB0_281 - -LBB0_760: - LONG $0x16f963c4; WORD $0x01d3 // vpextrq rbx, xmm10, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm1, 1 - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JNE LBB0_761 - -LBB0_282: - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JE LBB0_283 - -LBB0_762: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm1, 3 - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JNE LBB0_763 - -LBB0_284: - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JE LBB0_285 - -LBB0_764: - LONG $0x16f963c4; WORD $0x01cb // vpextrq rbx, xmm9, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm1, 5 - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JNE LBB0_765 - -LBB0_286: - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_287 - -LBB0_766: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm1, 7 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JNE LBB0_767 - -LBB0_288: - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_289 - -LBB0_768: - LONG $0x16f963c4; WORD $0x01c1 // vpextrq rcx, xmm8, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JNE LBB0_769 - -LBB0_290: - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JE LBB0_291 - -LBB0_770: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm1, 11 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_771 - -LBB0_292: - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JE LBB0_293 - -LBB0_772: - LONG $0x16f9e3c4; WORD $0x01f9 // vpextrq rcx, xmm7, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_773 - -LBB0_294: - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JE LBB0_296 - -LBB0_295: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - -LBB0_296: - QUAD $0x0002c0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 704] - QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor ymm15, ymm1, yword [rsp + 512] - QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor ymm5, ymm1, yword [rsp + 480] - QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor ymm10, ymm1, yword [rsp + 384] - QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor ymm9, ymm1, yword [rsp + 352] - QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor ymm12, ymm1, yword [rsp + 448] - QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor ymm11, ymm1, yword [rsp + 416] - QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor ymm8, ymm1, yword [rsp + 320] - LONG $0xf9ebddc5 // vpor ymm7, ymm4, ymm1 - LONG $0x463de3c4; WORD $0x31cf // vperm2i128 ymm1, ymm8, ymm7, 49 - LONG $0x383de3c4; WORD $0x01d7 // vinserti128 ymm2, ymm8, xmm7, 1 - LONG $0xc9c6ecc5; BYTE $0x88 // vshufps ymm1, ymm2, ymm1, 136 - LONG $0x461dc3c4; WORD $0x31d3 // vperm2i128 ymm2, ymm12, ymm11, 49 - LONG $0x381dc3c4; WORD $0x01db // vinserti128 ymm3, ymm12, xmm11, 1 - LONG $0xd2c6e4c5; BYTE $0x88 // vshufps ymm2, ymm3, ymm2, 136 - LONG $0x462dc3c4; WORD $0x31d9 // vperm2i128 ymm3, ymm10, ymm9, 49 - LONG $0x382d43c4; WORD $0x01e9 // vinserti128 ymm13, ymm10, xmm9, 1 - LONG $0xdbc694c5; BYTE $0x88 // vshufps ymm3, ymm13, ymm3, 136 - LONG $0x460563c4; WORD $0x31ed // vperm2i128 ymm13, ymm15, ymm5, 49 - LONG $0x380563c4; WORD $0x01f5 // vinserti128 ymm14, ymm15, xmm5, 1 - LONG $0xc60c41c4; WORD $0x88ed // vshufps ymm13, ymm14, ymm13, 136 - LONG $0x667d41c4; BYTE $0xed // vpcmpgtd ymm13, ymm0, ymm13 - LONG $0xdb66fdc5 // vpcmpgtd ymm3, ymm0, ymm3 - LONG $0xdb6b95c5 // vpackssdw ymm3, ymm13, ymm3 - LONG $0xd266fdc5 // vpcmpgtd ymm2, ymm0, ymm2 - LONG $0xc966fdc5 // vpcmpgtd ymm1, ymm0, ymm1 - LONG $0xc96bedc5 // vpackssdw ymm1, ymm2, ymm1 - LONG $0x00fde3c4; WORD $0xd8d3 // vpermq ymm2, ymm3, 216 - LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 - LONG $0xc963edc5 // vpacksswb ymm1, ymm2, ymm1 - LONG $0xf6dbf5c5 // vpand ymm6, ymm1, ymm6 - LONG $0xf17ef9c5 // vmovd ecx, xmm6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_297 - LONG $0x787d62c4; WORD $0x1734 // vpbroadcastb ymm14, byte [rdi + rdx] - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_775 - -LBB0_298: - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_300 - -LBB0_299: - LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb xmm1, xmm14, byte [rdi + rbx], 2 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_300: - LONG $0x24748b48; BYTE $0x60 // mov rsi, qword [rsp + 96] - LONG $0x24548b4c; BYTE $0x48 // mov r10, qword [rsp + 72] - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_301 - QUAD $0x00000110248c8b48 // mov rcx, qword [rsp + 272] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 3 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_777 - -LBB0_302: - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_303 - -LBB0_778: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb xmm1, xmm14, byte [rdi + rdx], 5 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_779 - -LBB0_304: - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_305 - -LBB0_780: - LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb xmm1, xmm14, byte [rdi + r9], 7 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_781 - -LBB0_306: - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_308 - -LBB0_307: - LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb xmm1, xmm14, byte [rdi + r15], 9 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_308: - QUAD $0x0000009024848b48 // mov rax, qword [rsp + 144] - QUAD $0x0000008824b48b48 // mov rsi, qword [rsp + 136] - QUAD $0x00000080249c8b48 // mov rbx, qword [rsp + 128] - LONG $0x244c8b4c; BYTE $0x78 // mov r9, qword [rsp + 120] - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_309 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb xmm1, xmm14, byte [rdi + rax], 10 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_783 - -LBB0_310: - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_311 - -LBB0_784: - QUAD $0x000000f824848b48 // mov rax, qword [rsp + 248] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb xmm1, xmm14, byte [rdi + rax], 12 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_312 - JMP LBB0_313 - -LBB0_297: - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_298 - -LBB0_775: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 1 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_299 - JMP LBB0_300 - -LBB0_301: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_302 - -LBB0_777: - QUAD $0x00000108248c8b48 // mov rcx, qword [rsp + 264] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 4 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_778 - -LBB0_303: - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_304 - -LBB0_779: - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb xmm1, xmm14, byte [rdi + rax], 6 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_780 - -LBB0_305: - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_306 - -LBB0_781: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 8 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_307 - JMP LBB0_308 - -LBB0_309: - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_310 - -LBB0_783: - QUAD $0x0000010024848b48 // mov rax, qword [rsp + 256] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb xmm1, xmm14, byte [rdi + rax], 11 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_784 - -LBB0_311: - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_313 - -LBB0_312: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb xmm1, xmm14, byte [rdi + rdx], 13 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_313: - LONG $0x24448b48; BYTE $0x50 // mov rax, qword [rsp + 80] - LONG $0x24548b48; BYTE $0x40 // mov rdx, qword [rsp + 64] - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_315 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb xmm1, xmm14, byte [rdi + rax], 14 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_315: - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_317 - LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb xmm1, xmm14, byte [rdi + r10], 15 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_317: - LONG $0x397de3c4; WORD $0x01f1 // vextracti128 xmm1, ymm6, 1 - LONG $0xc87ef9c5 // vmovd eax, xmm1 - LONG $0x2c244489 // mov dword [rsp + 44], eax - WORD $0x01a8 // test al, 1 - JE LBB0_319 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb xmm2, xmm2, byte [rdi + rdx], 0 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_319: - LONG $0x24448b48; BYTE $0x38 // mov rax, qword [rsp + 56] - LONG $0x1479e3c4; WORD $0x01c9 // vpextrb ecx, xmm1, 1 - LONG $0x28244c89 // mov dword [rsp + 40], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_320 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb xmm2, xmm2, byte [rdi + rsi], 1 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_786 - -LBB0_321: - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_322 - -LBB0_787: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb xmm2, xmm2, byte [rdi + r9], 3 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_788 - -LBB0_323: - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JE LBB0_325 - -LBB0_324: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb xmm2, xmm2, byte [rdi + r13], 5 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_325: - LONG $0x24448b48; BYTE $0x70 // mov rax, qword [rsp + 112] - LONG $0x1479e3c4; WORD $0x06c9 // vpextrb ecx, xmm1, 6 - LONG $0x14244c89 // mov dword [rsp + 20], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_326 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb xmm2, xmm2, byte [rdi + rax], 6 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_790 - -LBB0_327: - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JE LBB0_328 - -LBB0_791: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb xmm2, xmm2, byte [rdi + rax], 8 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_792 - -LBB0_329: - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JE LBB0_330 - -LBB0_793: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c824848b48 // mov rax, qword [rsp + 200] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb xmm2, xmm2, byte [rdi + rax], 10 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JNE LBB0_794 - -LBB0_331: - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_332 - -LBB0_795: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b8249c8b48 // mov rbx, qword [rsp + 184] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb xmm2, xmm2, byte [rdi + rbx], 12 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JNE LBB0_796 - -LBB0_333: - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_334 - -LBB0_797: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000a8249c8b48 // mov rbx, qword [rsp + 168] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb xmm2, xmm2, byte [rdi + rbx], 14 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JNE LBB0_335 - JMP LBB0_336 - -LBB0_320: - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_321 - -LBB0_786: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb xmm2, xmm2, byte [rdi + rbx], 2 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_787 - -LBB0_322: - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_323 - -LBB0_788: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb xmm2, xmm2, byte [rdi + rax], 4 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_324 - JMP LBB0_325 - -LBB0_326: - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_327 - -LBB0_790: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000f024848b48 // mov rax, qword [rsp + 240] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb xmm2, xmm2, byte [rdi + rax], 7 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JNE LBB0_791 - -LBB0_328: - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_329 - -LBB0_792: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d024848b48 // mov rax, qword [rsp + 208] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb xmm2, xmm2, byte [rdi + rax], 9 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_793 - -LBB0_330: - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JE LBB0_331 - -LBB0_794: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c0249c8b48 // mov rbx, qword [rsp + 192] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb xmm2, xmm2, byte [rdi + rbx], 11 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_795 - -LBB0_332: - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JE LBB0_333 - -LBB0_796: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b0249c8b48 // mov rbx, qword [rsp + 176] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb xmm2, xmm2, byte [rdi + rbx], 13 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_797 - -LBB0_334: - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_336 - -LBB0_335: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000a0249c8b48 // mov rbx, qword [rsp + 160] - LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb xmm1, xmm1, byte [rdi + rbx], 15 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_336: - LONG $0x7175c1c4; WORD $0x03d6 // vpsrlw ymm1, ymm14, 3 - QUAD $0x00000080b5db75c5 // vpand ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0x7e79c1c4; BYTE $0xf7 // vmovd r15d, xmm6 - LONG $0x01c7f641 // test r15b, 1 - JE LBB0_337 - LONG $0x7ef961c4; BYTE $0xfb // vmovq rbx, xmm15 - LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm14, 0 - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_799 - -LBB0_338: - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JE LBB0_339 - -LBB0_800: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm14, 2 - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_801 - -LBB0_340: - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_341 - -LBB0_802: - LONG $0x7ef9e1c4; BYTE $0xeb // vmovq rbx, xmm5 - LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm14, 4 - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_803 - -LBB0_342: - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_343 - -LBB0_804: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm14, 6 - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_805 - -LBB0_344: - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_345 - -LBB0_806: - LONG $0x7ef961c4; BYTE $0xe3 // vmovq rbx, xmm12 - LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb byte [r8 + rbx], xmm14, 8 - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_807 - -LBB0_346: - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_347 - -LBB0_808: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb byte [r8 + rbx], xmm14, 10 - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_809 - -LBB0_348: - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_349 - -LBB0_810: - LONG $0x7ef961c4; BYTE $0xdb // vmovq rbx, xmm11 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb byte [r8 + rbx], xmm14, 12 - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_811 - -LBB0_350: - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_351 - -LBB0_812: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb byte [r8 + rbx], xmm14, 14 - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_813 - -LBB0_352: - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JE LBB0_353 - -LBB0_814: - LONG $0x7ef961c4; BYTE $0xd3 // vmovq rbx, xmm10 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm1, 0 - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JNE LBB0_815 - -LBB0_354: - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JE LBB0_355 - -LBB0_816: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm1, 2 - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JNE LBB0_817 - -LBB0_356: - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JE LBB0_357 - -LBB0_818: - LONG $0x7ef961c4; BYTE $0xcb // vmovq rbx, xmm9 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm1, 4 - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JNE LBB0_819 - -LBB0_358: - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JE LBB0_359 - -LBB0_820: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm1, 6 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_821 - -LBB0_360: - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JE LBB0_361 - -LBB0_822: - LONG $0x7ef961c4; BYTE $0xc2 // vmovq rdx, xmm8 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb byte [r8 + rdx], xmm1, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_823 - -LBB0_362: - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JE LBB0_363 - -LBB0_824: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JNE LBB0_825 - -LBB0_364: - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_365 - -LBB0_826: - LONG $0x7ef9e1c4; BYTE $0xf9 // vmovq rcx, xmm7 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JNE LBB0_827 - -LBB0_366: - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_367 - -LBB0_828: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JNE LBB0_368 - JMP LBB0_369 - -LBB0_337: - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_338 - -LBB0_799: - LONG $0x16f963c4; WORD $0x01fb // vpextrq rbx, xmm15, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm14, 1 - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JNE LBB0_800 - -LBB0_339: - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_340 - -LBB0_801: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm14, 3 - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_802 - -LBB0_341: - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_342 - -LBB0_803: - LONG $0x16f9e3c4; WORD $0x01eb // vpextrq rbx, xmm5, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm14, 5 - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_804 - -LBB0_343: - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_344 - -LBB0_805: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm14, 7 - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_806 - -LBB0_345: - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_346 - -LBB0_807: - LONG $0x16f963c4; WORD $0x01e3 // vpextrq rbx, xmm12, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb byte [r8 + rbx], xmm14, 9 - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_808 - -LBB0_347: - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_348 - -LBB0_809: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb byte [r8 + rbx], xmm14, 11 - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_810 - -LBB0_349: - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_350 - -LBB0_811: - LONG $0x16f963c4; WORD $0x01db // vpextrq rbx, xmm11, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb byte [r8 + rbx], xmm14, 13 - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_812 - -LBB0_351: - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_352 - -LBB0_813: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb byte [r8 + rbx], xmm14, 15 - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JNE LBB0_814 - -LBB0_353: - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JE LBB0_354 - -LBB0_815: - LONG $0x16f963c4; WORD $0x01d3 // vpextrq rbx, xmm10, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm1, 1 - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JNE LBB0_816 - -LBB0_355: - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JE LBB0_356 - -LBB0_817: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm1, 3 - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JNE LBB0_818 - -LBB0_357: - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JE LBB0_358 - -LBB0_819: - LONG $0x16f963c4; WORD $0x01cb // vpextrq rbx, xmm9, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm1, 5 - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JNE LBB0_820 - -LBB0_359: - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_360 - -LBB0_821: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm1, 7 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JNE LBB0_822 - -LBB0_361: - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_362 - -LBB0_823: - LONG $0x16f963c4; WORD $0x01c1 // vpextrq rcx, xmm8, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JNE LBB0_824 - -LBB0_363: - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JE LBB0_364 - -LBB0_825: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm1, 11 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_826 - -LBB0_365: - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JE LBB0_366 - -LBB0_827: - LONG $0x16f9e3c4; WORD $0x01f9 // vpextrq rcx, xmm7, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_828 - -LBB0_367: - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JE LBB0_369 - -LBB0_368: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - -LBB0_369: - QUAD $0x0002a0248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 672] - QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor ymm15, ymm1, yword [rsp + 512] - QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor ymm5, ymm1, yword [rsp + 480] - QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor ymm10, ymm1, yword [rsp + 384] - QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor ymm9, ymm1, yword [rsp + 352] - QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor ymm12, ymm1, yword [rsp + 448] - QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor ymm11, ymm1, yword [rsp + 416] - QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor ymm8, ymm1, yword [rsp + 320] - LONG $0xf9ebddc5 // vpor ymm7, ymm4, ymm1 - LONG $0x463de3c4; WORD $0x31cf // vperm2i128 ymm1, ymm8, ymm7, 49 - LONG $0x383de3c4; WORD $0x01d7 // vinserti128 ymm2, ymm8, xmm7, 1 - LONG $0xc9c6ecc5; BYTE $0x88 // vshufps ymm1, ymm2, ymm1, 136 - LONG $0x461dc3c4; WORD $0x31d3 // vperm2i128 ymm2, ymm12, ymm11, 49 - LONG $0x381dc3c4; WORD $0x01db // vinserti128 ymm3, ymm12, xmm11, 1 - LONG $0xd2c6e4c5; BYTE $0x88 // vshufps ymm2, ymm3, ymm2, 136 - LONG $0x462dc3c4; WORD $0x31d9 // vperm2i128 ymm3, ymm10, ymm9, 49 - LONG $0x382d43c4; WORD $0x01e9 // vinserti128 ymm13, ymm10, xmm9, 1 - LONG $0xdbc694c5; BYTE $0x88 // vshufps ymm3, ymm13, ymm3, 136 - LONG $0x460563c4; WORD $0x31ed // vperm2i128 ymm13, ymm15, ymm5, 49 - LONG $0x380563c4; WORD $0x01f5 // vinserti128 ymm14, ymm15, xmm5, 1 - LONG $0xc60c41c4; WORD $0x88ed // vshufps ymm13, ymm14, ymm13, 136 - LONG $0x667d41c4; BYTE $0xed // vpcmpgtd ymm13, ymm0, ymm13 - LONG $0xdb66fdc5 // vpcmpgtd ymm3, ymm0, ymm3 - LONG $0xdb6b95c5 // vpackssdw ymm3, ymm13, ymm3 - LONG $0xd266fdc5 // vpcmpgtd ymm2, ymm0, ymm2 - LONG $0xc966fdc5 // vpcmpgtd ymm1, ymm0, ymm1 - LONG $0xc96bedc5 // vpackssdw ymm1, ymm2, ymm1 - LONG $0x00fde3c4; WORD $0xd8d3 // vpermq ymm2, ymm3, 216 - LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 - LONG $0xc963edc5 // vpacksswb ymm1, ymm2, ymm1 - LONG $0xf6dbf5c5 // vpand ymm6, ymm1, ymm6 - LONG $0xf17ef9c5 // vmovd ecx, xmm6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_370 - LONG $0x787d62c4; WORD $0x1734 // vpbroadcastb ymm14, byte [rdi + rdx] - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_830 - -LBB0_371: - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_373 - -LBB0_372: - LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb xmm1, xmm14, byte [rdi + rbx], 2 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_373: - LONG $0x24748b48; BYTE $0x60 // mov rsi, qword [rsp + 96] - LONG $0x24548b4c; BYTE $0x48 // mov r10, qword [rsp + 72] - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_374 - QUAD $0x00000110248c8b48 // mov rcx, qword [rsp + 272] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 3 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_832 - -LBB0_375: - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_376 - -LBB0_833: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb xmm1, xmm14, byte [rdi + rdx], 5 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_834 - -LBB0_377: - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_378 - -LBB0_835: - LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb xmm1, xmm14, byte [rdi + r9], 7 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_836 - -LBB0_379: - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_381 - -LBB0_380: - LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb xmm1, xmm14, byte [rdi + r15], 9 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_381: - QUAD $0x0000009024848b48 // mov rax, qword [rsp + 144] - QUAD $0x0000008824b48b48 // mov rsi, qword [rsp + 136] - QUAD $0x00000080249c8b48 // mov rbx, qword [rsp + 128] - LONG $0x244c8b4c; BYTE $0x78 // mov r9, qword [rsp + 120] - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_382 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb xmm1, xmm14, byte [rdi + rax], 10 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_838 - -LBB0_383: - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_384 - -LBB0_839: - QUAD $0x000000f824848b48 // mov rax, qword [rsp + 248] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb xmm1, xmm14, byte [rdi + rax], 12 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_385 - JMP LBB0_386 - -LBB0_370: - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_371 - -LBB0_830: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 1 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_372 - JMP LBB0_373 - -LBB0_374: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_375 - -LBB0_832: - QUAD $0x00000108248c8b48 // mov rcx, qword [rsp + 264] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 4 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_833 - -LBB0_376: - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_377 - -LBB0_834: - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb xmm1, xmm14, byte [rdi + rax], 6 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_835 - -LBB0_378: - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_379 - -LBB0_836: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 8 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_380 - JMP LBB0_381 - -LBB0_382: - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_383 - -LBB0_838: - QUAD $0x0000010024848b48 // mov rax, qword [rsp + 256] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb xmm1, xmm14, byte [rdi + rax], 11 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_839 - -LBB0_384: - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_386 - -LBB0_385: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb xmm1, xmm14, byte [rdi + rdx], 13 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_386: - LONG $0x24448b48; BYTE $0x50 // mov rax, qword [rsp + 80] - LONG $0x24548b48; BYTE $0x40 // mov rdx, qword [rsp + 64] - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_388 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb xmm1, xmm14, byte [rdi + rax], 14 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_388: - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_390 - LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb xmm1, xmm14, byte [rdi + r10], 15 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_390: - LONG $0x397de3c4; WORD $0x01f1 // vextracti128 xmm1, ymm6, 1 - LONG $0xc87ef9c5 // vmovd eax, xmm1 - LONG $0x2c244489 // mov dword [rsp + 44], eax - WORD $0x01a8 // test al, 1 - JE LBB0_392 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb xmm2, xmm2, byte [rdi + rdx], 0 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_392: - LONG $0x24448b48; BYTE $0x38 // mov rax, qword [rsp + 56] - LONG $0x1479e3c4; WORD $0x01c9 // vpextrb ecx, xmm1, 1 - LONG $0x28244c89 // mov dword [rsp + 40], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_393 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb xmm2, xmm2, byte [rdi + rsi], 1 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_841 - -LBB0_394: - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_395 - -LBB0_842: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb xmm2, xmm2, byte [rdi + r9], 3 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_843 - -LBB0_396: - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JE LBB0_398 - -LBB0_397: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb xmm2, xmm2, byte [rdi + r13], 5 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_398: - LONG $0x24448b48; BYTE $0x70 // mov rax, qword [rsp + 112] - LONG $0x1479e3c4; WORD $0x06c9 // vpextrb ecx, xmm1, 6 - LONG $0x14244c89 // mov dword [rsp + 20], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_399 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb xmm2, xmm2, byte [rdi + rax], 6 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_845 - -LBB0_400: - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JE LBB0_401 - -LBB0_846: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb xmm2, xmm2, byte [rdi + rax], 8 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_847 - -LBB0_402: - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JE LBB0_403 - -LBB0_848: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c824848b48 // mov rax, qword [rsp + 200] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb xmm2, xmm2, byte [rdi + rax], 10 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JNE LBB0_849 - -LBB0_404: - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_405 - -LBB0_850: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b8249c8b48 // mov rbx, qword [rsp + 184] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb xmm2, xmm2, byte [rdi + rbx], 12 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JNE LBB0_851 - -LBB0_406: - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_407 - -LBB0_852: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000a8249c8b48 // mov rbx, qword [rsp + 168] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb xmm2, xmm2, byte [rdi + rbx], 14 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JNE LBB0_408 - JMP LBB0_409 - -LBB0_393: - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_394 - -LBB0_841: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb xmm2, xmm2, byte [rdi + rbx], 2 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_842 - -LBB0_395: - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_396 - -LBB0_843: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb xmm2, xmm2, byte [rdi + rax], 4 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_397 - JMP LBB0_398 - -LBB0_399: - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_400 - -LBB0_845: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000f024848b48 // mov rax, qword [rsp + 240] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb xmm2, xmm2, byte [rdi + rax], 7 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JNE LBB0_846 - -LBB0_401: - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_402 - -LBB0_847: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d024848b48 // mov rax, qword [rsp + 208] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb xmm2, xmm2, byte [rdi + rax], 9 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_848 - -LBB0_403: - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JE LBB0_404 - -LBB0_849: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c0249c8b48 // mov rbx, qword [rsp + 192] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb xmm2, xmm2, byte [rdi + rbx], 11 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_850 - -LBB0_405: - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JE LBB0_406 - -LBB0_851: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b0249c8b48 // mov rbx, qword [rsp + 176] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb xmm2, xmm2, byte [rdi + rbx], 13 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_852 - -LBB0_407: - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_409 - -LBB0_408: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000a0249c8b48 // mov rbx, qword [rsp + 160] - LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb xmm1, xmm1, byte [rdi + rbx], 15 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_409: - LONG $0x7175c1c4; WORD $0x04d6 // vpsrlw ymm1, ymm14, 4 - QUAD $0x00000080b5db75c5 // vpand ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0x7e79c1c4; BYTE $0xf7 // vmovd r15d, xmm6 - LONG $0x01c7f641 // test r15b, 1 - JE LBB0_410 - LONG $0x7ef961c4; BYTE $0xfb // vmovq rbx, xmm15 - LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm14, 0 - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_854 - -LBB0_411: - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JE LBB0_412 - -LBB0_855: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm14, 2 - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_856 - -LBB0_413: - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_414 - -LBB0_857: - LONG $0x7ef9e1c4; BYTE $0xeb // vmovq rbx, xmm5 - LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm14, 4 - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_858 - -LBB0_415: - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_416 - -LBB0_859: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm14, 6 - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_860 - -LBB0_417: - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_418 - -LBB0_861: - LONG $0x7ef961c4; BYTE $0xe3 // vmovq rbx, xmm12 - LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb byte [r8 + rbx], xmm14, 8 - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_862 - -LBB0_419: - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_420 - -LBB0_863: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb byte [r8 + rbx], xmm14, 10 - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_864 - -LBB0_421: - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_422 - -LBB0_865: - LONG $0x7ef961c4; BYTE $0xdb // vmovq rbx, xmm11 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb byte [r8 + rbx], xmm14, 12 - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_866 - -LBB0_423: - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_424 - -LBB0_867: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb byte [r8 + rbx], xmm14, 14 - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_868 - -LBB0_425: - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JE LBB0_426 - -LBB0_869: - LONG $0x7ef961c4; BYTE $0xd3 // vmovq rbx, xmm10 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm1, 0 - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JNE LBB0_870 - -LBB0_427: - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JE LBB0_428 - -LBB0_871: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm1, 2 - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JNE LBB0_872 - -LBB0_429: - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JE LBB0_430 - -LBB0_873: - LONG $0x7ef961c4; BYTE $0xcb // vmovq rbx, xmm9 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm1, 4 - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JNE LBB0_874 - -LBB0_431: - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JE LBB0_432 - -LBB0_875: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm1, 6 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_876 - -LBB0_433: - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JE LBB0_434 - -LBB0_877: - LONG $0x7ef961c4; BYTE $0xc2 // vmovq rdx, xmm8 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb byte [r8 + rdx], xmm1, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_878 - -LBB0_435: - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JE LBB0_436 - -LBB0_879: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JNE LBB0_880 - -LBB0_437: - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_438 - -LBB0_881: - LONG $0x7ef9e1c4; BYTE $0xf9 // vmovq rcx, xmm7 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JNE LBB0_882 - -LBB0_439: - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_440 - -LBB0_883: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JNE LBB0_441 - JMP LBB0_442 - -LBB0_410: - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_411 - -LBB0_854: - LONG $0x16f963c4; WORD $0x01fb // vpextrq rbx, xmm15, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm14, 1 - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JNE LBB0_855 - -LBB0_412: - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_413 - -LBB0_856: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm14, 3 - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_857 - -LBB0_414: - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_415 - -LBB0_858: - LONG $0x16f9e3c4; WORD $0x01eb // vpextrq rbx, xmm5, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm14, 5 - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_859 - -LBB0_416: - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_417 - -LBB0_860: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm14, 7 - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_861 - -LBB0_418: - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_419 - -LBB0_862: - LONG $0x16f963c4; WORD $0x01e3 // vpextrq rbx, xmm12, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb byte [r8 + rbx], xmm14, 9 - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_863 - -LBB0_420: - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_421 - -LBB0_864: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb byte [r8 + rbx], xmm14, 11 - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_865 - -LBB0_422: - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_423 - -LBB0_866: - LONG $0x16f963c4; WORD $0x01db // vpextrq rbx, xmm11, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb byte [r8 + rbx], xmm14, 13 - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_867 - -LBB0_424: - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_425 - -LBB0_868: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb byte [r8 + rbx], xmm14, 15 - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JNE LBB0_869 - -LBB0_426: - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JE LBB0_427 - -LBB0_870: - LONG $0x16f963c4; WORD $0x01d3 // vpextrq rbx, xmm10, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm1, 1 - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JNE LBB0_871 - -LBB0_428: - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JE LBB0_429 - -LBB0_872: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm1, 3 - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JNE LBB0_873 - -LBB0_430: - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JE LBB0_431 - -LBB0_874: - LONG $0x16f963c4; WORD $0x01cb // vpextrq rbx, xmm9, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm1, 5 - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JNE LBB0_875 - -LBB0_432: - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_433 - -LBB0_876: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm1, 7 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JNE LBB0_877 - -LBB0_434: - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_435 - -LBB0_878: - LONG $0x16f963c4; WORD $0x01c1 // vpextrq rcx, xmm8, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JNE LBB0_879 - -LBB0_436: - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JE LBB0_437 - -LBB0_880: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm1, 11 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_881 - -LBB0_438: - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JE LBB0_439 - -LBB0_882: - LONG $0x16f9e3c4; WORD $0x01f9 // vpextrq rcx, xmm7, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_883 - -LBB0_440: - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JE LBB0_442 - -LBB0_441: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - -LBB0_442: - QUAD $0x000280248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 640] - QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor ymm15, ymm1, yword [rsp + 512] - QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor ymm5, ymm1, yword [rsp + 480] - QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor ymm10, ymm1, yword [rsp + 384] - QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor ymm9, ymm1, yword [rsp + 352] - QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor ymm12, ymm1, yword [rsp + 448] - QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor ymm11, ymm1, yword [rsp + 416] - QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor ymm8, ymm1, yword [rsp + 320] - LONG $0xf9ebddc5 // vpor ymm7, ymm4, ymm1 - LONG $0x463de3c4; WORD $0x31cf // vperm2i128 ymm1, ymm8, ymm7, 49 - LONG $0x383de3c4; WORD $0x01d7 // vinserti128 ymm2, ymm8, xmm7, 1 - LONG $0xc9c6ecc5; BYTE $0x88 // vshufps ymm1, ymm2, ymm1, 136 - LONG $0x461dc3c4; WORD $0x31d3 // vperm2i128 ymm2, ymm12, ymm11, 49 - LONG $0x381dc3c4; WORD $0x01db // vinserti128 ymm3, ymm12, xmm11, 1 - LONG $0xd2c6e4c5; BYTE $0x88 // vshufps ymm2, ymm3, ymm2, 136 - LONG $0x462dc3c4; WORD $0x31d9 // vperm2i128 ymm3, ymm10, ymm9, 49 - LONG $0x382d43c4; WORD $0x01e9 // vinserti128 ymm13, ymm10, xmm9, 1 - LONG $0xdbc694c5; BYTE $0x88 // vshufps ymm3, ymm13, ymm3, 136 - LONG $0x460563c4; WORD $0x31ed // vperm2i128 ymm13, ymm15, ymm5, 49 - LONG $0x380563c4; WORD $0x01f5 // vinserti128 ymm14, ymm15, xmm5, 1 - LONG $0xc60c41c4; WORD $0x88ed // vshufps ymm13, ymm14, ymm13, 136 - LONG $0x667d41c4; BYTE $0xed // vpcmpgtd ymm13, ymm0, ymm13 - LONG $0xdb66fdc5 // vpcmpgtd ymm3, ymm0, ymm3 - LONG $0xdb6b95c5 // vpackssdw ymm3, ymm13, ymm3 - LONG $0xd266fdc5 // vpcmpgtd ymm2, ymm0, ymm2 - LONG $0xc966fdc5 // vpcmpgtd ymm1, ymm0, ymm1 - LONG $0xc96bedc5 // vpackssdw ymm1, ymm2, ymm1 - LONG $0x00fde3c4; WORD $0xd8d3 // vpermq ymm2, ymm3, 216 - LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 - LONG $0xc963edc5 // vpacksswb ymm1, ymm2, ymm1 - LONG $0xf6dbf5c5 // vpand ymm6, ymm1, ymm6 - LONG $0xf17ef9c5 // vmovd ecx, xmm6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_443 - LONG $0x787d62c4; WORD $0x1734 // vpbroadcastb ymm14, byte [rdi + rdx] - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_885 - -LBB0_444: - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_446 - -LBB0_445: - LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb xmm1, xmm14, byte [rdi + rbx], 2 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_446: - LONG $0x24748b48; BYTE $0x60 // mov rsi, qword [rsp + 96] - LONG $0x24548b4c; BYTE $0x48 // mov r10, qword [rsp + 72] - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_447 - QUAD $0x00000110248c8b48 // mov rcx, qword [rsp + 272] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 3 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_887 - -LBB0_448: - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_449 - -LBB0_888: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb xmm1, xmm14, byte [rdi + rdx], 5 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_889 - -LBB0_450: - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_451 - -LBB0_890: - LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb xmm1, xmm14, byte [rdi + r9], 7 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_891 - -LBB0_452: - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_454 - -LBB0_453: - LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb xmm1, xmm14, byte [rdi + r15], 9 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_454: - QUAD $0x0000009024848b48 // mov rax, qword [rsp + 144] - QUAD $0x0000008824b48b48 // mov rsi, qword [rsp + 136] - QUAD $0x00000080249c8b48 // mov rbx, qword [rsp + 128] - LONG $0x244c8b4c; BYTE $0x78 // mov r9, qword [rsp + 120] - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_455 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb xmm1, xmm14, byte [rdi + rax], 10 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_893 - -LBB0_456: - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_457 - -LBB0_894: - QUAD $0x000000f824848b48 // mov rax, qword [rsp + 248] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb xmm1, xmm14, byte [rdi + rax], 12 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_458 - JMP LBB0_459 - -LBB0_443: - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_444 - -LBB0_885: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 1 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_445 - JMP LBB0_446 - -LBB0_447: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_448 - -LBB0_887: - QUAD $0x00000108248c8b48 // mov rcx, qword [rsp + 264] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 4 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_888 - -LBB0_449: - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_450 - -LBB0_889: - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb xmm1, xmm14, byte [rdi + rax], 6 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_890 - -LBB0_451: - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_452 - -LBB0_891: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 8 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_453 - JMP LBB0_454 - -LBB0_455: - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_456 - -LBB0_893: - QUAD $0x0000010024848b48 // mov rax, qword [rsp + 256] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb xmm1, xmm14, byte [rdi + rax], 11 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_894 - -LBB0_457: - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_459 - -LBB0_458: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb xmm1, xmm14, byte [rdi + rdx], 13 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_459: - LONG $0x24448b48; BYTE $0x50 // mov rax, qword [rsp + 80] - LONG $0x24548b48; BYTE $0x40 // mov rdx, qword [rsp + 64] - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_461 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb xmm1, xmm14, byte [rdi + rax], 14 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_461: - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_463 - LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb xmm1, xmm14, byte [rdi + r10], 15 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_463: - LONG $0x397de3c4; WORD $0x01f1 // vextracti128 xmm1, ymm6, 1 - LONG $0xc87ef9c5 // vmovd eax, xmm1 - LONG $0x2c244489 // mov dword [rsp + 44], eax - WORD $0x01a8 // test al, 1 - JE LBB0_465 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb xmm2, xmm2, byte [rdi + rdx], 0 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_465: - LONG $0x24448b48; BYTE $0x38 // mov rax, qword [rsp + 56] - LONG $0x1479e3c4; WORD $0x01c9 // vpextrb ecx, xmm1, 1 - LONG $0x28244c89 // mov dword [rsp + 40], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_466 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb xmm2, xmm2, byte [rdi + rsi], 1 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_896 - -LBB0_467: - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_468 - -LBB0_897: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb xmm2, xmm2, byte [rdi + r9], 3 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_898 - -LBB0_469: - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JE LBB0_471 - -LBB0_470: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb xmm2, xmm2, byte [rdi + r13], 5 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_471: - LONG $0x24448b48; BYTE $0x70 // mov rax, qword [rsp + 112] - LONG $0x1479e3c4; WORD $0x06c9 // vpextrb ecx, xmm1, 6 - LONG $0x14244c89 // mov dword [rsp + 20], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_472 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb xmm2, xmm2, byte [rdi + rax], 6 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_900 - -LBB0_473: - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JE LBB0_474 - -LBB0_901: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb xmm2, xmm2, byte [rdi + rax], 8 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_902 - -LBB0_475: - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JE LBB0_476 - -LBB0_903: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c824848b48 // mov rax, qword [rsp + 200] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb xmm2, xmm2, byte [rdi + rax], 10 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JNE LBB0_904 - -LBB0_477: - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_478 - -LBB0_905: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b8249c8b48 // mov rbx, qword [rsp + 184] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb xmm2, xmm2, byte [rdi + rbx], 12 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JNE LBB0_906 - -LBB0_479: - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_480 - -LBB0_907: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000a8249c8b48 // mov rbx, qword [rsp + 168] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb xmm2, xmm2, byte [rdi + rbx], 14 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JNE LBB0_481 - JMP LBB0_482 - -LBB0_466: - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_467 - -LBB0_896: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb xmm2, xmm2, byte [rdi + rbx], 2 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_897 - -LBB0_468: - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_469 - -LBB0_898: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb xmm2, xmm2, byte [rdi + rax], 4 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_470 - JMP LBB0_471 - -LBB0_472: - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_473 - -LBB0_900: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000f024848b48 // mov rax, qword [rsp + 240] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb xmm2, xmm2, byte [rdi + rax], 7 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JNE LBB0_901 - -LBB0_474: - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_475 - -LBB0_902: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d024848b48 // mov rax, qword [rsp + 208] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb xmm2, xmm2, byte [rdi + rax], 9 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_903 - -LBB0_476: - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JE LBB0_477 - -LBB0_904: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c0249c8b48 // mov rbx, qword [rsp + 192] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb xmm2, xmm2, byte [rdi + rbx], 11 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_905 - -LBB0_478: - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JE LBB0_479 - -LBB0_906: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b0249c8b48 // mov rbx, qword [rsp + 176] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb xmm2, xmm2, byte [rdi + rbx], 13 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_907 - -LBB0_480: - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_482 - -LBB0_481: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000a0249c8b48 // mov rbx, qword [rsp + 160] - LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb xmm1, xmm1, byte [rdi + rbx], 15 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_482: - LONG $0x7175c1c4; WORD $0x05d6 // vpsrlw ymm1, ymm14, 5 - QUAD $0x00000080b5db75c5 // vpand ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0x7e79c1c4; BYTE $0xf7 // vmovd r15d, xmm6 - LONG $0x01c7f641 // test r15b, 1 - JE LBB0_483 - LONG $0x7ef961c4; BYTE $0xfb // vmovq rbx, xmm15 - LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm14, 0 - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_909 - -LBB0_484: - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JE LBB0_485 - -LBB0_910: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm14, 2 - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_911 - -LBB0_486: - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_487 - -LBB0_912: - LONG $0x7ef9e1c4; BYTE $0xeb // vmovq rbx, xmm5 - LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm14, 4 - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_913 - -LBB0_488: - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_489 - -LBB0_914: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm14, 6 - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_915 - -LBB0_490: - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_491 - -LBB0_916: - LONG $0x7ef961c4; BYTE $0xe3 // vmovq rbx, xmm12 - LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb byte [r8 + rbx], xmm14, 8 - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_917 - -LBB0_492: - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_493 - -LBB0_918: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb byte [r8 + rbx], xmm14, 10 - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_919 - -LBB0_494: - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_495 - -LBB0_920: - LONG $0x7ef961c4; BYTE $0xdb // vmovq rbx, xmm11 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb byte [r8 + rbx], xmm14, 12 - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_921 - -LBB0_496: - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_497 - -LBB0_922: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb byte [r8 + rbx], xmm14, 14 - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_923 - -LBB0_498: - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JE LBB0_499 - -LBB0_924: - LONG $0x7ef961c4; BYTE $0xd3 // vmovq rbx, xmm10 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm1, 0 - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JNE LBB0_925 - -LBB0_500: - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JE LBB0_501 - -LBB0_926: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm1, 2 - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JNE LBB0_927 - -LBB0_502: - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JE LBB0_503 - -LBB0_928: - LONG $0x7ef961c4; BYTE $0xcb // vmovq rbx, xmm9 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm1, 4 - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JNE LBB0_929 - -LBB0_504: - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JE LBB0_505 - -LBB0_930: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm1, 6 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_931 - -LBB0_506: - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JE LBB0_507 - -LBB0_932: - LONG $0x7ef961c4; BYTE $0xc2 // vmovq rdx, xmm8 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb byte [r8 + rdx], xmm1, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_933 - -LBB0_508: - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JE LBB0_509 - -LBB0_934: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JNE LBB0_935 - -LBB0_510: - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_511 - -LBB0_936: - LONG $0x7ef9e1c4; BYTE $0xf9 // vmovq rcx, xmm7 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JNE LBB0_937 - -LBB0_512: - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_513 - -LBB0_938: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JNE LBB0_514 - JMP LBB0_515 - -LBB0_483: - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_484 - -LBB0_909: - LONG $0x16f963c4; WORD $0x01fb // vpextrq rbx, xmm15, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm14, 1 - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JNE LBB0_910 - -LBB0_485: - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_486 - -LBB0_911: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm14, 3 - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_912 - -LBB0_487: - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_488 - -LBB0_913: - LONG $0x16f9e3c4; WORD $0x01eb // vpextrq rbx, xmm5, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm14, 5 - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_914 - -LBB0_489: - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_490 - -LBB0_915: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm14, 7 - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_916 - -LBB0_491: - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_492 - -LBB0_917: - LONG $0x16f963c4; WORD $0x01e3 // vpextrq rbx, xmm12, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb byte [r8 + rbx], xmm14, 9 - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_918 - -LBB0_493: - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_494 - -LBB0_919: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb byte [r8 + rbx], xmm14, 11 - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_920 - -LBB0_495: - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_496 - -LBB0_921: - LONG $0x16f963c4; WORD $0x01db // vpextrq rbx, xmm11, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb byte [r8 + rbx], xmm14, 13 - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_922 - -LBB0_497: - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_498 - -LBB0_923: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb byte [r8 + rbx], xmm14, 15 - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JNE LBB0_924 - -LBB0_499: - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JE LBB0_500 - -LBB0_925: - LONG $0x16f963c4; WORD $0x01d3 // vpextrq rbx, xmm10, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm1, 1 - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JNE LBB0_926 - -LBB0_501: - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JE LBB0_502 - -LBB0_927: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm1, 3 - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JNE LBB0_928 - -LBB0_503: - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JE LBB0_504 - -LBB0_929: - LONG $0x16f963c4; WORD $0x01cb // vpextrq rbx, xmm9, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm1, 5 - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JNE LBB0_930 - -LBB0_505: - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_506 - -LBB0_931: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm1, 7 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JNE LBB0_932 - -LBB0_507: - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_508 - -LBB0_933: - LONG $0x16f963c4; WORD $0x01c1 // vpextrq rcx, xmm8, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JNE LBB0_934 - -LBB0_509: - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JE LBB0_510 - -LBB0_935: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm1, 11 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_936 - -LBB0_511: - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JE LBB0_512 - -LBB0_937: - LONG $0x16f9e3c4; WORD $0x01f9 // vpextrq rcx, xmm7, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_938 - -LBB0_513: - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JE LBB0_515 - -LBB0_514: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - -LBB0_515: - QUAD $0x000260248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 608] - QUAD $0x00020024bceb75c5; BYTE $0x00 // vpor ymm15, ymm1, yword [rsp + 512] - QUAD $0x0001e024acebf5c5; BYTE $0x00 // vpor ymm5, ymm1, yword [rsp + 480] - QUAD $0x0001802494eb75c5; BYTE $0x00 // vpor ymm10, ymm1, yword [rsp + 384] - QUAD $0x000160248ceb75c5; BYTE $0x00 // vpor ymm9, ymm1, yword [rsp + 352] - QUAD $0x0001c024a4eb75c5; BYTE $0x00 // vpor ymm12, ymm1, yword [rsp + 448] - QUAD $0x0001a0249ceb75c5; BYTE $0x00 // vpor ymm11, ymm1, yword [rsp + 416] - QUAD $0x0001402484eb75c5; BYTE $0x00 // vpor ymm8, ymm1, yword [rsp + 320] - LONG $0xf9ebddc5 // vpor ymm7, ymm4, ymm1 - LONG $0x463de3c4; WORD $0x31cf // vperm2i128 ymm1, ymm8, ymm7, 49 - LONG $0x383de3c4; WORD $0x01d7 // vinserti128 ymm2, ymm8, xmm7, 1 - LONG $0xc9c6ecc5; BYTE $0x88 // vshufps ymm1, ymm2, ymm1, 136 - LONG $0x461dc3c4; WORD $0x31d3 // vperm2i128 ymm2, ymm12, ymm11, 49 - LONG $0x381dc3c4; WORD $0x01db // vinserti128 ymm3, ymm12, xmm11, 1 - LONG $0xd2c6e4c5; BYTE $0x88 // vshufps ymm2, ymm3, ymm2, 136 - LONG $0x462dc3c4; WORD $0x31d9 // vperm2i128 ymm3, ymm10, ymm9, 49 - LONG $0x382d43c4; WORD $0x01e9 // vinserti128 ymm13, ymm10, xmm9, 1 - LONG $0xdbc694c5; BYTE $0x88 // vshufps ymm3, ymm13, ymm3, 136 - LONG $0x460563c4; WORD $0x31ed // vperm2i128 ymm13, ymm15, ymm5, 49 - LONG $0x380563c4; WORD $0x01f5 // vinserti128 ymm14, ymm15, xmm5, 1 - LONG $0xc60c41c4; WORD $0x88ed // vshufps ymm13, ymm14, ymm13, 136 - LONG $0x667d41c4; BYTE $0xed // vpcmpgtd ymm13, ymm0, ymm13 - LONG $0xdb66fdc5 // vpcmpgtd ymm3, ymm0, ymm3 - LONG $0xdb6b95c5 // vpackssdw ymm3, ymm13, ymm3 - LONG $0xd266fdc5 // vpcmpgtd ymm2, ymm0, ymm2 - LONG $0xc966fdc5 // vpcmpgtd ymm1, ymm0, ymm1 - LONG $0xc96bedc5 // vpackssdw ymm1, ymm2, ymm1 - LONG $0x00fde3c4; WORD $0xd8d3 // vpermq ymm2, ymm3, 216 - LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 - LONG $0xc963edc5 // vpacksswb ymm1, ymm2, ymm1 - LONG $0xf6dbf5c5 // vpand ymm6, ymm1, ymm6 - LONG $0xf17ef9c5 // vmovd ecx, xmm6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_516 - LONG $0x787d62c4; WORD $0x1734 // vpbroadcastb ymm14, byte [rdi + rdx] - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_940 - -LBB0_517: - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_519 - -LBB0_518: - LONG $0x2009e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb xmm1, xmm14, byte [rdi + rbx], 2 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_519: - LONG $0x24748b48; BYTE $0x60 // mov rsi, qword [rsp + 96] - LONG $0x24548b4c; BYTE $0x48 // mov r10, qword [rsp + 72] - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_520 - QUAD $0x00000110248c8b48 // mov rcx, qword [rsp + 272] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 3 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_942 - -LBB0_521: - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_522 - -LBB0_943: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x05 // vpinsrb xmm1, xmm14, byte [rdi + rdx], 5 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_944 - -LBB0_523: - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_524 - -LBB0_945: - LONG $0x2009a3c4; WORD $0x0f0c; BYTE $0x07 // vpinsrb xmm1, xmm14, byte [rdi + r9], 7 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_946 - -LBB0_525: - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_527 - -LBB0_526: - LONG $0x2009a3c4; WORD $0x3f0c; BYTE $0x09 // vpinsrb xmm1, xmm14, byte [rdi + r15], 9 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_527: - QUAD $0x0000009024848b48 // mov rax, qword [rsp + 144] - QUAD $0x0000008824b48b48 // mov rsi, qword [rsp + 136] - QUAD $0x00000080249c8b48 // mov rbx, qword [rsp + 128] - LONG $0x244c8b4c; BYTE $0x78 // mov r9, qword [rsp + 120] - LONG $0x1479e3c4; WORD $0x0af1 // vpextrb ecx, xmm6, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_528 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb xmm1, xmm14, byte [rdi + rax], 10 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_948 - -LBB0_529: - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_530 - -LBB0_949: - QUAD $0x000000f824848b48 // mov rax, qword [rsp + 248] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0c // vpinsrb xmm1, xmm14, byte [rdi + rax], 12 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_531 - JMP LBB0_532 - -LBB0_516: - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_517 - -LBB0_940: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 1 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_518 - JMP LBB0_519 - -LBB0_520: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_521 - -LBB0_942: - QUAD $0x00000108248c8b48 // mov rcx, qword [rsp + 264] - LONG $0x2009e3c4; WORD $0x0f0c; BYTE $0x04 // vpinsrb xmm1, xmm14, byte [rdi + rcx], 4 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x05f1 // vpextrb ecx, xmm6, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_943 - -LBB0_522: - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_523 - -LBB0_944: - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb xmm1, xmm14, byte [rdi + rax], 6 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x07f1 // vpextrb ecx, xmm6, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_945 - -LBB0_524: - LONG $0x1479e3c4; WORD $0x08f1 // vpextrb ecx, xmm6, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_525 - -LBB0_946: - LONG $0x2009e3c4; WORD $0x370c; BYTE $0x08 // vpinsrb xmm1, xmm14, byte [rdi + rsi], 8 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_526 - JMP LBB0_527 - -LBB0_528: - LONG $0x1479e3c4; WORD $0x0bf1 // vpextrb ecx, xmm6, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_529 - -LBB0_948: - QUAD $0x0000010024848b48 // mov rax, qword [rsp + 256] - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0b // vpinsrb xmm1, xmm14, byte [rdi + rax], 11 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - LONG $0x1479e3c4; WORD $0x0cf1 // vpextrb ecx, xmm6, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_949 - -LBB0_530: - LONG $0x1479e3c4; WORD $0x0df1 // vpextrb ecx, xmm6, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_532 - -LBB0_531: - LONG $0x2009e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb xmm1, xmm14, byte [rdi + rdx], 13 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_532: - LONG $0x24448b48; BYTE $0x50 // mov rax, qword [rsp + 80] - LONG $0x24548b48; BYTE $0x40 // mov rdx, qword [rsp + 64] - LONG $0x1479e3c4; WORD $0x0ef1 // vpextrb ecx, xmm6, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_534 - LONG $0x2009e3c4; WORD $0x070c; BYTE $0x0e // vpinsrb xmm1, xmm14, byte [rdi + rax], 14 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_534: - LONG $0x1479e3c4; WORD $0x0ff1 // vpextrb ecx, xmm6, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_536 - LONG $0x2009a3c4; WORD $0x170c; BYTE $0x0f // vpinsrb xmm1, xmm14, byte [rdi + r10], 15 - LONG $0x020d63c4; WORD $0x0ff1 // vpblendd ymm14, ymm14, ymm1, 15 - -LBB0_536: - LONG $0x397de3c4; WORD $0x01f1 // vextracti128 xmm1, ymm6, 1 - LONG $0xc87ef9c5 // vmovd eax, xmm1 - LONG $0x2c244489 // mov dword [rsp + 44], eax - WORD $0x01a8 // test al, 1 - JE LBB0_538 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1714; BYTE $0x00 // vpinsrb xmm2, xmm2, byte [rdi + rdx], 0 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_538: - LONG $0x24448b48; BYTE $0x38 // mov rax, qword [rsp + 56] - LONG $0x1479e3c4; WORD $0x01c9 // vpextrb ecx, xmm1, 1 - LONG $0x28244c89 // mov dword [rsp + 40], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_539 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x3714; BYTE $0x01 // vpinsrb xmm2, xmm2, byte [rdi + rsi], 1 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_951 - -LBB0_540: - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_541 - -LBB0_952: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x0f14; BYTE $0x03 // vpinsrb xmm2, xmm2, byte [rdi + r9], 3 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_953 - -LBB0_542: - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JE LBB0_544 - -LBB0_543: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069a3c4; WORD $0x2f14; BYTE $0x05 // vpinsrb xmm2, xmm2, byte [rdi + r13], 5 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - -LBB0_544: - LONG $0x24448b48; BYTE $0x70 // mov rax, qword [rsp + 112] - LONG $0x1479e3c4; WORD $0x06c9 // vpextrb ecx, xmm1, 6 - LONG $0x14244c89 // mov dword [rsp + 20], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_545 - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x06 // vpinsrb xmm2, xmm2, byte [rdi + rax], 6 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_955 - -LBB0_546: - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JE LBB0_547 - -LBB0_956: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x08 // vpinsrb xmm2, xmm2, byte [rdi + rax], 8 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_957 - -LBB0_548: - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JE LBB0_549 - -LBB0_958: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c824848b48 // mov rax, qword [rsp + 200] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x0a // vpinsrb xmm2, xmm2, byte [rdi + rax], 10 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JNE LBB0_959 - -LBB0_550: - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_551 - -LBB0_960: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b8249c8b48 // mov rbx, qword [rsp + 184] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0c // vpinsrb xmm2, xmm2, byte [rdi + rbx], 12 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JNE LBB0_961 - -LBB0_552: - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_553 - -LBB0_962: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000a8249c8b48 // mov rbx, qword [rsp + 168] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0e // vpinsrb xmm2, xmm2, byte [rdi + rbx], 14 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JNE LBB0_554 - JMP LBB0_555 - -LBB0_539: - LONG $0x1479e3c4; WORD $0x02c9 // vpextrb ecx, xmm1, 2 - LONG $0x24244c89 // mov dword [rsp + 36], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_540 - -LBB0_951: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x02 // vpinsrb xmm2, xmm2, byte [rdi + rbx], 2 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x03c9 // vpextrb ecx, xmm1, 3 - LONG $0x20244c89 // mov dword [rsp + 32], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_952 - -LBB0_541: - LONG $0x1479e3c4; WORD $0x04c9 // vpextrb ecx, xmm1, 4 - LONG $0x1c244c89 // mov dword [rsp + 28], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_542 - -LBB0_953: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x04 // vpinsrb xmm2, xmm2, byte [rdi + rax], 4 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 - LONG $0x18244489 // mov dword [rsp + 24], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_543 - JMP LBB0_544 - -LBB0_545: - LONG $0x1479c3c4; WORD $0x07c9 // vpextrb r9d, xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_546 - -LBB0_955: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000f024848b48 // mov rax, qword [rsp + 240] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x07 // vpinsrb xmm2, xmm2, byte [rdi + rax], 7 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x08ca // vpextrb edx, xmm1, 8 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - JNE LBB0_956 - -LBB0_547: - LONG $0x1479e3c4; WORD $0x09c9 // vpextrb ecx, xmm1, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_548 - -LBB0_957: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000d024848b48 // mov rax, qword [rsp + 208] - LONG $0x2069e3c4; WORD $0x0714; BYTE $0x09 // vpinsrb xmm2, xmm2, byte [rdi + rax], 9 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479e3c4; WORD $0x0ace // vpextrb esi, xmm1, 10 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_958 - -LBB0_549: - LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 - WORD $0x01a8 // test al, 1 - JE LBB0_550 - -LBB0_959: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000c0249c8b48 // mov rbx, qword [rsp + 192] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0b // vpinsrb xmm2, xmm2, byte [rdi + rbx], 11 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ccd // vpextrb r13d, xmm1, 12 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_960 - -LBB0_551: - LONG $0x1479c3c4; WORD $0x0dca // vpextrb r10d, xmm1, 13 - LONG $0x01c2f641 // test r10b, 1 - JE LBB0_552 - -LBB0_961: - LONG $0x397d63c4; WORD $0x01f2 // vextracti128 xmm2, ymm14, 1 - QUAD $0x000000b0249c8b48 // mov rbx, qword [rsp + 176] - LONG $0x2069e3c4; WORD $0x1f14; BYTE $0x0d // vpinsrb xmm2, xmm2, byte [rdi + rbx], 13 - LONG $0x380d63c4; WORD $0x01f2 // vinserti128 ymm14, ymm14, xmm2, 1 - LONG $0x1479c3c4; WORD $0x0ecb // vpextrb r11d, xmm1, 14 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_962 - -LBB0_553: - LONG $0x1479c3c4; WORD $0x0fce // vpextrb r14d, xmm1, 15 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_555 - -LBB0_554: - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - QUAD $0x000000a0249c8b48 // mov rbx, qword [rsp + 160] - LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0f // vpinsrb xmm1, xmm1, byte [rdi + rbx], 15 - LONG $0x380d63c4; WORD $0x01f1 // vinserti128 ymm14, ymm14, xmm1, 1 - -LBB0_555: - LONG $0x7175c1c4; WORD $0x06d6 // vpsrlw ymm1, ymm14, 6 - QUAD $0x00000080b5db75c5 // vpand ymm14, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0x7e79c1c4; BYTE $0xf7 // vmovd r15d, xmm6 - LONG $0x01c7f641 // test r15b, 1 - JE LBB0_556 - LONG $0x7ef961c4; BYTE $0xfb // vmovq rbx, xmm15 - LONG $0x147943c4; WORD $0x1834; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm14, 0 - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_964 - -LBB0_557: - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JE LBB0_558 - -LBB0_965: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm14, 2 - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_966 - -LBB0_559: - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_560 - -LBB0_967: - LONG $0x7ef9e1c4; BYTE $0xeb // vmovq rbx, xmm5 - LONG $0x147943c4; WORD $0x1834; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm14, 4 - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_968 - -LBB0_561: - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_562 - -LBB0_969: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm14, 6 - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_970 - -LBB0_563: - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_564 - -LBB0_971: - LONG $0x7ef961c4; BYTE $0xe3 // vmovq rbx, xmm12 - LONG $0x147943c4; WORD $0x1834; BYTE $0x08 // vpextrb byte [r8 + rbx], xmm14, 8 - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_972 - -LBB0_565: - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_566 - -LBB0_973: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0a // vpextrb byte [r8 + rbx], xmm14, 10 - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_974 - -LBB0_567: - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_568 - -LBB0_975: - LONG $0x7ef961c4; BYTE $0xdb // vmovq rbx, xmm11 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0c // vpextrb byte [r8 + rbx], xmm14, 12 - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_976 - -LBB0_569: - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_570 - -LBB0_977: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0e // vpextrb byte [r8 + rbx], xmm14, 14 - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_978 - -LBB0_571: - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JE LBB0_572 - -LBB0_979: - LONG $0x7ef961c4; BYTE $0xd3 // vmovq rbx, xmm10 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm1, 0 - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JNE LBB0_980 - -LBB0_573: - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JE LBB0_574 - -LBB0_981: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm1, 2 - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JNE LBB0_982 - -LBB0_575: - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JE LBB0_576 - -LBB0_983: - LONG $0x7ef961c4; BYTE $0xcb // vmovq rbx, xmm9 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm1, 4 - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JNE LBB0_984 - -LBB0_577: - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JE LBB0_578 - -LBB0_985: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm1, 6 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_986 - -LBB0_579: - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JE LBB0_580 - -LBB0_987: - LONG $0x7ef961c4; BYTE $0xc2 // vmovq rdx, xmm8 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb byte [r8 + rdx], xmm1, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_988 - -LBB0_581: - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JE LBB0_582 - -LBB0_989: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JNE LBB0_990 - -LBB0_583: - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_584 - -LBB0_991: - LONG $0x7ef9e1c4; BYTE $0xf9 // vmovq rcx, xmm7 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JNE LBB0_992 - -LBB0_585: - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_586 - -LBB0_993: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JNE LBB0_587 - JMP LBB0_588 - -LBB0_556: - LONG $0x1479e3c4; WORD $0x01f3 // vpextrb ebx, xmm6, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_557 - -LBB0_964: - LONG $0x16f963c4; WORD $0x01fb // vpextrq rbx, xmm15, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm14, 1 - LONG $0x1479e3c4; WORD $0x02f3 // vpextrb ebx, xmm6, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000000e024bc8b4c // mov r15, qword [rsp + 224] - JNE LBB0_965 - -LBB0_558: - LONG $0x1479e3c4; WORD $0x03f3 // vpextrb ebx, xmm6, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_559 - -LBB0_966: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm14, 3 - LONG $0x1479e3c4; WORD $0x04f3 // vpextrb ebx, xmm6, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_967 - -LBB0_560: - LONG $0x1479e3c4; WORD $0x05f3 // vpextrb ebx, xmm6, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_561 - -LBB0_968: - LONG $0x16f9e3c4; WORD $0x01eb // vpextrq rbx, xmm5, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm14, 5 - LONG $0x1479e3c4; WORD $0x06f3 // vpextrb ebx, xmm6, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_969 - -LBB0_562: - LONG $0x1479e3c4; WORD $0x07f3 // vpextrb ebx, xmm6, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_563 - -LBB0_970: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm14, 7 - LONG $0x1479e3c4; WORD $0x08f3 // vpextrb ebx, xmm6, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_971 - -LBB0_564: - LONG $0x1479e3c4; WORD $0x09f3 // vpextrb ebx, xmm6, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_565 - -LBB0_972: - LONG $0x16f963c4; WORD $0x01e3 // vpextrq rbx, xmm12, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x09 // vpextrb byte [r8 + rbx], xmm14, 9 - LONG $0x1479e3c4; WORD $0x0af3 // vpextrb ebx, xmm6, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_973 - -LBB0_566: - LONG $0x1479e3c4; WORD $0x0bf3 // vpextrb ebx, xmm6, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_567 - -LBB0_974: - LONG $0x397d63c4; WORD $0x01e1 // vextracti128 xmm1, ymm12, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0b // vpextrb byte [r8 + rbx], xmm14, 11 - LONG $0x1479e3c4; WORD $0x0cf3 // vpextrb ebx, xmm6, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_975 - -LBB0_568: - LONG $0x1479e3c4; WORD $0x0df3 // vpextrb ebx, xmm6, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_569 - -LBB0_976: - LONG $0x16f963c4; WORD $0x01db // vpextrq rbx, xmm11, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0d // vpextrb byte [r8 + rbx], xmm14, 13 - LONG $0x1479e3c4; WORD $0x0ef3 // vpextrb ebx, xmm6, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_977 - -LBB0_570: - LONG $0x1479e3c4; WORD $0x0ff3 // vpextrb ebx, xmm6, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_571 - -LBB0_978: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x147943c4; WORD $0x1834; BYTE $0x0f // vpextrb byte [r8 + rbx], xmm14, 15 - LONG $0x2c2444f6; BYTE $0x01 // test byte [rsp + 44], 1 - JNE LBB0_979 - -LBB0_572: - LONG $0x282444f6; BYTE $0x01 // test byte [rsp + 40], 1 - JE LBB0_573 - -LBB0_980: - LONG $0x16f963c4; WORD $0x01d3 // vpextrq rbx, xmm10, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm1, 1 - LONG $0x242444f6; BYTE $0x01 // test byte [rsp + 36], 1 - JNE LBB0_981 - -LBB0_574: - LONG $0x202444f6; BYTE $0x01 // test byte [rsp + 32], 1 - JE LBB0_575 - -LBB0_982: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm1, 3 - LONG $0x1c2444f6; BYTE $0x01 // test byte [rsp + 28], 1 - JNE LBB0_983 - -LBB0_576: - LONG $0x182444f6; BYTE $0x01 // test byte [rsp + 24], 1 - JE LBB0_577 - -LBB0_984: - LONG $0x16f963c4; WORD $0x01cb // vpextrq rbx, xmm9, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm1, 5 - LONG $0x142444f6; BYTE $0x01 // test byte [rsp + 20], 1 - JNE LBB0_985 - -LBB0_578: - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_579 - -LBB0_986: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm1, 7 - WORD $0xc2f6; BYTE $0x01 // test dl, 1 - QUAD $0x00000128249c8b48 // mov rbx, qword [rsp + 296] - JNE LBB0_987 - -LBB0_580: - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_581 - -LBB0_988: - LONG $0x16f963c4; WORD $0x01c1 // vpextrq rcx, xmm8, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - LONG $0x01c6f640 // test sil, 1 - QUAD $0x0000013024948b48 // mov rdx, qword [rsp + 304] - JNE LBB0_989 - -LBB0_582: - WORD $0x01a8 // test al, 1 - QUAD $0x0000009824b48b48 // mov rsi, qword [rsp + 152] - JE LBB0_583 - -LBB0_990: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm1, 11 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_991 - -LBB0_584: - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x0000011824ac8b4c // mov r13, qword [rsp + 280] - JE LBB0_585 - -LBB0_992: - LONG $0x16f9e3c4; WORD $0x01f9 // vpextrq rcx, xmm7, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_993 - -LBB0_586: - LONG $0x01c6f641 // test r14b, 1 - QUAD $0x0000012024848b48 // mov rax, qword [rsp + 288] - QUAD $0x000000e8248c8b4c // mov r9, qword [rsp + 232] - JE LBB0_588 - -LBB0_587: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397d63c4; WORD $0x01f1 // vextracti128 xmm1, ymm14, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - -LBB0_588: - QUAD $0x000240248c6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword [rsp + 576] - QUAD $0x000200249ceb75c5; BYTE $0x00 // vpor ymm11, ymm1, yword [rsp + 512] - QUAD $0x0001e02494eb75c5; BYTE $0x00 // vpor ymm10, ymm1, yword [rsp + 480] - QUAD $0x0001802484eb75c5; BYTE $0x00 // vpor ymm8, ymm1, yword [rsp + 384] - QUAD $0x00016024bcebf5c5; BYTE $0x00 // vpor ymm7, ymm1, yword [rsp + 352] - QUAD $0x0001c0248ceb75c5; BYTE $0x00 // vpor ymm9, ymm1, yword [rsp + 448] - QUAD $0x0001a024acebf5c5; BYTE $0x00 // vpor ymm5, ymm1, yword [rsp + 416] - QUAD $0x0001402494ebf5c5; BYTE $0x00 // vpor ymm2, ymm1, yword [rsp + 320] - LONG $0xf9eb5dc5 // vpor ymm15, ymm4, ymm1 - LONG $0x466dc3c4; WORD $0x31df // vperm2i128 ymm3, ymm2, ymm15, 49 - LONG $0x386dc3c4; WORD $0x01e7 // vinserti128 ymm4, ymm2, xmm15, 1 - LONG $0xdbc6dcc5; BYTE $0x88 // vshufps ymm3, ymm4, ymm3, 136 - LONG $0x4635e3c4; WORD $0x31e5 // vperm2i128 ymm4, ymm9, ymm5, 49 - LONG $0x383563c4; WORD $0x01e5 // vinserti128 ymm12, ymm9, xmm5, 1 - LONG $0xe4c69cc5; BYTE $0x88 // vshufps ymm4, ymm12, ymm4, 136 - LONG $0x463d63c4; WORD $0x31e7 // vperm2i128 ymm12, ymm8, ymm7, 49 - LONG $0x383d63c4; WORD $0x01ef // vinserti128 ymm13, ymm8, xmm7, 1 - LONG $0xc61441c4; WORD $0x88e4 // vshufps ymm12, ymm13, ymm12, 136 - LONG $0x462543c4; WORD $0x31ea // vperm2i128 ymm13, ymm11, ymm10, 49 - LONG $0x382543c4; WORD $0x01f2 // vinserti128 ymm14, ymm11, xmm10, 1 - LONG $0xc60c41c4; WORD $0x88ed // vshufps ymm13, ymm14, ymm13, 136 - LONG $0x667d41c4; BYTE $0xed // vpcmpgtd ymm13, ymm0, ymm13 - LONG $0x667d41c4; BYTE $0xe4 // vpcmpgtd ymm12, ymm0, ymm12 - LONG $0x6b1541c4; BYTE $0xe4 // vpackssdw ymm12, ymm13, ymm12 - LONG $0x00fd43c4; WORD $0xd8e4 // vpermq ymm12, ymm12, 216 - LONG $0xe466fdc5 // vpcmpgtd ymm4, ymm0, ymm4 - LONG $0xdb66fdc5 // vpcmpgtd ymm3, ymm0, ymm3 - LONG $0xdb6bddc5 // vpackssdw ymm3, ymm4, ymm3 - LONG $0x00fde3c4; WORD $0xd8db // vpermq ymm3, ymm3, 216 - LONG $0xdb639dc5 // vpacksswb ymm3, ymm12, ymm3 - LONG $0xdedbe5c5 // vpand ymm3, ymm3, ymm6 - LONG $0xd97ef9c5 // vmovd ecx, xmm3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_589 - LONG $0x787de2c4; WORD $0x1724 // vpbroadcastb ymm4, byte [rdi + rdx] - LONG $0x1479e3c4; WORD $0x01d9 // vpextrb ecx, xmm3, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_995 - -LBB0_590: - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02d9 // vpextrb ecx, xmm3, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_592 - -LBB0_591: - LONG $0x2059e3c4; WORD $0x1f34; BYTE $0x02 // vpinsrb xmm6, xmm4, byte [rdi + rbx], 2 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - -LBB0_592: - LONG $0x24748b48; BYTE $0x60 // mov rsi, qword [rsp + 96] - LONG $0x24548b4c; BYTE $0x48 // mov r10, qword [rsp + 72] - LONG $0x1479e3c4; WORD $0x03d9 // vpextrb ecx, xmm3, 3 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_593 - QUAD $0x00000110248c8b48 // mov rcx, qword [rsp + 272] - LONG $0x2059e3c4; WORD $0x0f34; BYTE $0x03 // vpinsrb xmm6, xmm4, byte [rdi + rcx], 3 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x04d9 // vpextrb ecx, xmm3, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_997 - -LBB0_594: - LONG $0x1479e3c4; WORD $0x05d9 // vpextrb ecx, xmm3, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_595 - -LBB0_998: - LONG $0x2059e3c4; WORD $0x1734; BYTE $0x05 // vpinsrb xmm6, xmm4, byte [rdi + rdx], 5 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x06d9 // vpextrb ecx, xmm3, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_999 - -LBB0_596: - LONG $0x1479e3c4; WORD $0x07d9 // vpextrb ecx, xmm3, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_597 - -LBB0_1000: - LONG $0x2059a3c4; WORD $0x0f34; BYTE $0x07 // vpinsrb xmm6, xmm4, byte [rdi + r9], 7 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x08d9 // vpextrb ecx, xmm3, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1001 - -LBB0_598: - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09d9 // vpextrb ecx, xmm3, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_600 - -LBB0_599: - LONG $0x2059a3c4; WORD $0x3f34; BYTE $0x09 // vpinsrb xmm6, xmm4, byte [rdi + r15], 9 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - -LBB0_600: - QUAD $0x0000009024848b48 // mov rax, qword [rsp + 144] - QUAD $0x0000008824b48b48 // mov rsi, qword [rsp + 136] - QUAD $0x00000080249c8b48 // mov rbx, qword [rsp + 128] - LONG $0x244c8b4c; BYTE $0x78 // mov r9, qword [rsp + 120] - LONG $0x1479e3c4; WORD $0x0ad9 // vpextrb ecx, xmm3, 10 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_601 - LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0a // vpinsrb xmm6, xmm4, byte [rdi + rax], 10 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x0bd9 // vpextrb ecx, xmm3, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1003 - -LBB0_602: - LONG $0x1479e3c4; WORD $0x0cd9 // vpextrb ecx, xmm3, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_603 - -LBB0_1004: - QUAD $0x000000f824848b48 // mov rax, qword [rsp + 248] - LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0c // vpinsrb xmm6, xmm4, byte [rdi + rax], 12 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x0dd9 // vpextrb ecx, xmm3, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_604 - JMP LBB0_605 - -LBB0_589: - LONG $0x1479e3c4; WORD $0x01d9 // vpextrb ecx, xmm3, 1 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_590 - -LBB0_995: - LONG $0x2059e3c4; WORD $0x3734; BYTE $0x01 // vpinsrb xmm6, xmm4, byte [rdi + rsi], 1 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x24548b48; BYTE $0x68 // mov rdx, qword [rsp + 104] - LONG $0x1479e3c4; WORD $0x02d9 // vpextrb ecx, xmm3, 2 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_591 - JMP LBB0_592 - -LBB0_593: - LONG $0x1479e3c4; WORD $0x04d9 // vpextrb ecx, xmm3, 4 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_594 - -LBB0_997: - QUAD $0x00000108248c8b48 // mov rcx, qword [rsp + 264] - LONG $0x2059e3c4; WORD $0x0f34; BYTE $0x04 // vpinsrb xmm6, xmm4, byte [rdi + rcx], 4 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x05d9 // vpextrb ecx, xmm3, 5 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_998 - -LBB0_595: - LONG $0x1479e3c4; WORD $0x06d9 // vpextrb ecx, xmm3, 6 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_596 - -LBB0_999: - LONG $0x2059e3c4; WORD $0x0734; BYTE $0x06 // vpinsrb xmm6, xmm4, byte [rdi + rax], 6 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x07d9 // vpextrb ecx, xmm3, 7 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1000 - -LBB0_597: - LONG $0x1479e3c4; WORD $0x08d9 // vpextrb ecx, xmm3, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_598 - -LBB0_1001: - LONG $0x2059e3c4; WORD $0x3734; BYTE $0x08 // vpinsrb xmm6, xmm4, byte [rdi + rsi], 8 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x24548b48; BYTE $0x58 // mov rdx, qword [rsp + 88] - LONG $0x1479e3c4; WORD $0x09d9 // vpextrb ecx, xmm3, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_599 - JMP LBB0_600 - -LBB0_601: - LONG $0x1479e3c4; WORD $0x0bd9 // vpextrb ecx, xmm3, 11 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_602 - -LBB0_1003: - QUAD $0x0000010024848b48 // mov rax, qword [rsp + 256] - LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0b // vpinsrb xmm6, xmm4, byte [rdi + rax], 11 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - LONG $0x1479e3c4; WORD $0x0cd9 // vpextrb ecx, xmm3, 12 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1004 - -LBB0_603: - LONG $0x1479e3c4; WORD $0x0dd9 // vpextrb ecx, xmm3, 13 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_605 - -LBB0_604: - LONG $0x2059e3c4; WORD $0x1734; BYTE $0x0d // vpinsrb xmm6, xmm4, byte [rdi + rdx], 13 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - -LBB0_605: - LONG $0x24448b48; BYTE $0x50 // mov rax, qword [rsp + 80] - LONG $0x24548b48; BYTE $0x40 // mov rdx, qword [rsp + 64] - LONG $0x1479e3c4; WORD $0x0ed9 // vpextrb ecx, xmm3, 14 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_607 - LONG $0x2059e3c4; WORD $0x0734; BYTE $0x0e // vpinsrb xmm6, xmm4, byte [rdi + rax], 14 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - -LBB0_607: - LONG $0x1479e3c4; WORD $0x0fd9 // vpextrb ecx, xmm3, 15 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_609 - LONG $0x2059a3c4; WORD $0x1734; BYTE $0x0f // vpinsrb xmm6, xmm4, byte [rdi + r10], 15 - LONG $0x025de3c4; WORD $0x0fe6 // vpblendd ymm4, ymm4, ymm6, 15 - -LBB0_609: - LONG $0x397de3c4; WORD $0x01de // vextracti128 xmm6, ymm3, 1 - LONG $0xf07ef9c5 // vmovd eax, xmm6 - LONG $0x00248489; WORD $0x0002; BYTE $0x00 // mov dword [rsp + 512], eax - WORD $0x01a8 // test al, 1 - JE LBB0_611 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x170c; BYTE $0x00 // vpinsrb xmm1, xmm1, byte [rdi + rdx], 0 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - -LBB0_611: - LONG $0x24448b48; BYTE $0x38 // mov rax, qword [rsp + 56] - LONG $0x1479e3c4; WORD $0x01f1 // vpextrb ecx, xmm6, 1 - LONG $0xe0248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 480], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_612 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x370c; BYTE $0x01 // vpinsrb xmm1, xmm1, byte [rdi + rsi], 1 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - LONG $0xc0248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 448], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1006 - -LBB0_613: - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - LONG $0xa0248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 416], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_614 - -LBB0_1007: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071a3c4; WORD $0x0f0c; BYTE $0x03 // vpinsrb xmm1, xmm1, byte [rdi + r9], 3 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - LONG $0x80248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 384], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1008 - -LBB0_615: - LONG $0x1479e3c4; WORD $0x05f0 // vpextrb eax, xmm6, 5 - LONG $0x60248489; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 352], eax - WORD $0x01a8 // test al, 1 - JE LBB0_617 - -LBB0_616: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071a3c4; WORD $0x2f0c; BYTE $0x05 // vpinsrb xmm1, xmm1, byte [rdi + r13], 5 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - -LBB0_617: - LONG $0x24448b48; BYTE $0x70 // mov rax, qword [rsp + 112] - QUAD $0x000000b8249c8b48 // mov rbx, qword [rsp + 184] - QUAD $0x000000b024948b48 // mov rdx, qword [rsp + 176] - LONG $0x1479e3c4; WORD $0x06f1 // vpextrb ecx, xmm6, 6 - LONG $0x40248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 320], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_618 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x06 // vpinsrb xmm1, xmm1, byte [rdi + rax], 6 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x07f0 // vpextrb eax, xmm6, 7 - LONG $0x98248489; WORD $0x0000; BYTE $0x00 // mov dword [rsp + 152], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_1010 - -LBB0_619: - LONG $0x1479c3c4; WORD $0x08f1 // vpextrb r9d, xmm6, 8 - LONG $0x01c1f641 // test r9b, 1 - JE LBB0_620 - -LBB0_1011: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - QUAD $0x000000d824848b48 // mov rax, qword [rsp + 216] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x08 // vpinsrb xmm1, xmm1, byte [rdi + rax], 8 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1012 - -LBB0_621: - LONG $0x1479c3c4; WORD $0x0af3 // vpextrb r11d, xmm6, 10 - LONG $0x01c3f641 // test r11b, 1 - JE LBB0_622 - -LBB0_1013: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - QUAD $0x000000c824848b48 // mov rax, qword [rsp + 200] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x0a // vpinsrb xmm1, xmm1, byte [rdi + rax], 10 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x0bf0 // vpextrb eax, xmm6, 11 - WORD $0x01a8 // test al, 1 - JNE LBB0_1014 - -LBB0_623: - LONG $0x1479e3c4; WORD $0x0cf6 // vpextrb esi, xmm6, 12 - LONG $0x01c6f640 // test sil, 1 - JE LBB0_624 - -LBB0_1015: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x0c // vpinsrb xmm1, xmm1, byte [rdi + rbx], 12 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479c3c4; WORD $0x0df2 // vpextrb r10d, xmm6, 13 - LONG $0x01c2f641 // test r10b, 1 - JNE LBB0_1016 - -LBB0_625: - QUAD $0x000000a824948b48 // mov rdx, qword [rsp + 168] - LONG $0x1479c3c4; WORD $0x0ef5 // vpextrb r13d, xmm6, 14 - LONG $0x01c5f641 // test r13b, 1 - JE LBB0_626 - -LBB0_1017: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x170c; BYTE $0x0e // vpinsrb xmm1, xmm1, byte [rdi + rdx], 14 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - QUAD $0x000000a024948b48 // mov rdx, qword [rsp + 160] - LONG $0x1479c3c4; WORD $0x0ff6 // vpextrb r14d, xmm6, 15 - LONG $0x01c6f641 // test r14b, 1 - JNE LBB0_627 - JMP LBB0_628 - -LBB0_612: - LONG $0x1479e3c4; WORD $0x02f1 // vpextrb ecx, xmm6, 2 - LONG $0xc0248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 448], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_613 - -LBB0_1006: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x1f0c; BYTE $0x02 // vpinsrb xmm1, xmm1, byte [rdi + rbx], 2 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x03f1 // vpextrb ecx, xmm6, 3 - LONG $0xa0248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 416], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1007 - -LBB0_614: - LONG $0x1479e3c4; WORD $0x04f1 // vpextrb ecx, xmm6, 4 - LONG $0x80248c89; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 384], ecx - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_615 - -LBB0_1008: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x04 // vpinsrb xmm1, xmm1, byte [rdi + rax], 4 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x05f0 // vpextrb eax, xmm6, 5 - LONG $0x60248489; WORD $0x0001; BYTE $0x00 // mov dword [rsp + 352], eax - WORD $0x01a8 // test al, 1 - JNE LBB0_616 - JMP LBB0_617 - -LBB0_618: - LONG $0x1479e3c4; WORD $0x07f0 // vpextrb eax, xmm6, 7 - LONG $0x98248489; WORD $0x0000; BYTE $0x00 // mov dword [rsp + 152], eax - WORD $0x01a8 // test al, 1 - JE LBB0_619 - -LBB0_1010: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - QUAD $0x000000f024848b48 // mov rax, qword [rsp + 240] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x07 // vpinsrb xmm1, xmm1, byte [rdi + rax], 7 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479c3c4; WORD $0x08f1 // vpextrb r9d, xmm6, 8 - LONG $0x01c1f641 // test r9b, 1 - JNE LBB0_1011 - -LBB0_620: - LONG $0x1479e3c4; WORD $0x09f1 // vpextrb ecx, xmm6, 9 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_621 - -LBB0_1012: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - QUAD $0x000000d024848b48 // mov rax, qword [rsp + 208] - LONG $0x2071e3c4; WORD $0x070c; BYTE $0x09 // vpinsrb xmm1, xmm1, byte [rdi + rax], 9 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479c3c4; WORD $0x0af3 // vpextrb r11d, xmm6, 10 - LONG $0x01c3f641 // test r11b, 1 - JNE LBB0_1013 - -LBB0_622: - LONG $0x1479e3c4; WORD $0x0bf0 // vpextrb eax, xmm6, 11 - WORD $0x01a8 // test al, 1 - JE LBB0_623 - -LBB0_1014: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - QUAD $0x000000c024b48b48 // mov rsi, qword [rsp + 192] - LONG $0x2071e3c4; WORD $0x370c; BYTE $0x0b // vpinsrb xmm1, xmm1, byte [rdi + rsi], 11 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - LONG $0x1479e3c4; WORD $0x0cf6 // vpextrb esi, xmm6, 12 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_1015 - -LBB0_624: - LONG $0x1479c3c4; WORD $0x0df2 // vpextrb r10d, xmm6, 13 - LONG $0x01c2f641 // test r10b, 1 - JE LBB0_625 - -LBB0_1016: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x170c; BYTE $0x0d // vpinsrb xmm1, xmm1, byte [rdi + rdx], 13 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - QUAD $0x000000a824948b48 // mov rdx, qword [rsp + 168] - LONG $0x1479c3c4; WORD $0x0ef5 // vpextrb r13d, xmm6, 14 - LONG $0x01c5f641 // test r13b, 1 - JNE LBB0_1017 - -LBB0_626: - QUAD $0x000000a024948b48 // mov rdx, qword [rsp + 160] - LONG $0x1479c3c4; WORD $0x0ff6 // vpextrb r14d, xmm6, 15 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_628 - -LBB0_627: - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x2071e3c4; WORD $0x170c; BYTE $0x0f // vpinsrb xmm1, xmm1, byte [rdi + rdx], 15 - LONG $0x385de3c4; WORD $0x01e1 // vinserti128 ymm4, ymm4, xmm1, 1 - -LBB0_628: - LONG $0xd471f5c5; BYTE $0x07 // vpsrlw ymm1, ymm4, 7 - QUAD $0x00000080a5dbf5c5 // vpand ymm4, ymm1, yword 128[rbp] /* [rip + .LCPI0_4] */ - LONG $0x7e79c1c4; BYTE $0xdf // vmovd r15d, xmm3 - LONG $0x01c7f641 // test r15b, 1 - JE LBB0_629 - LONG $0x7ef961c4; BYTE $0xdb // vmovq rbx, xmm11 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm4, 0 - LONG $0x1479e3c4; WORD $0x01db // vpextrb ebx, xmm3, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1019 - -LBB0_630: - LONG $0x1479e3c4; WORD $0x02db // vpextrb ebx, xmm3, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_631 - -LBB0_1020: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm4, 2 - LONG $0x1479e3c4; WORD $0x03db // vpextrb ebx, xmm3, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1021 - -LBB0_632: - LONG $0x1479e3c4; WORD $0x04db // vpextrb ebx, xmm3, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_633 - -LBB0_1022: - LONG $0x7ef961c4; BYTE $0xd3 // vmovq rbx, xmm10 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm4, 4 - LONG $0x1479e3c4; WORD $0x05db // vpextrb ebx, xmm3, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1023 - -LBB0_634: - LONG $0x1479e3c4; WORD $0x06db // vpextrb ebx, xmm3, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_635 - -LBB0_1024: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm4, 6 - LONG $0x1479e3c4; WORD $0x07db // vpextrb ebx, xmm3, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1025 - -LBB0_636: - LONG $0x1479e3c4; WORD $0x08db // vpextrb ebx, xmm3, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_637 - -LBB0_1026: - LONG $0x7ef961c4; BYTE $0xcb // vmovq rbx, xmm9 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x08 // vpextrb byte [r8 + rbx], xmm4, 8 - LONG $0x1479e3c4; WORD $0x09db // vpextrb ebx, xmm3, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1027 - -LBB0_638: - LONG $0x1479e3c4; WORD $0x0adb // vpextrb ebx, xmm3, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_639 - -LBB0_1028: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0a // vpextrb byte [r8 + rbx], xmm4, 10 - LONG $0x1479e3c4; WORD $0x0bdb // vpextrb ebx, xmm3, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1029 - -LBB0_640: - LONG $0x1479e3c4; WORD $0x0cdb // vpextrb ebx, xmm3, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_641 - -LBB0_1030: - LONG $0x7ef9e1c4; BYTE $0xeb // vmovq rbx, xmm5 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0c // vpextrb byte [r8 + rbx], xmm4, 12 - LONG $0x1479e3c4; WORD $0x0ddb // vpextrb ebx, xmm3, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000380248c6f7dc5; BYTE $0x00 // vmovdqa ymm9, yword [rsp + 896] - JNE LBB0_1031 - -LBB0_642: - LONG $0x1479e3c4; WORD $0x0edb // vpextrb ebx, xmm3, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_643 - -LBB0_1032: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0e // vpextrb byte [r8 + rbx], xmm4, 14 - LONG $0x1479e3c4; WORD $0x0fdb // vpextrb ebx, xmm3, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1033 - -LBB0_644: - QUAD $0x01000002002484f6 // test byte [rsp + 512], 1 - QUAD $0x000340249c6ffdc5; BYTE $0x00 // vmovdqa ymm3, yword [rsp + 832] - JE LBB0_645 - -LBB0_1034: - LONG $0x7ef961c4; BYTE $0xc3 // vmovq rbx, xmm8 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x00 // vpextrb byte [r8 + rbx], xmm1, 0 - QUAD $0x01000001e02484f6 // test byte [rsp + 480], 1 - JNE LBB0_1035 - -LBB0_646: - QUAD $0x01000001c02484f6 // test byte [rsp + 448], 1 - JE LBB0_647 - -LBB0_1036: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x02 // vpextrb byte [r8 + rbx], xmm1, 2 - QUAD $0x01000001a02484f6 // test byte [rsp + 416], 1 - JNE LBB0_1037 - -LBB0_648: - QUAD $0x01000001802484f6 // test byte [rsp + 384], 1 - JE LBB0_649 - -LBB0_1038: - LONG $0x7ef9e1c4; BYTE $0xfb // vmovq rbx, xmm7 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x04 // vpextrb byte [r8 + rbx], xmm1, 4 - QUAD $0x01000001602484f6 // test byte [rsp + 352], 1 - QUAD $0x00036024846f7dc5; BYTE $0x00 // vmovdqa ymm8, yword [rsp + 864] - JNE LBB0_1039 - -LBB0_650: - QUAD $0x01000001402484f6 // test byte [rsp + 320], 1 - JE LBB0_651 - -LBB0_1040: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x7ef9e1c4; BYTE $0xcb // vmovq rbx, xmm1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x06 // vpextrb byte [r8 + rbx], xmm1, 6 - QUAD $0x01000000982484f6 // test byte [rsp + 152], 1 - JNE LBB0_1041 - -LBB0_652: - LONG $0x01c1f641 // test r9b, 1 - LONG $0x244c8b44; BYTE $0x10 // mov r9d, dword [rsp + 16] - JE LBB0_653 - -LBB0_1042: - LONG $0x7ef9e1c4; BYTE $0xd2 // vmovq rdx, xmm2 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x100c; BYTE $0x08 // vpextrb byte [r8 + rdx], xmm1, 8 - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JNE LBB0_1043 - -LBB0_654: - LONG $0x01c3f641 // test r11b, 1 - QUAD $0x00000130249c8b4c // mov r11, qword [rsp + 304] - JE LBB0_655 - -LBB0_1044: - LONG $0x397de3c4; WORD $0x01d1 // vextracti128 xmm1, ymm2, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0a // vpextrb byte [r8 + rcx], xmm1, 10 - WORD $0x01a8 // test al, 1 - JNE LBB0_1045 - -LBB0_656: - LONG $0x01c6f640 // test sil, 1 - JE LBB0_657 - -LBB0_1046: - LONG $0x7ef961c4; BYTE $0xf9 // vmovq rcx, xmm15 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0c // vpextrb byte [r8 + rcx], xmm1, 12 - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x00032024946ffdc5; BYTE $0x00 // vmovdqa ymm2, yword [rsp + 800] - JNE LBB0_1047 - -LBB0_658: - LONG $0x01c5f641 // test r13b, 1 - LONG $0x24548b4c; BYTE $0x30 // mov r10, qword [rsp + 48] - JE LBB0_659 - -LBB0_1048: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x7ef9e1c4; BYTE $0xc9 // vmovq rcx, xmm1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0e // vpextrb byte [r8 + rcx], xmm1, 14 - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_25 - JMP LBB0_1049 - -LBB0_629: - LONG $0x1479e3c4; WORD $0x01db // vpextrb ebx, xmm3, 1 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_630 - -LBB0_1019: - LONG $0x16f963c4; WORD $0x01db // vpextrq rbx, xmm11, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm4, 1 - LONG $0x1479e3c4; WORD $0x02db // vpextrb ebx, xmm3, 2 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1020 - -LBB0_631: - LONG $0x1479e3c4; WORD $0x03db // vpextrb ebx, xmm3, 3 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_632 - -LBB0_1021: - LONG $0x397d63c4; WORD $0x01d9 // vextracti128 xmm1, ymm11, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm4, 3 - LONG $0x1479e3c4; WORD $0x04db // vpextrb ebx, xmm3, 4 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1022 - -LBB0_633: - LONG $0x1479e3c4; WORD $0x05db // vpextrb ebx, xmm3, 5 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_634 - -LBB0_1023: - LONG $0x16f963c4; WORD $0x01d3 // vpextrq rbx, xmm10, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm4, 5 - LONG $0x1479e3c4; WORD $0x06db // vpextrb ebx, xmm3, 6 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1024 - -LBB0_635: - LONG $0x1479e3c4; WORD $0x07db // vpextrb ebx, xmm3, 7 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_636 - -LBB0_1025: - LONG $0x397d63c4; WORD $0x01d1 // vextracti128 xmm1, ymm10, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm4, 7 - LONG $0x1479e3c4; WORD $0x08db // vpextrb ebx, xmm3, 8 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1026 - -LBB0_637: - LONG $0x1479e3c4; WORD $0x09db // vpextrb ebx, xmm3, 9 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_638 - -LBB0_1027: - LONG $0x16f963c4; WORD $0x01cb // vpextrq rbx, xmm9, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x09 // vpextrb byte [r8 + rbx], xmm4, 9 - LONG $0x1479e3c4; WORD $0x0adb // vpextrb ebx, xmm3, 10 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1028 - -LBB0_639: - LONG $0x1479e3c4; WORD $0x0bdb // vpextrb ebx, xmm3, 11 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_640 - -LBB0_1029: - LONG $0x397d63c4; WORD $0x01c9 // vextracti128 xmm1, ymm9, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0b // vpextrb byte [r8 + rbx], xmm4, 11 - LONG $0x1479e3c4; WORD $0x0cdb // vpextrb ebx, xmm3, 12 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1030 - -LBB0_641: - LONG $0x1479e3c4; WORD $0x0ddb // vpextrb ebx, xmm3, 13 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - QUAD $0x000380248c6f7dc5; BYTE $0x00 // vmovdqa ymm9, yword [rsp + 896] - JE LBB0_642 - -LBB0_1031: - LONG $0x16f9e3c4; WORD $0x01eb // vpextrq rbx, xmm5, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0d // vpextrb byte [r8 + rbx], xmm4, 13 - LONG $0x1479e3c4; WORD $0x0edb // vpextrb ebx, xmm3, 14 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JNE LBB0_1032 - -LBB0_643: - LONG $0x1479e3c4; WORD $0x0fdb // vpextrb ebx, xmm3, 15 - WORD $0xc3f6; BYTE $0x01 // test bl, 1 - JE LBB0_644 - -LBB0_1033: - LONG $0x397de3c4; WORD $0x01e9 // vextracti128 xmm1, ymm5, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x1479c3c4; WORD $0x1824; BYTE $0x0f // vpextrb byte [r8 + rbx], xmm4, 15 - QUAD $0x01000002002484f6 // test byte [rsp + 512], 1 - QUAD $0x000340249c6ffdc5; BYTE $0x00 // vmovdqa ymm3, yword [rsp + 832] - JNE LBB0_1034 - -LBB0_645: - QUAD $0x01000001e02484f6 // test byte [rsp + 480], 1 - JE LBB0_646 - -LBB0_1035: - LONG $0x16f963c4; WORD $0x01c3 // vpextrq rbx, xmm8, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x01 // vpextrb byte [r8 + rbx], xmm1, 1 - QUAD $0x01000001c02484f6 // test byte [rsp + 448], 1 - JNE LBB0_1036 - -LBB0_647: - QUAD $0x01000001a02484f6 // test byte [rsp + 416], 1 - JE LBB0_648 - -LBB0_1037: - LONG $0x397d63c4; WORD $0x01c1 // vextracti128 xmm1, ymm8, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x03 // vpextrb byte [r8 + rbx], xmm1, 3 - QUAD $0x01000001802484f6 // test byte [rsp + 384], 1 - JNE LBB0_1038 - -LBB0_649: - QUAD $0x01000001602484f6 // test byte [rsp + 352], 1 - QUAD $0x00036024846f7dc5; BYTE $0x00 // vmovdqa ymm8, yword [rsp + 864] - JE LBB0_650 - -LBB0_1039: - LONG $0x16f9e3c4; WORD $0x01fb // vpextrq rbx, xmm7, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x05 // vpextrb byte [r8 + rbx], xmm1, 5 - QUAD $0x01000001402484f6 // test byte [rsp + 320], 1 - JNE LBB0_1040 - -LBB0_651: - QUAD $0x01000000982484f6 // test byte [rsp + 152], 1 - JE LBB0_652 - -LBB0_1041: - LONG $0x397de3c4; WORD $0x01f9 // vextracti128 xmm1, ymm7, 1 - LONG $0x16f9e3c4; WORD $0x01cb // vpextrq rbx, xmm1, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x180c; BYTE $0x07 // vpextrb byte [r8 + rbx], xmm1, 7 - LONG $0x01c1f641 // test r9b, 1 - LONG $0x244c8b44; BYTE $0x10 // mov r9d, dword [rsp + 16] - JNE LBB0_1042 - -LBB0_653: - WORD $0xc1f6; BYTE $0x01 // test cl, 1 - JE LBB0_654 - -LBB0_1043: - LONG $0x16f9e3c4; WORD $0x01d1 // vpextrq rcx, xmm2, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x09 // vpextrb byte [r8 + rcx], xmm1, 9 - LONG $0x01c3f641 // test r11b, 1 - QUAD $0x00000130249c8b4c // mov r11, qword [rsp + 304] - JNE LBB0_1044 - -LBB0_655: - WORD $0x01a8 // test al, 1 - JE LBB0_656 - -LBB0_1045: - LONG $0x397de3c4; WORD $0x01d1 // vextracti128 xmm1, ymm2, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0b // vpextrb byte [r8 + rcx], xmm1, 11 - LONG $0x01c6f640 // test sil, 1 - JNE LBB0_1046 - -LBB0_657: - LONG $0x01c2f641 // test r10b, 1 - QUAD $0x00032024946ffdc5; BYTE $0x00 // vmovdqa ymm2, yword [rsp + 800] - JE LBB0_658 - -LBB0_1047: - LONG $0x16f963c4; WORD $0x01f9 // vpextrq rcx, xmm15, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0d // vpextrb byte [r8 + rcx], xmm1, 13 - LONG $0x01c5f641 // test r13b, 1 - LONG $0x24548b4c; BYTE $0x30 // mov r10, qword [rsp + 48] - JNE LBB0_1048 - -LBB0_659: - LONG $0x01c6f641 // test r14b, 1 - JE LBB0_25 - -LBB0_1049: - LONG $0x397d63c4; WORD $0x01f9 // vextracti128 xmm1, ymm15, 1 - LONG $0x16f9e3c4; WORD $0x01c9 // vpextrq rcx, xmm1, 1 - LONG $0x397de3c4; WORD $0x01e1 // vextracti128 xmm1, ymm4, 1 - LONG $0x1479c3c4; WORD $0x080c; BYTE $0x0f // vpextrb byte [r8 + rcx], xmm1, 15 - JMP LBB0_25 - -LBB0_1050: - WORD $0x394d; BYTE $0xd4 // cmp r12, r10 - JNE LBB0_1055 - -LBB0_1051: - MOVQ 960(SP), SP - VZEROUPPER + WORD $0x3941; BYTE $0xca // cmp r10d, ecx + JGE LBB0_4 + WORD $0x8945; BYTE $0xd1 // mov r9d, r10d + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0x0124 // and al, 1 + LONG $0x0a048842 // mov byte [rdx + r9], al + WORD $0x894c; BYTE $0xce // mov rsi, r9 + LONG $0x01ce8348 // or rsi, 1 + WORD $0xce39 // cmp esi, ecx + JGE LBB0_4 + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0xe8d0 // shr al, 1 + WORD $0x0124 // and al, 1 + WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al + WORD $0x894c; BYTE $0xce // mov rsi, r9 + LONG $0x02ce8348 // or rsi, 2 + WORD $0xce39 // cmp esi, ecx + JGE LBB0_4 + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0xe8c0; BYTE $0x02 // shr al, 2 + WORD $0x0124 // and al, 1 + WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al + WORD $0x894c; BYTE $0xce // mov rsi, r9 + LONG $0x03ce8348 // or rsi, 3 + WORD $0xce39 // cmp esi, ecx + JGE LBB0_4 + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0xe8c0; BYTE $0x03 // shr al, 3 + WORD $0x0124 // and al, 1 + WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al + WORD $0x894c; BYTE $0xce // mov rsi, r9 + LONG $0x04ce8348 // or rsi, 4 + WORD $0xce39 // cmp esi, ecx + JGE LBB0_4 + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0xe8c0; BYTE $0x04 // shr al, 4 + WORD $0x0124 // and al, 1 + WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al + WORD $0x894c; BYTE $0xce // mov rsi, r9 + LONG $0x05ce8348 // or rsi, 5 + WORD $0xce39 // cmp esi, ecx + JGE LBB0_4 + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0xe8c0; BYTE $0x05 // shr al, 5 + WORD $0x0124 // and al, 1 + WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al + WORD $0x894c; BYTE $0xce // mov rsi, r9 + LONG $0x06ce8348 // or rsi, 6 + WORD $0xce39 // cmp esi, ecx + JGE LBB0_4 + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0xe8c0; BYTE $0x06 // shr al, 6 + WORD $0x0124 // and al, 1 + WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al + LONG $0x07c98349 // or r9, 7 + WORD $0x3941; BYTE $0xc9 // cmp r9d, ecx + JGE LBB0_4 + WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi] + WORD $0xe8c0; BYTE $0x07 // shr al, 7 + LONG $0x0a048842 // mov byte [rdx + r9], al + JMP LBB0_4 + +LBB0_5: RET - -LBB0_1052: - LONG $0x244c8b44; BYTE $0x10 // mov r9d, dword [rsp + 16] - LONG $0x24548b4c; BYTE $0x30 // mov r10, qword [rsp + 48] - JMP LBB0_1055 - -LBB0_1054: - LONG $0x244c8b44; BYTE $0x10 // mov r9d, dword [rsp + 16] - JMP LBB0_1055 From ade330cb627772ddcf1c1e2e02f325621023b325 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Sat, 22 May 2021 15:15:58 -0400 Subject: [PATCH 02/17] add encoding package --- go/parquet/doc.go | 4 +- go/parquet/go.mod | 6 +- go/parquet/go.sum | 18 +- .../internal/encoding/boolean_decoder.go | 98 ++ .../internal/encoding/boolean_encoder.go | 112 ++ .../internal/encoding/byte_array_decoder.go | 84 + .../internal/encoding/byte_array_encoder.go | 111 ++ go/parquet/internal/encoding/decoder.go | 178 ++ .../internal/encoding/delta_bit_packing.go | 514 ++++++ .../internal/encoding/delta_byte_array.go | 203 +++ .../encoding/delta_length_byte_array.go | 142 ++ go/parquet/internal/encoding/encoder.go | 300 ++++ .../encoding/encoding_benchmarks_test.go | 461 ++++++ go/parquet/internal/encoding/encoding_test.go | 684 ++++++++ .../encoding/fixed_len_byte_array_decoder.go | 66 + .../encoding/fixed_len_byte_array_encoder.go | 92 ++ go/parquet/internal/encoding/levels.go | 284 ++++ go/parquet/internal/encoding/levels_test.go | 288 ++++ go/parquet/internal/encoding/memo_table.go | 380 +++++ .../internal/encoding/memo_table_test.go | 284 ++++ .../internal/encoding/memo_table_types.gen.go | 366 ++++ .../encoding/memo_table_types.gen.go.tmpl | 115 ++ .../internal/encoding/physical_types.tmpldata | 52 + .../encoding/plain_encoder_types.gen.go | 553 +++++++ .../encoding/plain_encoder_types.gen.go.tmpl | 132 ++ .../internal/encoding/typed_encoder.gen.go | 1467 +++++++++++++++++ .../encoding/typed_encoder.gen.go.tmpl | 341 ++++ go/parquet/internal/encoding/types.go | 437 +++++ go/parquet/internal/hashing/hashing_test.go | 114 ++ go/parquet/internal/hashing/types.tmpldata | 18 + .../internal/hashing/xxh3_memo_table.gen.go | 1009 ++++++++++++ .../hashing/xxh3_memo_table.gen.go.tmpl | 290 ++++ .../internal/hashing/xxh3_memo_table.go | 386 +++++ go/parquet/internal/testutils/utils.go | 42 + 34 files changed, 9622 insertions(+), 9 deletions(-) create mode 100644 go/parquet/internal/encoding/boolean_decoder.go create mode 100644 go/parquet/internal/encoding/boolean_encoder.go create mode 100644 go/parquet/internal/encoding/byte_array_decoder.go create mode 100644 go/parquet/internal/encoding/byte_array_encoder.go create mode 100644 go/parquet/internal/encoding/decoder.go create mode 100644 go/parquet/internal/encoding/delta_bit_packing.go create mode 100644 go/parquet/internal/encoding/delta_byte_array.go create mode 100644 go/parquet/internal/encoding/delta_length_byte_array.go create mode 100644 go/parquet/internal/encoding/encoder.go create mode 100644 go/parquet/internal/encoding/encoding_benchmarks_test.go create mode 100644 go/parquet/internal/encoding/encoding_test.go create mode 100644 go/parquet/internal/encoding/fixed_len_byte_array_decoder.go create mode 100644 go/parquet/internal/encoding/fixed_len_byte_array_encoder.go create mode 100644 go/parquet/internal/encoding/levels.go create mode 100644 go/parquet/internal/encoding/levels_test.go create mode 100644 go/parquet/internal/encoding/memo_table.go create mode 100644 go/parquet/internal/encoding/memo_table_test.go create mode 100644 go/parquet/internal/encoding/memo_table_types.gen.go create mode 100644 go/parquet/internal/encoding/memo_table_types.gen.go.tmpl create mode 100644 go/parquet/internal/encoding/physical_types.tmpldata create mode 100644 go/parquet/internal/encoding/plain_encoder_types.gen.go create mode 100644 go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl create mode 100644 go/parquet/internal/encoding/typed_encoder.gen.go create mode 100644 go/parquet/internal/encoding/typed_encoder.gen.go.tmpl create mode 100644 go/parquet/internal/encoding/types.go create mode 100644 go/parquet/internal/hashing/hashing_test.go create mode 100644 go/parquet/internal/hashing/types.tmpldata create mode 100644 go/parquet/internal/hashing/xxh3_memo_table.gen.go create mode 100644 go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl create mode 100644 go/parquet/internal/hashing/xxh3_memo_table.go create mode 100644 go/parquet/internal/testutils/utils.go diff --git a/go/parquet/doc.go b/go/parquet/doc.go index cf87b81826e..87a592836a9 100644 --- a/go/parquet/doc.go +++ b/go/parquet/doc.go @@ -29,8 +29,8 @@ // go get -u github.com/apache/arrow/go/parquet // // In addition, two cli utilities are provided: -// go install github.factset.com/mtopol/parquet-go/cmd/parquet_reader -// go install github.factset.com/mtopol/parquet-go/cmd/parquet_schema +// go install github.com/apache/arrow/go/parquet/cmd/parquet_reader +// go install github.com/apache/arrow/go/parquet/cmd/parquet_schema // // Modules // diff --git a/go/parquet/go.mod b/go/parquet/go.mod index 9c415931191..cf2be66aba0 100644 --- a/go/parquet/go.mod +++ b/go/parquet/go.mod @@ -19,15 +19,17 @@ module github.com/apache/arrow/go/parquet go 1.15 require ( + github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216 github.com/andybalholm/brotli v1.0.1 - github.com/apache/arrow/go/arrow v0.0.0-20210310173904-5de02e3697aa + github.com/apache/arrow/go/arrow v0.0.0-20210520144409-d07f30ada677 github.com/apache/thrift/lib/go/thrift v0.0.0-20210120171102-e27e82c46ba4 github.com/golang/snappy v0.0.3 github.com/klauspost/asmfmt v1.2.3 - github.com/klauspost/compress v1.11.12 + github.com/klauspost/compress v1.12.2 github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 github.com/stretchr/testify v1.7.0 + github.com/zeebo/xxh3 v0.10.0 golang.org/x/exp v0.0.0-20210220032938-85be41e4509f golang.org/x/sys v0.0.0-20210309074719-68d13333faf2 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 diff --git a/go/parquet/go.sum b/go/parquet/go.sum index be02835cc89..bebc1ff48a1 100644 --- a/go/parquet/go.sum +++ b/go/parquet/go.sum @@ -2,11 +2,13 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216 h1:2ZboyJ8vl75fGesnG9NpMTD2DyQI3FzMXy4x752rGF0= +github.com/JohnCGriffin/overflow v0.0.0-20170615021017-4d914c927216/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/andybalholm/brotli v1.0.1 h1:KqhlKozYbRtJvsPrrEeXcO+N2l6NYT5A2QAFmSULpEc= github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y= -github.com/apache/arrow/go/arrow v0.0.0-20210310173904-5de02e3697aa h1:0Bhiab9ep1wmbD1Lm17uqPkzgYhcBIZf1CsvrMhFMGI= -github.com/apache/arrow/go/arrow v0.0.0-20210310173904-5de02e3697aa/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ= +github.com/apache/arrow/go/arrow v0.0.0-20210520144409-d07f30ada677 h1:F7HiqIf4aBsF4YUBcLolXZ8duSEideNnZnr3lBGa2sA= +github.com/apache/arrow/go/arrow v0.0.0-20210520144409-d07f30ada677/go.mod h1:R4hW3Ug0s+n4CUsWHKOj00Pu01ZqU4x/hSF5kXUcXKQ= github.com/apache/thrift/lib/go/thrift v0.0.0-20210120171102-e27e82c46ba4 h1:orNYqmQGnSjgOauLWjHEp9/qIDT98xv/0Aa4Zet3/Y8= github.com/apache/thrift/lib/go/thrift v0.0.0-20210120171102-e27e82c46ba4/go.mod h1:V/LzksIyqd3KZuQ2SunvReTG/UkArhII1dAWY5U1sCE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -42,15 +44,19 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/klauspost/asmfmt v1.2.3 h1:qEM7SLDo6DXXXz5yTpqUoxhsrtwH30nNR2riO2ZjznY= github.com/klauspost/asmfmt v1.2.3/go.mod h1:RAoUvqkWr2rUa2I19qKMEVZQe4BVtcHGTMCUOcCU2Lg= -github.com/klauspost/compress v1.11.12 h1:famVnQVu7QwryBN4jNseQdUKES71ZAOnB6UQQJPZvqk= -github.com/klauspost/compress v1.11.12/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= +github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= +github.com/klauspost/compress v1.12.2 h1:2KCfW3I9M7nSc5wOqXAlW2v2U6v+w6cbjvbfp+OykW8= +github.com/klauspost/compress v1.12.2/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= +github.com/pierrec/lz4/v4 v4.1.4 h1:PjkB+qEooc9nw4F6Pxe/e0xaRdWz3suItXWxWqAO1QE= +github.com/pierrec/lz4/v4 v4.1.4/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -58,6 +64,8 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/zeebo/xxh3 v0.10.0 h1:1+2Mov9zfxTNUeoDG9k9i13VfxTR0p1JQu8L0vikxB0= +github.com/zeebo/xxh3 v0.10.0/go.mod h1:AQY73TOrhF3jNsdiM9zZOb8MThrYbZONHj7ryDBaLpg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -97,6 +105,7 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210309074719-68d13333faf2 h1:46ULzRKLh1CwgRq2dC5SlBzEqqNCi8rreOZnNrbqcIY= golang.org/x/sys v0.0.0-20210309074719-68d13333faf2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -135,7 +144,6 @@ google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= -google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= diff --git a/go/parquet/internal/encoding/boolean_decoder.go b/go/parquet/internal/encoding/boolean_decoder.go new file mode 100644 index 00000000000..48c320fc345 --- /dev/null +++ b/go/parquet/internal/encoding/boolean_decoder.go @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/arrow/bitutil" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" +) + +// PlainBooleanDecoder is for the Plain Encoding type, there is no +// dictionary decoding for bools. +type PlainBooleanDecoder struct { + decoder + + bitOffset int +} + +// Type for the PlainBooleanDecoder is parquet.Types.Boolean +func (PlainBooleanDecoder) Type() parquet.Type { + return parquet.Types.Boolean +} + +// Decode fills out with bools decoded from the data at the current point +// or until we reach the end of the data. +// +// Returns the number of values decoded +func (dec *PlainBooleanDecoder) Decode(out []bool) (int, error) { + max := utils.MinInt(len(out), dec.nvals) + + // if we aren't at a byte boundary, then get bools until we hit + // a byte boundary with the bit offset. + i := 0 + for dec.bitOffset != 0 && dec.bitOffset < 8 && i < max { + out[i] = (dec.data[0] & byte(1< 0 { // only go in here if there's at least one full byte to decode + if i > 0 { // skip our data forward if we decoded anything above + dec.data = dec.data[1:] + out = out[i:] + } + // determine the number of aligned bytes we can grab using SIMD optimized + // functions to improve performance. + alignedBytes := bitutil.BytesForBits(int64(batch)) + utils.BytesToBools(dec.data[:alignedBytes], out) + dec.data = dec.data[alignedBytes:] + out = out[alignedBytes*8:] + } + + // grab any trailing bits now that we've got our aligned bytes. + for ; dec.bitOffset < (bitsRemain - batch); dec.bitOffset++ { + out[dec.bitOffset] = (dec.data[0] & byte(1< 0 { + toRead := len(out) - nullCount + valuesRead, err := dec.Decode(out[:toRead]) + if err != nil { + return 0, err + } + if valuesRead != toRead { + panic("parquet: number of values / definition levels read did not match") + } + return spacedExpand(out, nullCount, validBits, validBitsOffset), nil + } + return dec.Decode(out) +} diff --git a/go/parquet/internal/encoding/boolean_encoder.go b/go/parquet/internal/encoding/boolean_encoder.go new file mode 100644 index 00000000000..fe31f118dc1 --- /dev/null +++ b/go/parquet/internal/encoding/boolean_encoder.go @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/arrow/bitutil" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" +) + +const boolBufSize = 1024 + +// PlainBooleanEncoder encodes bools as a bitmap as per the Plain Encoding +type PlainBooleanEncoder struct { + encoder + nbits int + bitsBuffer []byte +} + +// Type for the PlainBooleanEncoder is parquet.Types.Boolean +func (PlainBooleanEncoder) Type() parquet.Type { + return parquet.Types.Boolean +} + +// Put encodes the contents of in into the underlying data buffer. +func (enc *PlainBooleanEncoder) Put(in []bool) { + if enc.bitsBuffer == nil { + enc.bitsBuffer = make([]byte, boolBufSize) + } + + bitOffset := 0 + // first check if we are in the middle of a byte due to previous + // encoding of data and finish out that byte's bits. + if enc.nbits > 0 { + bitsToWrite := utils.MinInt(enc.nbits, len(in)) + beg := (boolBufSize * 8) - enc.nbits + for i, val := range in[:bitsToWrite] { + bitmask := uint8(1 << uint((beg+i)%8)) + if val { + enc.bitsBuffer[(beg+i)/8] |= bitmask + } else { + enc.bitsBuffer[(beg+i)/8] &= bitmask ^ 0xFF + } + } + enc.nbits -= bitsToWrite + bitOffset = bitsToWrite + if enc.nbits == 0 { + enc.append(enc.bitsBuffer) + } + } + + // now that we're aligned, write the rest of our bits + bitsRemain := len(in) - bitOffset + for bitOffset < len(in) { + enc.nbits = boolBufSize * 8 + bitsToWrite := utils.MinInt(bitsRemain, enc.nbits) + for i, val := range in[bitOffset : bitOffset+bitsToWrite] { + bitmask := uint8(1 << uint(i%8)) + if val { + enc.bitsBuffer[i/8] |= bitmask + } else { + enc.bitsBuffer[i/8] &= bitmask ^ 0xFF + } + } + bitOffset += bitsToWrite + enc.nbits -= bitsToWrite + bitsRemain -= bitsToWrite + if enc.nbits == 0 { + enc.append(enc.bitsBuffer) + } + } +} + +// PutSpaced will use the validBits bitmap to determine which values are nulls +// and can be left out from the slice, and the encoded without those nulls. +func (enc *PlainBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBitsOffset int64) { + bufferOut := make([]bool, len(in)) + nvalid := spacedCompress(in, bufferOut, validBits, validBitsOffset) + enc.Put(bufferOut[:nvalid]) +} + +// EstimatedDataEncodedSize returns the current number of bytes that have +// been buffered so far +func (enc *PlainBooleanEncoder) EstimatedDataEncodedSize() int64 { + return int64(enc.sink.Len() + (boolBufSize * 8) - enc.nbits) +} + +// FlushValues returns the buffered data, the responsibility is on the caller +// to release the buffer memory +func (enc *PlainBooleanEncoder) FlushValues() Buffer { + if enc.nbits > 0 { + toFlush := (boolBufSize * 8) - enc.nbits + enc.append(enc.bitsBuffer[:bitutil.BytesForBits(int64(toFlush))]) + enc.nbits = boolBufSize * 8 + } + + return enc.sink.Finish() +} diff --git a/go/parquet/internal/encoding/byte_array_decoder.go b/go/parquet/internal/encoding/byte_array_decoder.go new file mode 100644 index 00000000000..09eacf97551 --- /dev/null +++ b/go/parquet/internal/encoding/byte_array_decoder.go @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "encoding/binary" + + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" +) + +// PlainByteArrayDecoder decodes a data chunk for bytearrays according to +// the plain encoding. The byte arrays will use slices to reference the +// data rather than copying it. +type PlainByteArrayDecoder struct { + decoder +} + +// Type returns parquet.Types.ByteArray for this decoder +func (PlainByteArrayDecoder) Type() parquet.Type { + return parquet.Types.ByteArray +} + +// Decode will populate the slice of bytearrays in full or until the number +// of values is emptied. +// +// Returns the number of values that were decoded. +func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { + max := utils.MinInt(len(out), pbad.nvals) + + for i := 0; i < max; i++ { + // there should always be at least four bytes which is the length of the + // next value in the data. + if len(pbad.data) < 4 { + return i, xerrors.New("parquet: eof reading bytearray") + } + + // the first 4 bytes are a little endian uint32 length + nbytes := int32(binary.LittleEndian.Uint32(pbad.data[:4])) + if nbytes < 0 { + return i, xerrors.New("parquet: invalid BYTE_ARRAY value") + } + + if int64(len(pbad.data)) < int64(nbytes)+4 { + return i, xerrors.New("parquet: eof reading bytearray") + } + + out[i] = pbad.data[4 : nbytes+4] + pbad.data = pbad.data[nbytes+4:] + } + + pbad.nvals -= max + return max, nil +} + +// DecodeSpaced is like Decode, but expands the slice out to leave empty values +// where the validBits bitmap has 0s +func (pbad *PlainByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toRead := len(out) - nullCount + valuesRead, err := pbad.Decode(out[:toRead]) + if err != nil { + return valuesRead, err + } + if valuesRead != toRead { + return valuesRead, xerrors.New("parquet: number of values / definition levels read did not match") + } + + return spacedExpand(out, nullCount, validBits, validBitsOffset), nil +} diff --git a/go/parquet/internal/encoding/byte_array_encoder.go b/go/parquet/internal/encoding/byte_array_encoder.go new file mode 100644 index 00000000000..1fe1bfcccd5 --- /dev/null +++ b/go/parquet/internal/encoding/byte_array_encoder.go @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "encoding/binary" + "unsafe" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" +) + +// PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding +// by encoding the length as a uint32 followed by the bytes of the value. +type PlainByteArrayEncoder struct { + encoder +} + +// PutByteArray writes out the 4 bytes for the length followed by the data +func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) { + inc := val.Len() + arrow.Uint32SizeBytes + enc.sink.Reserve(inc) + vlen := uint32(val.Len()) + enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:]) + enc.sink.UnsafeWrite(val) +} + +// Put writes out all of the values in this slice to the buffer +func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) { + for _, val := range in { + enc.PutByteArray(val) + } +} + +// PutSpaced uses the bitmap of validBits to leave out anything that is null according +// to the bitmap. +// +// If validBits is nil, this is equivalent to calling Put +func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { + if validBits != nil { + data := make([]parquet.ByteArray, len(in)) + nvalid := spacedCompress(in, data, validBits, validBitsOffset) + enc.Put(data[:nvalid]) + } else { + enc.Put(in) + } +} + +// Type returns parquet.Types.ByteArray for the bytearray encoder +func (PlainByteArrayEncoder) Type() parquet.Type { + return parquet.Types.ByteArray +} + +// WriteDict writes the dictionary out to the provided slice, out should be +// at least DictEncodedSize() bytes +func (enc *DictByteArrayEncoder) WriteDict(out []byte) { + enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) { + binary.LittleEndian.PutUint32(out, uint32(len(v))) + out = out[arrow.Uint32SizeBytes:] + copy(out, v) + out = out[len(v):] + }) +} + +// PutByteArray adds a single byte array to buffer, updating the dictionary +// and encoded size if it's a new value +func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) { + if in == nil { + in = empty[:] + } + memoIdx, found, err := enc.memo.GetOrInsert(in) + if err != nil { + panic(err) + } + if !found { + enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes + } + enc.addIndex(memoIdx) +} + +// Put takes a slice of ByteArrays to add and encode. +func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) { + for _, val := range in { + enc.PutByteArray(val) + } +} + +// PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0 +func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { + utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { + for i := int64(0); i < length; i++ { + enc.PutByteArray(in[i+pos]) + } + return nil + }) +} diff --git a/go/parquet/internal/encoding/decoder.go b/go/parquet/internal/encoding/decoder.go new file mode 100644 index 00000000000..abfa3867e0b --- /dev/null +++ b/go/parquet/internal/encoding/decoder.go @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "bytes" + "reflect" + + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "github.com/apache/arrow/go/parquet/schema" +) + +// DecoderTraits provides an interface for more easily interacting with types +// to generate decoders for specific types. +type DecoderTraits interface { + Decoder(parquet.Encoding, *schema.Column, bool, memory.Allocator) TypedDecoder + BytesRequired(int) int +} + +// NewDecoder constructs a decoder for a given type and encoding +func NewDecoder(t parquet.Type, e parquet.Encoding, descr *schema.Column, mem memory.Allocator) TypedDecoder { + traits := getDecodingTraits(t) + if traits == nil { + return nil + } + + return traits.Decoder(e, descr, false, mem) +} + +// NewDictDecoder is like NewDecoder but for dictionary encodings, panics if type is bool. +// +// if mem is nil, memory.DefaultAllocator will be used +func NewDictDecoder(t parquet.Type, descr *schema.Column, mem memory.Allocator) DictDecoder { + traits := getDecodingTraits(t) + if traits == nil { + return nil + } + + if mem == nil { + mem = memory.DefaultAllocator + } + + return traits.Decoder(parquet.Encodings.RLEDict, descr, true, mem).(DictDecoder) +} + +type decoder struct { + descr *schema.Column + encoding format.Encoding + nvals int + data []byte + typeLen int +} + +// newDecoderBase constructs the base decoding object that is embedded in the +// type specific decoders. +func newDecoderBase(e format.Encoding, descr *schema.Column) decoder { + typeLen := -1 + if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray { + typeLen = int(descr.TypeLength()) + } + + return decoder{ + descr: descr, + encoding: e, + typeLen: typeLen, + } +} + +// SetData sets the data for decoding into the decoder to update the available +// data bytes and number of values available. +func (d *decoder) SetData(nvals int, data []byte) { + d.data = data + d.nvals = nvals +} + +// ValuesLeft returns the number of remaining values that can be decoded +func (d *decoder) ValuesLeft() int { return d.nvals } + +// Encoding returns the encoding type used by this decoder to decode the bytes. +func (d *decoder) Encoding() parquet.Encoding { return parquet.Encoding(d.encoding) } + +type dictDecoder struct { + decoder + mem memory.Allocator + dictValueDecoder utils.DictionaryConverter + idxDecoder *utils.RleDecoder +} + +// SetDict sets a decoder that can be used to decode the dictionary that is +// used for this column in order to return the proper values. +func (d *dictDecoder) SetDict(dict TypedDecoder) { + if dict.Type() != d.descr.PhysicalType() { + panic("parquet: mismatch dictionary and column data type") + } + + d.dictValueDecoder = NewDictConverter(dict) +} + +// SetData sets the index value data into the decoder. +func (d *dictDecoder) SetData(nvals int, data []byte) { + d.nvals = nvals + if len(data) == 0 { + d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data), 1) + return + } + + width := uint8(data[0]) + if width >= 64 { + panic("parquet: invalid or corrupted bit width") + } + + d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data[1:]), int(width)) +} + +func (d *dictDecoder) decode(out interface{}) (int, error) { + return d.idxDecoder.GetBatchWithDict(d.dictValueDecoder, out) +} + +func (d *dictDecoder) decodeSpaced(out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + return d.idxDecoder.GetBatchWithDictSpaced(d.dictValueDecoder, out, nullCount, validBits, validBitsOffset) +} + +var empty = [1]byte{0} + +// spacedExpand is used to take a slice of data and utilize the bitmap provided to fill in nulls into the +// correct slots according to the bitmap in order to produce a fully expanded result slice with nulls +// in the correct slots. +func spacedExpand(buffer interface{}, nullCount int, validBits []byte, validBitsOffset int64) int { + bufferRef := reflect.ValueOf(buffer) + if bufferRef.Kind() != reflect.Slice { + panic("invalid spacedexpand type, not slice") + } + + var ( + numValues int = bufferRef.Len() + ) + + idxDecode := int32(numValues - nullCount) + if idxDecode == 0 { // if there's nothing to decode there's nothing to do. + return numValues + } + + // read the bitmap in reverse grabbing runs of valid bits where possible. + rdr := utils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(numValues)) + for { + run := rdr.NextRun() + if run.Length == 0 { + break + } + + // copy data from the end of the slice to it's proper location in the slice after accounting for the nulls + // because we technically don't care what is in the null slots we don't actually have to clean + // up after ourselves because we're doing this in reverse to guarantee that we'll always simply + // overwrite any existing data with the correctly spaced data. Any data that happens to be left in the null + // slots is fine since it shouldn't matter and saves us work. + idxDecode -= int32(run.Length) + reflect.Copy(bufferRef.Slice(int(run.Pos), bufferRef.Len()), bufferRef.Slice(int(idxDecode), int(int64(idxDecode)+run.Length))) + } + + return numValues +} diff --git a/go/parquet/internal/encoding/delta_bit_packing.go b/go/parquet/internal/encoding/delta_bit_packing.go new file mode 100644 index 00000000000..986d862f592 --- /dev/null +++ b/go/parquet/internal/encoding/delta_bit_packing.go @@ -0,0 +1,514 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "bytes" + "math" + "math/bits" + "reflect" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" +) + +// see the deltaBitPack encoder for a description of the encoding format that is +// used for delta-bitpacking. +type deltaBitPackDecoder struct { + decoder + + mem memory.Allocator + + usedFirst bool + bitdecoder *utils.BitReader + blockSize uint64 + currentBlockVals uint64 + miniBlocks uint64 + valsPerMini uint64 + currentMiniBlockVals uint64 + minDelta int64 + miniBlockIdx uint64 + + deltaBitWidths *memory.Buffer + deltaBitWidth byte + + lastVal int64 +} + +// returns the number of bytes read so far +func (d *deltaBitPackDecoder) bytesRead() int64 { + return d.bitdecoder.CurOffset() +} + +func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem } + +// SetData sets in the data to be decoded and the expected number of values to decode +func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) { + // set our data into the underlying decoder for the type + d.decoder.SetData(nvalues, data) + // create a bit reader for our decoder's values + d.bitdecoder = utils.NewBitReader(bytes.NewReader(d.data)) + d.currentBlockVals = 0 + d.currentMiniBlockVals = 0 + if d.deltaBitWidths == nil { + d.deltaBitWidths = memory.NewResizableBuffer(d.mem) + } + + var ok bool + d.blockSize, ok = d.bitdecoder.GetVlqInt() + if !ok { + panic("parquet: eof exception") + } + + if d.miniBlocks, ok = d.bitdecoder.GetVlqInt(); !ok { + panic("parquet: eof exception") + } + + var totalValues uint64 + if totalValues, ok = d.bitdecoder.GetVlqInt(); !ok { + panic("parquet: eof exception") + } + + if int(totalValues) != d.nvals { + panic("parquet: mismatch between number of values and count in data header") + } + + if d.lastVal, ok = d.bitdecoder.GetZigZagVlqInt(); !ok { + panic("parquet: eof exception") + } + + d.valsPerMini = uint64(d.blockSize / d.miniBlocks) +} + +// initialize a block to decode +func (d *deltaBitPackDecoder) initBlock() error { + // first we grab the min delta value that we'll start from + var ok bool + if d.minDelta, ok = d.bitdecoder.GetZigZagVlqInt(); !ok { + return xerrors.New("parquet: eof exception") + } + + // ensure we have enough space for our miniblocks to decode the widths + d.deltaBitWidths.Resize(int(d.miniBlocks)) + + var err error + for i := uint64(0); i < d.miniBlocks; i++ { + if d.deltaBitWidths.Bytes()[i], err = d.bitdecoder.ReadByte(); err != nil { + return err + } + } + + d.miniBlockIdx = 0 + d.deltaBitWidth = d.deltaBitWidths.Bytes()[0] + d.currentBlockVals = d.blockSize + return nil +} + +// DeltaBitPackInt32Decoder decodes Int32 values which are packed using the Delta BitPacking algorithm. +type DeltaBitPackInt32Decoder struct { + *deltaBitPackDecoder + + miniBlockValues []int32 +} + +func (d *DeltaBitPackInt32Decoder) unpackNextMini() error { + if d.miniBlockValues == nil { + d.miniBlockValues = make([]int32, 0, int(d.valsPerMini)) + } else { + d.miniBlockValues = d.miniBlockValues[:0] + } + d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)] + d.currentMiniBlockVals = d.valsPerMini + + for j := 0; j < int(d.valsPerMini); j++ { + delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth)) + if !ok { + return xerrors.New("parquet: eof exception") + } + + d.lastVal += int64(delta) + int64(d.minDelta) + d.miniBlockValues = append(d.miniBlockValues, int32(d.lastVal)) + } + d.miniBlockIdx++ + return nil +} + +// Decode retrieves min(remaining values, len(out)) values from the data and returns the number +// of values actually decoded and any errors encountered. +func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) { + max := utils.MinInt(len(out), d.nvals) + if max == 0 { + return 0, nil + } + + out = out[:max] + if !d.usedFirst { // starting value to calculate deltas against + out[0] = int32(d.lastVal) + out = out[1:] + d.usedFirst = true + } + + var err error + for len(out) > 0 { // unpack mini blocks until we get all the values we need + if d.currentBlockVals == 0 { + err = d.initBlock() + } + if d.currentMiniBlockVals == 0 { + err = d.unpackNextMini() + } + if err != nil { + return 0, err + } + + // copy as many values from our mini block as we can into out + start := int(d.valsPerMini - d.currentMiniBlockVals) + end := utils.MinInt(int(d.valsPerMini), len(out)) + copy(out, d.miniBlockValues[start:end]) + + numCopied := end - start + out = out[numCopied:] + d.currentBlockVals -= uint64(numCopied) + d.currentMiniBlockVals -= uint64(numCopied) + } + return max, nil +} + +// DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap +func (d *DeltaBitPackInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := d.Decode(out[:toread]) + if err != nil { + return values, err + } + if values != toread { + return values, xerrors.New("parquet: number of values / definition levels read did not match") + } + + return spacedExpand(out, nullCount, validBits, validBitsOffset), nil +} + +// Type returns the physical parquet type that this decoder decodes, in this case Int32 +func (DeltaBitPackInt32Decoder) Type() parquet.Type { + return parquet.Types.Int32 +} + +// DeltaBitPackInt64Decoder decodes a delta bit packed int64 column of data. +type DeltaBitPackInt64Decoder struct { + *deltaBitPackDecoder + + miniBlockValues []int64 +} + +func (d *DeltaBitPackInt64Decoder) unpackNextMini() error { + if d.miniBlockValues == nil { + d.miniBlockValues = make([]int64, 0, int(d.valsPerMini)) + } else { + d.miniBlockValues = d.miniBlockValues[:0] + } + + d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)] + d.currentMiniBlockVals = d.valsPerMini + + for j := 0; j < int(d.valsPerMini); j++ { + delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth)) + if !ok { + return xerrors.New("parquet: eof exception") + } + + d.lastVal += int64(delta) + int64(d.minDelta) + d.miniBlockValues = append(d.miniBlockValues, d.lastVal) + } + d.miniBlockIdx++ + return nil +} + +// Decode retrieves min(remaining values, len(out)) values from the data and returns the number +// of values actually decoded and any errors encountered. +func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) { + max := utils.MinInt(len(out), d.nvals) + if max == 0 { + return 0, nil + } + + out = out[:max] + if !d.usedFirst { + out[0] = d.lastVal + out = out[1:] + d.usedFirst = true + } + + var err error + for len(out) > 0 { + if d.currentBlockVals == 0 { + err = d.initBlock() + } + if d.currentMiniBlockVals == 0 { + err = d.unpackNextMini() + } + + if err != nil { + return 0, err + } + + start := int(d.valsPerMini - d.currentMiniBlockVals) + end := utils.MinInt(int(d.valsPerMini), len(out)) + copy(out, d.miniBlockValues[start:end]) + + numCopied := end - start + out = out[numCopied:] + d.currentBlockVals -= uint64(numCopied) + d.currentMiniBlockVals -= uint64(numCopied) + } + return max, nil +} + +// Type returns the physical parquet type that this decoder decodes, in this case Int64 +func (DeltaBitPackInt64Decoder) Type() parquet.Type { + return parquet.Types.Int64 +} + +// DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap +func (d DeltaBitPackInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := d.Decode(out[:toread]) + if err != nil { + return values, err + } + if values != toread { + return values, xerrors.New("parquet: number of values / definition levels read did not match") + } + + return spacedExpand(out, nullCount, validBits, validBitsOffset), nil +} + +const ( + // block size must be a multiple of 128 + defaultBlockSize = 128 + defaultNumMiniBlocks = 4 + // block size / number of mini blocks must result in a multiple of 32 + defaultNumValuesPerMini = 32 + // max size of the header for the delta blocks + maxHeaderWriterSize = 32 +) + +// deltaBitPackEncoder is an encoder for the DeltaBinary Packing format +// as per the parquet spec. +// +// Consists of a header followed by blocks of delta encoded values binary packed. +// +// Format +// [header] [block 1] [block 2] ... [block N] +// +// Header +// [block size] [number of mini blocks per block] [total value count] [first value] +// +// Block +// [min delta] [list of bitwidths of the miniblocks] [miniblocks...] +// +// Sets aside bytes at the start of the internal buffer where the header will be written, +// and only writes the header when FlushValues is called before returning it. +type deltaBitPackEncoder struct { + encoder + + bitWriter *utils.BitWriter + totalVals uint64 + firstVal int64 + currentVal int64 + + blockSize uint64 + miniBlockSize uint64 + numMiniBlocks uint64 + deltas []int64 +} + +// flushBlock flushes out a finished block for writing to the underlying encoder +func (enc *deltaBitPackEncoder) flushBlock() { + if len(enc.deltas) == 0 { + return + } + + // determine the minimum delta value + minDelta := int64(math.MaxInt64) + for _, delta := range enc.deltas { + if delta < minDelta { + minDelta = delta + } + } + + enc.bitWriter.WriteZigZagVlqInt(minDelta) + // reserve enough bytes to write out our miniblock deltas + offset := enc.bitWriter.ReserveBytes(int(enc.numMiniBlocks)) + + valuesToWrite := int64(len(enc.deltas)) + for i := 0; i < int(enc.numMiniBlocks); i++ { + n := utils.Min(int64(enc.miniBlockSize), valuesToWrite) + if n == 0 { + break + } + + maxDelta := int64(math.MinInt64) + start := i * int(enc.miniBlockSize) + for _, val := range enc.deltas[start : start+int(n)] { + maxDelta = utils.Max(maxDelta, val) + } + + // compute bit width to store (max_delta - min_delta) + width := uint(bits.Len64(uint64(maxDelta - minDelta))) + // write out the bit width we used into the bytes we reserved earlier + enc.bitWriter.WriteAt([]byte{byte(width)}, int64(offset+i)) + + // write out our deltas + for _, val := range enc.deltas[start : start+int(n)] { + enc.bitWriter.WriteValue(uint64(val-minDelta), width) + } + + valuesToWrite -= n + + // pad the last block if n < miniBlockSize + for ; n < int64(enc.miniBlockSize); n++ { + enc.bitWriter.WriteValue(0, width) + } + } + enc.deltas = enc.deltas[:0] +} + +// putInternal is the implementation for actually writing data which must be +// integral data as int, int8, int32, or int64. +func (enc *deltaBitPackEncoder) putInternal(data interface{}) { + v := reflect.ValueOf(data) + if v.Len() == 0 { + return + } + + idx := 0 + if enc.totalVals == 0 { + enc.blockSize = defaultBlockSize + enc.numMiniBlocks = defaultNumMiniBlocks + enc.miniBlockSize = defaultNumValuesPerMini + + enc.firstVal = v.Index(0).Int() + enc.currentVal = enc.firstVal + idx = 1 + + enc.bitWriter = utils.NewBitWriter(enc.sink) + } + + enc.totalVals += uint64(v.Len()) + for ; idx < v.Len(); idx++ { + val := v.Index(idx).Int() + enc.deltas = append(enc.deltas, val-enc.currentVal) + enc.currentVal = val + if len(enc.deltas) == int(enc.blockSize) { + enc.flushBlock() + } + } +} + +// FlushValues flushes any remaining data and returns the finished encoded buffer +func (enc *deltaBitPackEncoder) FlushValues() Buffer { + if enc.bitWriter != nil { + // write any remaining values + enc.flushBlock() + enc.bitWriter.Flush(true) + } else { + enc.blockSize = defaultBlockSize + enc.numMiniBlocks = defaultNumMiniBlocks + enc.miniBlockSize = defaultNumValuesPerMini + } + + buffer := make([]byte, maxHeaderWriterSize) + headerWriter := utils.NewBitWriter(utils.NewWriterAtBuffer(buffer)) + + headerWriter.WriteVlqInt(uint64(enc.blockSize)) + headerWriter.WriteVlqInt(uint64(enc.numMiniBlocks)) + headerWriter.WriteVlqInt(uint64(enc.totalVals)) + headerWriter.WriteZigZagVlqInt(int64(enc.firstVal)) + headerWriter.Flush(false) + + buffer = buffer[:headerWriter.Written()] + enc.totalVals = 0 + + if enc.bitWriter != nil { + flushed := enc.sink.Finish() + defer flushed.Release() + + buffer = append(buffer, flushed.Buf()[:enc.bitWriter.Written()]...) + } + return poolBuffer{memory.NewBufferBytes(buffer)} +} + +// EstimatedDataEncodedSize returns the current amount of data actually flushed out and written +func (enc *deltaBitPackEncoder) EstimatedDataEncodedSize() int64 { + return int64(enc.bitWriter.Written()) +} + +// DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data. +type DeltaBitPackInt32Encoder struct { + *deltaBitPackEncoder +} + +// Put writes the values from the provided slice of int32 to the encoder +func (enc DeltaBitPackInt32Encoder) Put(in []int32) { + enc.putInternal(in) +} + +// PutSpaced takes a slice of int32 along with a bitmap that describes the nulls and an offset into the bitmap +// in order to write spaced data to the encoder. +func (enc DeltaBitPackInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) { + buffer := memory.NewResizableBuffer(enc.mem) + buffer.Reserve(arrow.Int32Traits.BytesRequired(len(in))) + defer buffer.Release() + + data := arrow.Int32Traits.CastFromBytes(buffer.Buf()) + nvalid := spacedCompress(in, data, validBits, validBitsOffset) + enc.Put(data[:nvalid]) +} + +// Type returns the underlying physical type this encoder works with, in this case Int32 +func (DeltaBitPackInt32Encoder) Type() parquet.Type { + return parquet.Types.Int32 +} + +// DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data. +type DeltaBitPackInt64Encoder struct { + *deltaBitPackEncoder +} + +// Put writes the values from the provided slice of int64 to the encoder +func (enc DeltaBitPackInt64Encoder) Put(in []int64) { + enc.putInternal(in) +} + +// PutSpaced takes a slice of int64 along with a bitmap that describes the nulls and an offset into the bitmap +// in order to write spaced data to the encoder. +func (enc DeltaBitPackInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) { + buffer := memory.NewResizableBuffer(enc.mem) + buffer.Reserve(arrow.Int64Traits.BytesRequired(len(in))) + defer buffer.Release() + + data := arrow.Int64Traits.CastFromBytes(buffer.Buf()) + nvalid := spacedCompress(in, data, validBits, validBitsOffset) + enc.Put(data[:nvalid]) +} + +// Type returns the underlying physical type this encoder works with, in this case Int64 +func (DeltaBitPackInt64Encoder) Type() parquet.Type { + return parquet.Types.Int64 +} diff --git a/go/parquet/internal/encoding/delta_byte_array.go b/go/parquet/internal/encoding/delta_byte_array.go new file mode 100644 index 00000000000..bc2cc638e70 --- /dev/null +++ b/go/parquet/internal/encoding/delta_byte_array.go @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" +) + +// DeltaByteArrayEncoder is an encoder for writing bytearrays which are delta encoded +// this is also known as incremental encoding or front compression. For each element +// in a sequence of strings, we store the prefix length of the previous entry plus the suffix +// see https://en.wikipedia.org/wiki/Incremental_encoding for a longer description. +// +// This is stored as a sequence of delta-encoded prefix lengths followed by the suffixes +// encoded as delta length byte arrays. +type DeltaByteArrayEncoder struct { + encoder + + prefixEncoder *DeltaBitPackInt32Encoder + suffixEncoder *DeltaLengthByteArrayEncoder + + lastVal parquet.ByteArray +} + +func (enc *DeltaByteArrayEncoder) initEncoders() { + enc.prefixEncoder = &DeltaBitPackInt32Encoder{ + deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}} + enc.suffixEncoder = &DeltaLengthByteArrayEncoder{ + newEncoderBase(enc.encoding, nil, enc.mem), + &DeltaBitPackInt32Encoder{ + deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}} +} + +// Type returns the underlying physical type this operates on, in this case ByteArrays only +func (DeltaByteArrayEncoder) Type() parquet.Type { return parquet.Types.ByteArray } + +// Put writes a slice of ByteArrays to the encoder +func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) { + if len(in) == 0 { + return + } + + var suf [1]parquet.ByteArray + if enc.prefixEncoder == nil { // initialize our encoders if we haven't yet + enc.initEncoders() + enc.prefixEncoder.Put([]int32{0}) + suf[0] = in[0] + enc.lastVal = append([]byte(nil), in[0]...) + enc.suffixEncoder.Put(suf[:]) + in = in[1:] + } + + // for each value, figure out the common prefix with the previous value + // and then write the prefix length and the suffix. + for _, val := range in { + l1 := enc.lastVal.Len() + l2 := val.Len() + j := 0 + for j < l1 && j < l2 { + if enc.lastVal[j] != val[j] { + break + } + j++ + } + enc.prefixEncoder.Put([]int32{int32(j)}) + suf[0] = val[j:] + enc.suffixEncoder.Put(suf[:]) + enc.lastVal = append([]byte(nil), val...) + } +} + +// PutSpaced is like Put, but assumes the data is already spaced for nulls and uses the bitmap provided and offset +// to compress the data before writing it without the null slots. +func (enc *DeltaByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { + if validBits != nil { + data := make([]parquet.ByteArray, len(in)) + nvalid := spacedCompress(in, data, validBits, validBitsOffset) + enc.Put(data[:nvalid]) + } else { + enc.Put(in) + } +} + +// Flush flushes any remaining data out and returns the finished encoded buffer. +func (enc *DeltaByteArrayEncoder) FlushValues() Buffer { + if enc.prefixEncoder == nil { + enc.initEncoders() + } + prefixBuf := enc.prefixEncoder.FlushValues() + defer prefixBuf.Release() + + suffixBuf := enc.suffixEncoder.FlushValues() + defer suffixBuf.Release() + + ret := bufferPool.Get().(*memory.Buffer) + ret.ResizeNoShrink(prefixBuf.Len() + suffixBuf.Len()) + copy(ret.Bytes(), prefixBuf.Bytes()) + copy(ret.Bytes()[prefixBuf.Len():], suffixBuf.Bytes()) + return poolBuffer{ret} +} + +// DeltaByteArrayDecoder is a decoder for a column of data encoded using incremental or prefix encoding. +type DeltaByteArrayDecoder struct { + *DeltaLengthByteArrayDecoder + + prefixLengths []int32 + lastVal parquet.ByteArray +} + +// Type returns the underlying physical type this decoder operates on, in this case ByteArrays only +func (DeltaByteArrayDecoder) Type() parquet.Type { + return parquet.Types.ByteArray +} + +func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem } + +// SetData expects the data passed in to be the prefix lengths, followed by the +// blocks of suffix data in order to initialize the decoder. +func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) { + prefixLenDec := DeltaBitPackInt32Decoder{ + deltaBitPackDecoder: &deltaBitPackDecoder{ + decoder: newDecoderBase(d.encoding, d.descr), + mem: d.mem}} + + prefixLenDec.SetData(nvalues, data) + d.prefixLengths = make([]int32, nvalues) + // decode all the prefix lengths first so we know how many bytes it took to get the + // prefix lengths for nvalues + prefixLenDec.Decode(d.prefixLengths) + + // now that we know how many bytes we needed for the prefix lengths, the rest are the + // delta length byte array encoding. + d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):]) +} + +// Decode decodes byte arrays into the slice provided and returns the number of values actually decoded +func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { + max := utils.MinInt(len(out), d.nvals) + if max == 0 { + return 0, nil + } + out = out[:max] + + var err error + if d.lastVal == nil { + _, err = d.DeltaLengthByteArrayDecoder.Decode(out[:1]) + if err != nil { + return 0, err + } + d.lastVal = out[0] + out = out[1:] + d.prefixLengths = d.prefixLengths[1:] + } + + var prefixLen int32 + suffixHolder := make([]parquet.ByteArray, 1) + for len(out) > 0 { + prefixLen, d.prefixLengths = d.prefixLengths[0], d.prefixLengths[1:] + + prefix := d.lastVal[:prefixLen] + _, err = d.DeltaLengthByteArrayDecoder.Decode(suffixHolder) + if err != nil { + return 0, err + } + + d.lastVal = make([]byte, 0, int(prefixLen)+len(suffixHolder[0])) + d.lastVal = append([]byte{}, prefix...) + d.lastVal = append(d.lastVal, suffixHolder[0]...) + out[0], out = d.lastVal, out[1:] + } + return max, nil +} + +// DecodeSpaced is like decode, but the result is spaced out based on the bitmap provided. +func (d *DeltaByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := d.Decode(out[:toread]) + if err != nil { + return values, err + } + if values != toread { + return values, xerrors.New("parquet: number of values / definition levels read did not match") + } + + return spacedExpand(out, nullCount, validBits, validBitsOffset), nil +} diff --git a/go/parquet/internal/encoding/delta_length_byte_array.go b/go/parquet/internal/encoding/delta_length_byte_array.go new file mode 100644 index 00000000000..61309d97c85 --- /dev/null +++ b/go/parquet/internal/encoding/delta_length_byte_array.go @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" +) + +// DeltaLengthByteArrayEncoder encodes data using by taking all of the byte array lengths +// and encoding them in front using delta encoding, followed by all of the binary data +// concatenated back to back. The expected savings is from the cost of encoding the lengths +// and possibly better compression in the data which will no longer be interleaved with the lengths. +// +// This encoding is always preferred over PLAIN for byte array columns where possible. +// +// For example, if the data was "Hello", "World", "Foobar", "ABCDEF" the encoded data would be: +// DeltaEncoding(5, 5, 6, 6) "HelloWorldFoobarABCDEF" +type DeltaLengthByteArrayEncoder struct { + encoder + + lengthEncoder *DeltaBitPackInt32Encoder +} + +// Put writes the provided slice of byte arrays to the encoder +func (enc *DeltaLengthByteArrayEncoder) Put(in []parquet.ByteArray) { + lengths := make([]int32, len(in)) + totalLen := int(0) + for idx, val := range in { + lengths[idx] = int32(val.Len()) + totalLen += val.Len() + } + + enc.lengthEncoder.Put(lengths) + enc.sink.Reserve(totalLen) + for _, val := range in { + enc.sink.UnsafeWrite(val) + } +} + +// PutSpaced is like Put, but the data is spaced out according to the bitmap provided and is compressed +// accordingly before it is written to drop the null data from the write. +func (enc *DeltaLengthByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { + if validBits != nil { + data := make([]parquet.ByteArray, len(in)) + nvalid := spacedCompress(in, data, validBits, validBitsOffset) + enc.Put(data[:nvalid]) + } else { + enc.Put(in) + } +} + +// Type returns the underlying type which is handled by this encoder, ByteArrays only. +func (DeltaLengthByteArrayEncoder) Type() parquet.Type { + return parquet.Types.ByteArray +} + +// FlushValues flushes any remaining data and returns the final encoded buffer of data. +func (enc *DeltaLengthByteArrayEncoder) FlushValues() Buffer { + ret := enc.lengthEncoder.FlushValues() + defer ret.Release() + + data := enc.sink.Finish() + defer data.Release() + + output := bufferPool.Get().(*memory.Buffer) + output.ResizeNoShrink(ret.Len() + data.Len()) + copy(output.Bytes(), ret.Bytes()) + copy(output.Bytes()[ret.Len():], data.Bytes()) + return poolBuffer{output} +} + +// DeltaLengthByteArrayDecoder is a decoder for handling data produced by the corresponding +// encoder which expects delta packed lengths followed by the bytes of data. +type DeltaLengthByteArrayDecoder struct { + decoder + + mem memory.Allocator + lengths []int32 +} + +// Type returns the underlying type which is handled by this encoder, ByteArrays only. +func (DeltaLengthByteArrayDecoder) Type() parquet.Type { + return parquet.Types.ByteArray +} + +func (d *DeltaLengthByteArrayDecoder) Allocator() memory.Allocator { return d.mem } + +// SetData sets in the expected data to the decoder which should be nvalues delta packed lengths +// followed by the rest of the byte array data immediately after. +func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) { + dec := DeltaBitPackInt32Decoder{ + deltaBitPackDecoder: &deltaBitPackDecoder{ + decoder: newDecoderBase(d.encoding, d.descr), + mem: d.mem}} + + dec.SetData(nvalues, data) + d.lengths = make([]int32, nvalues) + dec.Decode(d.lengths) + + d.decoder.SetData(nvalues, data[int(dec.bytesRead()):]) +} + +// Decode populates the passed in slice with data decoded until it hits the length of out +// or runs out of values in the column to decode, then returns the number of values actually decoded. +func (d *DeltaLengthByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { + max := utils.MinInt(len(out), d.nvals) + for i := 0; i < max; i++ { + out[i] = d.data[:d.lengths[i]] + d.data = d.data[d.lengths[i]:] + } + d.nvals -= max + d.lengths = d.lengths[max:] + return max, nil +} + +// DecodeSpaced is like Decode, but for spaced data using the provided bitmap to determine where the nulls should be inserted. +func (d *DeltaLengthByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, _ := d.Decode(out[:toread]) + if values != toread { + return values, xerrors.New("parquet: number of values / definition levels read did not match") + } + + return spacedExpand(out, nullCount, validBits, validBitsOffset), nil +} diff --git a/go/parquet/internal/encoding/encoder.go b/go/parquet/internal/encoding/encoder.go new file mode 100644 index 00000000000..81d1a15f49c --- /dev/null +++ b/go/parquet/internal/encoding/encoder.go @@ -0,0 +1,300 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "math/bits" + "reflect" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/bitutil" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "github.com/apache/arrow/go/parquet/schema" +) + +//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata plain_encoder_types.gen.go.tmpl typed_encoder.gen.go.tmpl + +// EncoderTraits is an interface for the different types to make it more +// convenient to construct encoders for specific types. +type EncoderTraits interface { + Encoder(format.Encoding, bool, *schema.Column, memory.Allocator) TypedEncoder +} + +// NewEncoder will return the appropriately typed encoder for the requested physical type +// and encoding. +// +// If mem is nil, memory.DefaultAllocator will be used. +func NewEncoder(t parquet.Type, e parquet.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + traits := getEncodingTraits(t) + if traits == nil { + return nil + } + + if mem == nil { + mem = memory.DefaultAllocator + } + return traits.Encoder(format.Encoding(e), useDict, descr, mem) +} + +type encoder struct { + descr *schema.Column + encoding format.Encoding + typeLen int + mem memory.Allocator + + sink *PooledBufferWriter +} + +// newEncoderBase constructs a new base encoder for embedding on the typed encoders +// encapsulating the common functionality. +func newEncoderBase(e format.Encoding, descr *schema.Column, mem memory.Allocator) encoder { + typelen := -1 + if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray { + typelen = int(descr.TypeLength()) + } + return encoder{ + descr: descr, + encoding: e, + mem: mem, + typeLen: typelen, + sink: NewPooledBufferWriter(1024), + } +} + +// ReserveForWrite allocates n bytes so that the next n bytes written do not require new allocations. +func (e *encoder) ReserveForWrite(n int) { e.sink.Reserve(n) } +func (e *encoder) EstimatedDataEncodedSize() int64 { return int64(e.sink.Len()) } +func (e *encoder) Encoding() parquet.Encoding { return parquet.Encoding(e.encoding) } +func (e *encoder) Allocator() memory.Allocator { return e.mem } +func (e *encoder) append(data []byte) { e.sink.Write(data) } + +// FlushValues flushes any unwritten data to the buffer and returns the finished encoded buffer of data. +// This also clears the encoder, ownership of the data belongs to whomever called FlushValues, Release +// should be called on the resulting Buffer when done. +func (e *encoder) FlushValues() Buffer { return e.sink.Finish() } + +// Bytes returns the current bytes that have been written to the encoder's buffer but doesn't transfer ownership. +func (e *encoder) Bytes() []byte { return e.sink.Bytes() } + +// Reset drops the data currently in the encoder and resets for new use. +func (e *encoder) Reset() { e.sink.Reset(0) } + +type dictEncoder struct { + encoder + + dictEncodedSize int + idxBuffer *memory.Buffer + idxValues []int32 + memo MemoTable +} + +// newDictEncoderBase constructs and returns a dictionary encoder for the appropriate type using the passed +// in memo table for constructing the index. +func newDictEncoderBase(descr *schema.Column, memo MemoTable, mem memory.Allocator) dictEncoder { + return dictEncoder{ + encoder: newEncoderBase(format.Encoding_PLAIN_DICTIONARY, descr, mem), + idxBuffer: memory.NewResizableBuffer(mem), + memo: memo, + } +} + +// Reset drops all the currently encoded values from the index and indexes from the data to allow +// restarting the encoding process. +func (d *dictEncoder) Reset() { + d.encoder.Reset() + d.dictEncodedSize = 0 + d.idxValues = d.idxValues[:0] + d.idxBuffer.ResizeNoShrink(0) + d.memo.Reset() +} + +// append the passed index to the indexbuffer +func (d *dictEncoder) addIndex(idx int) { + if len(d.idxValues) == cap(d.idxValues) { + curLen := len(d.idxValues) + d.idxBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(bitutil.NextPowerOf2(curLen + 1))) + d.idxValues = arrow.Int32Traits.CastFromBytes(d.idxBuffer.Buf())[: curLen : d.idxBuffer.Len()/arrow.Int32SizeBytes] + } + d.idxValues = append(d.idxValues, int32(idx)) +} + +// FlushValues dumps all the currently buffered indexes that would become the data page to a buffer and +// returns it. +func (d *dictEncoder) FlushValues() Buffer { + buf := bufferPool.Get().(*memory.Buffer) + buf.Reserve(int(d.EstimatedDataEncodedSize())) + size := d.WriteIndices(buf.Buf()) + buf.ResizeNoShrink(size) + return poolBuffer{buf} +} + +// EstimatedDataEncodedSize returns the maximum number of bytes needed to store the RLE encoded indexes, not including the +// dictionary index in the computation. +func (d *dictEncoder) EstimatedDataEncodedSize() int64 { + return 1 + int64(utils.MaxBufferSize(d.BitWidth(), len(d.idxValues))+utils.MinBufferSize(d.BitWidth())) +} + +// NumEntries returns the number of entires in the dictionary index for this encoder. +func (d *dictEncoder) NumEntries() int { + return d.memo.Size() +} + +// BitWidth returns the max bitwidth that would be necessary for encoding the index values currently +// in the dictionary based on the size of the dictionary index. +func (d *dictEncoder) BitWidth() int { + switch d.NumEntries() { + case 0: + return 0 + case 1: + return 1 + default: + return bits.Len32(uint32(d.NumEntries() - 1)) + } +} + +// WriteDict writes the dictionary index to the given byte slice. +func (d *dictEncoder) WriteDict(out []byte) { + d.memo.CopyValues(out) +} + +// WriteIndices performs Run Length encoding on the indexes and the writes the encoded +// index value data to the provided byte slice, returning the number of bytes actually written. +func (d *dictEncoder) WriteIndices(out []byte) int { + out[0] = byte(d.BitWidth()) + + enc := utils.NewRleEncoder(utils.NewWriterAtBuffer(out[1:]), d.BitWidth()) + for _, idx := range d.idxValues { + if !enc.Put(uint64(idx)) { + return -1 + } + } + nbytes := enc.Flush() + + d.idxValues = d.idxValues[:0] + return nbytes + 1 +} + +// Put adds a value to the dictionary data column, inserting the value if it +// didn't already exist in the dictionary. +func (d *dictEncoder) Put(v interface{}) { + memoIdx, found, err := d.memo.GetOrInsert(v) + if err != nil { + panic(err) + } + if !found { + d.dictEncodedSize += int(reflect.TypeOf(v).Size()) + } + d.addIndex(memoIdx) +} + +// DictEncodedSize returns the current size of the encoded dictionary +func (d *dictEncoder) DictEncodedSize() int { + return d.dictEncodedSize +} + +// spacedCompress is a helper function for encoders to remove the slots in the slices passed in according +// to the bitmap which are null into an output slice that is no longer spaced out with slots for nulls. +func spacedCompress(src, out interface{}, validBits []byte, validBitsOffset int64) int { + nvalid := 0 + + // for efficiency we use a type switch because the copy runs significantly faster when typed + // than calling reflect.Copy + switch s := src.(type) { + case []int32: + o := out.([]int32) + reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) + for { + run := reader.NextRun() + if run.Length == 0 { + break + } + copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) + nvalid += int(run.Length) + } + case []int64: + o := out.([]int64) + reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) + for { + run := reader.NextRun() + if run.Length == 0 { + break + } + copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) + nvalid += int(run.Length) + } + case []float32: + o := out.([]float32) + reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) + for { + run := reader.NextRun() + if run.Length == 0 { + break + } + copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) + nvalid += int(run.Length) + } + case []float64: + o := out.([]float64) + reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) + for { + run := reader.NextRun() + if run.Length == 0 { + break + } + copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) + nvalid += int(run.Length) + } + case []parquet.ByteArray: + o := out.([]parquet.ByteArray) + reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) + for { + run := reader.NextRun() + if run.Length == 0 { + break + } + copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) + nvalid += int(run.Length) + } + case []parquet.FixedLenByteArray: + o := out.([]parquet.FixedLenByteArray) + reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) + for { + run := reader.NextRun() + if run.Length == 0 { + break + } + copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) + nvalid += int(run.Length) + } + case []bool: + o := out.([]bool) + reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) + for { + run := reader.NextRun() + if run.Length == 0 { + break + } + copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) + nvalid += int(run.Length) + } + } + + return nvalid +} diff --git a/go/parquet/internal/encoding/encoding_benchmarks_test.go b/go/parquet/internal/encoding/encoding_benchmarks_test.go new file mode 100644 index 00000000000..13d8b0dd9bc --- /dev/null +++ b/go/parquet/internal/encoding/encoding_benchmarks_test.go @@ -0,0 +1,461 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding_test + +import ( + "fmt" + "math" + "testing" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/encoding" + "github.com/apache/arrow/go/parquet/internal/hashing" + "github.com/apache/arrow/go/parquet/internal/testutils" + "github.com/apache/arrow/go/parquet/schema" +) + +const ( + MINSIZE = 1024 + MAXSIZE = 65536 +) + +func BenchmarkPlainEncodingBoolean(b *testing.B) { + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + values := make([]bool, sz) + for idx := range values { + values[idx] = true + } + encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, + false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + encoder.Put(values) + encoder.FlushValues().Release() + } + }) + } +} + +func BenchmarkPlainEncodingInt32(b *testing.B) { + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + values := make([]int32, sz) + for idx := range values { + values[idx] = 64 + } + encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, + false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) + b.ResetTimer() + b.SetBytes(int64(len(values) * arrow.Int32SizeBytes)) + for n := 0; n < b.N; n++ { + encoder.Put(values) + encoder.FlushValues().Release() + } + }) + } +} + +func BenchmarkPlainEncodingInt64(b *testing.B) { + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + values := make([]int64, sz) + for idx := range values { + values[idx] = 64 + } + encoder := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.Plain, + false, nil, memory.DefaultAllocator).(encoding.Int64Encoder) + b.ResetTimer() + b.SetBytes(int64(len(values) * arrow.Int64SizeBytes)) + for n := 0; n < b.N; n++ { + encoder.Put(values) + encoder.FlushValues().Release() + } + }) + } +} + +func BenchmarkPlainEncodingFloat32(b *testing.B) { + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + values := make([]float32, sz) + for idx := range values { + values[idx] = 64.0 + } + encoder := encoding.NewEncoder(parquet.Types.Float, parquet.Encodings.Plain, + false, nil, memory.DefaultAllocator).(encoding.Float32Encoder) + b.ResetTimer() + b.SetBytes(int64(len(values) * arrow.Float32SizeBytes)) + for n := 0; n < b.N; n++ { + encoder.Put(values) + encoder.FlushValues().Release() + } + }) + } +} + +func BenchmarkPlainEncodingFloat64(b *testing.B) { + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + values := make([]float64, sz) + for idx := range values { + values[idx] = 64 + } + encoder := encoding.NewEncoder(parquet.Types.Double, parquet.Encodings.Plain, + false, nil, memory.DefaultAllocator).(encoding.Float64Encoder) + b.ResetTimer() + b.SetBytes(int64(len(values) * arrow.Float64SizeBytes)) + for n := 0; n < b.N; n++ { + encoder.Put(values) + encoder.FlushValues().Release() + } + }) + } +} + +func BenchmarkPlainDecodingBoolean(b *testing.B) { + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + output := make([]bool, sz) + values := make([]bool, sz) + for idx := range values { + values[idx] = true + } + encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, + false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) + encoder.Put(values) + buf := encoder.FlushValues() + defer buf.Release() + + decoder := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, nil, memory.DefaultAllocator) + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + decoder.SetData(sz, buf.Bytes()) + decoder.(encoding.BooleanDecoder).Decode(output) + } + }) + } +} + +func BenchmarkPlainDecodingInt32(b *testing.B) { + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + output := make([]int32, sz) + values := make([]int32, sz) + for idx := range values { + values[idx] = 64 + } + encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, + false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) + encoder.Put(values) + buf := encoder.FlushValues() + defer buf.Release() + + decoder := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.Plain, nil, memory.DefaultAllocator) + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + decoder.SetData(sz, buf.Bytes()) + decoder.(encoding.Int32Decoder).Decode(output) + } + }) + } +} + +func BenchmarkMemoTableFloat64(b *testing.B) { + tests := []struct { + nunique int32 + nvalues int64 + }{ + {100, 65535}, + {1000, 65535}, + {5000, 65535}, + } + + for _, tt := range tests { + b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { + rag := testutils.NewRandomArrayGenerator(0) + dict := rag.Float64(int64(tt.nunique), 0) + indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) + + values := make([]float64, tt.nvalues) + for idx := range values { + values[idx] = dict.Value(int(indices.Value(idx))) + } + + b.ResetTimer() + b.Run("go map", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := encoding.NewFloat64MemoTable(memory.DefaultAllocator) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nunique) { + b.Fatal(tbl.Size(), tt.nunique) + } + } + }) + b.ResetTimer() + b.Run("xxh3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := hashing.NewFloat64MemoTable(0) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nunique) { + b.Fatal(tbl.Size(), tt.nunique) + } + } + }) + }) + } +} + +func BenchmarkMemoTableInt32(b *testing.B) { + tests := []struct { + nunique int32 + nvalues int64 + }{ + {100, 65535}, + {1000, 65535}, + {5000, 65535}, + } + + for _, tt := range tests { + b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { + rag := testutils.NewRandomArrayGenerator(0) + dict := rag.Int32(int64(tt.nunique), 0, math.MaxInt32-1, 0) + indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) + + values := make([]int32, tt.nvalues) + for idx := range values { + values[idx] = dict.Value(int(indices.Value(idx))) + } + b.ResetTimer() + b.Run("xxh3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := hashing.NewInt32MemoTable(0) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nunique) { + b.Fatal(tbl.Size(), tt.nunique) + } + } + }) + + b.Run("go map", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := encoding.NewInt32MemoTable(memory.DefaultAllocator) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nunique) { + b.Fatal(tbl.Size(), tt.nunique) + } + } + }) + }) + } +} + +func BenchmarkMemoTable(b *testing.B) { + tests := []struct { + nunique int32 + minLen int32 + maxLen int32 + nvalues int64 + }{ + {100, 32, 32, 65535}, + {100, 8, 32, 65535}, + {1000, 32, 32, 65535}, + {1000, 8, 32, 65535}, + {5000, 32, 32, 65535}, + {5000, 8, 32, 65535}, + } + + for _, tt := range tests { + b.Run(fmt.Sprintf("%d unique len %d-%d n %d", tt.nunique, tt.minLen, tt.maxLen, tt.nvalues), func(b *testing.B) { + + rag := testutils.NewRandomArrayGenerator(0) + dict := rag.ByteArray(int64(tt.nunique), tt.minLen, tt.maxLen, 0).(*array.String) + indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) + + values := make([]parquet.ByteArray, tt.nvalues) + for idx := range values { + values[idx] = []byte(dict.Value(int(indices.Value(idx)))) + } + + b.ResetTimer() + + b.Run("xxh3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nunique) { + b.Fatal(tbl.Size(), tt.nunique) + } + tbl.Release() + } + }) + b.ResetTimer() + b.Run("go map", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nunique) { + b.Fatal(tbl.Size(), tt.nunique) + } + tbl.Release() + } + }) + }) + } +} + +func BenchmarkMemoTableAllUnique(b *testing.B) { + tests := []struct { + minLen int32 + maxLen int32 + nvalues int64 + }{ + {32, 32, 1024}, + {8, 32, 1024}, + {32, 32, 32767}, + {8, 32, 32767}, + {32, 32, 65535}, + {8, 32, 65535}, + } + for _, tt := range tests { + b.Run(fmt.Sprintf("values %d len %d-%d", tt.nvalues, tt.minLen, tt.maxLen), func(b *testing.B) { + + rag := testutils.NewRandomArrayGenerator(0) + dict := rag.ByteArray(tt.nvalues, tt.minLen, tt.maxLen, 0).(*array.String) + + values := make([]parquet.ByteArray, tt.nvalues) + for idx := range values { + values[idx] = []byte(dict.Value(idx)) + } + + b.ResetTimer() + b.Run("go map", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nvalues) { + b.Fatal(tbl.Size(), tt.nvalues) + } + tbl.Release() + } + }) + + b.Run("xxh3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) + for _, v := range values { + tbl.GetOrInsert(v) + } + if tbl.Size() != int(tt.nvalues) { + b.Fatal(tbl.Size(), tt.nvalues) + } + tbl.Release() + } + }) + }) + } + +} + +func BenchmarkEncodeDictByteArray(b *testing.B) { + const ( + nunique = 100 + minLen = 8 + maxLen = 32 + nvalues = 65535 + ) + + rag := testutils.NewRandomArrayGenerator(0) + dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) + indices := rag.Int32(nvalues, 0, nunique-1, 0) + + values := make([]parquet.ByteArray, nvalues) + for idx := range values { + values[idx] = []byte(dict.Value(int(indices.Value(idx)))) + } + col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) + + out := make([]byte, nunique*(maxLen+arrow.Uint32SizeBytes)) + b.ResetTimer() + for i := 0; i < b.N; i++ { + enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) + enc.Put(values) + enc.WriteDict(out) + } +} + +func BenchmarkDecodeDictByteArray(b *testing.B) { + const ( + nunique = 100 + minLen = 32 + maxLen = 32 + nvalues = 65535 + ) + + rag := testutils.NewRandomArrayGenerator(0) + dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) + indices := rag.Int32(nvalues, 0, nunique-1, 0) + + values := make([]parquet.ByteArray, nvalues) + for idx := range values { + values[idx] = []byte(dict.Value(int(indices.Value(idx)))) + } + + col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) + enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) + enc.Put(values) + + dictBuf := make([]byte, enc.DictEncodedSize()) + enc.WriteDict(dictBuf) + + idxBuf := make([]byte, enc.EstimatedDataEncodedSize()) + enc.WriteIndices(idxBuf) + + out := make([]parquet.ByteArray, nvalues) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.Plain, col, memory.DefaultAllocator) + dec.SetData(nunique, dictBuf) + dictDec := encoding.NewDictDecoder(parquet.Types.ByteArray, col, memory.DefaultAllocator).(*encoding.DictByteArrayDecoder) + dictDec.SetDict(dec) + dictDec.SetData(nvalues, idxBuf) + + dictDec.Decode(out) + } +} diff --git a/go/parquet/internal/encoding/encoding_test.go b/go/parquet/internal/encoding/encoding_test.go new file mode 100644 index 00000000000..d4aa12b5b94 --- /dev/null +++ b/go/parquet/internal/encoding/encoding_test.go @@ -0,0 +1,684 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding_test + +import ( + "fmt" + "reflect" + "testing" + "unsafe" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/bitutil" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/encoding" + "github.com/apache/arrow/go/parquet/internal/testutils" + "github.com/apache/arrow/go/parquet/schema" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/suite" +) + +type nodeFactory func(string, parquet.Repetition, int32) *schema.PrimitiveNode + +func createNodeFactory(t reflect.Type) nodeFactory { + switch t { + case reflect.TypeOf(true): + return schema.NewBooleanNode + case reflect.TypeOf(int32(0)): + return schema.NewInt32Node + case reflect.TypeOf(int64(0)): + return schema.NewInt64Node + case reflect.TypeOf(parquet.Int96{}): + return schema.NewInt96Node + case reflect.TypeOf(float32(0)): + return schema.NewFloat32Node + case reflect.TypeOf(float64(0)): + return schema.NewFloat64Node + case reflect.TypeOf(parquet.ByteArray{}): + return schema.NewByteArrayNode + case reflect.TypeOf(parquet.FixedLenByteArray{}): + return func(name string, rep parquet.Repetition, field int32) *schema.PrimitiveNode { + return schema.NewFixedLenByteArrayNode(name, rep, 12, field) + } + } + return nil +} + +func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, heap *memory.Buffer) (interface{}, interface{}) { + switch t { + case reflect.TypeOf(true): + draws := *(*[]bool)(unsafe.Pointer(&drawbuf)) + decode := *(*[]bool)(unsafe.Pointer(&decodebuf)) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + case reflect.TypeOf(int32(0)): + draws := arrow.Int32Traits.CastFromBytes(drawbuf) + decode := arrow.Int32Traits.CastFromBytes(decodebuf) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + case reflect.TypeOf(int64(0)): + draws := arrow.Int64Traits.CastFromBytes(drawbuf) + decode := arrow.Int64Traits.CastFromBytes(decodebuf) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + case reflect.TypeOf(parquet.Int96{}): + draws := parquet.Int96Traits.CastFromBytes(drawbuf) + decode := parquet.Int96Traits.CastFromBytes(decodebuf) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + case reflect.TypeOf(float32(0)): + draws := arrow.Float32Traits.CastFromBytes(drawbuf) + decode := arrow.Float32Traits.CastFromBytes(decodebuf) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + case reflect.TypeOf(float64(0)): + draws := arrow.Float64Traits.CastFromBytes(drawbuf) + decode := arrow.Float64Traits.CastFromBytes(decodebuf) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + case reflect.TypeOf(parquet.ByteArray{}): + draws := parquet.ByteArrayTraits.CastFromBytes(drawbuf) + decode := parquet.ByteArrayTraits.CastFromBytes(decodebuf) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + case reflect.TypeOf(parquet.FixedLenByteArray{}): + draws := parquet.FixedLenByteArrayTraits.CastFromBytes(drawbuf) + decode := parquet.FixedLenByteArrayTraits.CastFromBytes(decodebuf) + testutils.InitValues(draws[:nvals], heap) + + for j := 1; j < repeats; j++ { + for k := 0; k < nvals; k++ { + draws[nvals*j+k] = draws[k] + } + } + + return draws[:nvals*repeats], decode[:nvals*repeats] + } + return nil, nil +} + +func encode(enc encoding.TypedEncoder, vals interface{}) { + switch v := vals.(type) { + case []bool: + enc.(encoding.BooleanEncoder).Put(v) + case []int32: + enc.(encoding.Int32Encoder).Put(v) + case []int64: + enc.(encoding.Int64Encoder).Put(v) + case []parquet.Int96: + enc.(encoding.Int96Encoder).Put(v) + case []float32: + enc.(encoding.Float32Encoder).Put(v) + case []float64: + enc.(encoding.Float64Encoder).Put(v) + case []parquet.ByteArray: + enc.(encoding.ByteArrayEncoder).Put(v) + case []parquet.FixedLenByteArray: + enc.(encoding.FixedLenByteArrayEncoder).Put(v) + } +} + +func encodeSpaced(enc encoding.TypedEncoder, vals interface{}, validBits []byte, validBitsOffset int64) { + switch v := vals.(type) { + case []bool: + enc.(encoding.BooleanEncoder).PutSpaced(v, validBits, validBitsOffset) + case []int32: + enc.(encoding.Int32Encoder).PutSpaced(v, validBits, validBitsOffset) + case []int64: + enc.(encoding.Int64Encoder).PutSpaced(v, validBits, validBitsOffset) + case []parquet.Int96: + enc.(encoding.Int96Encoder).PutSpaced(v, validBits, validBitsOffset) + case []float32: + enc.(encoding.Float32Encoder).PutSpaced(v, validBits, validBitsOffset) + case []float64: + enc.(encoding.Float64Encoder).PutSpaced(v, validBits, validBitsOffset) + case []parquet.ByteArray: + enc.(encoding.ByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) + case []parquet.FixedLenByteArray: + enc.(encoding.FixedLenByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) + } +} + +func decode(dec encoding.TypedDecoder, out interface{}) (int, error) { + switch v := out.(type) { + case []bool: + return dec.(encoding.BooleanDecoder).Decode(v) + case []int32: + return dec.(encoding.Int32Decoder).Decode(v) + case []int64: + return dec.(encoding.Int64Decoder).Decode(v) + case []parquet.Int96: + return dec.(encoding.Int96Decoder).Decode(v) + case []float32: + return dec.(encoding.Float32Decoder).Decode(v) + case []float64: + return dec.(encoding.Float64Decoder).Decode(v) + case []parquet.ByteArray: + return dec.(encoding.ByteArrayDecoder).Decode(v) + case []parquet.FixedLenByteArray: + return dec.(encoding.FixedLenByteArrayDecoder).Decode(v) + } + return 0, nil +} + +func decodeSpaced(dec encoding.TypedDecoder, out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + switch v := out.(type) { + case []bool: + return dec.(encoding.BooleanDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + case []int32: + return dec.(encoding.Int32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + case []int64: + return dec.(encoding.Int64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + case []parquet.Int96: + return dec.(encoding.Int96Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + case []float32: + return dec.(encoding.Float32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + case []float64: + return dec.(encoding.Float64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + case []parquet.ByteArray: + return dec.(encoding.ByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + case []parquet.FixedLenByteArray: + return dec.(encoding.FixedLenByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) + } + return 0, nil +} + +type BaseEncodingTestSuite struct { + suite.Suite + + descr *schema.Column + typeLen int + mem memory.Allocator + typ reflect.Type + + nvalues int + heap *memory.Buffer + inputBytes *memory.Buffer + outputBytes *memory.Buffer + nodeFactory nodeFactory + + draws interface{} + decodeBuf interface{} +} + +func (b *BaseEncodingTestSuite) SetupSuite() { + b.mem = memory.DefaultAllocator + b.inputBytes = memory.NewResizableBuffer(b.mem) + b.outputBytes = memory.NewResizableBuffer(b.mem) + b.heap = memory.NewResizableBuffer(b.mem) + b.nodeFactory = createNodeFactory(b.typ) +} + +func (b *BaseEncodingTestSuite) TearDownSuite() { + b.inputBytes.Release() + b.outputBytes.Release() + b.heap.Release() +} + +func (b *BaseEncodingTestSuite) SetupTest() { + b.descr = schema.NewColumn(b.nodeFactory("name", parquet.Repetitions.Optional, -1), 0, 0) + b.typeLen = int(b.descr.TypeLength()) +} + +func (b *BaseEncodingTestSuite) initData(nvalues, repeats int) { + b.nvalues = nvalues * repeats + b.inputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) + b.outputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) + memory.Set(b.inputBytes.Buf(), 0) + memory.Set(b.outputBytes.Buf(), 0) + + b.draws, b.decodeBuf = initdata(b.typ, b.inputBytes.Buf(), b.outputBytes.Buf(), nvalues, repeats, b.heap) +} + +func (b *BaseEncodingTestSuite) encodeTestData(e parquet.Encoding) encoding.Buffer { + enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) + b.Equal(e, enc.Encoding()) + b.Equal(b.descr.PhysicalType(), enc.Type()) + encode(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface()) + return enc.FlushValues() +} + +func (b *BaseEncodingTestSuite) decodeTestData(e parquet.Encoding, buf []byte) { + dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) + b.Equal(e, dec.Encoding()) + b.Equal(b.descr.PhysicalType(), dec.Type()) + + dec.SetData(b.nvalues, buf) + decoded, _ := decode(dec, b.decodeBuf) + b.Equal(b.nvalues, decoded) + b.Equal(reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), reflect.ValueOf(b.decodeBuf).Slice(0, b.nvalues).Interface()) +} + +func (b *BaseEncodingTestSuite) encodeTestDataSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) encoding.Buffer { + enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) + encodeSpaced(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), validBits, validBitsOffset) + return enc.FlushValues() +} + +func (b *BaseEncodingTestSuite) decodeTestDataSpaced(e parquet.Encoding, nullCount int, buf []byte, validBits []byte, validBitsOffset int64) { + dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) + dec.SetData(b.nvalues-nullCount, buf) + decoded, _ := decodeSpaced(dec, b.decodeBuf, nullCount, validBits, validBitsOffset) + b.Equal(b.nvalues, decoded) + + drawval := reflect.ValueOf(b.draws) + decodeval := reflect.ValueOf(b.decodeBuf) + for j := 0; j < b.nvalues; j++ { + if bitutil.BitIsSet(validBits, int(validBitsOffset)+j) { + b.Equal(drawval.Index(j).Interface(), decodeval.Index(j).Interface()) + } + } +} + +func (b *BaseEncodingTestSuite) checkRoundTrip(e parquet.Encoding) { + buf := b.encodeTestData(e) + defer buf.Release() + b.decodeTestData(e, buf.Bytes()) +} + +func (b *BaseEncodingTestSuite) checkRoundTripSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) { + buf := b.encodeTestDataSpaced(e, validBits, validBitsOffset) + defer buf.Release() + + nullCount := 0 + for i := 0; i < b.nvalues; i++ { + if bitutil.BitIsNotSet(validBits, int(validBitsOffset)+i) { + nullCount++ + } + } + b.decodeTestDataSpaced(e, nullCount, buf.Bytes(), validBits, validBitsOffset) +} + +func (b *BaseEncodingTestSuite) TestBasicRoundTrip() { + b.initData(10000, 1) + b.checkRoundTrip(parquet.Encodings.Plain) +} + +// func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() { +// b.initData(10000, 1) + +// switch b.typ { +// case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): +// b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) +// default: +// b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) }) +// } +// } + +func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() { + b.initData(10000, 1) + + switch b.typ { + case reflect.TypeOf(parquet.ByteArray{}): + b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) + default: + b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) + } +} + +// func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() { +// b.initData(10000, 1) + +// switch b.typ { +// case reflect.TypeOf(parquet.ByteArray{}): +// b.checkRoundTrip(parquet.Encodings.DeltaByteArray) +// default: +// b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) +// } +// } + +func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() { + exec := func(vals, repeats int, validBitsOffset int64, nullProb float64) { + b.Run(fmt.Sprintf("%d vals %d repeats %d offset %0.3f null", vals, repeats, validBitsOffset, 1-nullProb), func() { + b.initData(vals, repeats) + + size := int64(b.nvalues) + validBitsOffset + r := testutils.NewRandomArrayGenerator(1923) + arr := r.Uint8(size, 0, 100, 1-nullProb) + validBits := arr.NullBitmapBytes() + if validBits != nil { + b.checkRoundTripSpaced(parquet.Encodings.Plain, validBits, validBitsOffset) + switch b.typ { + case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): + b.checkRoundTripSpaced(parquet.Encodings.DeltaBinaryPacked, validBits, validBitsOffset) + case reflect.TypeOf(parquet.ByteArray{}): + b.checkRoundTripSpaced(parquet.Encodings.DeltaLengthByteArray, validBits, validBitsOffset) + // b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset) + } + } + }) + } + + const ( + avx512Size = 64 + simdSize = avx512Size + multiSimdSize = simdSize * 33 + ) + + for _, nullProb := range []float64{0.001, 0.1, 0.5, 0.9, 0.999} { + // Test with both size and offset up to 3 simd block + for i := 1; i < simdSize*3; i++ { + exec(i, 1, 0, nullProb) + exec(i, 1, int64(i+1), nullProb) + } + // large block and offset + exec(multiSimdSize, 1, 0, nullProb) + exec(multiSimdSize+33, 1, 0, nullProb) + exec(multiSimdSize, 1, 33, nullProb) + exec(multiSimdSize+33, 1, 33, nullProb) + } +} + +func TestEncoding(t *testing.T) { + tests := []struct { + name string + typ reflect.Type + }{ + {"Bool", reflect.TypeOf(true)}, + {"Int32", reflect.TypeOf(int32(0))}, + {"Int64", reflect.TypeOf(int64(0))}, + {"Float32", reflect.TypeOf(float32(0))}, + {"Float64", reflect.TypeOf(float64(0))}, + {"Int96", reflect.TypeOf(parquet.Int96{})}, + {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, + {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + suite.Run(t, &BaseEncodingTestSuite{typ: tt.typ}) + }) + } +} + +type DictionaryEncodingTestSuite struct { + BaseEncodingTestSuite +} + +func (d *DictionaryEncodingTestSuite) encodeTestDataDict(e parquet.Encoding) (dictBuffer, indices encoding.Buffer, numEntries int) { + enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) + + d.Equal(parquet.Encodings.PlainDict, enc.Encoding()) + d.Equal(d.descr.PhysicalType(), enc.Type()) + encode(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface()) + dictBuffer = memory.NewResizableBuffer(d.mem) + dictBuffer.Resize(enc.DictEncodedSize()) + enc.WriteDict(dictBuffer.Bytes()) + indices = enc.FlushValues() + numEntries = enc.NumEntries() + return +} + +func (d *DictionaryEncodingTestSuite) encodeTestDataDictSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (dictBuffer, indices encoding.Buffer, numEntries int) { + enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) + d.Equal(d.descr.PhysicalType(), enc.Type()) + + encodeSpaced(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), validBits, validBitsOffset) + dictBuffer = memory.NewResizableBuffer(d.mem) + dictBuffer.Resize(enc.DictEncodedSize()) + enc.WriteDict(dictBuffer.Bytes()) + indices = enc.FlushValues() + numEntries = enc.NumEntries() + return +} + +func (d *DictionaryEncodingTestSuite) checkRoundTrip() { + dictBuffer, indices, numEntries := d.encodeTestDataDict(parquet.Encodings.Plain) + defer dictBuffer.Release() + defer indices.Release() + validBits := make([]byte, int(bitutil.BytesForBits(int64(d.nvalues)))+1) + memory.Set(validBits, 255) + + spacedBuffer, indicesSpaced, _ := d.encodeTestDataDictSpaced(parquet.Encodings.Plain, validBits, 0) + defer spacedBuffer.Release() + defer indicesSpaced.Release() + d.Equal(indices.Bytes(), indicesSpaced.Bytes()) + + dictDecoder := encoding.NewDecoder(testutils.TypeToParquetType(d.typ), parquet.Encodings.Plain, d.descr, d.mem) + d.Equal(d.descr.PhysicalType(), dictDecoder.Type()) + dictDecoder.SetData(numEntries, dictBuffer.Bytes()) + decoder := encoding.NewDictDecoder(testutils.TypeToParquetType(d.typ), d.descr, d.mem) + decoder.SetDict(dictDecoder) + decoder.SetData(d.nvalues, indices.Bytes()) + + decoded, _ := decode(decoder, d.decodeBuf) + d.Equal(d.nvalues, decoded) + d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) + + decoder.SetData(d.nvalues, indices.Bytes()) + decoded, _ = decodeSpaced(decoder, d.decodeBuf, 0, validBits, 0) + d.Equal(d.nvalues, decoded) + d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) +} + +func (d *DictionaryEncodingTestSuite) TestBasicRoundTrip() { + d.initData(2500, 2) + d.checkRoundTrip() +} + +func TestDictEncoding(t *testing.T) { + tests := []struct { + name string + typ reflect.Type + }{ + {"Int32", reflect.TypeOf(int32(0))}, + {"Int64", reflect.TypeOf(int64(0))}, + {"Float32", reflect.TypeOf(float32(0))}, + {"Float64", reflect.TypeOf(float64(0))}, + {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, + {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + suite.Run(t, &DictionaryEncodingTestSuite{BaseEncodingTestSuite{typ: tt.typ}}) + }) + } +} + +func TestWriteDeltaBitPackedInt32(t *testing.T) { + column := schema.NewColumn(schema.NewInt32Node("int32", parquet.Repetitions.Required, -1), 0, 0) + + tests := []struct { + name string + toencode []int32 + expected []byte + }{ + {"simple 12345", []int32{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, + {"odd vals", []int32{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) + + enc.(encoding.Int32Encoder).Put(tt.toencode) + buf := enc.FlushValues() + defer buf.Release() + + assert.Equal(t, tt.expected, buf.Bytes()) + + dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) + + dec.(encoding.Int32Decoder).SetData(len(tt.toencode), tt.expected) + out := make([]int32, len(tt.toencode)) + dec.(encoding.Int32Decoder).Decode(out) + assert.Equal(t, tt.toencode, out) + }) + } + + t.Run("test progressive decoding", func(t *testing.T) { + values := make([]int32, 1000) + testutils.FillRandomInt32(0, values) + + enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) + enc.(encoding.Int32Encoder).Put(values) + buf := enc.FlushValues() + defer buf.Release() + + dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) + dec.(encoding.Int32Decoder).SetData(len(values), buf.Bytes()) + + valueBuf := make([]int32, 100) + for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { + dec.(encoding.Int32Decoder).Decode(valueBuf) + assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) + } + }) +} + +func TestWriteDeltaBitPackedInt64(t *testing.T) { + column := schema.NewColumn(schema.NewInt64Node("int64", parquet.Repetitions.Required, -1), 0, 0) + + tests := []struct { + name string + toencode []int64 + expected []byte + }{ + {"simple 12345", []int64{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, + {"odd vals", []int64{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) + + enc.(encoding.Int64Encoder).Put(tt.toencode) + buf := enc.FlushValues() + defer buf.Release() + + assert.Equal(t, tt.expected, buf.Bytes()) + + dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) + + dec.(encoding.Int64Decoder).SetData(len(tt.toencode), tt.expected) + out := make([]int64, len(tt.toencode)) + dec.(encoding.Int64Decoder).Decode(out) + assert.Equal(t, tt.toencode, out) + }) + } + + t.Run("test progressive decoding", func(t *testing.T) { + values := make([]int64, 1000) + testutils.FillRandomInt64(0, values) + + enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) + enc.(encoding.Int64Encoder).Put(values) + buf := enc.FlushValues() + defer buf.Release() + + dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) + dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes()) + + valueBuf := make([]int64, 100) + for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { + decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf) + assert.Equal(t, len(valueBuf), decoded) + assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) + } + }) +} + +func TestDeltaLengthByteArrayEncoding(t *testing.T) { + column := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) + + test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} + expected := []byte{128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} + + enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, false, column, memory.DefaultAllocator) + enc.(encoding.ByteArrayEncoder).Put(test) + buf := enc.FlushValues() + defer buf.Release() + + assert.Equal(t, expected, buf.Bytes()) + + dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, column, nil) + dec.SetData(len(test), expected) + out := make([]parquet.ByteArray, len(test)) + decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) + assert.Equal(t, len(test), decoded) + assert.Equal(t, test, out) +} + +func TestDeltaByteArrayEncoding(t *testing.T) { + test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} + expected := []byte{128, 1, 4, 4, 0, 0, 0, 0, 0, 0, 128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} + + enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, false, nil, nil) + enc.(encoding.ByteArrayEncoder).Put(test) + buf := enc.FlushValues() + defer buf.Release() + + assert.Equal(t, expected, buf.Bytes()) + + dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, nil, nil) + dec.SetData(len(test), expected) + out := make([]parquet.ByteArray, len(test)) + decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) + assert.Equal(t, len(test), decoded) + assert.Equal(t, test, out) +} diff --git a/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go b/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go new file mode 100644 index 00000000000..a23489290c8 --- /dev/null +++ b/go/parquet/internal/encoding/fixed_len_byte_array_decoder.go @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "math" + + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" +) + +// PlainFixedLenByteArrayDecoder is a plain encoding decoder for Fixed Length Byte Arrays +type PlainFixedLenByteArrayDecoder struct { + decoder +} + +// Type returns the physical type this decoder operates on, FixedLength Byte Arrays +func (PlainFixedLenByteArrayDecoder) Type() parquet.Type { + return parquet.Types.FixedLenByteArray +} + +// Decode populates out with fixed length byte array values until either there are no more +// values to decode or the length of out has been filled. Then returns the total number of values +// that were decoded. +func (pflba *PlainFixedLenByteArrayDecoder) Decode(out []parquet.FixedLenByteArray) (int, error) { + max := utils.MinInt(len(out), pflba.nvals) + numBytesNeeded := max * pflba.typeLen + if numBytesNeeded > len(pflba.data) || numBytesNeeded > math.MaxInt32 { + return 0, xerrors.New("parquet: eof exception") + } + + for idx := range out[:max] { + out[idx] = pflba.data[:pflba.typeLen] + pflba.data = pflba.data[pflba.typeLen:] + } + return max, nil +} + +// DecodeSpaced does the same as Decode but spaces out the resulting slice according to the bitmap leaving space for null values +func (pflba *PlainFixedLenByteArrayDecoder) DecodeSpaced(out []parquet.FixedLenByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toRead := len(out) - nullCount + valuesRead, err := pflba.Decode(out[:toRead]) + if err != nil { + return valuesRead, err + } + if valuesRead != toRead { + return valuesRead, xerrors.New("parquet: number of values / definitions levels read did not match") + } + + return spacedExpand(out, nullCount, validBits, validBitsOffset), nil +} diff --git a/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go b/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go new file mode 100644 index 00000000000..519efc8f029 --- /dev/null +++ b/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" +) + +// PlainFixedLenByteArrayEncoder writes the raw bytes of the byte array +// always writing typeLength bytes for each value. +type PlainFixedLenByteArrayEncoder struct { + encoder +} + +// Put writes the provided values to the encoder +func (enc *PlainFixedLenByteArrayEncoder) Put(in []parquet.FixedLenByteArray) { + typeLen := enc.descr.TypeLength() + if typeLen == 0 { + return + } + + bytesNeeded := len(in) * typeLen + enc.sink.Reserve(bytesNeeded) + for _, val := range in { + if val == nil { + panic("value cannot be nil") + } + enc.sink.UnsafeWrite(val[:typeLen]) + } +} + +// PutSpaced is like Put but works with data that is spaced out according to the passed in bitmap +func (enc *PlainFixedLenByteArrayEncoder) PutSpaced(in []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) { + if validBits != nil { + data := make([]parquet.FixedLenByteArray, len(in)) + nvalid := spacedCompress(in, data, validBits, validBitsOffset) + enc.Put(data[:nvalid]) + } else { + enc.Put(in) + } +} + +// Type returns the underlying physical type this encoder works with, Fixed Length byte arrays. +func (PlainFixedLenByteArrayEncoder) Type() parquet.Type { + return parquet.Types.FixedLenByteArray +} + +// WriteDict overrides the embedded WriteDict function to call a specialized function +// for copying out the Fixed length values from the dictionary more efficiently. +func (enc *DictFixedLenByteArrayEncoder) WriteDict(out []byte) { + enc.memo.(BinaryMemoTable).CopyFixedWidthValues(0, enc.typeLen, out) +} + +// Put writes fixed length values to a dictionary encoded column +func (enc *DictFixedLenByteArrayEncoder) Put(in []parquet.FixedLenByteArray) { + for _, v := range in { + if v == nil { + v = empty[:] + } + memoIdx, found, err := enc.memo.GetOrInsert(v) + if err != nil { + panic(err) + } + if !found { + enc.dictEncodedSize += enc.typeLen + } + enc.addIndex(memoIdx) + } +} + +// PutSpaced is like Put but leaves space for nulls +func (enc *DictFixedLenByteArrayEncoder) PutSpaced(in []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) { + utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { + enc.Put(in[pos : pos+length]) + return nil + }) +} diff --git a/go/parquet/internal/encoding/levels.go b/go/parquet/internal/encoding/levels.go new file mode 100644 index 00000000000..c45858d4653 --- /dev/null +++ b/go/parquet/internal/encoding/levels.go @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "bytes" + "encoding/binary" + "io" + "math/bits" + + "github.com/JohnCGriffin/overflow" + "github.com/apache/arrow/go/arrow/bitutil" + "github.com/apache/arrow/go/parquet" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" +) + +// LevelEncoder is for handling the encoding of Definition and Repetition levels +// to parquet files. +type LevelEncoder struct { + bitWidth int + rleLen int + encoding format.Encoding + rle *utils.RleEncoder + bit *utils.BitWriter +} + +// LevelEncodingMaxBufferSize estimates the max number of bytes needed to encode data with the +// specified encoding given the max level and number of buffered values provided. +func LevelEncodingMaxBufferSize(encoding parquet.Encoding, maxLvl int16, nbuffered int) int { + bitWidth := bits.Len64(uint64(maxLvl)) + nbytes := 0 + switch encoding { + case parquet.Encodings.RLE: + nbytes = utils.MaxBufferSize(bitWidth, nbuffered) + utils.MinBufferSize(bitWidth) + case parquet.Encodings.BitPacked: + nbytes = int(bitutil.BytesForBits(int64(nbuffered * bitWidth))) + default: + panic("parquet: unknown encoding type for levels") + } + return nbytes +} + +// Reset resets the encoder allowing it to be reused and updating the maxlevel to the new +// specified value. +func (l *LevelEncoder) Reset(maxLvl int16) { + l.bitWidth = bits.Len64(uint64(maxLvl)) + switch l.encoding { + case format.Encoding_RLE: + l.rle.Clear() + l.rle.BitWidth = l.bitWidth + case format.Encoding_BIT_PACKED: + l.bit.Clear() + default: + panic("parquet: unknown encoding type") + } +} + +// Init is called to set up the desired encoding type, max level and underlying writer for a +// level encoder to control where the resulting encoded buffer will end up. +func (l *LevelEncoder) Init(encoding parquet.Encoding, maxLvl int16, w io.WriterAt) { + l.bitWidth = bits.Len64(uint64(maxLvl)) + l.encoding = format.Encoding(encoding) + switch l.encoding { + case format.Encoding_RLE: + l.rle = utils.NewRleEncoder(w, l.bitWidth) + case format.Encoding_BIT_PACKED: + l.bit = utils.NewBitWriter(w) + default: + panic("parquet: unknown encoding type for levels") + } +} + +// EncodeNoFlush encodes the provided levels in the encoder, but doesn't flush +// the buffer and return it yet, appending these encoded values. Returns the number +// of values encoded. +func (l *LevelEncoder) EncodeNoFlush(lvls []int16) int { + nencoded := 0 + if l.rle == nil && l.bit == nil { + panic("parquet: level encoders are not initialized") + } + + switch l.encoding { + case format.Encoding_RLE: + for _, level := range lvls { + if !l.rle.Put(uint64(level)) { + break + } + nencoded++ + } + default: + for _, level := range lvls { + if l.bit.WriteValue(uint64(level), uint(l.bitWidth)) != nil { + break + } + nencoded++ + } + } + return nencoded +} + +// Flush flushes out any encoded data to the underlying writer. +func (l *LevelEncoder) Flush() { + if l.rle == nil && l.bit == nil { + panic("parquet: level encoders are not initialized") + } + + switch l.encoding { + case format.Encoding_RLE: + l.rleLen = l.rle.Flush() + default: + l.bit.Flush(false) + } +} + +// Encode encodes the slice of definition or repetition levels based on +// the currently configured encoding type and returns the number of +// values that were encoded. +func (l *LevelEncoder) Encode(lvls []int16) int { + nencoded := 0 + if l.rle == nil && l.bit == nil { + panic("parquet: level encoders are not initialized") + } + + switch l.encoding { + case format.Encoding_RLE: + for _, level := range lvls { + if !l.rle.Put(uint64(level)) { + break + } + nencoded++ + } + l.rleLen = l.rle.Flush() + default: + for _, level := range lvls { + if l.bit.WriteValue(uint64(level), uint(l.bitWidth)) != nil { + break + } + nencoded++ + } + l.bit.Flush(false) + } + return nencoded +} + +// Len returns the number of bytes that were written as Run Length encoded +// levels, this is only valid for run length encoding and will panic if using +// deprecated bit packed encoding. +func (l *LevelEncoder) Len() int { + if l.encoding != format.Encoding_RLE { + panic("parquet: level encoder, only implemented for RLE") + } + return l.rleLen +} + +// LevelDecoder handles the decoding of repetition and definition levels from a +// parquet file supporting bit packed and run length encoded values. +type LevelDecoder struct { + bitWidth int + remaining int + maxLvl int16 + encoding format.Encoding + rle *utils.RleDecoder + bit *utils.BitReader +} + +// SetData sets in the data to be decoded by subsequent calls by specifying the encoding type +// the maximum level (which is what determines the bit width), the number of values expected +// and the raw bytes to decode. Returns the number of bytes expected to be decoded. +func (l *LevelDecoder) SetData(encoding parquet.Encoding, maxLvl int16, nbuffered int, data []byte) int { + l.maxLvl = maxLvl + l.encoding = format.Encoding(encoding) + l.remaining = nbuffered + l.bitWidth = bits.Len64(uint64(maxLvl)) + + switch encoding { + case parquet.Encodings.RLE: + if len(data) < 4 { + panic("parquet: received invalid levels (corrupt data page?)") + } + + nbytes := int32(binary.LittleEndian.Uint32(data[:4])) + if nbytes < 0 || nbytes > int32(len(data)-4) { + panic("parquet: received invalid number of bytes (corrupt data page?)") + } + + buf := data[4:] + if l.rle == nil { + l.rle = utils.NewRleDecoder(bytes.NewReader(buf), l.bitWidth) + } else { + l.rle.Reset(bytes.NewReader(buf), l.bitWidth) + } + return int(nbytes) + 4 + case parquet.Encodings.BitPacked: + nbits, ok := overflow.Mul(nbuffered, l.bitWidth) + if !ok { + panic("parquet: number of buffered values too large (corrupt data page?)") + } + + nbytes := bitutil.BytesForBits(int64(nbits)) + if nbytes < 0 || nbytes > int64(len(data)) { + panic("parquet: recieved invalid number of bytes (corrupt data page?)") + } + if l.bit == nil { + l.bit = utils.NewBitReader(bytes.NewReader(data)) + } else { + l.bit.Reset(bytes.NewReader(data)) + } + return int(nbytes) + default: + panic("parquet: unknown encoding type for levels") + } +} + +// SetDataV2 is the same as SetData but only for DataPageV2 pages and only supports +// run length encoding. +func (l *LevelDecoder) SetDataV2(nbytes int32, maxLvl int16, nbuffered int, data []byte) { + if nbytes < 0 { + panic("parquet: invalid page header (corrupt data page?)") + } + + l.maxLvl = maxLvl + l.encoding = format.Encoding_RLE + l.remaining = nbuffered + l.bitWidth = bits.Len64(uint64(maxLvl)) + + if l.rle == nil { + l.rle = utils.NewRleDecoder(bytes.NewReader(data), l.bitWidth) + } else { + l.rle.Reset(bytes.NewReader(data), l.bitWidth) + } +} + +// Decode decodes the bytes that were set with SetData into the slice of levels +// returning the total number of levels that were decoded and the number of +// values which had a level equal to the max level, indicating how many physical +// values exist to be read. +func (l *LevelDecoder) Decode(levels []int16) (int, int64) { + var ( + buf [1024]uint64 + totaldecoded int + decoded int + valsToRead int64 + ) + + n := utils.Min(int64(l.remaining), int64(len(levels))) + for n > 0 { + batch := utils.Min(1024, n) + switch l.encoding { + case format.Encoding_RLE: + decoded = l.rle.GetBatch(buf[:batch]) + case format.Encoding_BIT_PACKED: + decoded, _ = l.bit.GetBatch(uint(l.bitWidth), buf[:batch]) + } + l.remaining -= decoded + totaldecoded += decoded + n -= batch + + for idx, val := range buf[:decoded] { + lvl := int16(val) + levels[idx] = lvl + if lvl == l.maxLvl { + valsToRead++ + } + } + levels = levels[decoded:] + } + + return totaldecoded, valsToRead +} diff --git a/go/parquet/internal/encoding/levels_test.go b/go/parquet/internal/encoding/levels_test.go new file mode 100644 index 00000000000..f39c1d558d8 --- /dev/null +++ b/go/parquet/internal/encoding/levels_test.go @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding_test + +import ( + "encoding/binary" + "strconv" + "testing" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/encoding" + "github.com/stretchr/testify/assert" +) + +func generateLevels(minRepeat, maxRepeat int, maxLevel int16) []int16 { + // for each repetition count up to max repeat + ret := make([]int16, 0) + for rep := minRepeat; rep <= maxRepeat; rep++ { + var ( + repCount = 1 << rep + val int16 = 0 + bwidth = 0 + ) + // generate levels for repetition count up to max level + for val <= maxLevel { + for i := 0; i < repCount; i++ { + ret = append(ret, val) + } + val = int16((2 << bwidth) - 1) + bwidth++ + } + } + return ret +} + +func encodeLevels(t *testing.T, enc parquet.Encoding, maxLvl int16, numLevels int, input []int16) []byte { + var ( + encoder encoding.LevelEncoder + lvlCount = 0 + buf = encoding.NewBufferWriter(2*numLevels, memory.DefaultAllocator) + ) + + if enc == parquet.Encodings.RLE { + buf.SetOffset(arrow.Int32SizeBytes) + // leave space to write the rle length value + encoder.Init(enc, maxLvl, buf) + lvlCount = encoder.Encode(input) + buf.SetOffset(0) + arrow.Int32Traits.CastFromBytes(buf.Bytes())[0] = int32(encoder.Len()) + } else { + encoder.Init(enc, maxLvl, buf) + lvlCount = encoder.Encode(input) + } + + assert.Equal(t, numLevels, lvlCount) + return buf.Bytes() +} + +func verifyDecodingLvls(t *testing.T, enc parquet.Encoding, maxLvl int16, input []int16, buf []byte) { + var ( + decoder encoding.LevelDecoder + lvlCount = 0 + numLevels = len(input) + output = make([]int16, numLevels) + decodeCount = 4 + numInnerLevels = numLevels / decodeCount + ) + + // decode levels and test with multiple decode calls + decoder.SetData(enc, maxLvl, numLevels, buf) + // try multiple decoding on a single setdata call + for ct := 0; ct < decodeCount; ct++ { + offset := ct * numInnerLevels + lvlCount, _ = decoder.Decode(output[:numInnerLevels]) + assert.Equal(t, numInnerLevels, lvlCount) + assert.Equal(t, input[offset:offset+numInnerLevels], output[:numInnerLevels]) + } + + // check the remaining levels + var ( + levelsCompleted = decodeCount * (numLevels / decodeCount) + remaining = numLevels - levelsCompleted + ) + + if remaining > 0 { + lvlCount, _ = decoder.Decode(output[:remaining]) + assert.Equal(t, remaining, lvlCount) + assert.Equal(t, input[levelsCompleted:], output[:remaining]) + } + // test decode zero values + lvlCount, _ = decoder.Decode(output[:1]) + assert.Zero(t, lvlCount) +} + +func verifyDecodingMultipleSetData(t *testing.T, enc parquet.Encoding, max int16, input []int16, buf [][]byte) { + var ( + decoder encoding.LevelDecoder + lvlCount = 0 + setdataCount = len(buf) + numLevels = len(input) / setdataCount + output = make([]int16, numLevels) + ) + + for ct := 0; ct < setdataCount; ct++ { + offset := ct * numLevels + assert.Len(t, output, numLevels) + decoder.SetData(enc, max, numLevels, buf[ct]) + lvlCount, _ = decoder.Decode(output) + assert.Equal(t, numLevels, lvlCount) + assert.Equal(t, input[offset:offset+numLevels], output) + } +} + +func TestLevelsDecodeMultipleBitWidth(t *testing.T) { + t.Parallel() + // Test levels with maximum bit-width from 1 to 8 + // increase the repetition count for each iteration by a factor of 2 + var ( + minRepeat = 0 + maxRepeat = 7 // 128 + maxBitWidth = 8 + input []int16 + buf []byte + encodings = [2]parquet.Encoding{parquet.Encodings.RLE, parquet.Encodings.BitPacked} + ) + + for _, enc := range encodings { + t.Run(enc.String(), func(t *testing.T) { + // bitpacked requires a sequence of at least 8 + if enc == parquet.Encodings.BitPacked { + minRepeat = 3 + } + // for each max bit width + for bitWidth := 1; bitWidth <= maxBitWidth; bitWidth++ { + t.Run(strconv.Itoa(bitWidth), func(t *testing.T) { + max := int16((1 << bitWidth) - 1) + // generate levels + input = generateLevels(minRepeat, maxRepeat, max) + assert.NotPanics(t, func() { + buf = encodeLevels(t, enc, max, len(input), input) + }) + assert.NotPanics(t, func() { + verifyDecodingLvls(t, enc, max, input, buf) + }) + }) + } + }) + } +} + +func TestLevelsDecodeMultipleSetData(t *testing.T) { + t.Parallel() + + var ( + minRepeat = 3 + maxRepeat = 7 + bitWidth = 8 + maxLevel = int16((1 << bitWidth) - 1) + encodings = [2]parquet.Encoding{parquet.Encodings.RLE, parquet.Encodings.BitPacked} + ) + + input := generateLevels(minRepeat, maxRepeat, maxLevel) + + var ( + numLevels = len(input) + setdataFactor = 8 + splitLevelSize = numLevels / setdataFactor + buf = make([][]byte, setdataFactor) + ) + + for _, enc := range encodings { + t.Run(enc.String(), func(t *testing.T) { + for rf := 0; rf < setdataFactor; rf++ { + offset := rf * splitLevelSize + assert.NotPanics(t, func() { + buf[rf] = encodeLevels(t, enc, maxLevel, splitLevelSize, input[offset:offset+splitLevelSize]) + }) + } + assert.NotPanics(t, func() { + verifyDecodingMultipleSetData(t, enc, maxLevel, input, buf) + }) + }) + } +} + +func TestMinimumBufferSize(t *testing.T) { + t.Parallel() + + const numToEncode = 1024 + levels := make([]int16, numToEncode) + + for idx := range levels { + if idx%9 == 0 { + levels[idx] = 0 + } else { + levels[idx] = 1 + } + } + + output := encoding.NewBufferWriter(0, memory.DefaultAllocator) + + var encoder encoding.LevelEncoder + encoder.Init(parquet.Encodings.RLE, 1, output) + count := encoder.Encode(levels) + assert.Equal(t, numToEncode, count) +} + +func TestMinimumBufferSize2(t *testing.T) { + t.Parallel() + + // test the worst case for bit_width=2 consisting of + // LiteralRun(size=8) + // RepeatedRun(size=8) + // LiteralRun(size=8) + // ... + const numToEncode = 1024 + levels := make([]int16, numToEncode) + + for idx := range levels { + // This forces a literal run of 00000001 + // followed by eight 1s + if (idx % 16) < 7 { + levels[idx] = 0 + } else { + levels[idx] = 1 + } + } + + for bitWidth := int16(1); bitWidth <= 8; bitWidth++ { + output := encoding.NewBufferWriter(0, memory.DefaultAllocator) + + var encoder encoding.LevelEncoder + encoder.Init(parquet.Encodings.RLE, bitWidth, output) + count := encoder.Encode(levels) + assert.Equal(t, numToEncode, count) + } +} + +func TestEncodeDecodeLevels(t *testing.T) { + t.Parallel() + const numToEncode = 2048 + levels := make([]int16, numToEncode) + numones := 0 + for idx := range levels { + if (idx % 16) < 7 { + levels[idx] = 0 + } else { + levels[idx] = 1 + numones++ + } + } + + output := encoding.NewBufferWriter(0, memory.DefaultAllocator) + + var encoder encoding.LevelEncoder + encoder.Init(parquet.Encodings.RLE, 1, output) + count := encoder.Encode(levels) + assert.Equal(t, numToEncode, count) + encoder.Flush() + + buf := output.Bytes() + var prefix [4]byte + binary.LittleEndian.PutUint32(prefix[:], uint32(len(buf))) + + var decoder encoding.LevelDecoder + decoder.SetData(parquet.Encodings.RLE, 1, numToEncode, append(prefix[:], buf...)) + var levelOut [numToEncode]int16 + total, vals := decoder.Decode(levelOut[:]) + assert.EqualValues(t, numToEncode, total) + assert.EqualValues(t, numones, vals) + assert.Equal(t, levels, levelOut[:]) +} diff --git a/go/parquet/internal/encoding/memo_table.go b/go/parquet/internal/encoding/memo_table.go new file mode 100644 index 00000000000..9a04e6e0d02 --- /dev/null +++ b/go/parquet/internal/encoding/memo_table.go @@ -0,0 +1,380 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "math" + "unsafe" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/hashing" +) + +//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata memo_table_types.gen.go.tmpl + +// MemoTable interface that can be used to swap out implementations of the hash table +// used for handling dictionary encoding. Dictionary encoding is built against this interface +// to make it easy for code generation and changing implementations. +// +// Values should remember the order they are inserted to generate a valid dictionary index +type MemoTable interface { + // Reset drops everything in the table allowing it to be reused + Reset() + // Size returns the current number of unique values stored in the table + // including whether or not a null value has been passed in using GetOrInsertNull + Size() int + // CopyValues populates out with the values currently in the table, out must + // be a slice of the appropriate type for the table type. + CopyValues(out interface{}) + // CopyValuesSubset is like CopyValues but only copies a subset of values starting + // at the indicated index. + CopyValuesSubset(start int, out interface{}) + // Get returns the index of the table the specified value is, and a boolean indicating + // whether or not the value was found in the table. Will panic if val is not the appropriate + // type for the underlying table. + Get(val interface{}) (int, bool) + // GetOrInsert is the same as Get, except if the value is not currently in the table it will + // be inserted into the table. + GetOrInsert(val interface{}) (idx int, existed bool, err error) + // GetNull returns the index of the null value and whether or not it was found in the table + GetNull() (int, bool) + // GetOrInsertNull returns the index of the null value, if it didn't already exist in the table, + // it is inserted. + GetOrInsertNull() (idx int, existed bool) +} + +// BinaryMemoTable is an extension of the MemoTable interface adding extra methods +// for handling byte arrays/strings/fixed length byte arrays. +type BinaryMemoTable interface { + MemoTable + // ValuesSize returns the total number of bytes needed to copy all of the values + // from this table. + ValuesSize() int + // CopyOffsets populates out with the start and end offsets of each value in the + // table data. Out should be sized to Size()+1 to accomodate all of the offsets. + CopyOffsets(out []int8) + // CopyOffsetsSubset is like CopyOffsets but only gets a subset of the offsets + // starting at the specified index. + CopyOffsetsSubset(start int, out []int8) + // CopyFixedWidthValues exists to cope with the fact that the table doesn't track + // the fixed width when inserting the null value into the databuffer populating + // a zero length byte slice for the null value (if found). + CopyFixedWidthValues(start int, width int, out []byte) + // VisitValues calls visitFn on each value in the table starting with the index specified + VisitValues(start int, visitFn func([]byte)) + // Retain increases the reference count of the separately stored binary data that is + // kept alongside the table which contains all of the values in the table. This is + // safe to call simultaneously across multiple goroutines. + Retain() + // Release decreases the reference count by 1 of the separately stored binary data + // kept alongside the table containing the values. When the reference count goes to + // 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy. + Release() +} + +// NewInt32Dictionary returns a memotable interface for use with Int32 values only +func NewInt32Dictionary() MemoTable { + return hashing.NewInt32MemoTable(0) +} + +// NewInt64Dictionary returns a memotable interface for use with Int64 values only +func NewInt64Dictionary() MemoTable { + return hashing.NewInt64MemoTable(0) +} + +// NewFloat32Dictionary returns a memotable interface for use with Float32 values only +func NewFloat32Dictionary() MemoTable { + return hashing.NewFloat32MemoTable(0) +} + +// NewFloat64Dictionary returns a memotable interface for use with Float64 values only +func NewFloat64Dictionary() MemoTable { + return hashing.NewFloat64MemoTable(0) +} + +// NewBinaryDictionary returns a memotable interface for use with strings, byte slices, +// parquet.ByteArray and parquet.FixedLengthByteArray only. +func NewBinaryDictionary(mem memory.Allocator) BinaryMemoTable { + return hashing.NewBinaryMemoTable(mem, 0, -1) +} + +const keyNotFound = hashing.KeyNotFound + +// standard map based implementation of a binary memotable which is only kept around +// currently to be used as a benchmark against the memotables in the internal/hashing +// module as a baseline comparison. + +func NewBinaryMemoTable(mem memory.Allocator) BinaryMemoTable { + return &binaryMemoTableImpl{ + table: make(map[string]int), + nullIndex: keyNotFound, + builder: array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary), + } +} + +type binaryMemoTableImpl struct { + table map[string]int + builder *array.BinaryBuilder + nullIndex int +} + +func (m *binaryMemoTableImpl) Reset() { + m.table = make(map[string]int) + m.nullIndex = keyNotFound + m.builder.NewArray().Release() +} + +func (m *binaryMemoTableImpl) CopyValues(out interface{}) { + m.CopyValuesSubset(0, out) +} + +func (m *binaryMemoTableImpl) GetNull() (int, bool) { + return m.nullIndex, m.nullIndex != keyNotFound +} + +func (m *binaryMemoTableImpl) ValuesSize() int { + return m.builder.DataLen() +} + +func (m *binaryMemoTableImpl) Size() int { + sz := len(m.table) + if _, ok := m.GetNull(); ok { + sz++ + } + return sz +} + +func (m *binaryMemoTableImpl) valAsString(val interface{}) string { + switch v := val.(type) { + case string: + return v + case []byte: + return *(*string)(unsafe.Pointer(&v)) + case parquet.ByteArray: + return *(*string)(unsafe.Pointer(&v)) + case parquet.FixedLenByteArray: + return *(*string)(unsafe.Pointer(&v)) + default: + panic("invalid type for value in binarymemotable") + } +} + +func (m *binaryMemoTableImpl) Get(val interface{}) (int, bool) { + key := m.valAsString(val) + if p, ok := m.table[key]; ok { + return p, true + } + return keyNotFound, false +} + +func (m *binaryMemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { + key := m.valAsString(val) + idx, found = m.table[key] + if !found { + idx = m.Size() + m.builder.AppendString(key) + m.table[key] = idx + } + return +} + +func (m *binaryMemoTableImpl) GetOrInsertNull() (idx int, found bool) { + idx, found = m.GetNull() + if !found { + idx = m.Size() + m.nullIndex = idx + m.builder.AppendNull() + } + return +} + +func (m *binaryMemoTableImpl) findOffset(idx int) uintptr { + val := m.builder.Value(idx) + for len(val) == 0 { + idx++ + if idx >= m.builder.Len() { + break + } + val = m.builder.Value(idx) + } + if len(val) != 0 { + return uintptr(unsafe.Pointer(&val[0])) + } + return uintptr(m.builder.DataLen()) + m.findOffset(0) +} + +func (m *binaryMemoTableImpl) CopyValuesSubset(start int, out interface{}) { + var ( + first = m.findOffset(0) + offset = m.findOffset(int(start)) + length = m.builder.DataLen() - int(offset-first) + ) + + outval := out.([]byte) + copy(outval, m.builder.Value(start)[0:length]) +} + +func (m *binaryMemoTableImpl) CopyFixedWidthValues(start, width int, out []byte) { + +} + +func (m *binaryMemoTableImpl) CopyOffsetsSubset(start int, out []int8) { + if m.builder.Len() <= start { + return + } + + first := m.findOffset(0) + delta := m.findOffset(start) + for i := start; i < m.Size(); i++ { + offset := int8(m.findOffset(i) - delta) + out[i-start] = offset + } + + out[m.Size()-start] = int8(m.builder.DataLen() - int(delta) - int(first)) +} + +func (m *binaryMemoTableImpl) CopyOffsets(out []int8) { + m.CopyOffsetsSubset(0, out) +} + +func (m *binaryMemoTableImpl) VisitValues(start int, visitFn func([]byte)) { + for i := int(start); i < m.Size(); i++ { + visitFn(m.builder.Value(i)) + } +} + +func (m *binaryMemoTableImpl) Release() { + m.builder.Release() +} + +func (m *binaryMemoTableImpl) Retain() { + m.builder.Retain() +} + +// standard map based implementation of a float64 memotable which is only kept around +// currently to be used as a benchmark against the memotables in the internal/hashing +// module as a baseline comparison. + +func NewFloat64MemoTable(memory.Allocator) MemoTable { + return &float64MemoTableImpl{ + table: make(map[float64]struct { + value float64 + memoIndex int + }), + nullIndex: keyNotFound, + nanIndex: keyNotFound, + } +} + +type float64MemoTableImpl struct { + table map[float64]struct { + value float64 + memoIndex int + } + nullIndex int + nanIndex int +} + +func (m *float64MemoTableImpl) Reset() { + m.table = make(map[float64]struct { + value float64 + memoIndex int + }) + m.nullIndex = keyNotFound + m.nanIndex = keyNotFound +} + +func (m *float64MemoTableImpl) GetNull() (int, bool) { + return m.nullIndex, m.nullIndex != keyNotFound +} + +func (m *float64MemoTableImpl) Size() int { + sz := len(m.table) + if _, ok := m.GetNull(); ok { + sz++ + } + if m.nanIndex != keyNotFound { + sz++ + } + return sz +} + +func (m *float64MemoTableImpl) GetOrInsertNull() (idx int, found bool) { + idx, found = m.GetNull() + if !found { + idx = m.Size() + m.nullIndex = idx + } + return +} + +func (m *float64MemoTableImpl) Get(val interface{}) (int, bool) { + v := val.(float64) + if p, ok := m.table[v]; ok { + return p.memoIndex, true + } + if math.IsNaN(v) && m.nanIndex != keyNotFound { + return m.nanIndex, true + } + return keyNotFound, false +} + +func (m *float64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { + v := val.(float64) + if math.IsNaN(v) { + if m.nanIndex == keyNotFound { + idx = m.Size() + m.nanIndex = idx + } else { + idx = m.nanIndex + found = true + } + return + } + + p, ok := m.table[v] + if ok { + idx = p.memoIndex + } else { + idx = m.Size() + p.value = v + p.memoIndex = idx + m.table[v] = p + found = true + } + return +} + +func (m *float64MemoTableImpl) CopyValues(out interface{}) { + m.CopyValuesSubset(0, out) +} + +func (m *float64MemoTableImpl) CopyValuesSubset(start int, out interface{}) { + outval := out.([]float64) + for _, v := range m.table { + idx := v.memoIndex - start + if idx >= 0 { + outval[idx] = v.value + } + } + if m.nanIndex != keyNotFound { + outval[m.nanIndex] = math.NaN() + } +} diff --git a/go/parquet/internal/encoding/memo_table_test.go b/go/parquet/internal/encoding/memo_table_test.go new file mode 100644 index 00000000000..96f1c22733b --- /dev/null +++ b/go/parquet/internal/encoding/memo_table_test.go @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding_test + +import ( + "math" + "testing" + + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet/internal/encoding" + "github.com/apache/arrow/go/parquet/internal/hashing" + "github.com/stretchr/testify/suite" +) + +type MemoTableTestSuite struct { + suite.Suite +} + +func TestMemoTable(t *testing.T) { + suite.Run(t, new(MemoTableTestSuite)) +} + +func (m *MemoTableTestSuite) assertGetNotFound(table encoding.MemoTable, v interface{}) { + _, ok := table.Get(v) + m.False(ok) +} + +func (m *MemoTableTestSuite) assertGet(table encoding.MemoTable, v interface{}, expected int) { + idx, ok := table.Get(v) + m.Equal(expected, idx) + m.True(ok) +} + +func (m *MemoTableTestSuite) assertGetOrInsert(table encoding.MemoTable, v interface{}, expected int) { + idx, _, err := table.GetOrInsert(v) + m.NoError(err) + m.Equal(expected, idx) +} + +func (m *MemoTableTestSuite) assertGetNullNotFound(table encoding.MemoTable) { + _, ok := table.GetNull() + m.False(ok) +} + +func (m *MemoTableTestSuite) assertGetNull(table encoding.MemoTable, expected int) { + idx, ok := table.GetNull() + m.Equal(expected, idx) + m.True(ok) +} + +func (m *MemoTableTestSuite) assertGetOrInsertNull(table encoding.MemoTable, expected int) { + idx, _ := table.GetOrInsertNull() + m.Equal(expected, idx) +} + +func (m *MemoTableTestSuite) TestInt64() { + const ( + A int64 = 1234 + B int64 = 0 + C int64 = -98765321 + D int64 = 12345678901234 + E int64 = -1 + F int64 = 1 + G int64 = 9223372036854775807 + H int64 = -9223372036854775807 - 1 + ) + + // table := encoding.NewInt64MemoTable(nil) + table := hashing.NewInt64MemoTable(0) + m.Zero(table.Size()) + m.assertGetNotFound(table, A) + m.assertGetNullNotFound(table) + m.assertGetOrInsert(table, A, 0) + m.assertGetNotFound(table, B) + m.assertGetOrInsert(table, B, 1) + m.assertGetOrInsert(table, C, 2) + m.assertGetOrInsert(table, D, 3) + m.assertGetOrInsert(table, E, 4) + m.assertGetOrInsertNull(table, 5) + + m.assertGet(table, A, 0) + m.assertGetOrInsert(table, A, 0) + m.assertGet(table, E, 4) + m.assertGetOrInsert(table, E, 4) + + m.assertGetOrInsert(table, F, 6) + m.assertGetOrInsert(table, G, 7) + m.assertGetOrInsert(table, H, 8) + + m.assertGetOrInsert(table, G, 7) + m.assertGetOrInsert(table, F, 6) + m.assertGetOrInsertNull(table, 5) + m.assertGetOrInsert(table, E, 4) + m.assertGetOrInsert(table, D, 3) + m.assertGetOrInsert(table, C, 2) + m.assertGetOrInsert(table, B, 1) + m.assertGetOrInsert(table, A, 0) + + const sz int = 9 + m.Equal(sz, table.Size()) + m.Panics(func() { + values := make([]int32, sz) + table.CopyValues(values) + }, "should panic because wrong type") + m.Panics(func() { + values := make([]int64, sz-3) + table.CopyValues(values) + }, "should panic because out of bounds") + + { + values := make([]int64, sz) + table.CopyValues(values) + m.Equal([]int64{A, B, C, D, E, 0, F, G, H}, values) + } + { + const offset = 3 + values := make([]int64, sz-offset) + table.CopyValuesSubset(offset, values) + m.Equal([]int64{D, E, 0, F, G, H}, values) + } +} + +func (m *MemoTableTestSuite) TestFloat64() { + const ( + A float64 = 0.0 + B float64 = 1.5 + C float64 = -0.1 + ) + var ( + D = math.Inf(1) + E = -D + F = math.NaN() + ) + + // table := encoding.NewFloat64MemoTable(nil) + // table := &hashing.Float64MemoTable{hashing.NewScalarMemoTable(0)} + table := hashing.NewFloat64MemoTable(0) + m.Zero(table.Size()) + m.assertGetNotFound(table, A) + m.assertGetNullNotFound(table) + m.assertGetOrInsert(table, A, 0) + m.assertGetNotFound(table, B) + m.assertGetOrInsert(table, B, 1) + m.assertGetOrInsert(table, C, 2) + m.assertGetOrInsert(table, D, 3) + m.assertGetOrInsert(table, E, 4) + m.assertGetOrInsert(table, F, 5) + + m.assertGet(table, A, 0) + m.assertGetOrInsert(table, A, 0) + m.assertGetOrInsert(table, B, 1) + m.assertGetOrInsert(table, C, 2) + m.assertGetOrInsert(table, D, 3) + m.assertGet(table, E, 4) + m.assertGetOrInsert(table, E, 4) + m.assertGet(table, F, 5) + m.assertGetOrInsert(table, F, 5) + + m.Equal(6, table.Size()) + expected := []float64{A, B, C, D, E, F} + m.Panics(func() { + values := make([]int32, 6) + table.CopyValues(values) + }, "should panic because wrong type") + m.Panics(func() { + values := make([]float64, 3) + table.CopyValues(values) + }, "should panic because out of bounds") + + values := make([]float64, len(expected)) + table.CopyValues(values) + for idx, ex := range expected { + if math.IsNaN(ex) { + m.True(math.IsNaN(values[idx])) + } else { + m.Equal(ex, values[idx]) + } + } +} + +func (m *MemoTableTestSuite) TestBinaryBasics() { + const ( + A = "" + B = "a" + C = "foo" + D = "bar" + E = "\000" + F = "\000trailing" + ) + + table := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) + defer table.Release() + + m.Zero(table.Size()) + m.assertGetNotFound(table, A) + m.assertGetNullNotFound(table) + m.assertGetOrInsert(table, A, 0) + m.assertGetNotFound(table, B) + m.assertGetOrInsert(table, B, 1) + m.assertGetOrInsert(table, C, 2) + m.assertGetOrInsert(table, D, 3) + m.assertGetOrInsert(table, E, 4) + m.assertGetOrInsert(table, F, 5) + m.assertGetOrInsertNull(table, 6) + + m.assertGet(table, A, 0) + m.assertGetOrInsert(table, A, 0) + m.assertGet(table, B, 1) + m.assertGetOrInsert(table, B, 1) + m.assertGetOrInsert(table, C, 2) + m.assertGetOrInsert(table, D, 3) + m.assertGetOrInsert(table, E, 4) + m.assertGet(table, F, 5) + m.assertGetOrInsert(table, F, 5) + m.assertGetNull(table, 6) + m.assertGetOrInsertNull(table, 6) + + m.Equal(7, table.Size()) + m.Equal(17, table.ValuesSize()) + + size := table.Size() + { + offsets := make([]int8, size+1) + table.CopyOffsets(offsets) + m.Equal([]int8{0, 0, 1, 4, 7, 8, 17, 17}, offsets) + + expectedValues := "afoobar" + expectedValues += "\000" + expectedValues += "\000" + expectedValues += "trailing" + values := make([]byte, 17) + table.CopyValues(values) + m.Equal(expectedValues, string(values)) + } + + { + startOffset := 4 + offsets := make([]int8, size+1-int(startOffset)) + table.CopyOffsetsSubset(startOffset, offsets) + m.Equal([]int8{0, 1, 10, 10}, offsets) + + expectedValues := "" + expectedValues += "\000" + expectedValues += "\000" + expectedValues += "trailing" + + values := make([]byte, 10) + table.CopyValuesSubset(startOffset, values) + m.Equal(expectedValues, string(values)) + } + + { + startOffset := 1 + values := make([]string, 0) + table.VisitValues(startOffset, func(b []byte) { + values = append(values, string(b)) + }) + m.Equal([]string{B, C, D, E, F, ""}, values) + } +} + +func (m *MemoTableTestSuite) TestBinaryEmpty() { + table := encoding.NewBinaryMemoTable(memory.DefaultAllocator) + defer table.Release() + + m.Zero(table.Size()) + offsets := make([]int8, 1) + table.CopyOffsetsSubset(0, offsets) + m.Equal(int8(0), offsets[0]) +} diff --git a/go/parquet/internal/encoding/memo_table_types.gen.go b/go/parquet/internal/encoding/memo_table_types.gen.go new file mode 100644 index 00000000000..5c4812cbbeb --- /dev/null +++ b/go/parquet/internal/encoding/memo_table_types.gen.go @@ -0,0 +1,366 @@ +// Code generated by memo_table_types.gen.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" +) + +// standard map based implementation of memo tables which can be more efficient +// in some cases based on the uniqueness / amount / size of the data. +// these are left here for now for use in the benchmarks to compare against the +// custom hash table implementation in the internal/hashing package as a base +// benchmark comparison. + +func NewInt32MemoTable(memory.Allocator) MemoTable { + return &int32MemoTableImpl{ + table: make(map[int32]struct { + value int32 + memoIndex int + }), + nullIndex: keyNotFound, + } +} + +type int32MemoTableImpl struct { + table map[int32]struct { + value int32 + memoIndex int + } + nullIndex int +} + +func (m *int32MemoTableImpl) Reset() { + m.table = make(map[int32]struct { + value int32 + memoIndex int + }) + m.nullIndex = keyNotFound +} + +func (m *int32MemoTableImpl) GetNull() (int, bool) { + return m.nullIndex, m.nullIndex != keyNotFound +} + +func (m *int32MemoTableImpl) Size() int { + sz := len(m.table) + if _, ok := m.GetNull(); ok { + sz++ + } + return sz +} + +func (m *int32MemoTableImpl) GetOrInsertNull() (idx int, found bool) { + idx, found = m.GetNull() + if !found { + idx = m.Size() + m.nullIndex = idx + } + return +} + +func (m *int32MemoTableImpl) Get(val interface{}) (int, bool) { + v := val.(int32) + if p, ok := m.table[v]; ok { + return p.memoIndex, true + } + return keyNotFound, false +} + +func (m *int32MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { + v := val.(int32) + p, ok := m.table[v] + if ok { + idx = p.memoIndex + } else { + idx = m.Size() + p.value = v + p.memoIndex = idx + m.table[v] = p + found = true + } + return +} + +func (m *int32MemoTableImpl) CopyValues(out interface{}) { + m.CopyValuesSubset(0, out) +} + +func (m *int32MemoTableImpl) CopyValuesSubset(start int, out interface{}) { + outval := out.([]int32) + for _, v := range m.table { + idx := v.memoIndex - start + if idx >= 0 { + outval[idx] = v.value + } + } +} + +func NewInt64MemoTable(memory.Allocator) MemoTable { + return &int64MemoTableImpl{ + table: make(map[int64]struct { + value int64 + memoIndex int + }), + nullIndex: keyNotFound, + } +} + +type int64MemoTableImpl struct { + table map[int64]struct { + value int64 + memoIndex int + } + nullIndex int +} + +func (m *int64MemoTableImpl) Reset() { + m.table = make(map[int64]struct { + value int64 + memoIndex int + }) + m.nullIndex = keyNotFound +} + +func (m *int64MemoTableImpl) GetNull() (int, bool) { + return m.nullIndex, m.nullIndex != keyNotFound +} + +func (m *int64MemoTableImpl) Size() int { + sz := len(m.table) + if _, ok := m.GetNull(); ok { + sz++ + } + return sz +} + +func (m *int64MemoTableImpl) GetOrInsertNull() (idx int, found bool) { + idx, found = m.GetNull() + if !found { + idx = m.Size() + m.nullIndex = idx + } + return +} + +func (m *int64MemoTableImpl) Get(val interface{}) (int, bool) { + v := val.(int64) + if p, ok := m.table[v]; ok { + return p.memoIndex, true + } + return keyNotFound, false +} + +func (m *int64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { + v := val.(int64) + p, ok := m.table[v] + if ok { + idx = p.memoIndex + } else { + idx = m.Size() + p.value = v + p.memoIndex = idx + m.table[v] = p + found = true + } + return +} + +func (m *int64MemoTableImpl) CopyValues(out interface{}) { + m.CopyValuesSubset(0, out) +} + +func (m *int64MemoTableImpl) CopyValuesSubset(start int, out interface{}) { + outval := out.([]int64) + for _, v := range m.table { + idx := v.memoIndex - start + if idx >= 0 { + outval[idx] = v.value + } + } +} + +func NewInt96MemoTable(memory.Allocator) MemoTable { + return &int96MemoTableImpl{ + table: make(map[parquet.Int96]struct { + value parquet.Int96 + memoIndex int + }), + nullIndex: keyNotFound, + } +} + +type int96MemoTableImpl struct { + table map[parquet.Int96]struct { + value parquet.Int96 + memoIndex int + } + nullIndex int +} + +func (m *int96MemoTableImpl) Reset() { + m.table = make(map[parquet.Int96]struct { + value parquet.Int96 + memoIndex int + }) + m.nullIndex = keyNotFound +} + +func (m *int96MemoTableImpl) GetNull() (int, bool) { + return m.nullIndex, m.nullIndex != keyNotFound +} + +func (m *int96MemoTableImpl) Size() int { + sz := len(m.table) + if _, ok := m.GetNull(); ok { + sz++ + } + return sz +} + +func (m *int96MemoTableImpl) GetOrInsertNull() (idx int, found bool) { + idx, found = m.GetNull() + if !found { + idx = m.Size() + m.nullIndex = idx + } + return +} + +func (m *int96MemoTableImpl) Get(val interface{}) (int, bool) { + v := val.(parquet.Int96) + if p, ok := m.table[v]; ok { + return p.memoIndex, true + } + return keyNotFound, false +} + +func (m *int96MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { + v := val.(parquet.Int96) + p, ok := m.table[v] + if ok { + idx = p.memoIndex + } else { + idx = m.Size() + p.value = v + p.memoIndex = idx + m.table[v] = p + found = true + } + return +} + +func (m *int96MemoTableImpl) CopyValues(out interface{}) { + m.CopyValuesSubset(0, out) +} + +func (m *int96MemoTableImpl) CopyValuesSubset(start int, out interface{}) { + outval := out.([]parquet.Int96) + for _, v := range m.table { + idx := v.memoIndex - start + if idx >= 0 { + outval[idx] = v.value + } + } +} + +func NewFloat32MemoTable(memory.Allocator) MemoTable { + return &float32MemoTableImpl{ + table: make(map[float32]struct { + value float32 + memoIndex int + }), + nullIndex: keyNotFound, + } +} + +type float32MemoTableImpl struct { + table map[float32]struct { + value float32 + memoIndex int + } + nullIndex int +} + +func (m *float32MemoTableImpl) Reset() { + m.table = make(map[float32]struct { + value float32 + memoIndex int + }) + m.nullIndex = keyNotFound +} + +func (m *float32MemoTableImpl) GetNull() (int, bool) { + return m.nullIndex, m.nullIndex != keyNotFound +} + +func (m *float32MemoTableImpl) Size() int { + sz := len(m.table) + if _, ok := m.GetNull(); ok { + sz++ + } + return sz +} + +func (m *float32MemoTableImpl) GetOrInsertNull() (idx int, found bool) { + idx, found = m.GetNull() + if !found { + idx = m.Size() + m.nullIndex = idx + } + return +} + +func (m *float32MemoTableImpl) Get(val interface{}) (int, bool) { + v := val.(float32) + if p, ok := m.table[v]; ok { + return p.memoIndex, true + } + return keyNotFound, false +} + +func (m *float32MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { + v := val.(float32) + p, ok := m.table[v] + if ok { + idx = p.memoIndex + } else { + idx = m.Size() + p.value = v + p.memoIndex = idx + m.table[v] = p + found = true + } + return +} + +func (m *float32MemoTableImpl) CopyValues(out interface{}) { + m.CopyValuesSubset(0, out) +} + +func (m *float32MemoTableImpl) CopyValuesSubset(start int, out interface{}) { + outval := out.([]float32) + for _, v := range m.table { + idx := v.memoIndex - start + if idx >= 0 { + outval[idx] = v.value + } + } +} diff --git a/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl b/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl new file mode 100644 index 00000000000..0a0a7af2920 --- /dev/null +++ b/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/parquet" +) + +// standard map based implementation of memo tables which can be more efficient +// in some cases based on the uniqueness / amount / size of the data. +// these are left here for now for use in the benchmarks to compare against the +// custom hash table implementation in the internal/hashing package as a base +// benchmark comparison. + +{{range .In}} +{{if and (ne .Name "ByteArray") (ne .Name "FixedLenByteArray") (ne .Name "Float64") (ne .Name "Boolean")}} +func New{{.Name}}MemoTable(memory.Allocator) MemoTable { + return &{{.lower}}MemoTableImpl{ + table: make(map[{{.name}}]struct{ + value {{.name}} + memoIndex int + }), + nullIndex: keyNotFound, + } +} + +type {{.lower}}MemoTableImpl struct { + table map[{{.name}}]struct{ + value {{.name}} + memoIndex int + } + nullIndex int +} + +func (m *{{.lower}}MemoTableImpl) Reset() { + m.table = make(map[{{.name}}]struct{ + value {{.name}} + memoIndex int + }) + m.nullIndex = keyNotFound +} + +func (m *{{.lower}}MemoTableImpl) GetNull() (int, bool) { + return m.nullIndex, m.nullIndex != keyNotFound +} + +func (m *{{.lower}}MemoTableImpl) Size() int { + sz := len(m.table) + if _, ok := m.GetNull(); ok { + sz++ + } + return sz +} + +func (m *{{.lower}}MemoTableImpl) GetOrInsertNull() (idx int, found bool) { + idx, found = m.GetNull() + if !found { + idx = m.Size() + m.nullIndex = idx + } + return +} + +func (m *{{.lower}}MemoTableImpl) Get(val interface{}) (int, bool) { + v := val.({{.name}}) + if p, ok := m.table[v]; ok { + return p.memoIndex, true + } + return keyNotFound, false +} + +func (m *{{.lower}}MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { + v := val.({{.name}}) + p, ok := m.table[v] + if ok { + idx = p.memoIndex + } else { + idx = m.Size() + p.value = v + p.memoIndex = idx + m.table[v] = p + found = true + } + return +} + +func (m *{{.lower}}MemoTableImpl) CopyValues(out interface{}) { + m.CopyValuesSubset(0, out) +} + +func (m *{{.lower}}MemoTableImpl) CopyValuesSubset(start int, out interface{}) { + outval := out.([]{{.name}}) + for _, v := range m.table { + idx := v.memoIndex - start + if idx >= 0 { + outval[idx] = v.value + } + } +} +{{end}} +{{end}} diff --git a/go/parquet/internal/encoding/physical_types.tmpldata b/go/parquet/internal/encoding/physical_types.tmpldata new file mode 100644 index 00000000000..0adeb9955bf --- /dev/null +++ b/go/parquet/internal/encoding/physical_types.tmpldata @@ -0,0 +1,52 @@ +[ + { + "Name": "Int32", + "name": "int32", + "lower": "int32", + "prefix": "arrow" + }, + { + "Name": "Int64", + "name": "int64", + "lower": "int64", + "prefix": "arrow" + }, + { + "Name": "Int96", + "name": "parquet.Int96", + "lower": "int96", + "prefix": "parquet" + }, + { + "Name": "Float32", + "name": "float32", + "lower": "float32", + "prefix": "arrow", + "physical": "Float" + }, + { + "Name": "Float64", + "name": "float64", + "lower": "float64", + "prefix": "arrow", + "physical": "Double" + }, + { + "Name": "Boolean", + "name": "bool", + "lower": "bool", + "prefix": "arrow" + }, + { + "Name": "ByteArray", + "name": "parquet.ByteArray", + "lower": "byteArray", + "prefix": "parquet" + }, + { + "Name": "FixedLenByteArray", + "name": "parquet.FixedLenByteArray", + "lower": "fixedLenByteArray", + "prefix": "parquet" + } +] diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go b/go/parquet/internal/encoding/plain_encoder_types.gen.go new file mode 100644 index 00000000000..c48268dcca3 --- /dev/null +++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go @@ -0,0 +1,553 @@ +// Code generated by plain_encoder_types.gen.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "math" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" +) + +// PlainInt32Encoder is an encoder for int32 values using Plain Encoding +// which in general is just storing the values as raw bytes of the appropriate size +type PlainInt32Encoder struct { + encoder + + bitSetReader utils.SetBitRunReader +} + +// Put encodes a slice of values into the underlying buffer +func (enc *PlainInt32Encoder) Put(in []int32) { + enc.append(arrow.Int32Traits.CastToBytes(in)) +} + +// PutSpaced encodes a slice of values into the underlying buffer which are spaced out +// including null values defined by the validBits bitmap starting at a given bit offset. +// the values are first compressed by having the null slots removed before writing to the buffer +func (enc *PlainInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) { + nbytes := arrow.Int32Traits.BytesRequired(len(in)) + enc.ReserveForWrite(nbytes) + + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } +} + +// Type returns the underlying physical type this encoder is able to encode +func (PlainInt32Encoder) Type() parquet.Type { + return parquet.Types.Int32 +} + +// PlainInt32Decoder is a decoder specifically for decoding Plain Encoding data +// of int32 type. +type PlainInt32Decoder struct { + decoder + + bitSetReader utils.SetBitRunReader +} + +// Type returns the physical type this decoder is able to decode for +func (PlainInt32Decoder) Type() parquet.Type { + return parquet.Types.Int32 +} + +// Decode populates the given slice with values from the data to be decoded, +// decoding the min(len(out), remaining values). +// It returns the number of values actually decoded and any error encountered. +func (dec *PlainInt32Decoder) Decode(out []int32) (int, error) { + max := utils.MinInt(len(out), dec.nvals) + nbytes := int64(max) * int64(arrow.Int32SizeBytes) + if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { + return 0, xerrors.Errorf("parquet: eof exception decode plain Int32, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) + } + + copy(arrow.Int32Traits.CastToBytes(out), dec.data[:nbytes]) + dec.data = dec.data[nbytes:] + dec.nvals -= max + return max, nil +} + +// DecodeSpaced is the same as decode, except it expands the data out to leave spaces for null values +// as defined by the bitmap provided. +func (dec *PlainInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := dec.Decode(out[:toread]) + if err != nil { + return 0, err + } + if values != toread { + return 0, xerrors.New("parquet: number of values / definition levels read did not match") + } + + nvalues := len(out) + if nullCount == 0 { + return nvalues, nil + } + + idxDecode := nvalues - nullCount + if dec.bitSetReader == nil { + dec.bitSetReader = utils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(nvalues)) + } else { + dec.bitSetReader.Reset(validBits, validBitsOffset, int64(nvalues)) + } + + for { + run := dec.bitSetReader.NextRun() + if run.Length == 0 { + break + } + + idxDecode -= int(run.Length) + copy(out[int(run.Pos):], out[idxDecode:idxDecode+int(run.Length)]) + } + return nvalues, nil +} + +// PlainInt64Encoder is an encoder for int64 values using Plain Encoding +// which in general is just storing the values as raw bytes of the appropriate size +type PlainInt64Encoder struct { + encoder + + bitSetReader utils.SetBitRunReader +} + +// Put encodes a slice of values into the underlying buffer +func (enc *PlainInt64Encoder) Put(in []int64) { + enc.append(arrow.Int64Traits.CastToBytes(in)) +} + +// PutSpaced encodes a slice of values into the underlying buffer which are spaced out +// including null values defined by the validBits bitmap starting at a given bit offset. +// the values are first compressed by having the null slots removed before writing to the buffer +func (enc *PlainInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) { + nbytes := arrow.Int64Traits.BytesRequired(len(in)) + enc.ReserveForWrite(nbytes) + + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } +} + +// Type returns the underlying physical type this encoder is able to encode +func (PlainInt64Encoder) Type() parquet.Type { + return parquet.Types.Int64 +} + +// PlainInt64Decoder is a decoder specifically for decoding Plain Encoding data +// of int64 type. +type PlainInt64Decoder struct { + decoder + + bitSetReader utils.SetBitRunReader +} + +// Type returns the physical type this decoder is able to decode for +func (PlainInt64Decoder) Type() parquet.Type { + return parquet.Types.Int64 +} + +// Decode populates the given slice with values from the data to be decoded, +// decoding the min(len(out), remaining values). +// It returns the number of values actually decoded and any error encountered. +func (dec *PlainInt64Decoder) Decode(out []int64) (int, error) { + max := utils.MinInt(len(out), dec.nvals) + nbytes := int64(max) * int64(arrow.Int64SizeBytes) + if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { + return 0, xerrors.Errorf("parquet: eof exception decode plain Int64, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) + } + + copy(arrow.Int64Traits.CastToBytes(out), dec.data[:nbytes]) + dec.data = dec.data[nbytes:] + dec.nvals -= max + return max, nil +} + +// DecodeSpaced is the same as decode, except it expands the data out to leave spaces for null values +// as defined by the bitmap provided. +func (dec *PlainInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := dec.Decode(out[:toread]) + if err != nil { + return 0, err + } + if values != toread { + return 0, xerrors.New("parquet: number of values / definition levels read did not match") + } + + nvalues := len(out) + if nullCount == 0 { + return nvalues, nil + } + + idxDecode := nvalues - nullCount + if dec.bitSetReader == nil { + dec.bitSetReader = utils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(nvalues)) + } else { + dec.bitSetReader.Reset(validBits, validBitsOffset, int64(nvalues)) + } + + for { + run := dec.bitSetReader.NextRun() + if run.Length == 0 { + break + } + + idxDecode -= int(run.Length) + copy(out[int(run.Pos):], out[idxDecode:idxDecode+int(run.Length)]) + } + return nvalues, nil +} + +// PlainInt96Encoder is an encoder for parquet.Int96 values using Plain Encoding +// which in general is just storing the values as raw bytes of the appropriate size +type PlainInt96Encoder struct { + encoder + + bitSetReader utils.SetBitRunReader +} + +// Put encodes a slice of values into the underlying buffer +func (enc *PlainInt96Encoder) Put(in []parquet.Int96) { + enc.append(parquet.Int96Traits.CastToBytes(in)) +} + +// PutSpaced encodes a slice of values into the underlying buffer which are spaced out +// including null values defined by the validBits bitmap starting at a given bit offset. +// the values are first compressed by having the null slots removed before writing to the buffer +func (enc *PlainInt96Encoder) PutSpaced(in []parquet.Int96, validBits []byte, validBitsOffset int64) { + nbytes := parquet.Int96Traits.BytesRequired(len(in)) + enc.ReserveForWrite(nbytes) + + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } +} + +// Type returns the underlying physical type this encoder is able to encode +func (PlainInt96Encoder) Type() parquet.Type { + return parquet.Types.Int96 +} + +// PlainInt96Decoder is a decoder specifically for decoding Plain Encoding data +// of parquet.Int96 type. +type PlainInt96Decoder struct { + decoder + + bitSetReader utils.SetBitRunReader +} + +// Type returns the physical type this decoder is able to decode for +func (PlainInt96Decoder) Type() parquet.Type { + return parquet.Types.Int96 +} + +// Decode populates the given slice with values from the data to be decoded, +// decoding the min(len(out), remaining values). +// It returns the number of values actually decoded and any error encountered. +func (dec *PlainInt96Decoder) Decode(out []parquet.Int96) (int, error) { + max := utils.MinInt(len(out), dec.nvals) + nbytes := int64(max) * int64(parquet.Int96SizeBytes) + if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { + return 0, xerrors.Errorf("parquet: eof exception decode plain Int96, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) + } + + copy(parquet.Int96Traits.CastToBytes(out), dec.data[:nbytes]) + dec.data = dec.data[nbytes:] + dec.nvals -= max + return max, nil +} + +// DecodeSpaced is the same as decode, except it expands the data out to leave spaces for null values +// as defined by the bitmap provided. +func (dec *PlainInt96Decoder) DecodeSpaced(out []parquet.Int96, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := dec.Decode(out[:toread]) + if err != nil { + return 0, err + } + if values != toread { + return 0, xerrors.New("parquet: number of values / definition levels read did not match") + } + + nvalues := len(out) + if nullCount == 0 { + return nvalues, nil + } + + idxDecode := nvalues - nullCount + if dec.bitSetReader == nil { + dec.bitSetReader = utils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(nvalues)) + } else { + dec.bitSetReader.Reset(validBits, validBitsOffset, int64(nvalues)) + } + + for { + run := dec.bitSetReader.NextRun() + if run.Length == 0 { + break + } + + idxDecode -= int(run.Length) + copy(out[int(run.Pos):], out[idxDecode:idxDecode+int(run.Length)]) + } + return nvalues, nil +} + +// PlainFloat32Encoder is an encoder for float32 values using Plain Encoding +// which in general is just storing the values as raw bytes of the appropriate size +type PlainFloat32Encoder struct { + encoder + + bitSetReader utils.SetBitRunReader +} + +// Put encodes a slice of values into the underlying buffer +func (enc *PlainFloat32Encoder) Put(in []float32) { + enc.append(arrow.Float32Traits.CastToBytes(in)) +} + +// PutSpaced encodes a slice of values into the underlying buffer which are spaced out +// including null values defined by the validBits bitmap starting at a given bit offset. +// the values are first compressed by having the null slots removed before writing to the buffer +func (enc *PlainFloat32Encoder) PutSpaced(in []float32, validBits []byte, validBitsOffset int64) { + nbytes := arrow.Float32Traits.BytesRequired(len(in)) + enc.ReserveForWrite(nbytes) + + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } +} + +// Type returns the underlying physical type this encoder is able to encode +func (PlainFloat32Encoder) Type() parquet.Type { + return parquet.Types.Float +} + +// PlainFloat32Decoder is a decoder specifically for decoding Plain Encoding data +// of float32 type. +type PlainFloat32Decoder struct { + decoder + + bitSetReader utils.SetBitRunReader +} + +// Type returns the physical type this decoder is able to decode for +func (PlainFloat32Decoder) Type() parquet.Type { + return parquet.Types.Float +} + +// Decode populates the given slice with values from the data to be decoded, +// decoding the min(len(out), remaining values). +// It returns the number of values actually decoded and any error encountered. +func (dec *PlainFloat32Decoder) Decode(out []float32) (int, error) { + max := utils.MinInt(len(out), dec.nvals) + nbytes := int64(max) * int64(arrow.Float32SizeBytes) + if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { + return 0, xerrors.Errorf("parquet: eof exception decode plain Float32, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) + } + + copy(arrow.Float32Traits.CastToBytes(out), dec.data[:nbytes]) + dec.data = dec.data[nbytes:] + dec.nvals -= max + return max, nil +} + +// DecodeSpaced is the same as decode, except it expands the data out to leave spaces for null values +// as defined by the bitmap provided. +func (dec *PlainFloat32Decoder) DecodeSpaced(out []float32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := dec.Decode(out[:toread]) + if err != nil { + return 0, err + } + if values != toread { + return 0, xerrors.New("parquet: number of values / definition levels read did not match") + } + + nvalues := len(out) + if nullCount == 0 { + return nvalues, nil + } + + idxDecode := nvalues - nullCount + if dec.bitSetReader == nil { + dec.bitSetReader = utils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(nvalues)) + } else { + dec.bitSetReader.Reset(validBits, validBitsOffset, int64(nvalues)) + } + + for { + run := dec.bitSetReader.NextRun() + if run.Length == 0 { + break + } + + idxDecode -= int(run.Length) + copy(out[int(run.Pos):], out[idxDecode:idxDecode+int(run.Length)]) + } + return nvalues, nil +} + +// PlainFloat64Encoder is an encoder for float64 values using Plain Encoding +// which in general is just storing the values as raw bytes of the appropriate size +type PlainFloat64Encoder struct { + encoder + + bitSetReader utils.SetBitRunReader +} + +// Put encodes a slice of values into the underlying buffer +func (enc *PlainFloat64Encoder) Put(in []float64) { + enc.append(arrow.Float64Traits.CastToBytes(in)) +} + +// PutSpaced encodes a slice of values into the underlying buffer which are spaced out +// including null values defined by the validBits bitmap starting at a given bit offset. +// the values are first compressed by having the null slots removed before writing to the buffer +func (enc *PlainFloat64Encoder) PutSpaced(in []float64, validBits []byte, validBitsOffset int64) { + nbytes := arrow.Float64Traits.BytesRequired(len(in)) + enc.ReserveForWrite(nbytes) + + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } +} + +// Type returns the underlying physical type this encoder is able to encode +func (PlainFloat64Encoder) Type() parquet.Type { + return parquet.Types.Double +} + +// PlainFloat64Decoder is a decoder specifically for decoding Plain Encoding data +// of float64 type. +type PlainFloat64Decoder struct { + decoder + + bitSetReader utils.SetBitRunReader +} + +// Type returns the physical type this decoder is able to decode for +func (PlainFloat64Decoder) Type() parquet.Type { + return parquet.Types.Double +} + +// Decode populates the given slice with values from the data to be decoded, +// decoding the min(len(out), remaining values). +// It returns the number of values actually decoded and any error encountered. +func (dec *PlainFloat64Decoder) Decode(out []float64) (int, error) { + max := utils.MinInt(len(out), dec.nvals) + nbytes := int64(max) * int64(arrow.Float64SizeBytes) + if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { + return 0, xerrors.Errorf("parquet: eof exception decode plain Float64, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) + } + + copy(arrow.Float64Traits.CastToBytes(out), dec.data[:nbytes]) + dec.data = dec.data[nbytes:] + dec.nvals -= max + return max, nil +} + +// DecodeSpaced is the same as decode, except it expands the data out to leave spaces for null values +// as defined by the bitmap provided. +func (dec *PlainFloat64Decoder) DecodeSpaced(out []float64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := dec.Decode(out[:toread]) + if err != nil { + return 0, err + } + if values != toread { + return 0, xerrors.New("parquet: number of values / definition levels read did not match") + } + + nvalues := len(out) + if nullCount == 0 { + return nvalues, nil + } + + idxDecode := nvalues - nullCount + if dec.bitSetReader == nil { + dec.bitSetReader = utils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(nvalues)) + } else { + dec.bitSetReader.Reset(validBits, validBitsOffset, int64(nvalues)) + } + + for { + run := dec.bitSetReader.NextRun() + if run.Length == 0 { + break + } + + idxDecode -= int(run.Length) + copy(out[int(run.Pos):], out[idxDecode:idxDecode+int(run.Length)]) + } + return nvalues, nil +} diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl new file mode 100644 index 00000000000..86e04e4e637 --- /dev/null +++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" +) + +{{range .In}} +{{if and (ne .Name "Boolean") (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}} +// Plain{{.Name}}Encoder is an encoder for {{.name}} values using Plain Encoding +// which in general is just storing the values as raw bytes of the appropriate size +type Plain{{.Name}}Encoder struct { + encoder + + bitSetReader utils.SetBitRunReader +} + +// Put encodes a slice of values into the underlying buffer +func (enc *Plain{{.Name}}Encoder) Put(in []{{.name}}) { + enc.append({{.prefix}}.{{.Name}}Traits.CastToBytes(in)) +} + +// PutSpaced encodes a slice of values into the underlying buffer which are spaced out +// including null values defined by the validBits bitmap starting at a given bit offset. +// the values are first compressed by having the null slots removed before writing to the buffer +func (enc *Plain{{.Name}}Encoder) PutSpaced(in []{{.name}}, validBits []byte, validBitsOffset int64) { + nbytes := {{.prefix}}.{{.Name}}Traits.BytesRequired(len(in)) + enc.ReserveForWrite(nbytes) + + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } +} + +// Type returns the underlying physical type this encoder is able to encode +func (Plain{{.Name}}Encoder) Type() parquet.Type { + return parquet.Types.{{if .physical}}{{.physical}}{{else}}{{.Name}}{{end}} +} + +// Plain{{.Name}}Decoder is a decoder specifically for decoding Plain Encoding data +// of {{.name}} type. +type Plain{{.Name}}Decoder struct { + decoder + + bitSetReader utils.SetBitRunReader +} + +// Type returns the physical type this decoder is able to decode for +func (Plain{{.Name}}Decoder) Type() parquet.Type { + return parquet.Types.{{if .physical}}{{.physical}}{{else}}{{.Name}}{{end}} +} + +// Decode populates the given slice with values from the data to be decoded, +// decoding the min(len(out), remaining values). +// It returns the number of values actually decoded and any error encountered. +func (dec *Plain{{.Name}}Decoder) Decode(out []{{.name}}) (int, error) { + max := utils.MinInt(len(out), dec.nvals) + nbytes := int64(max) * int64({{.prefix}}.{{.Name}}SizeBytes) + if nbytes > int64(len(dec.data)) || nbytes > math.MaxInt32 { + return 0, xerrors.Errorf("parquet: eof exception decode plain {{.Name}}, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) + } + + copy({{.prefix}}.{{.Name}}Traits.CastToBytes(out), dec.data[:nbytes]) + dec.data = dec.data[nbytes:] + dec.nvals -= max + return max, nil +} + +// DecodeSpaced is the same as decode, except it expands the data out to leave spaces for null values +// as defined by the bitmap provided. +func (dec *Plain{{.Name}}Decoder) DecodeSpaced(out []{{.name}}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + toread := len(out) - nullCount + values, err := dec.Decode(out[:toread]) + if err != nil { + return 0, err + } + if values != toread { + return 0, xerrors.New("parquet: number of values / definition levels read did not match") + } + + nvalues := len(out) + if nullCount == 0 { + return nvalues, nil + } + + idxDecode := nvalues - nullCount + if dec.bitSetReader == nil { + dec.bitSetReader = utils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(nvalues)) + } else { + dec.bitSetReader.Reset(validBits, validBitsOffset, int64(nvalues)) + } + + for { + run := dec.bitSetReader.NextRun() + if run.Length == 0 { + break + } + + idxDecode -= int(run.Length) + copy(out[int(run.Pos):], out[idxDecode:idxDecode+int(run.Length)]) + } + return nvalues, nil +} +{{end}} +{{end}} diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go b/go/parquet/internal/encoding/typed_encoder.gen.go new file mode 100644 index 00000000000..192286f987c --- /dev/null +++ b/go/parquet/internal/encoding/typed_encoder.gen.go @@ -0,0 +1,1467 @@ +// Code generated by typed_encoder.gen.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "unsafe" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "github.com/apache/arrow/go/parquet/schema" + "golang.org/x/xerrors" +) + +// fully typed encoder interfaces to enable writing against encoder/decoders +// without having to care about what encoding type is actually being used. + +var ( + Int32EncoderTraits int32EncoderTraits + Int32DecoderTraits int32DecoderTraits + Int64EncoderTraits int64EncoderTraits + Int64DecoderTraits int64DecoderTraits + Int96EncoderTraits int96EncoderTraits + Int96DecoderTraits int96DecoderTraits + Float32EncoderTraits float32EncoderTraits + Float32DecoderTraits float32DecoderTraits + Float64EncoderTraits float64EncoderTraits + Float64DecoderTraits float64DecoderTraits + BooleanEncoderTraits boolEncoderTraits + BooleanDecoderTraits boolDecoderTraits + ByteArrayEncoderTraits byteArrayEncoderTraits + ByteArrayDecoderTraits byteArrayDecoderTraits + FixedLenByteArrayEncoderTraits fixedLenByteArrayEncoderTraits + FixedLenByteArrayDecoderTraits fixedLenByteArrayDecoderTraits +) + +// Int32Encoder is the interface for all encoding types that implement encoding +// int32 values. +type Int32Encoder interface { + TypedEncoder + Put([]int32) + PutSpaced([]int32, []byte, int64) +} + +// Int32Decoder is the interface for all encoding types that implement decoding +// int32 values. +type Int32Decoder interface { + TypedDecoder + Decode([]int32) (int, error) + DecodeSpaced([]int32, int, []byte, int64) (int, error) +} + +// the int32EncoderTraits struct is used to make it easy to create encoders and decoders based on type +type int32EncoderTraits struct{} + +// Encoder returns an encoder for int32 type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +func (int32EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + return &DictInt32Encoder{newDictEncoderBase(descr, NewInt32Dictionary(), mem)} + } + + switch e { + case format.Encoding_PLAIN: + return &PlainInt32Encoder{encoder: newEncoderBase(e, descr, mem)} + case format.Encoding_DELTA_BINARY_PACKED: + return DeltaBitPackInt32Encoder{&deltaBitPackEncoder{ + encoder: newEncoderBase(e, descr, mem)}} + default: + panic("unimplemented encoding type") + } +} + +// int32DecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for int32 values +type int32DecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n int32 values. +func (int32DecoderTraits) BytesRequired(n int) int { + return arrow.Int32Traits.BytesRequired(n) +} + +// Decoder returns a decoder for int32 typed data of the requested encoding type if available +func (int32DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + return &DictInt32Decoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}} + } + + switch e { + case parquet.Encodings.Plain: + return &PlainInt32Decoder{decoder: newDecoderBase(format.Encoding(e), descr)} + case parquet.Encodings.DeltaBinaryPacked: + if mem == nil { + mem = memory.DefaultAllocator + } + return &DeltaBitPackInt32Decoder{ + deltaBitPackDecoder: &deltaBitPackDecoder{ + decoder: newDecoderBase(format.Encoding(e), descr), + mem: mem, + }} + default: + panic("unimplemented encoding type") + } +} + +// DictInt32Encoder is an encoder for int32 data using dictionary encoding +type DictInt32Encoder struct { + dictEncoder +} + +// Type returns the underlying physical type that can be encoded with this encoder +func (enc *DictInt32Encoder) Type() parquet.Type { + return parquet.Types.Int32 +} + +// WriteDict populates the byte slice with the dictionary index +func (enc *DictInt32Encoder) WriteDict(out []byte) { + enc.memo.CopyValues(arrow.Int32Traits.CastFromBytes(out)) +} + +// Put encodes the values passed in, adding to the index as needed. +func (enc *DictInt32Encoder) Put(in []int32) { + for _, val := range in { + enc.dictEncoder.Put(val) + } +} + +// PutSpaced is the same as Put but for when the data being encoded has slots open for +// null values, using the bitmap provided to skip values as needed. +func (enc *DictInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) { + utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { + for i := int64(0); i < length; i++ { + enc.dictEncoder.Put(in[i+pos]) + } + return nil + }) +} + +// DictInt32Decoder is a decoder for decoding dictionary encoded data for int32 columns +type DictInt32Decoder struct { + dictDecoder +} + +// Type returns the underlying physical type that can be decoded with this decoder +func (DictInt32Decoder) Type() parquet.Type { + return parquet.Types.Int32 +} + +// Decode populates the passed in slice with min(len(out), remaining values) values, +// decoding using hte dictionary to get the actual values. Returns the number of values +// actually decoded and any error encountered. +func (d *DictInt32Decoder) Decode(out []int32) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decode(out[:vals]) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Decode spaced is like Decode but will space out the data leaving slots for null values +// based on the provided bitmap. +func (d *DictInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict spaced eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Int32DictConverter is a helper for dictionary handling which is used for converting +// run length encoded indexes into the actual values that are stored in the dictionary index page. +type Int32DictConverter struct { + valueDecoder Int32Decoder + dict []int32 + zeroVal int32 +} + +// ensure validates that we've decoded dictionary values up to the index +// provided so that we don't need to decode the entire dictionary at start. +func (dc *Int32DictConverter) ensure(idx utils.IndexType) error { + if len(dc.dict) <= int(idx) { + if cap(dc.dict) <= int(idx) { + val := make([]int32, int(idx+1)-len(dc.dict)) + n, err := dc.valueDecoder.Decode(val) + if err != nil { + return err + } + dc.dict = append(dc.dict, val[:n]...) + } else { + cur := len(dc.dict) + n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1]) + if err != nil { + return err + } + dc.dict = dc.dict[:cur+n] + } + } + return nil +} + +// IsValid verifies that the set of indexes passed in are all valid indexes +// in the dictionary and if necessary decodes dictionary indexes up to the index +// requested. +func (dc *Int32DictConverter) IsValid(idxes ...utils.IndexType) bool { + min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes))) + dc.ensure(utils.IndexType(max)) + + return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict) +} + +// Fill populates the slice passed in entirely with the value at dictionary index indicated by val +func (dc *Int32DictConverter) Fill(out interface{}, val utils.IndexType) error { + o := out.([]int32) + if err := dc.ensure(val); err != nil { + return err + } + o[0] = dc.dict[val] + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } + return nil +} + +// FillZero populates the entire slice of out with the zero value for int32 +func (dc *Int32DictConverter) FillZero(out interface{}) { + o := out.([]int32) + o[0] = dc.zeroVal + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } +} + +// Copy populates the slice provided with the values in the dictionary at the indexes +// in the vals slice. +func (dc *Int32DictConverter) Copy(out interface{}, vals []utils.IndexType) error { + o := out.([]int32) + for idx, val := range vals { + o[idx] = dc.dict[val] + } + return nil +} + +// Int64Encoder is the interface for all encoding types that implement encoding +// int64 values. +type Int64Encoder interface { + TypedEncoder + Put([]int64) + PutSpaced([]int64, []byte, int64) +} + +// Int64Decoder is the interface for all encoding types that implement decoding +// int64 values. +type Int64Decoder interface { + TypedDecoder + Decode([]int64) (int, error) + DecodeSpaced([]int64, int, []byte, int64) (int, error) +} + +// the int64EncoderTraits struct is used to make it easy to create encoders and decoders based on type +type int64EncoderTraits struct{} + +// Encoder returns an encoder for int64 type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +func (int64EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + return &DictInt64Encoder{newDictEncoderBase(descr, NewInt64Dictionary(), mem)} + } + + switch e { + case format.Encoding_PLAIN: + return &PlainInt64Encoder{encoder: newEncoderBase(e, descr, mem)} + case format.Encoding_DELTA_BINARY_PACKED: + return DeltaBitPackInt64Encoder{&deltaBitPackEncoder{ + encoder: newEncoderBase(e, descr, mem)}} + default: + panic("unimplemented encoding type") + } +} + +// int64DecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for int64 values +type int64DecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n int64 values. +func (int64DecoderTraits) BytesRequired(n int) int { + return arrow.Int64Traits.BytesRequired(n) +} + +// Decoder returns a decoder for int64 typed data of the requested encoding type if available +func (int64DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + return &DictInt64Decoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}} + } + + switch e { + case parquet.Encodings.Plain: + return &PlainInt64Decoder{decoder: newDecoderBase(format.Encoding(e), descr)} + case parquet.Encodings.DeltaBinaryPacked: + if mem == nil { + mem = memory.DefaultAllocator + } + return &DeltaBitPackInt64Decoder{ + deltaBitPackDecoder: &deltaBitPackDecoder{ + decoder: newDecoderBase(format.Encoding(e), descr), + mem: mem, + }} + default: + panic("unimplemented encoding type") + } +} + +// DictInt64Encoder is an encoder for int64 data using dictionary encoding +type DictInt64Encoder struct { + dictEncoder +} + +// Type returns the underlying physical type that can be encoded with this encoder +func (enc *DictInt64Encoder) Type() parquet.Type { + return parquet.Types.Int64 +} + +// WriteDict populates the byte slice with the dictionary index +func (enc *DictInt64Encoder) WriteDict(out []byte) { + enc.memo.CopyValues(arrow.Int64Traits.CastFromBytes(out)) +} + +// Put encodes the values passed in, adding to the index as needed. +func (enc *DictInt64Encoder) Put(in []int64) { + for _, val := range in { + enc.dictEncoder.Put(val) + } +} + +// PutSpaced is the same as Put but for when the data being encoded has slots open for +// null values, using the bitmap provided to skip values as needed. +func (enc *DictInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) { + utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { + for i := int64(0); i < length; i++ { + enc.dictEncoder.Put(in[i+pos]) + } + return nil + }) +} + +// DictInt64Decoder is a decoder for decoding dictionary encoded data for int64 columns +type DictInt64Decoder struct { + dictDecoder +} + +// Type returns the underlying physical type that can be decoded with this decoder +func (DictInt64Decoder) Type() parquet.Type { + return parquet.Types.Int64 +} + +// Decode populates the passed in slice with min(len(out), remaining values) values, +// decoding using hte dictionary to get the actual values. Returns the number of values +// actually decoded and any error encountered. +func (d *DictInt64Decoder) Decode(out []int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decode(out[:vals]) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Decode spaced is like Decode but will space out the data leaving slots for null values +// based on the provided bitmap. +func (d *DictInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict spaced eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Int64DictConverter is a helper for dictionary handling which is used for converting +// run length encoded indexes into the actual values that are stored in the dictionary index page. +type Int64DictConverter struct { + valueDecoder Int64Decoder + dict []int64 + zeroVal int64 +} + +// ensure validates that we've decoded dictionary values up to the index +// provided so that we don't need to decode the entire dictionary at start. +func (dc *Int64DictConverter) ensure(idx utils.IndexType) error { + if len(dc.dict) <= int(idx) { + if cap(dc.dict) <= int(idx) { + val := make([]int64, int(idx+1)-len(dc.dict)) + n, err := dc.valueDecoder.Decode(val) + if err != nil { + return err + } + dc.dict = append(dc.dict, val[:n]...) + } else { + cur := len(dc.dict) + n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1]) + if err != nil { + return err + } + dc.dict = dc.dict[:cur+n] + } + } + return nil +} + +// IsValid verifies that the set of indexes passed in are all valid indexes +// in the dictionary and if necessary decodes dictionary indexes up to the index +// requested. +func (dc *Int64DictConverter) IsValid(idxes ...utils.IndexType) bool { + min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes))) + dc.ensure(utils.IndexType(max)) + + return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict) +} + +// Fill populates the slice passed in entirely with the value at dictionary index indicated by val +func (dc *Int64DictConverter) Fill(out interface{}, val utils.IndexType) error { + o := out.([]int64) + if err := dc.ensure(val); err != nil { + return err + } + o[0] = dc.dict[val] + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } + return nil +} + +// FillZero populates the entire slice of out with the zero value for int64 +func (dc *Int64DictConverter) FillZero(out interface{}) { + o := out.([]int64) + o[0] = dc.zeroVal + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } +} + +// Copy populates the slice provided with the values in the dictionary at the indexes +// in the vals slice. +func (dc *Int64DictConverter) Copy(out interface{}, vals []utils.IndexType) error { + o := out.([]int64) + for idx, val := range vals { + o[idx] = dc.dict[val] + } + return nil +} + +// Int96Encoder is the interface for all encoding types that implement encoding +// parquet.Int96 values. +type Int96Encoder interface { + TypedEncoder + Put([]parquet.Int96) + PutSpaced([]parquet.Int96, []byte, int64) +} + +// Int96Decoder is the interface for all encoding types that implement decoding +// parquet.Int96 values. +type Int96Decoder interface { + TypedDecoder + Decode([]parquet.Int96) (int, error) + DecodeSpaced([]parquet.Int96, int, []byte, int64) (int, error) +} + +// the int96EncoderTraits struct is used to make it easy to create encoders and decoders based on type +type int96EncoderTraits struct{} + +// Encoder returns an encoder for int96 type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +// dictionary encoding does not exist for this type and Encoder will panic if useDict is true +func (int96EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + panic("parquet: no parquet.Int96 dictionary encoding") + } + + switch e { + case format.Encoding_PLAIN: + return &PlainInt96Encoder{encoder: newEncoderBase(e, descr, mem)} + default: + panic("unimplemented encoding type") + } +} + +// int96DecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for int96 values +type int96DecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n int96 values. +func (int96DecoderTraits) BytesRequired(n int) int { + return parquet.Int96Traits.BytesRequired(n) +} + +// Decoder returns a decoder for int96 typed data of the requested encoding type if available +func (int96DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + panic("dictionary decoding unimplemented for int96") + } + + switch e { + case parquet.Encodings.Plain: + return &PlainInt96Decoder{decoder: newDecoderBase(format.Encoding(e), descr)} + default: + panic("unimplemented encoding type") + } +} + +// Float32Encoder is the interface for all encoding types that implement encoding +// float32 values. +type Float32Encoder interface { + TypedEncoder + Put([]float32) + PutSpaced([]float32, []byte, int64) +} + +// Float32Decoder is the interface for all encoding types that implement decoding +// float32 values. +type Float32Decoder interface { + TypedDecoder + Decode([]float32) (int, error) + DecodeSpaced([]float32, int, []byte, int64) (int, error) +} + +// the float32EncoderTraits struct is used to make it easy to create encoders and decoders based on type +type float32EncoderTraits struct{} + +// Encoder returns an encoder for float32 type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +func (float32EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + return &DictFloat32Encoder{newDictEncoderBase(descr, NewFloat32Dictionary(), mem)} + } + + switch e { + case format.Encoding_PLAIN: + return &PlainFloat32Encoder{encoder: newEncoderBase(e, descr, mem)} + default: + panic("unimplemented encoding type") + } +} + +// float32DecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for float32 values +type float32DecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n float32 values. +func (float32DecoderTraits) BytesRequired(n int) int { + return arrow.Float32Traits.BytesRequired(n) +} + +// Decoder returns a decoder for float32 typed data of the requested encoding type if available +func (float32DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + return &DictFloat32Decoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}} + } + + switch e { + case parquet.Encodings.Plain: + return &PlainFloat32Decoder{decoder: newDecoderBase(format.Encoding(e), descr)} + default: + panic("unimplemented encoding type") + } +} + +// DictFloat32Encoder is an encoder for float32 data using dictionary encoding +type DictFloat32Encoder struct { + dictEncoder +} + +// Type returns the underlying physical type that can be encoded with this encoder +func (enc *DictFloat32Encoder) Type() parquet.Type { + return parquet.Types.Float +} + +// WriteDict populates the byte slice with the dictionary index +func (enc *DictFloat32Encoder) WriteDict(out []byte) { + enc.memo.CopyValues(arrow.Float32Traits.CastFromBytes(out)) +} + +// Put encodes the values passed in, adding to the index as needed. +func (enc *DictFloat32Encoder) Put(in []float32) { + for _, val := range in { + enc.dictEncoder.Put(val) + } +} + +// PutSpaced is the same as Put but for when the data being encoded has slots open for +// null values, using the bitmap provided to skip values as needed. +func (enc *DictFloat32Encoder) PutSpaced(in []float32, validBits []byte, validBitsOffset int64) { + utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { + for i := int64(0); i < length; i++ { + enc.dictEncoder.Put(in[i+pos]) + } + return nil + }) +} + +// DictFloat32Decoder is a decoder for decoding dictionary encoded data for float32 columns +type DictFloat32Decoder struct { + dictDecoder +} + +// Type returns the underlying physical type that can be decoded with this decoder +func (DictFloat32Decoder) Type() parquet.Type { + return parquet.Types.Float +} + +// Decode populates the passed in slice with min(len(out), remaining values) values, +// decoding using hte dictionary to get the actual values. Returns the number of values +// actually decoded and any error encountered. +func (d *DictFloat32Decoder) Decode(out []float32) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decode(out[:vals]) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Decode spaced is like Decode but will space out the data leaving slots for null values +// based on the provided bitmap. +func (d *DictFloat32Decoder) DecodeSpaced(out []float32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict spaced eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Float32DictConverter is a helper for dictionary handling which is used for converting +// run length encoded indexes into the actual values that are stored in the dictionary index page. +type Float32DictConverter struct { + valueDecoder Float32Decoder + dict []float32 + zeroVal float32 +} + +// ensure validates that we've decoded dictionary values up to the index +// provided so that we don't need to decode the entire dictionary at start. +func (dc *Float32DictConverter) ensure(idx utils.IndexType) error { + if len(dc.dict) <= int(idx) { + if cap(dc.dict) <= int(idx) { + val := make([]float32, int(idx+1)-len(dc.dict)) + n, err := dc.valueDecoder.Decode(val) + if err != nil { + return err + } + dc.dict = append(dc.dict, val[:n]...) + } else { + cur := len(dc.dict) + n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1]) + if err != nil { + return err + } + dc.dict = dc.dict[:cur+n] + } + } + return nil +} + +// IsValid verifies that the set of indexes passed in are all valid indexes +// in the dictionary and if necessary decodes dictionary indexes up to the index +// requested. +func (dc *Float32DictConverter) IsValid(idxes ...utils.IndexType) bool { + min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes))) + dc.ensure(utils.IndexType(max)) + + return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict) +} + +// Fill populates the slice passed in entirely with the value at dictionary index indicated by val +func (dc *Float32DictConverter) Fill(out interface{}, val utils.IndexType) error { + o := out.([]float32) + if err := dc.ensure(val); err != nil { + return err + } + o[0] = dc.dict[val] + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } + return nil +} + +// FillZero populates the entire slice of out with the zero value for float32 +func (dc *Float32DictConverter) FillZero(out interface{}) { + o := out.([]float32) + o[0] = dc.zeroVal + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } +} + +// Copy populates the slice provided with the values in the dictionary at the indexes +// in the vals slice. +func (dc *Float32DictConverter) Copy(out interface{}, vals []utils.IndexType) error { + o := out.([]float32) + for idx, val := range vals { + o[idx] = dc.dict[val] + } + return nil +} + +// Float64Encoder is the interface for all encoding types that implement encoding +// float64 values. +type Float64Encoder interface { + TypedEncoder + Put([]float64) + PutSpaced([]float64, []byte, int64) +} + +// Float64Decoder is the interface for all encoding types that implement decoding +// float64 values. +type Float64Decoder interface { + TypedDecoder + Decode([]float64) (int, error) + DecodeSpaced([]float64, int, []byte, int64) (int, error) +} + +// the float64EncoderTraits struct is used to make it easy to create encoders and decoders based on type +type float64EncoderTraits struct{} + +// Encoder returns an encoder for float64 type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +func (float64EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + return &DictFloat64Encoder{newDictEncoderBase(descr, NewFloat64Dictionary(), mem)} + } + + switch e { + case format.Encoding_PLAIN: + return &PlainFloat64Encoder{encoder: newEncoderBase(e, descr, mem)} + default: + panic("unimplemented encoding type") + } +} + +// float64DecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for float64 values +type float64DecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n float64 values. +func (float64DecoderTraits) BytesRequired(n int) int { + return arrow.Float64Traits.BytesRequired(n) +} + +// Decoder returns a decoder for float64 typed data of the requested encoding type if available +func (float64DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + return &DictFloat64Decoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}} + } + + switch e { + case parquet.Encodings.Plain: + return &PlainFloat64Decoder{decoder: newDecoderBase(format.Encoding(e), descr)} + default: + panic("unimplemented encoding type") + } +} + +// DictFloat64Encoder is an encoder for float64 data using dictionary encoding +type DictFloat64Encoder struct { + dictEncoder +} + +// Type returns the underlying physical type that can be encoded with this encoder +func (enc *DictFloat64Encoder) Type() parquet.Type { + return parquet.Types.Double +} + +// WriteDict populates the byte slice with the dictionary index +func (enc *DictFloat64Encoder) WriteDict(out []byte) { + enc.memo.CopyValues(arrow.Float64Traits.CastFromBytes(out)) +} + +// Put encodes the values passed in, adding to the index as needed. +func (enc *DictFloat64Encoder) Put(in []float64) { + for _, val := range in { + enc.dictEncoder.Put(val) + } +} + +// PutSpaced is the same as Put but for when the data being encoded has slots open for +// null values, using the bitmap provided to skip values as needed. +func (enc *DictFloat64Encoder) PutSpaced(in []float64, validBits []byte, validBitsOffset int64) { + utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { + for i := int64(0); i < length; i++ { + enc.dictEncoder.Put(in[i+pos]) + } + return nil + }) +} + +// DictFloat64Decoder is a decoder for decoding dictionary encoded data for float64 columns +type DictFloat64Decoder struct { + dictDecoder +} + +// Type returns the underlying physical type that can be decoded with this decoder +func (DictFloat64Decoder) Type() parquet.Type { + return parquet.Types.Double +} + +// Decode populates the passed in slice with min(len(out), remaining values) values, +// decoding using hte dictionary to get the actual values. Returns the number of values +// actually decoded and any error encountered. +func (d *DictFloat64Decoder) Decode(out []float64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decode(out[:vals]) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Decode spaced is like Decode but will space out the data leaving slots for null values +// based on the provided bitmap. +func (d *DictFloat64Decoder) DecodeSpaced(out []float64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict spaced eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Float64DictConverter is a helper for dictionary handling which is used for converting +// run length encoded indexes into the actual values that are stored in the dictionary index page. +type Float64DictConverter struct { + valueDecoder Float64Decoder + dict []float64 + zeroVal float64 +} + +// ensure validates that we've decoded dictionary values up to the index +// provided so that we don't need to decode the entire dictionary at start. +func (dc *Float64DictConverter) ensure(idx utils.IndexType) error { + if len(dc.dict) <= int(idx) { + if cap(dc.dict) <= int(idx) { + val := make([]float64, int(idx+1)-len(dc.dict)) + n, err := dc.valueDecoder.Decode(val) + if err != nil { + return err + } + dc.dict = append(dc.dict, val[:n]...) + } else { + cur := len(dc.dict) + n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1]) + if err != nil { + return err + } + dc.dict = dc.dict[:cur+n] + } + } + return nil +} + +// IsValid verifies that the set of indexes passed in are all valid indexes +// in the dictionary and if necessary decodes dictionary indexes up to the index +// requested. +func (dc *Float64DictConverter) IsValid(idxes ...utils.IndexType) bool { + min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes))) + dc.ensure(utils.IndexType(max)) + + return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict) +} + +// Fill populates the slice passed in entirely with the value at dictionary index indicated by val +func (dc *Float64DictConverter) Fill(out interface{}, val utils.IndexType) error { + o := out.([]float64) + if err := dc.ensure(val); err != nil { + return err + } + o[0] = dc.dict[val] + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } + return nil +} + +// FillZero populates the entire slice of out with the zero value for float64 +func (dc *Float64DictConverter) FillZero(out interface{}) { + o := out.([]float64) + o[0] = dc.zeroVal + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } +} + +// Copy populates the slice provided with the values in the dictionary at the indexes +// in the vals slice. +func (dc *Float64DictConverter) Copy(out interface{}, vals []utils.IndexType) error { + o := out.([]float64) + for idx, val := range vals { + o[idx] = dc.dict[val] + } + return nil +} + +// BooleanEncoder is the interface for all encoding types that implement encoding +// bool values. +type BooleanEncoder interface { + TypedEncoder + Put([]bool) + PutSpaced([]bool, []byte, int64) +} + +// BooleanDecoder is the interface for all encoding types that implement decoding +// bool values. +type BooleanDecoder interface { + TypedDecoder + Decode([]bool) (int, error) + DecodeSpaced([]bool, int, []byte, int64) (int, error) +} + +// the boolEncoderTraits struct is used to make it easy to create encoders and decoders based on type +type boolEncoderTraits struct{} + +// Encoder returns an encoder for bool type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +// dictionary encoding does not exist for this type and Encoder will panic if useDict is true +func (boolEncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + panic("parquet: no bool dictionary encoding") + } + + switch e { + case format.Encoding_PLAIN: + return &PlainBooleanEncoder{encoder: newEncoderBase(e, descr, mem)} + default: + panic("unimplemented encoding type") + } +} + +// boolDecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for bool values +type boolDecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n bool values. +func (boolDecoderTraits) BytesRequired(n int) int { + return arrow.BooleanTraits.BytesRequired(n) +} + +// Decoder returns a decoder for bool typed data of the requested encoding type if available +func (boolDecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + panic("dictionary decoding unimplemented for bool") + } + + switch e { + case parquet.Encodings.Plain: + return &PlainBooleanDecoder{decoder: newDecoderBase(format.Encoding(e), descr)} + default: + panic("unimplemented encoding type") + } +} + +// ByteArrayEncoder is the interface for all encoding types that implement encoding +// parquet.ByteArray values. +type ByteArrayEncoder interface { + TypedEncoder + Put([]parquet.ByteArray) + PutSpaced([]parquet.ByteArray, []byte, int64) +} + +// ByteArrayDecoder is the interface for all encoding types that implement decoding +// parquet.ByteArray values. +type ByteArrayDecoder interface { + TypedDecoder + Decode([]parquet.ByteArray) (int, error) + DecodeSpaced([]parquet.ByteArray, int, []byte, int64) (int, error) +} + +// the byteArrayEncoderTraits struct is used to make it easy to create encoders and decoders based on type +type byteArrayEncoderTraits struct{} + +// Encoder returns an encoder for byteArray type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +func (byteArrayEncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + return &DictByteArrayEncoder{newDictEncoderBase(descr, NewBinaryDictionary(mem), mem)} + } + + switch e { + case format.Encoding_PLAIN: + return &PlainByteArrayEncoder{encoder: newEncoderBase(e, descr, mem)} + case format.Encoding_DELTA_LENGTH_BYTE_ARRAY: + return &DeltaLengthByteArrayEncoder{ + encoder: newEncoderBase(e, descr, mem), + lengthEncoder: &DeltaBitPackInt32Encoder{ + &deltaBitPackEncoder{encoder: newEncoderBase(e, descr, mem)}}, + } + case format.Encoding_DELTA_BYTE_ARRAY: + return &DeltaByteArrayEncoder{ + encoder: newEncoderBase(e, descr, mem), + } + default: + panic("unimplemented encoding type") + } +} + +// byteArrayDecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for byteArray values +type byteArrayDecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n byteArray values. +func (byteArrayDecoderTraits) BytesRequired(n int) int { + return parquet.ByteArrayTraits.BytesRequired(n) +} + +// Decoder returns a decoder for byteArray typed data of the requested encoding type if available +func (byteArrayDecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + return &DictByteArrayDecoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}} + } + + switch e { + case parquet.Encodings.Plain: + return &PlainByteArrayDecoder{decoder: newDecoderBase(format.Encoding(e), descr)} + case parquet.Encodings.DeltaLengthByteArray: + if mem == nil { + mem = memory.DefaultAllocator + } + return &DeltaLengthByteArrayDecoder{ + decoder: newDecoderBase(format.Encoding(e), descr), + mem: mem, + } + case parquet.Encodings.DeltaByteArray: + if mem == nil { + mem = memory.DefaultAllocator + } + return &DeltaByteArrayDecoder{ + DeltaLengthByteArrayDecoder: &DeltaLengthByteArrayDecoder{ + decoder: newDecoderBase(format.Encoding(e), descr), + mem: mem, + }} + default: + panic("unimplemented encoding type") + } +} + +// DictByteArrayEncoder is an encoder for parquet.ByteArray data using dictionary encoding +type DictByteArrayEncoder struct { + dictEncoder +} + +// Type returns the underlying physical type that can be encoded with this encoder +func (enc *DictByteArrayEncoder) Type() parquet.Type { + return parquet.Types.ByteArray +} + +// DictByteArrayDecoder is a decoder for decoding dictionary encoded data for parquet.ByteArray columns +type DictByteArrayDecoder struct { + dictDecoder +} + +// Type returns the underlying physical type that can be decoded with this decoder +func (DictByteArrayDecoder) Type() parquet.Type { + return parquet.Types.ByteArray +} + +// Decode populates the passed in slice with min(len(out), remaining values) values, +// decoding using hte dictionary to get the actual values. Returns the number of values +// actually decoded and any error encountered. +func (d *DictByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decode(out[:vals]) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Decode spaced is like Decode but will space out the data leaving slots for null values +// based on the provided bitmap. +func (d *DictByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict spaced eof exception") + } + d.nvals -= vals + return vals, nil +} + +// ByteArrayDictConverter is a helper for dictionary handling which is used for converting +// run length encoded indexes into the actual values that are stored in the dictionary index page. +type ByteArrayDictConverter struct { + valueDecoder ByteArrayDecoder + dict []parquet.ByteArray + zeroVal parquet.ByteArray +} + +// ensure validates that we've decoded dictionary values up to the index +// provided so that we don't need to decode the entire dictionary at start. +func (dc *ByteArrayDictConverter) ensure(idx utils.IndexType) error { + if len(dc.dict) <= int(idx) { + if cap(dc.dict) <= int(idx) { + val := make([]parquet.ByteArray, int(idx+1)-len(dc.dict)) + n, err := dc.valueDecoder.Decode(val) + if err != nil { + return err + } + dc.dict = append(dc.dict, val[:n]...) + } else { + cur := len(dc.dict) + n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1]) + if err != nil { + return err + } + dc.dict = dc.dict[:cur+n] + } + } + return nil +} + +// IsValid verifies that the set of indexes passed in are all valid indexes +// in the dictionary and if necessary decodes dictionary indexes up to the index +// requested. +func (dc *ByteArrayDictConverter) IsValid(idxes ...utils.IndexType) bool { + min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes))) + dc.ensure(utils.IndexType(max)) + + return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict) +} + +// Fill populates the slice passed in entirely with the value at dictionary index indicated by val +func (dc *ByteArrayDictConverter) Fill(out interface{}, val utils.IndexType) error { + o := out.([]parquet.ByteArray) + if err := dc.ensure(val); err != nil { + return err + } + o[0] = dc.dict[val] + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } + return nil +} + +// FillZero populates the entire slice of out with the zero value for parquet.ByteArray +func (dc *ByteArrayDictConverter) FillZero(out interface{}) { + o := out.([]parquet.ByteArray) + o[0] = dc.zeroVal + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } +} + +// Copy populates the slice provided with the values in the dictionary at the indexes +// in the vals slice. +func (dc *ByteArrayDictConverter) Copy(out interface{}, vals []utils.IndexType) error { + o := out.([]parquet.ByteArray) + for idx, val := range vals { + o[idx] = dc.dict[val] + } + return nil +} + +// FixedLenByteArrayEncoder is the interface for all encoding types that implement encoding +// parquet.FixedLenByteArray values. +type FixedLenByteArrayEncoder interface { + TypedEncoder + Put([]parquet.FixedLenByteArray) + PutSpaced([]parquet.FixedLenByteArray, []byte, int64) +} + +// FixedLenByteArrayDecoder is the interface for all encoding types that implement decoding +// parquet.FixedLenByteArray values. +type FixedLenByteArrayDecoder interface { + TypedDecoder + Decode([]parquet.FixedLenByteArray) (int, error) + DecodeSpaced([]parquet.FixedLenByteArray, int, []byte, int64) (int, error) +} + +// the fixedLenByteArrayEncoderTraits struct is used to make it easy to create encoders and decoders based on type +type fixedLenByteArrayEncoderTraits struct{} + +// Encoder returns an encoder for fixedLenByteArray type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +func (fixedLenByteArrayEncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { + return &DictFixedLenByteArrayEncoder{newDictEncoderBase(descr, NewBinaryDictionary(mem), mem)} + } + + switch e { + case format.Encoding_PLAIN: + return &PlainFixedLenByteArrayEncoder{encoder: newEncoderBase(e, descr, mem)} + default: + panic("unimplemented encoding type") + } +} + +// fixedLenByteArrayDecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for fixedLenByteArray values +type fixedLenByteArrayDecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n fixedLenByteArray values. +func (fixedLenByteArrayDecoderTraits) BytesRequired(n int) int { + return parquet.FixedLenByteArrayTraits.BytesRequired(n) +} + +// Decoder returns a decoder for fixedLenByteArray typed data of the requested encoding type if available +func (fixedLenByteArrayDecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { + return &DictFixedLenByteArrayDecoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}} + } + + switch e { + case parquet.Encodings.Plain: + return &PlainFixedLenByteArrayDecoder{decoder: newDecoderBase(format.Encoding(e), descr)} + default: + panic("unimplemented encoding type") + } +} + +// DictFixedLenByteArrayEncoder is an encoder for parquet.FixedLenByteArray data using dictionary encoding +type DictFixedLenByteArrayEncoder struct { + dictEncoder +} + +// Type returns the underlying physical type that can be encoded with this encoder +func (enc *DictFixedLenByteArrayEncoder) Type() parquet.Type { + return parquet.Types.FixedLenByteArray +} + +// DictFixedLenByteArrayDecoder is a decoder for decoding dictionary encoded data for parquet.FixedLenByteArray columns +type DictFixedLenByteArrayDecoder struct { + dictDecoder +} + +// Type returns the underlying physical type that can be decoded with this decoder +func (DictFixedLenByteArrayDecoder) Type() parquet.Type { + return parquet.Types.FixedLenByteArray +} + +// Decode populates the passed in slice with min(len(out), remaining values) values, +// decoding using hte dictionary to get the actual values. Returns the number of values +// actually decoded and any error encountered. +func (d *DictFixedLenByteArrayDecoder) Decode(out []parquet.FixedLenByteArray) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decode(out[:vals]) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Decode spaced is like Decode but will space out the data leaving slots for null values +// based on the provided bitmap. +func (d *DictFixedLenByteArrayDecoder) DecodeSpaced(out []parquet.FixedLenByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict spaced eof exception") + } + d.nvals -= vals + return vals, nil +} + +// FixedLenByteArrayDictConverter is a helper for dictionary handling which is used for converting +// run length encoded indexes into the actual values that are stored in the dictionary index page. +type FixedLenByteArrayDictConverter struct { + valueDecoder FixedLenByteArrayDecoder + dict []parquet.FixedLenByteArray + zeroVal parquet.FixedLenByteArray +} + +// ensure validates that we've decoded dictionary values up to the index +// provided so that we don't need to decode the entire dictionary at start. +func (dc *FixedLenByteArrayDictConverter) ensure(idx utils.IndexType) error { + if len(dc.dict) <= int(idx) { + if cap(dc.dict) <= int(idx) { + val := make([]parquet.FixedLenByteArray, int(idx+1)-len(dc.dict)) + n, err := dc.valueDecoder.Decode(val) + if err != nil { + return err + } + dc.dict = append(dc.dict, val[:n]...) + } else { + cur := len(dc.dict) + n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1]) + if err != nil { + return err + } + dc.dict = dc.dict[:cur+n] + } + } + return nil +} + +// IsValid verifies that the set of indexes passed in are all valid indexes +// in the dictionary and if necessary decodes dictionary indexes up to the index +// requested. +func (dc *FixedLenByteArrayDictConverter) IsValid(idxes ...utils.IndexType) bool { + min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes))) + dc.ensure(utils.IndexType(max)) + + return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict) +} + +// Fill populates the slice passed in entirely with the value at dictionary index indicated by val +func (dc *FixedLenByteArrayDictConverter) Fill(out interface{}, val utils.IndexType) error { + o := out.([]parquet.FixedLenByteArray) + if err := dc.ensure(val); err != nil { + return err + } + o[0] = dc.dict[val] + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } + return nil +} + +// FillZero populates the entire slice of out with the zero value for parquet.FixedLenByteArray +func (dc *FixedLenByteArrayDictConverter) FillZero(out interface{}) { + o := out.([]parquet.FixedLenByteArray) + o[0] = dc.zeroVal + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } +} + +// Copy populates the slice provided with the values in the dictionary at the indexes +// in the vals slice. +func (dc *FixedLenByteArrayDictConverter) Copy(out interface{}, vals []utils.IndexType) error { + o := out.([]parquet.FixedLenByteArray) + for idx, val := range vals { + o[idx] = dc.dict[val] + } + return nil +} + +// NewDictConverter creates a dict converter of the appropriate type, using the passed in +// decoder as the decoder to decode the dictionary index. +func NewDictConverter(dict TypedDecoder) utils.DictionaryConverter { + switch dict.Type() { + case parquet.Types.Int32: + return &Int32DictConverter{valueDecoder: dict.(Int32Decoder), dict: make([]int32, 0, dict.ValuesLeft())} + case parquet.Types.Int64: + return &Int64DictConverter{valueDecoder: dict.(Int64Decoder), dict: make([]int64, 0, dict.ValuesLeft())} + case parquet.Types.Float: + return &Float32DictConverter{valueDecoder: dict.(Float32Decoder), dict: make([]float32, 0, dict.ValuesLeft())} + case parquet.Types.Double: + return &Float64DictConverter{valueDecoder: dict.(Float64Decoder), dict: make([]float64, 0, dict.ValuesLeft())} + case parquet.Types.ByteArray: + return &ByteArrayDictConverter{valueDecoder: dict.(ByteArrayDecoder), dict: make([]parquet.ByteArray, 0, dict.ValuesLeft())} + case parquet.Types.FixedLenByteArray: + return &FixedLenByteArrayDictConverter{valueDecoder: dict.(FixedLenByteArrayDecoder), dict: make([]parquet.FixedLenByteArray, 0, dict.ValuesLeft())} + default: + return nil + } +} + +// helper function to get encoding traits object for the physical type indicated +func getEncodingTraits(t parquet.Type) EncoderTraits { + switch t { + case parquet.Types.Int32: + return Int32EncoderTraits + case parquet.Types.Int64: + return Int64EncoderTraits + case parquet.Types.Int96: + return Int96EncoderTraits + case parquet.Types.Float: + return Float32EncoderTraits + case parquet.Types.Double: + return Float64EncoderTraits + case parquet.Types.Boolean: + return BooleanEncoderTraits + case parquet.Types.ByteArray: + return ByteArrayEncoderTraits + case parquet.Types.FixedLenByteArray: + return FixedLenByteArrayEncoderTraits + default: + return nil + } +} + +// helper function to get decoding traits object for the physical type indicated +func getDecodingTraits(t parquet.Type) DecoderTraits { + switch t { + case parquet.Types.Int32: + return Int32DecoderTraits + case parquet.Types.Int64: + return Int64DecoderTraits + case parquet.Types.Int96: + return Int96DecoderTraits + case parquet.Types.Float: + return Float32DecoderTraits + case parquet.Types.Double: + return Float64DecoderTraits + case parquet.Types.Boolean: + return BooleanDecoderTraits + case parquet.Types.ByteArray: + return ByteArrayDecoderTraits + case parquet.Types.FixedLenByteArray: + return FixedLenByteArrayDecoderTraits + default: + return nil + } +} diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl new file mode 100644 index 00000000000..0667143ac07 --- /dev/null +++ b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl @@ -0,0 +1,341 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/schema" + format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/parquet/internal/utils" +) + +// fully typed encoder interfaces to enable writing against encoder/decoders +// without having to care about what encoding type is actually being used. + +var ( +{{range .In}} + {{.Name}}EncoderTraits {{.lower}}EncoderTraits + {{.Name}}DecoderTraits {{.lower}}DecoderTraits +{{- end}} +) + +{{range .In}} +// {{.Name}}Encoder is the interface for all encoding types that implement encoding +// {{.name}} values. +type {{.Name}}Encoder interface { + TypedEncoder + Put([]{{.name}}) + PutSpaced([]{{.name}}, []byte, int64) +} + +// {{.Name}}Decoder is the interface for all encoding types that implement decoding +// {{.name}} values. +type {{.Name}}Decoder interface { + TypedDecoder + Decode([]{{.name}}) (int, error) + DecodeSpaced([]{{.name}}, int, []byte, int64) (int, error) +} + +// the {{.lower}}EncoderTraits struct is used to make it easy to create encoders and decoders based on type +type {{.lower}}EncoderTraits struct{} + +// Encoder returns an encoder for {{.lower}} type data, using the specified encoding type and whether or not +// it should be dictionary encoded. +{{- if or (eq .Name "Boolean") (eq .Name "Int96")}} +// dictionary encoding does not exist for this type and Encoder will panic if useDict is true +{{- end }} +func ({{.lower}}EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { + if useDict { +{{- if or (eq .Name "Boolean") (eq .Name "Int96")}} + panic("parquet: no {{.name}} dictionary encoding") +{{- else}} + return &Dict{{.Name}}Encoder{newDictEncoderBase(descr, New{{if and (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}}{{.Name}}Dictionary(){{else}}BinaryDictionary(mem){{end}}, mem)} +{{- end}} + } + + switch e { + case format.Encoding_PLAIN: + return &Plain{{.Name}}Encoder{encoder: newEncoderBase(e, descr, mem)} +{{- if or (eq .Name "Int32") (eq .Name "Int64")}} + case format.Encoding_DELTA_BINARY_PACKED: + return DeltaBitPack{{.Name}}Encoder{&deltaBitPackEncoder{ + encoder: newEncoderBase(e, descr, mem)}} +{{- end}} +{{- if eq .Name "ByteArray"}} + case format.Encoding_DELTA_LENGTH_BYTE_ARRAY: + return &DeltaLengthByteArrayEncoder{ + encoder: newEncoderBase(e, descr, mem), + lengthEncoder: &DeltaBitPackInt32Encoder{ + &deltaBitPackEncoder{encoder: newEncoderBase(e, descr, mem)}}, + } + case format.Encoding_DELTA_BYTE_ARRAY: + return &DeltaByteArrayEncoder{ + encoder: newEncoderBase(e, descr, mem), + } +{{- end}} + default: + panic("unimplemented encoding type") + } +} + +// {{.lower}}DecoderTraits is a helper struct for providing information regardless of the type +// and used as a generic way to create a Decoder or Dictionary Decoder for {{.lower}} values +type {{.lower}}DecoderTraits struct{} + +// BytesRequired returns the number of bytes required to store n {{.lower}} values. +func ({{.lower}}DecoderTraits) BytesRequired(n int) int { + return {{.prefix}}.{{.Name}}Traits.BytesRequired(n) +} + +// Decoder returns a decoder for {{.lower}} typed data of the requested encoding type if available +func ({{.lower}}DecoderTraits) Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder { + if useDict { +{{- if and (ne .Name "Boolean") (ne .Name "Int96")}} + return &Dict{{.Name}}Decoder{dictDecoder{decoder: newDecoderBase(format.Encoding_RLE_DICTIONARY, descr), mem: mem}} +{{- else}} + panic("dictionary decoding unimplemented for {{.lower}}") +{{- end}} + } + + switch e { + case parquet.Encodings.Plain: + return &Plain{{.Name}}Decoder{decoder: newDecoderBase(format.Encoding(e), descr)} +{{- if or (eq .Name "Int32") (eq .Name "Int64")}} + case parquet.Encodings.DeltaBinaryPacked: + if mem == nil { + mem = memory.DefaultAllocator + } + return &DeltaBitPack{{.Name}}Decoder{ + deltaBitPackDecoder: &deltaBitPackDecoder{ + decoder: newDecoderBase(format.Encoding(e), descr), + mem: mem, + }} +{{- end}} +{{- if eq .Name "ByteArray"}} + case parquet.Encodings.DeltaLengthByteArray: + if mem == nil { + mem = memory.DefaultAllocator + } + return &DeltaLengthByteArrayDecoder{ + decoder: newDecoderBase(format.Encoding(e), descr), + mem: mem, + } + case parquet.Encodings.DeltaByteArray: + if mem == nil { + mem = memory.DefaultAllocator + } + return &DeltaByteArrayDecoder{ + DeltaLengthByteArrayDecoder: &DeltaLengthByteArrayDecoder{ + decoder: newDecoderBase(format.Encoding(e), descr), + mem: mem, + }} +{{- end}} + default: + panic("unimplemented encoding type") + } +} + +{{if and (ne .Name "Boolean") (ne .Name "Int96")}} +// Dict{{.Name}}Encoder is an encoder for {{.name}} data using dictionary encoding +type Dict{{.Name}}Encoder struct { + dictEncoder +} + +// Type returns the underlying physical type that can be encoded with this encoder +func (enc *Dict{{.Name}}Encoder) Type() parquet.Type { + return parquet.Types.{{if .physical}}{{.physical}}{{else}}{{.Name}}{{end}} +} + +{{if and (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}} +// WriteDict populates the byte slice with the dictionary index +func (enc *Dict{{.Name}}Encoder) WriteDict(out []byte) { + enc.memo.CopyValues({{.prefix}}.{{.Name}}Traits.CastFromBytes(out)) +} + +// Put encodes the values passed in, adding to the index as needed. +func (enc *Dict{{.Name}}Encoder) Put(in []{{.name}}) { + for _, val := range in { + enc.dictEncoder.Put(val) + } +} + +// PutSpaced is the same as Put but for when the data being encoded has slots open for +// null values, using the bitmap provided to skip values as needed. +func (enc *Dict{{.Name}}Encoder) PutSpaced(in []{{.name}}, validBits []byte, validBitsOffset int64) { + utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { + for i := int64(0); i < length; i++ { + enc.dictEncoder.Put(in[i+pos]) + } + return nil + }) +} +{{end}} + +// Dict{{.Name}}Decoder is a decoder for decoding dictionary encoded data for {{.name}} columns +type Dict{{.Name}}Decoder struct { + dictDecoder +} + +// Type returns the underlying physical type that can be decoded with this decoder +func (Dict{{.Name}}Decoder) Type() parquet.Type { + return parquet.Types.{{if .physical}}{{.physical}}{{else}}{{.Name}}{{end}} +} + +// Decode populates the passed in slice with min(len(out), remaining values) values, +// decoding using hte dictionary to get the actual values. Returns the number of values +// actually decoded and any error encountered. +func (d *Dict{{.Name}}Decoder) Decode(out []{{.name}}) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decode(out[:vals]) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict eof exception") + } + d.nvals -= vals + return vals, nil +} + +// Decode spaced is like Decode but will space out the data leaving slots for null values +// based on the provided bitmap. +func (d *Dict{{.Name}}Decoder) DecodeSpaced(out []{{.name}}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { + vals := utils.MinInt(len(out), d.nvals) + decoded, err := d.decodeSpaced(out[:vals], nullCount, validBits, validBitsOffset) + if err != nil { + return decoded, err + } + if vals != decoded { + return decoded, xerrors.New("parquet: dict spaced eof exception") + } + d.nvals -= vals + return vals, nil +} + +// {{.Name}}DictConverter is a helper for dictionary handling which is used for converting +// run length encoded indexes into the actual values that are stored in the dictionary index page. +type {{.Name}}DictConverter struct { + valueDecoder {{.Name}}Decoder + dict []{{.name}} + zeroVal {{.name}} +} + +// ensure validates that we've decoded dictionary values up to the index +// provided so that we don't need to decode the entire dictionary at start. +func (dc *{{.Name}}DictConverter) ensure(idx utils.IndexType) error { + if len(dc.dict) <= int(idx) { + if cap(dc.dict) <= int(idx) { + val := make([]{{.name}}, int(idx+1)-len(dc.dict)) + n, err := dc.valueDecoder.Decode(val) + if err != nil { + return err + } + dc.dict = append(dc.dict, val[:n]...) + } else { + cur := len(dc.dict) + n, err := dc.valueDecoder.Decode(dc.dict[cur : idx+1]) + if err != nil { + return err + } + dc.dict = dc.dict[:cur+n] + } + } + return nil +} + +// IsValid verifies that the set of indexes passed in are all valid indexes +// in the dictionary and if necessary decodes dictionary indexes up to the index +// requested. +func (dc *{{.Name}}DictConverter) IsValid(idxes ...utils.IndexType) bool { + min, max := utils.GetMinMaxInt32(*(*[]int32)(unsafe.Pointer(&idxes))) + dc.ensure(utils.IndexType(max)) + + return min >= 0 && int(min) < len(dc.dict) && int(max) >= 0 && int(max) < len(dc.dict) +} + +// Fill populates the slice passed in entirely with the value at dictionary index indicated by val +func (dc *{{.Name}}DictConverter) Fill(out interface{}, val utils.IndexType) error { + o := out.([]{{.name}}) + if err := dc.ensure(val); err != nil { + return err + } + o[0] = dc.dict[val] + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } + return nil +} + +// FillZero populates the entire slice of out with the zero value for {{.name}} +func (dc *{{.Name}}DictConverter) FillZero(out interface{}) { + o := out.([]{{.name}}) + o[0] = dc.zeroVal + for i := 1; i < len(o); i *= 2 { + copy(o[i:], o[:i]) + } +} + +// Copy populates the slice provided with the values in the dictionary at the indexes +// in the vals slice. +func (dc *{{.Name}}DictConverter) Copy(out interface{}, vals []utils.IndexType) error { + o := out.([]{{.name}}) + for idx, val := range vals { + o[idx] = dc.dict[val] + } + return nil +} +{{end}} + +{{end}} + +// NewDictConverter creates a dict converter of the appropriate type, using the passed in +// decoder as the decoder to decode the dictionary index. +func NewDictConverter(dict TypedDecoder) utils.DictionaryConverter { + switch dict.Type() { + {{ range .In }}{{ if and (ne .Name "Boolean") (ne .Name "Int96") -}} + case parquet.Types.{{if .physical }}{{.physical}}{{else}}{{.Name}}{{end}}: + return &{{.Name}}DictConverter{valueDecoder: dict.({{.Name}}Decoder), dict: make([]{{.name}}, 0, dict.ValuesLeft())} + {{ end }}{{ end -}} + default: + return nil + } +} + +// helper function to get encoding traits object for the physical type indicated +func getEncodingTraits(t parquet.Type) EncoderTraits { + switch t { + {{ range .In -}} + case parquet.Types.{{if .physical}}{{.physical}}{{else}}{{.Name}}{{end}}: + return {{.Name}}EncoderTraits + {{ end -}} + default: + return nil + } +} + +// helper function to get decoding traits object for the physical type indicated +func getDecodingTraits(t parquet.Type) DecoderTraits { + switch t { + {{ range .In -}} + case parquet.Types.{{if .physical}}{{.physical}}{{else}}{{.Name}}{{end}}: + return {{.Name}}DecoderTraits + {{ end -}} + default: + return nil + } +} diff --git a/go/parquet/internal/encoding/types.go b/go/parquet/internal/encoding/types.go new file mode 100644 index 00000000000..21988057226 --- /dev/null +++ b/go/parquet/internal/encoding/types.go @@ -0,0 +1,437 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package encoding + +import ( + "io" + "sync" + + "github.com/apache/arrow/go/arrow/bitutil" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" +) + +// TypedDecoder is the general interface for all decoder types which can +// then be type asserted to a specific Type Decoder +type TypedDecoder interface { + // SetData updates the data in the decoder with the passed in byte slice and the + // stated number of values as expected to be decoded. + SetData(buffered int, buf []byte) + // Encoding returns the encoding type that this decoder decodes data of + Encoding() parquet.Encoding + // ValuesLeft returns the number of remaining values to be decoded + ValuesLeft() int + // Type returns the physical type this can decode. + Type() parquet.Type +} + +// DictDecoder is a special TypedDecoder which implements dictionary decoding +type DictDecoder interface { + TypedDecoder + // SetDict takes in a decoder which can decode the dictionary index to be used + SetDict(TypedDecoder) +} + +// TypedEncoder is the general interface for all encoding types which +// can then be type asserted to a specific Type Encoder +type TypedEncoder interface { + // Bytes returns the current slice of bytes that have been encoded but does not pass ownership + Bytes() []byte + // Reset resets the encoder and dumps all the data to let it be reused. + Reset() + // ReserveForWrite reserves n bytes in the buffer so that the next n bytes written will not + // cause a memory allocation. + ReserveForWrite(n int) + // EstimatedDataEncodedSize returns the estimated number of bytes in the buffer + // so far. + EstimatedDataEncodedSize() int64 + // FlushValues finishes up any unwritten data and returns the buffer of data passing + // ownership to the caller, Release needs to be called on the Buffer to free the memory + FlushValues() Buffer + // Encoding returns the type of encoding that this encoder operates with + Encoding() parquet.Encoding + // Allocator returns the allocator that was used when creating this encoder + Allocator() memory.Allocator + // Type returns the underlying physical type this encodes. + Type() parquet.Type +} + +// DictEncoder is a special kind of TypedEncoder which implements Dictionary +// encoding. +type DictEncoder interface { + TypedEncoder + // WriteIndices populates the byte slice with the final indexes of data and returns + // the number of bytes written + WriteIndices(out []byte) int + // DictEncodedSize returns the current size of the encoded dictionary index. + DictEncodedSize() int + // BitWidth returns the bitwidth needed to encode all of the index values based + // on the number of values in the dictionary index. + BitWidth() int + // WriteDict populates out with the dictionary index values, out should be sized to at least + // as many bytes as DictEncodedSize + WriteDict(out []byte) + // NumEntries returns the number of values currently in the dictionary index. + NumEntries() int +} + +var bufferPool = sync.Pool{ + New: func() interface{} { + return memory.NewResizableBuffer(memory.DefaultAllocator) + }, +} + +// Buffer is an interface used as a general interface for handling buffers +// regardless of the underlying implementation. +type Buffer interface { + Len() int + Buf() []byte + Bytes() []byte + Resize(int) + Release() +} + +// poolBuffer is a buffer that will release the allocated buffer to a pool +// of buffers when release is called in order to allow it to be reused to +// cut down on the number of allocations. +type poolBuffer struct { + buf *memory.Buffer +} + +func (p poolBuffer) Resize(n int) { p.buf.ResizeNoShrink(n) } + +func (p poolBuffer) Len() int { return p.buf.Len() } + +func (p poolBuffer) Bytes() []byte { return p.buf.Bytes() } + +func (p poolBuffer) Buf() []byte { return p.buf.Buf() } + +func (p poolBuffer) Release() { + if p.buf.Mutable() { + memory.Set(p.buf.Buf(), 0) + p.buf.ResizeNoShrink(0) + bufferPool.Put(p.buf) + return + } + + p.buf.Release() +} + +// PooledBufferWriter uses buffers from the buffer pool to back it while +// implementing io.Writer and io.WriterAt interfaces +type PooledBufferWriter struct { + buf *memory.Buffer + pos int + offset int +} + +// NewPooledBufferWriter returns a new buffer with 'initial' bytes reserved +// and pre-allocated to guarantee that writing that many more bytes will not +// require another allocation. +func NewPooledBufferWriter(initial int) *PooledBufferWriter { + ret := &PooledBufferWriter{} + ret.Reserve(initial) + return ret +} + +// SetOffset sets an offset in the buffer which will ensure that all references +// to offsets and sizes in the buffer will be offset by this many bytes, allowing +// the writer to reserve space in the buffer. +func (b *PooledBufferWriter) SetOffset(offset int) { + b.pos -= b.offset + b.offset = offset + b.pos += offset +} + +// Reserve pre-allocates nbytes to ensure that the next write of that many bytes +// will not require another allocation. +func (b *PooledBufferWriter) Reserve(nbytes int) { + if b.buf == nil { + b.buf = bufferPool.Get().(*memory.Buffer) + } + + newCap := utils.MaxInt(b.buf.Cap()+b.offset, 256) + for newCap < b.pos+nbytes { + newCap = bitutil.NextPowerOf2(newCap) + } + b.buf.Reserve(newCap) +} + +// Reset will release any current memory and initialize it with the new +// allocated bytes. +func (b *PooledBufferWriter) Reset(initial int) { + if b.buf != nil { + memory.Set(b.buf.Buf(), 0) + b.buf.ResizeNoShrink(0) + bufferPool.Put(b.buf) + b.buf = nil + } + + b.pos = 0 + b.offset = 0 + b.Reserve(initial) +} + +// Finish returns the current buffer, with the responsibility for releasing +// the memory on the caller, resetting this writer to be re-used +func (b *PooledBufferWriter) Finish() Buffer { + if b.buf.Len() < b.pos { + b.buf.ResizeNoShrink(b.pos) + } + buf := poolBuffer{b.buf} + + b.buf = nil + b.Reset(0) + return buf +} + +// WriteAt writes the bytes from p into this buffer starting at offset. +// +// Does not affect the internal position of the writer. +func (b *PooledBufferWriter) WriteAt(p []byte, offset int64) (n int, err error) { + if len(p) == 0 { + return 0, nil + } + offset += int64(b.offset) + need := int(offset) + len(p) + + if need >= b.buf.Cap() { + b.Reserve(need - b.pos) + } + n = copy(b.buf.Buf()[offset:], p) + + if need > b.buf.Len() { + b.buf.ResizeNoShrink(need) + } + return +} + +func (b *PooledBufferWriter) Write(buf []byte) (int, error) { + if len(buf) == 0 { + return 0, nil + } + b.Reserve(len(buf)) + return b.UnsafeWrite(buf) +} + +func (b *PooledBufferWriter) UnsafeWriteCopy(ncopies int, pattern []byte) (int, error) { + nbytes := len(pattern) * ncopies + slc := b.buf.Buf()[b.pos : b.pos+nbytes] + copy(slc, pattern) + for j := len(pattern); j < len(slc); j *= 2 { + copy(slc[j:], slc[:j]) + } + b.pos += nbytes + return nbytes, nil +} + +// UnsafeWrite does not check the capacity / length before writing. +func (b *PooledBufferWriter) UnsafeWrite(buf []byte) (n int, err error) { + n = copy(b.buf.Buf()[b.pos:], buf) + b.pos += n + return +} + +func (b *PooledBufferWriter) Tell() int64 { + return int64(b.pos) +} + +// Bytes returns the current bytes slice of slice Len +func (b *PooledBufferWriter) Bytes() []byte { + if b.buf.Len() < b.pos { + b.buf.ResizeNoShrink(b.pos) + } + return b.buf.Bytes()[b.offset:] +} + +// Len provides the current Length of the byte slice +func (b *PooledBufferWriter) Len() int { + if b.buf.Len() < b.pos { + b.buf.ResizeNoShrink(b.pos) + } + return b.buf.Len() - b.offset +} + +// BufferWriter is a utility class for building and writing to a memory.Buffer +// with a given allocator that fulfills the interfaces io.Write, io.WriteAt +// and io.Seeker, while providing the ability to pre-allocate memory. +type BufferWriter struct { + buffer *memory.Buffer + pos int + mem memory.Allocator + + offset int +} + +// NewBufferWriterFromBuffer wraps the provided buffer to allow it to fulfill these +// interfaces. +func NewBufferWriterFromBuffer(b *memory.Buffer, mem memory.Allocator) *BufferWriter { + return &BufferWriter{b, 0, mem, 0} +} + +// NewBufferWriter constructs a buffer with initially reserved/allocated memory. +func NewBufferWriter(initial int, mem memory.Allocator) *BufferWriter { + buf := memory.NewResizableBuffer(mem) + buf.Reserve(initial) + return &BufferWriter{buffer: buf, mem: mem} +} + +func (b *BufferWriter) SetOffset(offset int) { + b.offset = offset +} + +// Bytes returns the current bytes slice of slice Len +func (b *BufferWriter) Bytes() []byte { + return b.buffer.Bytes()[b.offset:] +} + +// Len provides the current Length of the byte slice +func (b *BufferWriter) Len() int { + return b.buffer.Len() - b.offset +} + +// Cap returns the current capacity of the underlying buffer +func (b *BufferWriter) Cap() int { + return b.buffer.Cap() - b.offset +} + +// Finish returns the current buffer, with the responsibility for releasing +// the memory on the caller, resetting this writer to be re-used +func (b *BufferWriter) Finish() *memory.Buffer { + buf := b.buffer + b.buffer = nil + b.Reset(0) + return buf +} + +func (b *BufferWriter) Truncate() { + b.pos = 0 + b.offset = 0 + + if b.buffer == nil { + b.Reserve(1024) + } else { + b.buffer.ResizeNoShrink(0) + } +} + +// Reset will release any current memory and initialize it with the new +// allocated bytes. +func (b *BufferWriter) Reset(initial int) { + if b.buffer != nil { + b.buffer.Release() + } + + b.pos = 0 + b.offset = 0 + b.Reserve(initial) +} + +// Reserve ensures that there is at least enough capacity to write nbytes +// without another allocation, may allocate more than that in order to +// efficiently reduce allocations +func (b *BufferWriter) Reserve(nbytes int) { + if b.buffer == nil { + b.buffer = memory.NewResizableBuffer(b.mem) + } + newCap := utils.MaxInt(b.buffer.Cap()+b.offset, 256) + for newCap < b.pos+nbytes+b.offset { + newCap = bitutil.NextPowerOf2(newCap) + } + b.buffer.Reserve(newCap) +} + +// WriteAt writes the bytes from p into this buffer starting at offset. +// +// Does not affect the internal position of the writer. +func (b *BufferWriter) WriteAt(p []byte, offset int64) (n int, err error) { + if len(p) == 0 { + return 0, nil + } + offset += int64(b.offset) + need := int(offset) + len(p) + + if need >= b.buffer.Cap() { + b.Reserve(need - b.pos) + } + copy(b.buffer.Buf()[offset:], p) + + if need > b.buffer.Len() { + b.buffer.ResizeNoShrink(need) + } + return len(p), nil +} + +func (b *BufferWriter) Write(buf []byte) (int, error) { + if len(buf) == 0 { + return 0, nil + } + if b.buffer == nil { + b.Reserve(len(buf)) + } + + if b.pos+b.offset+len(buf) >= b.buffer.Cap() { + b.Reserve(len(buf)) + } + return b.UnsafeWrite(buf) +} + +func (b *BufferWriter) UnsafeWriteCopy(ncopies int, pattern []byte) (int, error) { + nbytes := len(pattern) * ncopies + slc := b.buffer.Buf()[b.pos : b.pos+nbytes] + copy(slc, pattern) + for j := len(pattern); j < len(slc); j *= 2 { + copy(slc[j:], slc[:j]) + } + b.pos += nbytes + b.buffer.ResizeNoShrink(b.pos) + return nbytes, nil +} + +// UnsafeWrite does not check the capacity / length before writing. +func (b *BufferWriter) UnsafeWrite(buf []byte) (int, error) { + copy(b.buffer.Buf()[b.pos+b.offset:], buf) + b.pos += len(buf) + b.buffer.ResizeNoShrink(b.pos) + return len(buf), nil +} + +// Seek fulfills the io.Seeker interface returning it's new position +// whence must be io.SeekStart, io.SeekCurrent or io.SeekEnd or it will be ignored. +func (b *BufferWriter) Seek(offset int64, whence int) (int64, error) { + newPos, offs := 0, int(offset) + offs += b.offset + switch whence { + case io.SeekStart: + newPos = offs + case io.SeekCurrent: + newPos = b.pos + offs + case io.SeekEnd: + newPos = b.buffer.Len() + offs + } + if newPos < 0 { + return 0, xerrors.New("negative result pos") + } + b.pos = newPos + return int64(newPos), nil +} + +func (b *BufferWriter) Tell() int64 { + return int64(b.pos) +} diff --git a/go/parquet/internal/hashing/hashing_test.go b/go/parquet/internal/hashing/hashing_test.go new file mode 100644 index 00000000000..875424a9d49 --- /dev/null +++ b/go/parquet/internal/hashing/hashing_test.go @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "math/rand" + "testing" + + "github.com/stretchr/testify/assert" +) + +func MakeDistinctIntegers(nvals int) map[int]bool { + r := rand.New(rand.NewSource(42)) + values := make(map[int]bool) + for len(values) < nvals { + values[r.Int()] = true + } + return values +} + +func MakeSequentialIntegers(nvals int) map[int]bool { + values := make(map[int]bool) + for i := 0; i < nvals; i++ { + values[i] = true + } + return values +} + +func MakeDistinctStrings(nvals int) map[string]bool { + values := make(map[string]bool) + + r := rand.New(rand.NewSource(42)) + + max := 'z' + min := '0' + for len(values) < nvals { + data := make([]byte, r.Intn(24)) + for idx := range data { + data[idx] = byte(r.Intn(int(max-min+1)) + int(min)) + } + values[string(data)] = true + } + return values +} + +func TestHashingQualityInt(t *testing.T) { + const nvalues = 10000 + + tests := []struct { + name string + values map[int]bool + quality float64 + }{ + {"distinct", MakeDistinctIntegers(nvalues), 0.96}, + {"sequential", MakeSequentialIntegers(nvalues), 0.96}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + hashes := make(map[uint64]bool) + for k := range tt.values { + hashes[hashInt(uint64(k), 0)] = true + hashes[hashInt(uint64(k), 1)] = true + } + assert.GreaterOrEqual(t, float64(len(hashes)), tt.quality*float64(2*len(tt.values))) + }) + } +} + +func TestHashingBoundsStrings(t *testing.T) { + sizes := []int{1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21} + for _, s := range sizes { + str := make([]byte, s) + for idx := range str { + str[idx] = uint8(idx) + } + + h := hash(str, 1) + diff := 0 + for i := 0; i < 120; i++ { + str[len(str)-1] = uint8(i) + if hash(str, 1) != h { + diff++ + } + } + assert.GreaterOrEqual(t, diff, 118) + } +} + +func TestHashingQualityString(t *testing.T) { + const nvalues = 10000 + values := MakeDistinctStrings(nvalues) + + hashes := make(map[uint64]bool) + for k := range values { + hashes[hashString(k, 0)] = true + hashes[hashString(k, 1)] = true + } + assert.GreaterOrEqual(t, float64(len(hashes)), 0.96*float64(2*len(values))) +} diff --git a/go/parquet/internal/hashing/types.tmpldata b/go/parquet/internal/hashing/types.tmpldata new file mode 100644 index 00000000000..2e97e9814e0 --- /dev/null +++ b/go/parquet/internal/hashing/types.tmpldata @@ -0,0 +1,18 @@ +[ + { + "Name": "Int32", + "name": "int32" + }, + { + "Name": "Int64", + "name": "int64" + }, + { + "Name": "Float32", + "name": "float32" + }, + { + "Name": "Float64", + "name": "float64" + } +] diff --git a/go/parquet/internal/hashing/xxh3_memo_table.gen.go b/go/parquet/internal/hashing/xxh3_memo_table.gen.go new file mode 100644 index 00000000000..1f37d180803 --- /dev/null +++ b/go/parquet/internal/hashing/xxh3_memo_table.gen.go @@ -0,0 +1,1009 @@ +// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "math" + + "github.com/apache/arrow/go/arrow/bitutil" +) + +type payloadInt32 struct { + val int32 + memoIdx int32 +} + +type entryInt32 struct { + h uint64 + payload payloadInt32 +} + +func (e entryInt32) Valid() bool { return e.h != sentinel } + +// Int32HashTable is a hashtable specifically for int32 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int32HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt32 +} + +// NewInt32HashTable returns a new hash table for int32 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt32HashTable(cap uint64) *Int32HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt32, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int32HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt32, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int32HashTable) CopyValues(out []int32) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) { + h.VisitEntries(func(e *entryInt32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int32HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt32 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int32HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt32, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int32MemoTable struct { + tbl *Int32HashTable + nullIdx int32 +} + +// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt32MemoTable(num int64) *Int32MemoTable { + return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int32MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int32MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int32MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int32MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int32)) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int32MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int32)), 0) + if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok { + + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int32)), 0) + e, ok := s.tbl.Lookup(h, func(v int32) bool { + return val.(int32) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int32), int32(idx)) + } + return +} + +type payloadInt64 struct { + val int64 + memoIdx int32 +} + +type entryInt64 struct { + h uint64 + payload payloadInt64 +} + +func (e entryInt64) Valid() bool { return e.h != sentinel } + +// Int64HashTable is a hashtable specifically for int64 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int64HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt64 +} + +// NewInt64HashTable returns a new hash table for int64 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt64HashTable(cap uint64) *Int64HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt64, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int64HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt64, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int64HashTable) CopyValues(out []int64) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) { + h.VisitEntries(func(e *entryInt64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int64HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt64 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int64HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt64, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int64MemoTable struct { + tbl *Int64HashTable + nullIdx int32 +} + +// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt64MemoTable(num int64) *Int64MemoTable { + return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int64MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int64MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int64MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int64MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int64)) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int64MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int64)), 0) + if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok { + + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int64)), 0) + e, ok := s.tbl.Lookup(h, func(v int64) bool { + return val.(int64) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int64), int32(idx)) + } + return +} + +type payloadFloat32 struct { + val float32 + memoIdx int32 +} + +type entryFloat32 struct { + h uint64 + payload payloadFloat32 +} + +func (e entryFloat32) Valid() bool { return e.h != sentinel } + +// Float32HashTable is a hashtable specifically for float32 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Float32HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryFloat32 +} + +// NewFloat32HashTable returns a new hash table for float32 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewFloat32HashTable(cap uint64) *Float32HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryFloat32, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Float32HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryFloat32, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Float32HashTable) CopyValues(out []float32) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) { + h.VisitEntries(func(e *entryFloat32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Float32HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryFloat32 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Float32HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryFloat32, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Float32MemoTable struct { + tbl *Float32HashTable + nullIdx int32 +} + +// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewFloat32MemoTable(num int64) *Float32MemoTable { + return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Float32MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Float32MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Float32MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Float32MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]float32)) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Float32MemoTable) Get(val interface{}) (int, bool) { + + h := hashFloat32(val.(float32), 0) + var cmp func(float32) bool + + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + + } else { + cmp = func(v float32) bool { return val.(float32) == v } + } + if e, ok := s.tbl.Lookup(h, cmp); ok { + + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashFloat32(val.(float32), 0) + var cmp func(float32) bool + + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + + } else { + cmp = func(v float32) bool { return val.(float32) == v } + } + e, ok := s.tbl.Lookup(h, cmp) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(float32), int32(idx)) + } + return +} + +type payloadFloat64 struct { + val float64 + memoIdx int32 +} + +type entryFloat64 struct { + h uint64 + payload payloadFloat64 +} + +func (e entryFloat64) Valid() bool { return e.h != sentinel } + +// Float64HashTable is a hashtable specifically for float64 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Float64HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryFloat64 +} + +// NewFloat64HashTable returns a new hash table for float64 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewFloat64HashTable(cap uint64) *Float64HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryFloat64, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Float64HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryFloat64, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Float64HashTable) CopyValues(out []float64) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) { + h.VisitEntries(func(e *entryFloat64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Float64HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryFloat64 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Float64HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryFloat64, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Float64MemoTable struct { + tbl *Float64HashTable + nullIdx int32 +} + +// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewFloat64MemoTable(num int64) *Float64MemoTable { + return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Float64MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Float64MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Float64MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Float64MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]float64)) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Float64MemoTable) Get(val interface{}) (int, bool) { + + h := hashFloat64(val.(float64), 0) + var cmp func(float64) bool + + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + + } else { + cmp = func(v float64) bool { return val.(float64) == v } + } + if e, ok := s.tbl.Lookup(h, cmp); ok { + + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashFloat64(val.(float64), 0) + var cmp func(float64) bool + + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + + } else { + cmp = func(v float64) bool { return val.(float64) == v } + } + e, ok := s.tbl.Lookup(h, cmp) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(float64), int32(idx)) + } + return +} diff --git a/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl b/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl new file mode 100644 index 00000000000..f8ec55586ae --- /dev/null +++ b/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl @@ -0,0 +1,290 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "github.com/apache/arrow/go/arrow/bitutil" +) + +{{range .In}} +type payload{{.Name}} struct { + val {{.name}} + memoIdx int32 +} + +type entry{{.Name}} struct { + h uint64 + payload payload{{.Name}} +} + +func (e entry{{.Name}}) Valid() bool { return e.h != sentinel } + +// {{.Name}}HashTable is a hashtable specifically for {{.name}} that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type {{.Name}}HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entry{{.Name}} +} + +// New{{.Name}}HashTable returns a new hash table for {{.name}} values +// initialized with the passed in capacity or 32 whichever is larger. +func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entry{{.Name}}, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *{{.Name}}HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entry{{.Name}}, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) { + h.VisitEntries(func(e *entry{{.Name}}) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func ({{.Name}}HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entry{{.Name}} + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *{{.Name}}HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entry{{.Name}}, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type {{.Name}}MemoTable struct { + tbl *{{.Name}}HashTable + nullIdx int32 +} + +// New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable { + return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *{{.Name}}MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *{{.Name}}MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *{{.Name}}MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *{{.Name}}MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]{{.name}})) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) { +{{if or (eq .Name "Int32") (eq .Name "Int64") }} + h := hashInt(uint64(val.({{.name}})), 0) + if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok { +{{ else }} + h := hash{{.Name}}(val.({{.name}}), 0) + var cmp func({{.name}}) bool + {{if eq .Name "Float32"}} + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + {{ else }} + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + {{end}} + } else { + cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } + } + if e, ok := s.tbl.Lookup(h, cmp); ok { +{{ end }} + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + {{if or (eq .Name "Int32") (eq .Name "Int64") }} + h := hashInt(uint64(val.({{.name}})), 0) + e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { + return val.({{.name}}) == v + }) +{{ else }} + h := hash{{.Name}}(val.({{.name}}), 0) + var cmp func({{.name}}) bool + {{if eq .Name "Float32"}} + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + {{ else }} + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + {{end}} + } else { + cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } + } + e, ok := s.tbl.Lookup(h, cmp) +{{ end }} + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.({{.name}}), int32(idx)) + } + return +} +{{end}} diff --git a/go/parquet/internal/hashing/xxh3_memo_table.go b/go/parquet/internal/hashing/xxh3_memo_table.go new file mode 100644 index 00000000000..dd1ee6cf58f --- /dev/null +++ b/go/parquet/internal/hashing/xxh3_memo_table.go @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hashing provides utilities for and an implementation of a hash +// table which is more performant than the default go map implementation +// by leveraging xxh3 and some custom hash functions. +package hashing + +import ( + "bytes" + "math" + "math/bits" + "reflect" + "unsafe" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/parquet" + + "github.com/zeebo/xxh3" +) + +//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl + +func hashInt(val uint64, alg uint64) uint64 { + // Two of xxhash's prime multipliers (which are chosen for their + // bit dispersion properties) + var multipliers = [2]uint64{11400714785074694791, 14029467366897019727} + // Multiplying by the prime number mixes the low bits into the high bits, + // then byte-swapping (which is a single CPU instruction) allows the + // combined high and low bits to participate in the initial hash table index. + return bits.ReverseBytes64(multipliers[alg] * val) +} + +func hashFloat32(val float32, alg uint64) uint64 { + // grab the raw byte pattern of the + bt := *(*[4]byte)(unsafe.Pointer(&val)) + x := uint64(*(*uint32)(unsafe.Pointer(&bt[0]))) + hx := hashInt(x, alg) + hy := hashInt(x, alg^1) + return 4 ^ hx ^ hy +} + +func hashFloat64(val float64, alg uint64) uint64 { + bt := *(*[8]byte)(unsafe.Pointer(&val)) + hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg) + hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1) + return 8 ^ hx ^ hy +} + +func hashString(val string, alg uint64) uint64 { + buf := *(*[]byte)(unsafe.Pointer(&val)) + (*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val) + return hash(buf, alg) +} + +// prime constants used for slightly increasing the hash quality further +var exprimes = [2]uint64{1609587929392839161, 9650029242287828579} + +// for smaller amounts of bytes this is faster than even calling into +// xxh3 to do the hash, so we specialize in order to get the benefits +// of that performance. +func hash(b []byte, alg uint64) uint64 { + n := uint32(len(b)) + if n <= 16 { + switch { + case n > 8: + // 8 < length <= 16 + // apply same principle as above, but as two 64-bit ints + x := *(*uint64)(unsafe.Pointer(&b[n-8])) + y := *(*uint64)(unsafe.Pointer(&b[0])) + hx := hashInt(x, alg) + hy := hashInt(y, alg^1) + return uint64(n) ^ hx ^ hy + case n >= 4: + // 4 < length <= 8 + // we can read the bytes as two overlapping 32-bit ints, apply different + // hash functions to each in parallel + // then xor the results + x := *(*uint32)(unsafe.Pointer(&b[n-4])) + y := *(*uint32)(unsafe.Pointer(&b[0])) + hx := hashInt(uint64(x), alg) + hy := hashInt(uint64(y), alg^1) + return uint64(n) ^ hx ^ hy + case n > 0: + x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1])) + return hashInt(uint64(x), alg) + case n == 0: + return 1 + } + } + + // increase differentiation enough to improve hash quality + return xxh3.Hash(b) + exprimes[alg] +} + +const ( + sentinel uint64 = 0 + loadFactor int64 = 2 +) + +func max(a, b uint64) uint64 { + if a > b { + return a + } + return b +} + +var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } + +// KeyNotFound is the constant returned by memo table functions when a key isn't found in the table +const KeyNotFound = -1 + +// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder +// to construct the actual data in an easy to pass around way with minimal copies +// while using a hash table to keep track of the indexes into the dictionary that +// is created as we go. +type BinaryMemoTable struct { + tbl *Int32HashTable + builder *array.BinaryBuilder + nullIdx int +} + +// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will +// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. +// initial and valuesize can be used to pre-allocate the table to reduce allocations. With +// initial being the initial number of entries to allocate for and valuesize being the starting +// amount of space allocated for writing the actual binary data. +func NewBinaryMemoTable(mem memory.Allocator, initial, valuesize int) *BinaryMemoTable { + if mem == nil { + mem = memory.DefaultAllocator + } + bldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + bldr.Reserve(int(initial)) + datasize := valuesize + if datasize <= 0 { + datasize = initial * 4 + } + bldr.ReserveData(datasize) + return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} +} + +// Reset dumps all of the data in the table allowing it to be reutilized. +func (s *BinaryMemoTable) Reset() { + s.tbl.Reset(32) + s.builder.NewArray().Release() + s.builder.Reserve(int(32)) + s.builder.ReserveData(int(32) * 4) + s.nullIdx = KeyNotFound +} + +// GetNull returns the index of a null that has been inserted into the table or +// KeyNotFound. The bool returned will be true if there was a null inserted into +// the table, and false otherwise. +func (s *BinaryMemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// Size returns the current size of the memo table including the null value +// if one has been inserted. +func (s *BinaryMemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// helper function to easily return a byte slice for any given value +// regardless of the type if it's a []byte, parquet.ByteArray, +// parquet.FixedLenByteArray or string. +func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { + switch v := val.(type) { + case []byte: + return v + case parquet.ByteArray: + return *(*[]byte)(unsafe.Pointer(&v)) + case parquet.FixedLenByteArray: + return *(*[]byte)(unsafe.Pointer(&v)) + case string: + return (*(*[]byte)(unsafe.Pointer(&v)))[:len(v):len(v)] + default: + panic("invalid type for binarymemotable") + } +} + +// helper function to get the hash value regardless of the underlying binary type +func (BinaryMemoTable) getHash(val interface{}) uint64 { + switch v := val.(type) { + case string: + return hashString(v, 0) + case []byte: + return hash(v, 0) + case parquet.ByteArray: + return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) + case parquet.FixedLenByteArray: + return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) + default: + panic("invalid type for binarymemotable") + } +} + +// helper function to append the given value to the builder regardless +// of the underlying binary type. +func (b *BinaryMemoTable) appendVal(val interface{}) { + switch v := val.(type) { + case string: + b.builder.AppendString(v) + case []byte: + b.builder.Append(v) + case parquet.ByteArray: + b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) + case parquet.FixedLenByteArray: + b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) + } +} + +func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { + return b.tbl.Lookup(h, func(i int32) bool { + return bytes.Equal(val, b.builder.Value(int(i))) + }) +} + +// Get returns the index of the specified value in the table or KeyNotFound, +// and a boolean indicating whether it was found in the table. +func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { + if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { + return int(p.payload.val), ok + } + return KeyNotFound, false +} + +// GetOrInsert returns the index of the given value in the table, if not found +// it is inserted into the table. The return value 'found' indicates whether the value +// was found in the table (true) or inserted (false) along with any possible error. +func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + h := b.getHash(val) + p, found := b.lookup(h, b.valAsByteSlice(val)) + if found { + idx = int(p.payload.val) + } else { + idx = b.Size() + b.appendVal(val) + b.tbl.Insert(p, h, int32(idx), -1) + } + return +} + +// GetOrInsertNull retrieves the index of a null in the table or inserts +// null into the table, returning the index and a boolean indicating if it was +// found in the table (true) or was inserted (false). +func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = b.GetNull() + if !found { + idx = b.Size() + b.nullIdx = idx + b.builder.AppendNull() + } + return +} + +// helper function to get the offset into the builder data for a given +// index value. +func (b *BinaryMemoTable) findOffset(idx int) uintptr { + val := b.builder.Value(idx) + for len(val) == 0 { + idx++ + if idx >= b.builder.Len() { + break + } + val = b.builder.Value(idx) + } + if len(val) != 0 { + return uintptr(unsafe.Pointer(&val[0])) + } + return uintptr(b.builder.DataLen()) + b.findOffset(0) +} + +// CopyOffsets copies the list of offsets into the passed in slice, the offsets +// being the start and end values of the underlying allocated bytes in the builder +// for the individual values of the table. out should be at least sized to Size()+1 +func (b *BinaryMemoTable) CopyOffsets(out []int8) { + b.CopyOffsetsSubset(0, out) +} + +// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, +// it gets a subset of the offsets in the table starting at the index provided by "start". +func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int8) { + if b.builder.Len() <= start { + return + } + + first := b.findOffset(0) + delta := b.findOffset(start) + for i := start; i < b.Size(); i++ { + offset := int8(b.findOffset(i) - delta) + out[i-start] = offset + } + + out[b.Size()-start] = int8(b.builder.DataLen() - int(delta) - int(first)) +} + +// CopyValues copies the raw binary data bytes out, out should be a []byte +// with at least ValuesSize bytes allocated to copy into. +func (b *BinaryMemoTable) CopyValues(out interface{}) { + b.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies the raw binary data bytes out starting with the value +// at the index start, out should be a []byte with at least ValuesSize bytes allocated +func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { + var ( + first = b.findOffset(0) + offset = b.findOffset(int(start)) + length = b.builder.DataLen() - int(offset-first) + ) + + outval := out.([]byte) + copy(outval, b.builder.Value(start)[0:length]) +} + +// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep +// track of the fixed width when inserting the null value the databuffer holds a +// zero length byte slice for the null value (if found) +func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { + if start >= b.Size() { + return + } + + null, exists := b.GetNull() + if !exists || null < start { + // nothing to skip, proceed as usual + b.CopyValuesSubset(start, out) + return + } + + var ( + leftOffset = b.findOffset(start) + nullOffset = b.findOffset(null) + leftSize = nullOffset - leftOffset + ) + + if leftSize > 0 { + copy(out, b.builder.Value(start)[0:leftSize]) + } + + rightSize := b.ValuesSize() - int(nullOffset) + if rightSize > 0 { + // skip the null fixed size value + copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize]) + } +} + +// VisitValues exists to run the visitFn on each value currently in the hash table. +func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { + for i := int(start); i < b.Size(); i++ { + visitFn(b.builder.Value(i)) + } +} + +// Release is used to tell the underlying builder that it can release the memory allocated +// when the reference count reaches 0, this is safe to be called from multiple goroutines +// simultaneously +func (b *BinaryMemoTable) Release() { b.builder.Release() } + +// Retain increases the ref count, it is safe to call it from multiple goroutines +// simultaneously. +func (b *BinaryMemoTable) Retain() { b.builder.Retain() } + +// ValuesSize returns the current total size of all the raw bytes that have been inserted +// into the memotable so far. +func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() } diff --git a/go/parquet/internal/testutils/utils.go b/go/parquet/internal/testutils/utils.go new file mode 100644 index 00000000000..503c60044ab --- /dev/null +++ b/go/parquet/internal/testutils/utils.go @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutils + +import ( + "reflect" + + "github.com/apache/arrow/go/parquet" +) + +var typeToParquetTypeMap = map[reflect.Type]parquet.Type{ + reflect.TypeOf(true): parquet.Types.Boolean, + reflect.TypeOf(int32(0)): parquet.Types.Int32, + reflect.TypeOf(int64(0)): parquet.Types.Int64, + reflect.TypeOf(float32(0)): parquet.Types.Float, + reflect.TypeOf(float64(0)): parquet.Types.Double, + reflect.TypeOf(parquet.ByteArray{}): parquet.Types.ByteArray, + reflect.TypeOf(parquet.Int96{}): parquet.Types.Int96, + reflect.TypeOf(parquet.FixedLenByteArray{}): parquet.Types.FixedLenByteArray, +} + +func TypeToParquetType(typ reflect.Type) parquet.Type { + ret, ok := typeToParquetTypeMap[typ] + if !ok { + panic("invalid type for parquet type") + } + return ret +} From 4f6318c6f84256534283864f4f839bd070469492 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Wed, 2 Jun 2021 17:54:39 -0400 Subject: [PATCH 03/17] updates based on feedback to boolean encoding and decoding --- .../internal/encoding/boolean_decoder.go | 23 ++++--- .../internal/encoding/boolean_encoder.go | 62 +++++-------------- go/parquet/internal/utils/bitmap_writer.go | 33 +++++++++- 3 files changed, 62 insertions(+), 56 deletions(-) diff --git a/go/parquet/internal/encoding/boolean_decoder.go b/go/parquet/internal/encoding/boolean_decoder.go index 48c320fc345..a33b21a3181 100644 --- a/go/parquet/internal/encoding/boolean_decoder.go +++ b/go/parquet/internal/encoding/boolean_decoder.go @@ -20,6 +20,7 @@ import ( "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/parquet" "github.com/apache/arrow/go/parquet/internal/utils" + "golang.org/x/xerrors" ) // PlainBooleanDecoder is for the Plain Encoding type, there is no @@ -42,15 +43,19 @@ func (PlainBooleanDecoder) Type() parquet.Type { func (dec *PlainBooleanDecoder) Decode(out []bool) (int, error) { max := utils.MinInt(len(out), dec.nvals) + unalignedExtract := func(start, end, curBitOffset int) int { + i := start + for ; curBitOffset < end; i, curBitOffset = i+1, curBitOffset+1 { + out[i] = (dec.data[0] & byte(1< 0 { - bitsToWrite := utils.MinInt(enc.nbits, len(in)) - beg := (boolBufSize * 8) - enc.nbits - for i, val := range in[:bitsToWrite] { - bitmask := uint8(1 << uint((beg+i)%8)) - if val { - enc.bitsBuffer[(beg+i)/8] |= bitmask - } else { - enc.bitsBuffer[(beg+i)/8] &= bitmask ^ 0xFF - } - } - enc.nbits -= bitsToWrite - bitOffset = bitsToWrite - if enc.nbits == 0 { - enc.append(enc.bitsBuffer) - } + if enc.wr == nil { + enc.wr = utils.NewBitmapWriter(enc.bitsBuffer, 0, boolsInBuf) } - // now that we're aligned, write the rest of our bits - bitsRemain := len(in) - bitOffset - for bitOffset < len(in) { - enc.nbits = boolBufSize * 8 - bitsToWrite := utils.MinInt(bitsRemain, enc.nbits) - for i, val := range in[bitOffset : bitOffset+bitsToWrite] { - bitmask := uint8(1 << uint(i%8)) - if val { - enc.bitsBuffer[i/8] |= bitmask - } else { - enc.bitsBuffer[i/8] &= bitmask ^ 0xFF - } - } - bitOffset += bitsToWrite - enc.nbits -= bitsToWrite - bitsRemain -= bitsToWrite - if enc.nbits == 0 { - enc.append(enc.bitsBuffer) - } + n := enc.wr.AppendBools(in) + for n < len(in) { + enc.wr.Finish() + enc.append(enc.bitsBuffer) + enc.wr.Reset(0, boolsInBuf) + in = in[n:] + n = enc.wr.AppendBools(in) } } @@ -96,16 +69,15 @@ func (enc *PlainBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBits // EstimatedDataEncodedSize returns the current number of bytes that have // been buffered so far func (enc *PlainBooleanEncoder) EstimatedDataEncodedSize() int64 { - return int64(enc.sink.Len() + (boolBufSize * 8) - enc.nbits) + return int64(enc.sink.Len() + int(bitutil.BytesForBits(enc.wr.Pos()))) } // FlushValues returns the buffered data, the responsibility is on the caller // to release the buffer memory func (enc *PlainBooleanEncoder) FlushValues() Buffer { - if enc.nbits > 0 { - toFlush := (boolBufSize * 8) - enc.nbits + if enc.wr.Pos() > 0 { + toFlush := int(enc.wr.Pos()) enc.append(enc.bitsBuffer[:bitutil.BytesForBits(int64(toFlush))]) - enc.nbits = boolBufSize * 8 } return enc.sink.Finish() diff --git a/go/parquet/internal/utils/bitmap_writer.go b/go/parquet/internal/utils/bitmap_writer.go index eed9f867554..f7c1f7a57cd 100644 --- a/go/parquet/internal/utils/bitmap_writer.go +++ b/go/parquet/internal/utils/bitmap_writer.go @@ -96,6 +96,9 @@ type BitmapWriter interface { Finish() // AppendWord takes nbits from word which should be an LSB bitmap and appends them to the bitmap. AppendWord(word uint64, nbits int64) + // AppendBools appends the bit representation of the bools slice, returning the number + // of bools that were able to fit in the remaining length of the bitmapwriter. + AppendBools(in []bool) int // Pos is the current position that will be written next Pos() int64 // Reset allows reusing the bitmapwriter by resetting Pos to start with length as @@ -140,7 +143,7 @@ func (b *bitmapWriter) Reset(start, length int64) { func (b *bitmapWriter) Pos() int64 { return b.pos } func (b *bitmapWriter) Set() { b.curByte |= b.bitMask } -func (b *bitmapWriter) Clear() { b.curByte &= b.bitMask ^ 0xFF } +func (b *bitmapWriter) Clear() { b.curByte &= ^b.bitMask } func (b *bitmapWriter) Next() { b.bitMask = b.bitMask << 1 @@ -155,6 +158,30 @@ func (b *bitmapWriter) Next() { } } +func (b *bitmapWriter) AppendBools(in []bool) int { + space := Min(bitutil.BytesForBits(b.length-b.pos), int64(len(in))) + + // location that the first byte needs to be written to for appending + appslice := b.buf[int(b.byteOffset):] + // update everything but curByte + bitOffset := bits.TrailingZeros32(uint32(b.bitMask)) + appslice[0] = b.curByte + for i, b := range in[:space] { + if b { + bitutil.SetBit(appslice, i) + } else { + bitutil.ClearBit(appslice, i) + } + } + + b.pos += space + b.bitMask = bitutil.BitMask[(int64(bitOffset)+space)%8] + b.byteOffset += (int64(bitOffset) + space) / 8 + b.curByte = appslice[len(appslice)-1] + + return int(space) +} + func (b *bitmapWriter) Finish() { if b.length > 0 && (b.bitMask != 0x01 || b.pos < b.length) { b.buf[int(b.byteOffset)] = b.curByte @@ -267,6 +294,10 @@ func (bw *firstTimeBitmapWriter) Next() { } } +func (b *firstTimeBitmapWriter) AppendBools(in []bool) int { + panic("Append Bools not yet implemented for firstTimeBitmapWriter") +} + func (bw *firstTimeBitmapWriter) Finish() { // store curByte into the bitmap if bw.length > 0 && bw.bitMask != 0x01 || bw.pos < bw.length { From dc8537f27f55b1a40d4556f2d89d7f9ae3832bd7 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Wed, 2 Jun 2021 17:54:51 -0400 Subject: [PATCH 04/17] slight changes to uncomment tests so they can get used properly --- go/parquet/internal/encoding/encoding_test.go | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/go/parquet/internal/encoding/encoding_test.go b/go/parquet/internal/encoding/encoding_test.go index d4aa12b5b94..380f46e5334 100644 --- a/go/parquet/internal/encoding/encoding_test.go +++ b/go/parquet/internal/encoding/encoding_test.go @@ -134,8 +134,8 @@ func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, hea return draws[:nvals*repeats], decode[:nvals*repeats] case reflect.TypeOf(parquet.ByteArray{}): - draws := parquet.ByteArrayTraits.CastFromBytes(drawbuf) - decode := parquet.ByteArrayTraits.CastFromBytes(decodebuf) + draws := make([]parquet.ByteArray, nvals*repeats) + decode := make([]parquet.ByteArray, nvals*repeats) testutils.InitValues(draws[:nvals], heap) for j := 1; j < repeats; j++ { @@ -146,8 +146,8 @@ func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, hea return draws[:nvals*repeats], decode[:nvals*repeats] case reflect.TypeOf(parquet.FixedLenByteArray{}): - draws := parquet.FixedLenByteArrayTraits.CastFromBytes(drawbuf) - decode := parquet.FixedLenByteArrayTraits.CastFromBytes(decodebuf) + draws := make([]parquet.FixedLenByteArray, nvals*repeats) + decode := make([]parquet.FixedLenByteArray, nvals*repeats) testutils.InitValues(draws[:nvals], heap) for j := 1; j < repeats; j++ { @@ -358,16 +358,16 @@ func (b *BaseEncodingTestSuite) TestBasicRoundTrip() { b.checkRoundTrip(parquet.Encodings.Plain) } -// func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() { -// b.initData(10000, 1) +func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() { + b.initData(10000, 1) -// switch b.typ { -// case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): -// b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) -// default: -// b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) }) -// } -// } + switch b.typ { + case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): + b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) + default: + b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) }) + } +} func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() { b.initData(10000, 1) @@ -380,16 +380,16 @@ func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() { } } -// func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() { -// b.initData(10000, 1) +func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() { + b.initData(10000, 1) -// switch b.typ { -// case reflect.TypeOf(parquet.ByteArray{}): -// b.checkRoundTrip(parquet.Encodings.DeltaByteArray) -// default: -// b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) -// } -// } + switch b.typ { + case reflect.TypeOf(parquet.ByteArray{}): + b.checkRoundTrip(parquet.Encodings.DeltaByteArray) + default: + b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) + } +} func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() { exec := func(vals, repeats int, validBitsOffset int64, nullProb float64) { @@ -407,7 +407,7 @@ func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() { b.checkRoundTripSpaced(parquet.Encodings.DeltaBinaryPacked, validBits, validBitsOffset) case reflect.TypeOf(parquet.ByteArray{}): b.checkRoundTripSpaced(parquet.Encodings.DeltaLengthByteArray, validBits, validBitsOffset) - // b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset) + b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset) } } }) From 163fc088f4de77d90b2e8f16fef8126288a9cd96 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 3 Jun 2021 12:46:35 -0400 Subject: [PATCH 05/17] Update go/parquet/internal/encoding/byte_array_decoder.go Co-authored-by: emkornfield --- go/parquet/internal/encoding/byte_array_decoder.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/parquet/internal/encoding/byte_array_decoder.go b/go/parquet/internal/encoding/byte_array_decoder.go index 09eacf97551..3711687bd70 100644 --- a/go/parquet/internal/encoding/byte_array_decoder.go +++ b/go/parquet/internal/encoding/byte_array_decoder.go @@ -37,7 +37,7 @@ func (PlainByteArrayDecoder) Type() parquet.Type { } // Decode will populate the slice of bytearrays in full or until the number -// of values is emptied. +// of values is consumed. // // Returns the number of values that were decoded. func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { From 902495cc723b3e2905bfbf39b47b364923d6662e Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Thu, 3 Jun 2021 15:47:29 -0400 Subject: [PATCH 06/17] updates based on feedback --- .../internal/encoding/byte_array_decoder.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/go/parquet/internal/encoding/byte_array_decoder.go b/go/parquet/internal/encoding/byte_array_decoder.go index 3711687bd70..04eed8b3375 100644 --- a/go/parquet/internal/encoding/byte_array_decoder.go +++ b/go/parquet/internal/encoding/byte_array_decoder.go @@ -27,6 +27,10 @@ import ( // PlainByteArrayDecoder decodes a data chunk for bytearrays according to // the plain encoding. The byte arrays will use slices to reference the // data rather than copying it. +// +// The parquet spec defines Plain encoding for ByteArrays as a 4 byte little +// endian integer containing the length of the bytearray followed by that many +// bytes being the raw data of the byte array. type PlainByteArrayDecoder struct { decoder } @@ -50,18 +54,18 @@ func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) return i, xerrors.New("parquet: eof reading bytearray") } - // the first 4 bytes are a little endian uint32 length - nbytes := int32(binary.LittleEndian.Uint32(pbad.data[:4])) - if nbytes < 0 { + // the first 4 bytes are a little endian int32 length + byteLen := int32(binary.LittleEndian.Uint32(pbad.data[:4])) + if byteLen < 0 { return i, xerrors.New("parquet: invalid BYTE_ARRAY value") } - if int64(len(pbad.data)) < int64(nbytes)+4 { + if int64(len(pbad.data)) < int64(byteLen)+4 { return i, xerrors.New("parquet: eof reading bytearray") } - out[i] = pbad.data[4 : nbytes+4] - pbad.data = pbad.data[nbytes+4:] + out[i] = pbad.data[4 : byteLen+4] + pbad.data = pbad.data[byteLen+4:] } pbad.nvals -= max From 5f607a75ccc935e2ce98df2e1690be96095c3525 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Thu, 3 Jun 2021 16:18:55 -0400 Subject: [PATCH 07/17] set capacity of resulting bytearrays when decoding --- go/parquet/internal/encoding/byte_array_decoder.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/parquet/internal/encoding/byte_array_decoder.go b/go/parquet/internal/encoding/byte_array_decoder.go index 04eed8b3375..fa8033b78fa 100644 --- a/go/parquet/internal/encoding/byte_array_decoder.go +++ b/go/parquet/internal/encoding/byte_array_decoder.go @@ -64,7 +64,7 @@ func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) return i, xerrors.New("parquet: eof reading bytearray") } - out[i] = pbad.data[4 : byteLen+4] + out[i] = pbad.data[4 : byteLen+4 : byteLen+4] pbad.data = pbad.data[byteLen+4:] } From db9d93bfd999b48c09e50495d793fdc66765c32a Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Fri, 4 Jun 2021 17:16:36 -0400 Subject: [PATCH 08/17] handle big-endian platforms by ensuring we Reverse the bytes if necessary to write a little endian value. --- go/parquet/internal/encoding/byte_array_encoder.go | 4 ++-- go/parquet/internal/encoding/encoder.go | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/go/parquet/internal/encoding/byte_array_encoder.go b/go/parquet/internal/encoding/byte_array_encoder.go index 1fe1bfcccd5..dc1ae560625 100644 --- a/go/parquet/internal/encoding/byte_array_encoder.go +++ b/go/parquet/internal/encoding/byte_array_encoder.go @@ -35,12 +35,12 @@ type PlainByteArrayEncoder struct { func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) { inc := val.Len() + arrow.Uint32SizeBytes enc.sink.Reserve(inc) - vlen := uint32(val.Len()) + vlen := toLEFunc(uint32(val.Len())) enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:]) enc.sink.UnsafeWrite(val) } -// Put writes out all of the values in this slice to the buffer +// Put writes out all of the values in this slice to the encoding sink func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) { for _, val := range in { enc.PutByteArray(val) diff --git a/go/parquet/internal/encoding/encoder.go b/go/parquet/internal/encoding/encoder.go index 81d1a15f49c..49072c8e151 100644 --- a/go/parquet/internal/encoding/encoder.go +++ b/go/parquet/internal/encoding/encoder.go @@ -22,6 +22,7 @@ import ( "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/arrow/bitutil" + "github.com/apache/arrow/go/arrow/endian" "github.com/apache/arrow/go/arrow/memory" "github.com/apache/arrow/go/parquet" format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" @@ -29,6 +30,16 @@ import ( "github.com/apache/arrow/go/parquet/schema" ) +var toLEFunc func(uint32) uint32 + +func init() { + if endian.IsBigEndian { + toLEFunc = bits.ReverseBytes32 + } else { + toLEFunc = func(in uint32) uint32 { return in } + } +} + //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata plain_encoder_types.gen.go.tmpl typed_encoder.gen.go.tmpl // EncoderTraits is an interface for the different types to make it more From 041295b38886bbc2cd85e115d54e21b0fa9e061a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 4 Jun 2021 16:58:09 -0400 Subject: [PATCH 09/17] Update go/parquet/internal/encoding/byte_array_encoder.go Co-authored-by: emkornfield --- go/parquet/internal/encoding/byte_array_encoder.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/parquet/internal/encoding/byte_array_encoder.go b/go/parquet/internal/encoding/byte_array_encoder.go index dc1ae560625..41762159507 100644 --- a/go/parquet/internal/encoding/byte_array_encoder.go +++ b/go/parquet/internal/encoding/byte_array_encoder.go @@ -26,7 +26,7 @@ import ( ) // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding -// by encoding the length as a uint32 followed by the bytes of the value. +// by encoding the length as a int32 followed by the bytes of the value. type PlainByteArrayEncoder struct { encoder } From e6c239cf7e43cbcd0bece30f33727cc53076e6c8 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Wed, 9 Jun 2021 10:51:26 -0400 Subject: [PATCH 10/17] updates from feedback --- go/parquet/internal/encoding/decoder.go | 4 +++- go/parquet/internal/encoding/delta_bit_packing.go | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/go/parquet/internal/encoding/decoder.go b/go/parquet/internal/encoding/decoder.go index abfa3867e0b..c18aa9a493b 100644 --- a/go/parquet/internal/encoding/decoder.go +++ b/go/parquet/internal/encoding/decoder.go @@ -22,6 +22,7 @@ import ( "github.com/apache/arrow/go/arrow/memory" "github.com/apache/arrow/go/parquet" + "github.com/apache/arrow/go/parquet/internal/debug" format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" "github.com/apache/arrow/go/parquet/internal/utils" "github.com/apache/arrow/go/parquet/schema" @@ -171,7 +172,8 @@ func spacedExpand(buffer interface{}, nullCount int, validBits []byte, validBits // overwrite any existing data with the correctly spaced data. Any data that happens to be left in the null // slots is fine since it shouldn't matter and saves us work. idxDecode -= int32(run.Length) - reflect.Copy(bufferRef.Slice(int(run.Pos), bufferRef.Len()), bufferRef.Slice(int(idxDecode), int(int64(idxDecode)+run.Length))) + n := reflect.Copy(bufferRef.Slice(int(run.Pos), bufferRef.Len()), bufferRef.Slice(int(idxDecode), int(int64(idxDecode)+run.Length))) + debug.Assert(n == int(run.Length), "reflect.Copy copied incorrect number of elements in spacedExpand") } return numValues diff --git a/go/parquet/internal/encoding/delta_bit_packing.go b/go/parquet/internal/encoding/delta_bit_packing.go index 986d862f592..92db5a15dd3 100644 --- a/go/parquet/internal/encoding/delta_bit_packing.go +++ b/go/parquet/internal/encoding/delta_bit_packing.go @@ -59,7 +59,8 @@ func (d *deltaBitPackDecoder) bytesRead() int64 { func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem } -// SetData sets in the data to be decoded and the expected number of values to decode +// SetData sets the bytes and the expected number of values to decode +// into the decoder, updating the decoder and allowing it to be reused. func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) { // set our data into the underlying decoder for the type d.decoder.SetData(nvalues, data) From 99ef6bbcd8c7d8807a4f40908c5744ca7cae0fb6 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Wed, 9 Jun 2021 10:55:11 -0400 Subject: [PATCH 11/17] use int64 to ensure no int32 overflow --- go/parquet/internal/encoding/decoder.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/parquet/internal/encoding/decoder.go b/go/parquet/internal/encoding/decoder.go index c18aa9a493b..1b29a5fd5e1 100644 --- a/go/parquet/internal/encoding/decoder.go +++ b/go/parquet/internal/encoding/decoder.go @@ -153,7 +153,7 @@ func spacedExpand(buffer interface{}, nullCount int, validBits []byte, validBits numValues int = bufferRef.Len() ) - idxDecode := int32(numValues - nullCount) + idxDecode := int64(numValues - nullCount) if idxDecode == 0 { // if there's nothing to decode there's nothing to do. return numValues } @@ -171,7 +171,7 @@ func spacedExpand(buffer interface{}, nullCount int, validBits []byte, validBits // up after ourselves because we're doing this in reverse to guarantee that we'll always simply // overwrite any existing data with the correctly spaced data. Any data that happens to be left in the null // slots is fine since it shouldn't matter and saves us work. - idxDecode -= int32(run.Length) + idxDecode -= run.Length n := reflect.Copy(bufferRef.Slice(int(run.Pos), bufferRef.Len()), bufferRef.Slice(int(idxDecode), int(int64(idxDecode)+run.Length))) debug.Assert(n == int(run.Length), "reflect.Copy copied incorrect number of elements in spacedExpand") } From 1808793cd6e2c5f70a462bd10fb37c4667db9404 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Thu, 10 Jun 2021 15:45:48 -0400 Subject: [PATCH 12/17] performance improvement for putspaced with bytearrays by avoiding an allocation and a copy per call --- .../internal/encoding/byte_array_encoder.go | 18 +++++++++++++++--- .../encoding/fixed_len_byte_array_encoder.go | 18 +++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/go/parquet/internal/encoding/byte_array_encoder.go b/go/parquet/internal/encoding/byte_array_encoder.go index 41762159507..8d46c6f5a9b 100644 --- a/go/parquet/internal/encoding/byte_array_encoder.go +++ b/go/parquet/internal/encoding/byte_array_encoder.go @@ -29,6 +29,8 @@ import ( // by encoding the length as a int32 followed by the bytes of the value. type PlainByteArrayEncoder struct { encoder + + bitSetReader utils.SetBitRunReader } // PutByteArray writes out the 4 bytes for the length followed by the data @@ -53,9 +55,19 @@ func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) { // If validBits is nil, this is equivalent to calling Put func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { if validBits != nil { - data := make([]parquet.ByteArray, len(in)) - nvalid := spacedCompress(in, data, validBits, validBitsOffset) - enc.Put(data[:nvalid]) + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } } else { enc.Put(in) } diff --git a/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go b/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go index 519efc8f029..7eda0d38b0b 100644 --- a/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go +++ b/go/parquet/internal/encoding/fixed_len_byte_array_encoder.go @@ -25,6 +25,8 @@ import ( // always writing typeLength bytes for each value. type PlainFixedLenByteArrayEncoder struct { encoder + + bitSetReader utils.SetBitRunReader } // Put writes the provided values to the encoder @@ -47,9 +49,19 @@ func (enc *PlainFixedLenByteArrayEncoder) Put(in []parquet.FixedLenByteArray) { // PutSpaced is like Put but works with data that is spaced out according to the passed in bitmap func (enc *PlainFixedLenByteArrayEncoder) PutSpaced(in []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) { if validBits != nil { - data := make([]parquet.FixedLenByteArray, len(in)) - nvalid := spacedCompress(in, data, validBits, validBitsOffset) - enc.Put(data[:nvalid]) + if enc.bitSetReader == nil { + enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) + } else { + enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) + } + + for { + run := enc.bitSetReader.NextRun() + if run.Length == 0 { + break + } + enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) + } } else { enc.Put(in) } From 03224b8206001d297c578d1c92b6dfb939a22b04 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Thu, 17 Jun 2021 18:31:21 -0400 Subject: [PATCH 13/17] changes based on feedback --- go/parquet/internal/encoding/decoder.go | 22 +++++++---- .../internal/encoding/delta_bit_packing.go | 37 +++++++++++-------- .../internal/encoding/delta_byte_array.go | 19 ++++++---- .../encoding/delta_length_byte_array.go | 8 ++-- go/parquet/internal/encoding/types.go | 2 +- 5 files changed, 52 insertions(+), 36 deletions(-) diff --git a/go/parquet/internal/encoding/decoder.go b/go/parquet/internal/encoding/decoder.go index 1b29a5fd5e1..6de61574ec5 100644 --- a/go/parquet/internal/encoding/decoder.go +++ b/go/parquet/internal/encoding/decoder.go @@ -26,12 +26,13 @@ import ( format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" "github.com/apache/arrow/go/parquet/internal/utils" "github.com/apache/arrow/go/parquet/schema" + "golang.org/x/xerrors" ) // DecoderTraits provides an interface for more easily interacting with types // to generate decoders for specific types. type DecoderTraits interface { - Decoder(parquet.Encoding, *schema.Column, bool, memory.Allocator) TypedDecoder + Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder BytesRequired(int) int } @@ -42,7 +43,7 @@ func NewDecoder(t parquet.Type, e parquet.Encoding, descr *schema.Column, mem me return nil } - return traits.Decoder(e, descr, false, mem) + return traits.Decoder(e, descr, false /* use dictionary */, mem) } // NewDictDecoder is like NewDecoder but for dictionary encodings, panics if type is bool. @@ -58,7 +59,7 @@ func NewDictDecoder(t parquet.Type, descr *schema.Column, mem memory.Allocator) mem = memory.DefaultAllocator } - return traits.Decoder(parquet.Encodings.RLEDict, descr, true, mem).(DictDecoder) + return traits.Decoder(parquet.Encodings.RLEDict, descr, true /* use dictionary */, mem).(DictDecoder) } type decoder struct { @@ -86,9 +87,10 @@ func newDecoderBase(e format.Encoding, descr *schema.Column) decoder { // SetData sets the data for decoding into the decoder to update the available // data bytes and number of values available. -func (d *decoder) SetData(nvals int, data []byte) { +func (d *decoder) SetData(nvals int, data []byte) error { d.data = data d.nvals = nvals + return nil } // ValuesLeft returns the number of remaining values that can be decoded @@ -115,19 +117,23 @@ func (d *dictDecoder) SetDict(dict TypedDecoder) { } // SetData sets the index value data into the decoder. -func (d *dictDecoder) SetData(nvals int, data []byte) { +func (d *dictDecoder) SetData(nvals int, data []byte) error { d.nvals = nvals if len(data) == 0 { - d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data), 1) - return + // no data, bitwidth can safely be 0 + d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data), 0 /* bitwidth */) + return nil } + // grab the bit width from the first byte width := uint8(data[0]) if width >= 64 { - panic("parquet: invalid or corrupted bit width") + return xerrors.New("parquet: invalid or corrupted bit width") } + // pass the rest of the data, minus that first byte, to the decoder d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data[1:]), int(width)) + return nil } func (d *dictDecoder) decode(out interface{}) (int, error) { diff --git a/go/parquet/internal/encoding/delta_bit_packing.go b/go/parquet/internal/encoding/delta_bit_packing.go index 92db5a15dd3..babd0b1fa97 100644 --- a/go/parquet/internal/encoding/delta_bit_packing.go +++ b/go/parquet/internal/encoding/delta_bit_packing.go @@ -39,10 +39,10 @@ type deltaBitPackDecoder struct { usedFirst bool bitdecoder *utils.BitReader blockSize uint64 - currentBlockVals uint64 + currentBlockVals uint32 miniBlocks uint64 - valsPerMini uint64 - currentMiniBlockVals uint64 + valsPerMini uint32 + currentMiniBlockVals uint32 minDelta int64 miniBlockIdx uint64 @@ -61,9 +61,11 @@ func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem } // SetData sets the bytes and the expected number of values to decode // into the decoder, updating the decoder and allowing it to be reused. -func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) { +func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) error { // set our data into the underlying decoder for the type - d.decoder.SetData(nvalues, data) + if err := d.decoder.SetData(nvalues, data); err != nil { + return err + } // create a bit reader for our decoder's values d.bitdecoder = utils.NewBitReader(bytes.NewReader(d.data)) d.currentBlockVals = 0 @@ -75,27 +77,30 @@ func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) { var ok bool d.blockSize, ok = d.bitdecoder.GetVlqInt() if !ok { - panic("parquet: eof exception") + return xerrors.New("parquet: eof exception") } if d.miniBlocks, ok = d.bitdecoder.GetVlqInt(); !ok { - panic("parquet: eof exception") + return xerrors.New("parquet: eof exception") } var totalValues uint64 if totalValues, ok = d.bitdecoder.GetVlqInt(); !ok { - panic("parquet: eof exception") + return xerrors.New("parquet: eof exception") } if int(totalValues) != d.nvals { - panic("parquet: mismatch between number of values and count in data header") + return xerrors.New("parquet: mismatch between number of values and count in data header") } if d.lastVal, ok = d.bitdecoder.GetZigZagVlqInt(); !ok { - panic("parquet: eof exception") + return xerrors.New("parquet: eof exception") } - d.valsPerMini = uint64(d.blockSize / d.miniBlocks) + if d.miniBlocks != 0 { + d.valsPerMini = uint32(d.blockSize / d.miniBlocks) + } + return nil } // initialize a block to decode @@ -118,7 +123,7 @@ func (d *deltaBitPackDecoder) initBlock() error { d.miniBlockIdx = 0 d.deltaBitWidth = d.deltaBitWidths.Bytes()[0] - d.currentBlockVals = d.blockSize + d.currentBlockVals = uint32(d.blockSize) return nil } @@ -185,8 +190,8 @@ func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) { numCopied := end - start out = out[numCopied:] - d.currentBlockVals -= uint64(numCopied) - d.currentMiniBlockVals -= uint64(numCopied) + d.currentBlockVals -= uint32(numCopied) + d.currentMiniBlockVals -= uint32(numCopied) } return max, nil } @@ -274,8 +279,8 @@ func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) { numCopied := end - start out = out[numCopied:] - d.currentBlockVals -= uint64(numCopied) - d.currentMiniBlockVals -= uint64(numCopied) + d.currentBlockVals -= uint32(numCopied) + d.currentMiniBlockVals -= uint32(numCopied) } return max, nil } diff --git a/go/parquet/internal/encoding/delta_byte_array.go b/go/parquet/internal/encoding/delta_byte_array.go index bc2cc638e70..6c4833c58b7 100644 --- a/go/parquet/internal/encoding/delta_byte_array.go +++ b/go/parquet/internal/encoding/delta_byte_array.go @@ -57,13 +57,13 @@ func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) { return } - var suf [1]parquet.ByteArray + var suf parquet.ByteArray if enc.prefixEncoder == nil { // initialize our encoders if we haven't yet enc.initEncoders() enc.prefixEncoder.Put([]int32{0}) - suf[0] = in[0] + suf = in[0] enc.lastVal = append([]byte(nil), in[0]...) - enc.suffixEncoder.Put(suf[:]) + enc.suffixEncoder.Put([]parquet.ByteArray{suf}) in = in[1:] } @@ -80,8 +80,8 @@ func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) { j++ } enc.prefixEncoder.Put([]int32{int32(j)}) - suf[0] = val[j:] - enc.suffixEncoder.Put(suf[:]) + suf = val[j:] + enc.suffixEncoder.Put([]parquet.ByteArray{suf}) enc.lastVal = append([]byte(nil), val...) } } @@ -133,13 +133,16 @@ func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem } // SetData expects the data passed in to be the prefix lengths, followed by the // blocks of suffix data in order to initialize the decoder. -func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) { +func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error { prefixLenDec := DeltaBitPackInt32Decoder{ deltaBitPackDecoder: &deltaBitPackDecoder{ decoder: newDecoderBase(d.encoding, d.descr), mem: d.mem}} - prefixLenDec.SetData(nvalues, data) + if err := prefixLenDec.SetData(nvalues, data); err != nil { + return err + } + d.prefixLengths = make([]int32, nvalues) // decode all the prefix lengths first so we know how many bytes it took to get the // prefix lengths for nvalues @@ -147,7 +150,7 @@ func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) { // now that we know how many bytes we needed for the prefix lengths, the rest are the // delta length byte array encoding. - d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):]) + return d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):]) } // Decode decodes byte arrays into the slice provided and returns the number of values actually decoded diff --git a/go/parquet/internal/encoding/delta_length_byte_array.go b/go/parquet/internal/encoding/delta_length_byte_array.go index 61309d97c85..65083919093 100644 --- a/go/parquet/internal/encoding/delta_length_byte_array.go +++ b/go/parquet/internal/encoding/delta_length_byte_array.go @@ -104,17 +104,19 @@ func (d *DeltaLengthByteArrayDecoder) Allocator() memory.Allocator { return d.me // SetData sets in the expected data to the decoder which should be nvalues delta packed lengths // followed by the rest of the byte array data immediately after. -func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) { +func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) error { dec := DeltaBitPackInt32Decoder{ deltaBitPackDecoder: &deltaBitPackDecoder{ decoder: newDecoderBase(d.encoding, d.descr), mem: d.mem}} - dec.SetData(nvalues, data) + if err := dec.SetData(nvalues, data); err != nil { + return err + } d.lengths = make([]int32, nvalues) dec.Decode(d.lengths) - d.decoder.SetData(nvalues, data[int(dec.bytesRead()):]) + return d.decoder.SetData(nvalues, data[int(dec.bytesRead()):]) } // Decode populates the passed in slice with data decoded until it hits the length of out diff --git a/go/parquet/internal/encoding/types.go b/go/parquet/internal/encoding/types.go index 21988057226..7bea96a87d8 100644 --- a/go/parquet/internal/encoding/types.go +++ b/go/parquet/internal/encoding/types.go @@ -32,7 +32,7 @@ import ( type TypedDecoder interface { // SetData updates the data in the decoder with the passed in byte slice and the // stated number of values as expected to be decoded. - SetData(buffered int, buf []byte) + SetData(buffered int, buf []byte) error // Encoding returns the encoding type that this decoder decodes data of Encoding() parquet.Encoding // ValuesLeft returns the number of remaining values to be decoded From a2dffb1fcacb90dd7541e3286a8f0f995937a3da Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Thu, 17 Jun 2021 19:32:12 -0400 Subject: [PATCH 14/17] shrink PR --- .../encoding/encoding_benchmarks_test.go | 461 -------- go/parquet/internal/encoding/encoding_test.go | 684 ----------- go/parquet/internal/encoding/levels.go | 284 ----- go/parquet/internal/encoding/levels_test.go | 288 ----- go/parquet/internal/encoding/memo_table.go | 380 ------- .../internal/encoding/memo_table_test.go | 284 ----- .../internal/encoding/memo_table_types.gen.go | 366 ------ .../encoding/memo_table_types.gen.go.tmpl | 115 -- .../internal/encoding/typed_encoder.gen.go | 24 - .../encoding/typed_encoder.gen.go.tmpl | 4 +- go/parquet/internal/encoding/types.go | 60 + go/parquet/internal/hashing/hashing_test.go | 114 -- go/parquet/internal/hashing/types.tmpldata | 18 - .../internal/hashing/xxh3_memo_table.gen.go | 1009 ----------------- .../hashing/xxh3_memo_table.gen.go.tmpl | 290 ----- .../internal/hashing/xxh3_memo_table.go | 386 ------- 16 files changed, 62 insertions(+), 4705 deletions(-) delete mode 100644 go/parquet/internal/encoding/encoding_benchmarks_test.go delete mode 100644 go/parquet/internal/encoding/encoding_test.go delete mode 100644 go/parquet/internal/encoding/levels.go delete mode 100644 go/parquet/internal/encoding/levels_test.go delete mode 100644 go/parquet/internal/encoding/memo_table.go delete mode 100644 go/parquet/internal/encoding/memo_table_test.go delete mode 100644 go/parquet/internal/encoding/memo_table_types.gen.go delete mode 100644 go/parquet/internal/encoding/memo_table_types.gen.go.tmpl delete mode 100644 go/parquet/internal/hashing/hashing_test.go delete mode 100644 go/parquet/internal/hashing/types.tmpldata delete mode 100644 go/parquet/internal/hashing/xxh3_memo_table.gen.go delete mode 100644 go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl delete mode 100644 go/parquet/internal/hashing/xxh3_memo_table.go diff --git a/go/parquet/internal/encoding/encoding_benchmarks_test.go b/go/parquet/internal/encoding/encoding_benchmarks_test.go deleted file mode 100644 index 13d8b0dd9bc..00000000000 --- a/go/parquet/internal/encoding/encoding_benchmarks_test.go +++ /dev/null @@ -1,461 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding_test - -import ( - "fmt" - "math" - "testing" - - "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/memory" - "github.com/apache/arrow/go/parquet" - "github.com/apache/arrow/go/parquet/internal/encoding" - "github.com/apache/arrow/go/parquet/internal/hashing" - "github.com/apache/arrow/go/parquet/internal/testutils" - "github.com/apache/arrow/go/parquet/schema" -) - -const ( - MINSIZE = 1024 - MAXSIZE = 65536 -) - -func BenchmarkPlainEncodingBoolean(b *testing.B) { - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - values := make([]bool, sz) - for idx := range values { - values[idx] = true - } - encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, - false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) - b.ResetTimer() - b.SetBytes(int64(len(values))) - for n := 0; n < b.N; n++ { - encoder.Put(values) - encoder.FlushValues().Release() - } - }) - } -} - -func BenchmarkPlainEncodingInt32(b *testing.B) { - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - values := make([]int32, sz) - for idx := range values { - values[idx] = 64 - } - encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, - false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) - b.ResetTimer() - b.SetBytes(int64(len(values) * arrow.Int32SizeBytes)) - for n := 0; n < b.N; n++ { - encoder.Put(values) - encoder.FlushValues().Release() - } - }) - } -} - -func BenchmarkPlainEncodingInt64(b *testing.B) { - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - values := make([]int64, sz) - for idx := range values { - values[idx] = 64 - } - encoder := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.Plain, - false, nil, memory.DefaultAllocator).(encoding.Int64Encoder) - b.ResetTimer() - b.SetBytes(int64(len(values) * arrow.Int64SizeBytes)) - for n := 0; n < b.N; n++ { - encoder.Put(values) - encoder.FlushValues().Release() - } - }) - } -} - -func BenchmarkPlainEncodingFloat32(b *testing.B) { - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - values := make([]float32, sz) - for idx := range values { - values[idx] = 64.0 - } - encoder := encoding.NewEncoder(parquet.Types.Float, parquet.Encodings.Plain, - false, nil, memory.DefaultAllocator).(encoding.Float32Encoder) - b.ResetTimer() - b.SetBytes(int64(len(values) * arrow.Float32SizeBytes)) - for n := 0; n < b.N; n++ { - encoder.Put(values) - encoder.FlushValues().Release() - } - }) - } -} - -func BenchmarkPlainEncodingFloat64(b *testing.B) { - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - values := make([]float64, sz) - for idx := range values { - values[idx] = 64 - } - encoder := encoding.NewEncoder(parquet.Types.Double, parquet.Encodings.Plain, - false, nil, memory.DefaultAllocator).(encoding.Float64Encoder) - b.ResetTimer() - b.SetBytes(int64(len(values) * arrow.Float64SizeBytes)) - for n := 0; n < b.N; n++ { - encoder.Put(values) - encoder.FlushValues().Release() - } - }) - } -} - -func BenchmarkPlainDecodingBoolean(b *testing.B) { - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - output := make([]bool, sz) - values := make([]bool, sz) - for idx := range values { - values[idx] = true - } - encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, - false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) - encoder.Put(values) - buf := encoder.FlushValues() - defer buf.Release() - - decoder := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, nil, memory.DefaultAllocator) - b.ResetTimer() - b.SetBytes(int64(len(values))) - for n := 0; n < b.N; n++ { - decoder.SetData(sz, buf.Bytes()) - decoder.(encoding.BooleanDecoder).Decode(output) - } - }) - } -} - -func BenchmarkPlainDecodingInt32(b *testing.B) { - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - output := make([]int32, sz) - values := make([]int32, sz) - for idx := range values { - values[idx] = 64 - } - encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, - false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) - encoder.Put(values) - buf := encoder.FlushValues() - defer buf.Release() - - decoder := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.Plain, nil, memory.DefaultAllocator) - b.ResetTimer() - b.SetBytes(int64(len(values))) - for n := 0; n < b.N; n++ { - decoder.SetData(sz, buf.Bytes()) - decoder.(encoding.Int32Decoder).Decode(output) - } - }) - } -} - -func BenchmarkMemoTableFloat64(b *testing.B) { - tests := []struct { - nunique int32 - nvalues int64 - }{ - {100, 65535}, - {1000, 65535}, - {5000, 65535}, - } - - for _, tt := range tests { - b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { - rag := testutils.NewRandomArrayGenerator(0) - dict := rag.Float64(int64(tt.nunique), 0) - indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) - - values := make([]float64, tt.nvalues) - for idx := range values { - values[idx] = dict.Value(int(indices.Value(idx))) - } - - b.ResetTimer() - b.Run("go map", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := encoding.NewFloat64MemoTable(memory.DefaultAllocator) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nunique) { - b.Fatal(tbl.Size(), tt.nunique) - } - } - }) - b.ResetTimer() - b.Run("xxh3", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := hashing.NewFloat64MemoTable(0) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nunique) { - b.Fatal(tbl.Size(), tt.nunique) - } - } - }) - }) - } -} - -func BenchmarkMemoTableInt32(b *testing.B) { - tests := []struct { - nunique int32 - nvalues int64 - }{ - {100, 65535}, - {1000, 65535}, - {5000, 65535}, - } - - for _, tt := range tests { - b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { - rag := testutils.NewRandomArrayGenerator(0) - dict := rag.Int32(int64(tt.nunique), 0, math.MaxInt32-1, 0) - indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) - - values := make([]int32, tt.nvalues) - for idx := range values { - values[idx] = dict.Value(int(indices.Value(idx))) - } - b.ResetTimer() - b.Run("xxh3", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := hashing.NewInt32MemoTable(0) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nunique) { - b.Fatal(tbl.Size(), tt.nunique) - } - } - }) - - b.Run("go map", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := encoding.NewInt32MemoTable(memory.DefaultAllocator) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nunique) { - b.Fatal(tbl.Size(), tt.nunique) - } - } - }) - }) - } -} - -func BenchmarkMemoTable(b *testing.B) { - tests := []struct { - nunique int32 - minLen int32 - maxLen int32 - nvalues int64 - }{ - {100, 32, 32, 65535}, - {100, 8, 32, 65535}, - {1000, 32, 32, 65535}, - {1000, 8, 32, 65535}, - {5000, 32, 32, 65535}, - {5000, 8, 32, 65535}, - } - - for _, tt := range tests { - b.Run(fmt.Sprintf("%d unique len %d-%d n %d", tt.nunique, tt.minLen, tt.maxLen, tt.nvalues), func(b *testing.B) { - - rag := testutils.NewRandomArrayGenerator(0) - dict := rag.ByteArray(int64(tt.nunique), tt.minLen, tt.maxLen, 0).(*array.String) - indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) - - values := make([]parquet.ByteArray, tt.nvalues) - for idx := range values { - values[idx] = []byte(dict.Value(int(indices.Value(idx)))) - } - - b.ResetTimer() - - b.Run("xxh3", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nunique) { - b.Fatal(tbl.Size(), tt.nunique) - } - tbl.Release() - } - }) - b.ResetTimer() - b.Run("go map", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nunique) { - b.Fatal(tbl.Size(), tt.nunique) - } - tbl.Release() - } - }) - }) - } -} - -func BenchmarkMemoTableAllUnique(b *testing.B) { - tests := []struct { - minLen int32 - maxLen int32 - nvalues int64 - }{ - {32, 32, 1024}, - {8, 32, 1024}, - {32, 32, 32767}, - {8, 32, 32767}, - {32, 32, 65535}, - {8, 32, 65535}, - } - for _, tt := range tests { - b.Run(fmt.Sprintf("values %d len %d-%d", tt.nvalues, tt.minLen, tt.maxLen), func(b *testing.B) { - - rag := testutils.NewRandomArrayGenerator(0) - dict := rag.ByteArray(tt.nvalues, tt.minLen, tt.maxLen, 0).(*array.String) - - values := make([]parquet.ByteArray, tt.nvalues) - for idx := range values { - values[idx] = []byte(dict.Value(idx)) - } - - b.ResetTimer() - b.Run("go map", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nvalues) { - b.Fatal(tbl.Size(), tt.nvalues) - } - tbl.Release() - } - }) - - b.Run("xxh3", func(b *testing.B) { - for i := 0; i < b.N; i++ { - tbl := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) - for _, v := range values { - tbl.GetOrInsert(v) - } - if tbl.Size() != int(tt.nvalues) { - b.Fatal(tbl.Size(), tt.nvalues) - } - tbl.Release() - } - }) - }) - } - -} - -func BenchmarkEncodeDictByteArray(b *testing.B) { - const ( - nunique = 100 - minLen = 8 - maxLen = 32 - nvalues = 65535 - ) - - rag := testutils.NewRandomArrayGenerator(0) - dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) - indices := rag.Int32(nvalues, 0, nunique-1, 0) - - values := make([]parquet.ByteArray, nvalues) - for idx := range values { - values[idx] = []byte(dict.Value(int(indices.Value(idx)))) - } - col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) - - out := make([]byte, nunique*(maxLen+arrow.Uint32SizeBytes)) - b.ResetTimer() - for i := 0; i < b.N; i++ { - enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) - enc.Put(values) - enc.WriteDict(out) - } -} - -func BenchmarkDecodeDictByteArray(b *testing.B) { - const ( - nunique = 100 - minLen = 32 - maxLen = 32 - nvalues = 65535 - ) - - rag := testutils.NewRandomArrayGenerator(0) - dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) - indices := rag.Int32(nvalues, 0, nunique-1, 0) - - values := make([]parquet.ByteArray, nvalues) - for idx := range values { - values[idx] = []byte(dict.Value(int(indices.Value(idx)))) - } - - col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) - enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) - enc.Put(values) - - dictBuf := make([]byte, enc.DictEncodedSize()) - enc.WriteDict(dictBuf) - - idxBuf := make([]byte, enc.EstimatedDataEncodedSize()) - enc.WriteIndices(idxBuf) - - out := make([]parquet.ByteArray, nvalues) - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.Plain, col, memory.DefaultAllocator) - dec.SetData(nunique, dictBuf) - dictDec := encoding.NewDictDecoder(parquet.Types.ByteArray, col, memory.DefaultAllocator).(*encoding.DictByteArrayDecoder) - dictDec.SetDict(dec) - dictDec.SetData(nvalues, idxBuf) - - dictDec.Decode(out) - } -} diff --git a/go/parquet/internal/encoding/encoding_test.go b/go/parquet/internal/encoding/encoding_test.go deleted file mode 100644 index 380f46e5334..00000000000 --- a/go/parquet/internal/encoding/encoding_test.go +++ /dev/null @@ -1,684 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding_test - -import ( - "fmt" - "reflect" - "testing" - "unsafe" - - "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/bitutil" - "github.com/apache/arrow/go/arrow/memory" - "github.com/apache/arrow/go/parquet" - "github.com/apache/arrow/go/parquet/internal/encoding" - "github.com/apache/arrow/go/parquet/internal/testutils" - "github.com/apache/arrow/go/parquet/schema" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/suite" -) - -type nodeFactory func(string, parquet.Repetition, int32) *schema.PrimitiveNode - -func createNodeFactory(t reflect.Type) nodeFactory { - switch t { - case reflect.TypeOf(true): - return schema.NewBooleanNode - case reflect.TypeOf(int32(0)): - return schema.NewInt32Node - case reflect.TypeOf(int64(0)): - return schema.NewInt64Node - case reflect.TypeOf(parquet.Int96{}): - return schema.NewInt96Node - case reflect.TypeOf(float32(0)): - return schema.NewFloat32Node - case reflect.TypeOf(float64(0)): - return schema.NewFloat64Node - case reflect.TypeOf(parquet.ByteArray{}): - return schema.NewByteArrayNode - case reflect.TypeOf(parquet.FixedLenByteArray{}): - return func(name string, rep parquet.Repetition, field int32) *schema.PrimitiveNode { - return schema.NewFixedLenByteArrayNode(name, rep, 12, field) - } - } - return nil -} - -func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, heap *memory.Buffer) (interface{}, interface{}) { - switch t { - case reflect.TypeOf(true): - draws := *(*[]bool)(unsafe.Pointer(&drawbuf)) - decode := *(*[]bool)(unsafe.Pointer(&decodebuf)) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - case reflect.TypeOf(int32(0)): - draws := arrow.Int32Traits.CastFromBytes(drawbuf) - decode := arrow.Int32Traits.CastFromBytes(decodebuf) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - case reflect.TypeOf(int64(0)): - draws := arrow.Int64Traits.CastFromBytes(drawbuf) - decode := arrow.Int64Traits.CastFromBytes(decodebuf) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - case reflect.TypeOf(parquet.Int96{}): - draws := parquet.Int96Traits.CastFromBytes(drawbuf) - decode := parquet.Int96Traits.CastFromBytes(decodebuf) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - case reflect.TypeOf(float32(0)): - draws := arrow.Float32Traits.CastFromBytes(drawbuf) - decode := arrow.Float32Traits.CastFromBytes(decodebuf) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - case reflect.TypeOf(float64(0)): - draws := arrow.Float64Traits.CastFromBytes(drawbuf) - decode := arrow.Float64Traits.CastFromBytes(decodebuf) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - case reflect.TypeOf(parquet.ByteArray{}): - draws := make([]parquet.ByteArray, nvals*repeats) - decode := make([]parquet.ByteArray, nvals*repeats) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - case reflect.TypeOf(parquet.FixedLenByteArray{}): - draws := make([]parquet.FixedLenByteArray, nvals*repeats) - decode := make([]parquet.FixedLenByteArray, nvals*repeats) - testutils.InitValues(draws[:nvals], heap) - - for j := 1; j < repeats; j++ { - for k := 0; k < nvals; k++ { - draws[nvals*j+k] = draws[k] - } - } - - return draws[:nvals*repeats], decode[:nvals*repeats] - } - return nil, nil -} - -func encode(enc encoding.TypedEncoder, vals interface{}) { - switch v := vals.(type) { - case []bool: - enc.(encoding.BooleanEncoder).Put(v) - case []int32: - enc.(encoding.Int32Encoder).Put(v) - case []int64: - enc.(encoding.Int64Encoder).Put(v) - case []parquet.Int96: - enc.(encoding.Int96Encoder).Put(v) - case []float32: - enc.(encoding.Float32Encoder).Put(v) - case []float64: - enc.(encoding.Float64Encoder).Put(v) - case []parquet.ByteArray: - enc.(encoding.ByteArrayEncoder).Put(v) - case []parquet.FixedLenByteArray: - enc.(encoding.FixedLenByteArrayEncoder).Put(v) - } -} - -func encodeSpaced(enc encoding.TypedEncoder, vals interface{}, validBits []byte, validBitsOffset int64) { - switch v := vals.(type) { - case []bool: - enc.(encoding.BooleanEncoder).PutSpaced(v, validBits, validBitsOffset) - case []int32: - enc.(encoding.Int32Encoder).PutSpaced(v, validBits, validBitsOffset) - case []int64: - enc.(encoding.Int64Encoder).PutSpaced(v, validBits, validBitsOffset) - case []parquet.Int96: - enc.(encoding.Int96Encoder).PutSpaced(v, validBits, validBitsOffset) - case []float32: - enc.(encoding.Float32Encoder).PutSpaced(v, validBits, validBitsOffset) - case []float64: - enc.(encoding.Float64Encoder).PutSpaced(v, validBits, validBitsOffset) - case []parquet.ByteArray: - enc.(encoding.ByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) - case []parquet.FixedLenByteArray: - enc.(encoding.FixedLenByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) - } -} - -func decode(dec encoding.TypedDecoder, out interface{}) (int, error) { - switch v := out.(type) { - case []bool: - return dec.(encoding.BooleanDecoder).Decode(v) - case []int32: - return dec.(encoding.Int32Decoder).Decode(v) - case []int64: - return dec.(encoding.Int64Decoder).Decode(v) - case []parquet.Int96: - return dec.(encoding.Int96Decoder).Decode(v) - case []float32: - return dec.(encoding.Float32Decoder).Decode(v) - case []float64: - return dec.(encoding.Float64Decoder).Decode(v) - case []parquet.ByteArray: - return dec.(encoding.ByteArrayDecoder).Decode(v) - case []parquet.FixedLenByteArray: - return dec.(encoding.FixedLenByteArrayDecoder).Decode(v) - } - return 0, nil -} - -func decodeSpaced(dec encoding.TypedDecoder, out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { - switch v := out.(type) { - case []bool: - return dec.(encoding.BooleanDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - case []int32: - return dec.(encoding.Int32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - case []int64: - return dec.(encoding.Int64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - case []parquet.Int96: - return dec.(encoding.Int96Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - case []float32: - return dec.(encoding.Float32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - case []float64: - return dec.(encoding.Float64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - case []parquet.ByteArray: - return dec.(encoding.ByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - case []parquet.FixedLenByteArray: - return dec.(encoding.FixedLenByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) - } - return 0, nil -} - -type BaseEncodingTestSuite struct { - suite.Suite - - descr *schema.Column - typeLen int - mem memory.Allocator - typ reflect.Type - - nvalues int - heap *memory.Buffer - inputBytes *memory.Buffer - outputBytes *memory.Buffer - nodeFactory nodeFactory - - draws interface{} - decodeBuf interface{} -} - -func (b *BaseEncodingTestSuite) SetupSuite() { - b.mem = memory.DefaultAllocator - b.inputBytes = memory.NewResizableBuffer(b.mem) - b.outputBytes = memory.NewResizableBuffer(b.mem) - b.heap = memory.NewResizableBuffer(b.mem) - b.nodeFactory = createNodeFactory(b.typ) -} - -func (b *BaseEncodingTestSuite) TearDownSuite() { - b.inputBytes.Release() - b.outputBytes.Release() - b.heap.Release() -} - -func (b *BaseEncodingTestSuite) SetupTest() { - b.descr = schema.NewColumn(b.nodeFactory("name", parquet.Repetitions.Optional, -1), 0, 0) - b.typeLen = int(b.descr.TypeLength()) -} - -func (b *BaseEncodingTestSuite) initData(nvalues, repeats int) { - b.nvalues = nvalues * repeats - b.inputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) - b.outputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) - memory.Set(b.inputBytes.Buf(), 0) - memory.Set(b.outputBytes.Buf(), 0) - - b.draws, b.decodeBuf = initdata(b.typ, b.inputBytes.Buf(), b.outputBytes.Buf(), nvalues, repeats, b.heap) -} - -func (b *BaseEncodingTestSuite) encodeTestData(e parquet.Encoding) encoding.Buffer { - enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) - b.Equal(e, enc.Encoding()) - b.Equal(b.descr.PhysicalType(), enc.Type()) - encode(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface()) - return enc.FlushValues() -} - -func (b *BaseEncodingTestSuite) decodeTestData(e parquet.Encoding, buf []byte) { - dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) - b.Equal(e, dec.Encoding()) - b.Equal(b.descr.PhysicalType(), dec.Type()) - - dec.SetData(b.nvalues, buf) - decoded, _ := decode(dec, b.decodeBuf) - b.Equal(b.nvalues, decoded) - b.Equal(reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), reflect.ValueOf(b.decodeBuf).Slice(0, b.nvalues).Interface()) -} - -func (b *BaseEncodingTestSuite) encodeTestDataSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) encoding.Buffer { - enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) - encodeSpaced(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), validBits, validBitsOffset) - return enc.FlushValues() -} - -func (b *BaseEncodingTestSuite) decodeTestDataSpaced(e parquet.Encoding, nullCount int, buf []byte, validBits []byte, validBitsOffset int64) { - dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) - dec.SetData(b.nvalues-nullCount, buf) - decoded, _ := decodeSpaced(dec, b.decodeBuf, nullCount, validBits, validBitsOffset) - b.Equal(b.nvalues, decoded) - - drawval := reflect.ValueOf(b.draws) - decodeval := reflect.ValueOf(b.decodeBuf) - for j := 0; j < b.nvalues; j++ { - if bitutil.BitIsSet(validBits, int(validBitsOffset)+j) { - b.Equal(drawval.Index(j).Interface(), decodeval.Index(j).Interface()) - } - } -} - -func (b *BaseEncodingTestSuite) checkRoundTrip(e parquet.Encoding) { - buf := b.encodeTestData(e) - defer buf.Release() - b.decodeTestData(e, buf.Bytes()) -} - -func (b *BaseEncodingTestSuite) checkRoundTripSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) { - buf := b.encodeTestDataSpaced(e, validBits, validBitsOffset) - defer buf.Release() - - nullCount := 0 - for i := 0; i < b.nvalues; i++ { - if bitutil.BitIsNotSet(validBits, int(validBitsOffset)+i) { - nullCount++ - } - } - b.decodeTestDataSpaced(e, nullCount, buf.Bytes(), validBits, validBitsOffset) -} - -func (b *BaseEncodingTestSuite) TestBasicRoundTrip() { - b.initData(10000, 1) - b.checkRoundTrip(parquet.Encodings.Plain) -} - -func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() { - b.initData(10000, 1) - - switch b.typ { - case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): - b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) - default: - b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) }) - } -} - -func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() { - b.initData(10000, 1) - - switch b.typ { - case reflect.TypeOf(parquet.ByteArray{}): - b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) - default: - b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) - } -} - -func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() { - b.initData(10000, 1) - - switch b.typ { - case reflect.TypeOf(parquet.ByteArray{}): - b.checkRoundTrip(parquet.Encodings.DeltaByteArray) - default: - b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) - } -} - -func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() { - exec := func(vals, repeats int, validBitsOffset int64, nullProb float64) { - b.Run(fmt.Sprintf("%d vals %d repeats %d offset %0.3f null", vals, repeats, validBitsOffset, 1-nullProb), func() { - b.initData(vals, repeats) - - size := int64(b.nvalues) + validBitsOffset - r := testutils.NewRandomArrayGenerator(1923) - arr := r.Uint8(size, 0, 100, 1-nullProb) - validBits := arr.NullBitmapBytes() - if validBits != nil { - b.checkRoundTripSpaced(parquet.Encodings.Plain, validBits, validBitsOffset) - switch b.typ { - case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): - b.checkRoundTripSpaced(parquet.Encodings.DeltaBinaryPacked, validBits, validBitsOffset) - case reflect.TypeOf(parquet.ByteArray{}): - b.checkRoundTripSpaced(parquet.Encodings.DeltaLengthByteArray, validBits, validBitsOffset) - b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset) - } - } - }) - } - - const ( - avx512Size = 64 - simdSize = avx512Size - multiSimdSize = simdSize * 33 - ) - - for _, nullProb := range []float64{0.001, 0.1, 0.5, 0.9, 0.999} { - // Test with both size and offset up to 3 simd block - for i := 1; i < simdSize*3; i++ { - exec(i, 1, 0, nullProb) - exec(i, 1, int64(i+1), nullProb) - } - // large block and offset - exec(multiSimdSize, 1, 0, nullProb) - exec(multiSimdSize+33, 1, 0, nullProb) - exec(multiSimdSize, 1, 33, nullProb) - exec(multiSimdSize+33, 1, 33, nullProb) - } -} - -func TestEncoding(t *testing.T) { - tests := []struct { - name string - typ reflect.Type - }{ - {"Bool", reflect.TypeOf(true)}, - {"Int32", reflect.TypeOf(int32(0))}, - {"Int64", reflect.TypeOf(int64(0))}, - {"Float32", reflect.TypeOf(float32(0))}, - {"Float64", reflect.TypeOf(float64(0))}, - {"Int96", reflect.TypeOf(parquet.Int96{})}, - {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, - {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - suite.Run(t, &BaseEncodingTestSuite{typ: tt.typ}) - }) - } -} - -type DictionaryEncodingTestSuite struct { - BaseEncodingTestSuite -} - -func (d *DictionaryEncodingTestSuite) encodeTestDataDict(e parquet.Encoding) (dictBuffer, indices encoding.Buffer, numEntries int) { - enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) - - d.Equal(parquet.Encodings.PlainDict, enc.Encoding()) - d.Equal(d.descr.PhysicalType(), enc.Type()) - encode(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface()) - dictBuffer = memory.NewResizableBuffer(d.mem) - dictBuffer.Resize(enc.DictEncodedSize()) - enc.WriteDict(dictBuffer.Bytes()) - indices = enc.FlushValues() - numEntries = enc.NumEntries() - return -} - -func (d *DictionaryEncodingTestSuite) encodeTestDataDictSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (dictBuffer, indices encoding.Buffer, numEntries int) { - enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) - d.Equal(d.descr.PhysicalType(), enc.Type()) - - encodeSpaced(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), validBits, validBitsOffset) - dictBuffer = memory.NewResizableBuffer(d.mem) - dictBuffer.Resize(enc.DictEncodedSize()) - enc.WriteDict(dictBuffer.Bytes()) - indices = enc.FlushValues() - numEntries = enc.NumEntries() - return -} - -func (d *DictionaryEncodingTestSuite) checkRoundTrip() { - dictBuffer, indices, numEntries := d.encodeTestDataDict(parquet.Encodings.Plain) - defer dictBuffer.Release() - defer indices.Release() - validBits := make([]byte, int(bitutil.BytesForBits(int64(d.nvalues)))+1) - memory.Set(validBits, 255) - - spacedBuffer, indicesSpaced, _ := d.encodeTestDataDictSpaced(parquet.Encodings.Plain, validBits, 0) - defer spacedBuffer.Release() - defer indicesSpaced.Release() - d.Equal(indices.Bytes(), indicesSpaced.Bytes()) - - dictDecoder := encoding.NewDecoder(testutils.TypeToParquetType(d.typ), parquet.Encodings.Plain, d.descr, d.mem) - d.Equal(d.descr.PhysicalType(), dictDecoder.Type()) - dictDecoder.SetData(numEntries, dictBuffer.Bytes()) - decoder := encoding.NewDictDecoder(testutils.TypeToParquetType(d.typ), d.descr, d.mem) - decoder.SetDict(dictDecoder) - decoder.SetData(d.nvalues, indices.Bytes()) - - decoded, _ := decode(decoder, d.decodeBuf) - d.Equal(d.nvalues, decoded) - d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) - - decoder.SetData(d.nvalues, indices.Bytes()) - decoded, _ = decodeSpaced(decoder, d.decodeBuf, 0, validBits, 0) - d.Equal(d.nvalues, decoded) - d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) -} - -func (d *DictionaryEncodingTestSuite) TestBasicRoundTrip() { - d.initData(2500, 2) - d.checkRoundTrip() -} - -func TestDictEncoding(t *testing.T) { - tests := []struct { - name string - typ reflect.Type - }{ - {"Int32", reflect.TypeOf(int32(0))}, - {"Int64", reflect.TypeOf(int64(0))}, - {"Float32", reflect.TypeOf(float32(0))}, - {"Float64", reflect.TypeOf(float64(0))}, - {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, - {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - suite.Run(t, &DictionaryEncodingTestSuite{BaseEncodingTestSuite{typ: tt.typ}}) - }) - } -} - -func TestWriteDeltaBitPackedInt32(t *testing.T) { - column := schema.NewColumn(schema.NewInt32Node("int32", parquet.Repetitions.Required, -1), 0, 0) - - tests := []struct { - name string - toencode []int32 - expected []byte - }{ - {"simple 12345", []int32{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, - {"odd vals", []int32{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) - - enc.(encoding.Int32Encoder).Put(tt.toencode) - buf := enc.FlushValues() - defer buf.Release() - - assert.Equal(t, tt.expected, buf.Bytes()) - - dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) - - dec.(encoding.Int32Decoder).SetData(len(tt.toencode), tt.expected) - out := make([]int32, len(tt.toencode)) - dec.(encoding.Int32Decoder).Decode(out) - assert.Equal(t, tt.toencode, out) - }) - } - - t.Run("test progressive decoding", func(t *testing.T) { - values := make([]int32, 1000) - testutils.FillRandomInt32(0, values) - - enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) - enc.(encoding.Int32Encoder).Put(values) - buf := enc.FlushValues() - defer buf.Release() - - dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) - dec.(encoding.Int32Decoder).SetData(len(values), buf.Bytes()) - - valueBuf := make([]int32, 100) - for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { - dec.(encoding.Int32Decoder).Decode(valueBuf) - assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) - } - }) -} - -func TestWriteDeltaBitPackedInt64(t *testing.T) { - column := schema.NewColumn(schema.NewInt64Node("int64", parquet.Repetitions.Required, -1), 0, 0) - - tests := []struct { - name string - toencode []int64 - expected []byte - }{ - {"simple 12345", []int64{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, - {"odd vals", []int64{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) - - enc.(encoding.Int64Encoder).Put(tt.toencode) - buf := enc.FlushValues() - defer buf.Release() - - assert.Equal(t, tt.expected, buf.Bytes()) - - dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) - - dec.(encoding.Int64Decoder).SetData(len(tt.toencode), tt.expected) - out := make([]int64, len(tt.toencode)) - dec.(encoding.Int64Decoder).Decode(out) - assert.Equal(t, tt.toencode, out) - }) - } - - t.Run("test progressive decoding", func(t *testing.T) { - values := make([]int64, 1000) - testutils.FillRandomInt64(0, values) - - enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) - enc.(encoding.Int64Encoder).Put(values) - buf := enc.FlushValues() - defer buf.Release() - - dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) - dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes()) - - valueBuf := make([]int64, 100) - for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { - decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf) - assert.Equal(t, len(valueBuf), decoded) - assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) - } - }) -} - -func TestDeltaLengthByteArrayEncoding(t *testing.T) { - column := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) - - test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} - expected := []byte{128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} - - enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, false, column, memory.DefaultAllocator) - enc.(encoding.ByteArrayEncoder).Put(test) - buf := enc.FlushValues() - defer buf.Release() - - assert.Equal(t, expected, buf.Bytes()) - - dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, column, nil) - dec.SetData(len(test), expected) - out := make([]parquet.ByteArray, len(test)) - decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) - assert.Equal(t, len(test), decoded) - assert.Equal(t, test, out) -} - -func TestDeltaByteArrayEncoding(t *testing.T) { - test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} - expected := []byte{128, 1, 4, 4, 0, 0, 0, 0, 0, 0, 128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} - - enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, false, nil, nil) - enc.(encoding.ByteArrayEncoder).Put(test) - buf := enc.FlushValues() - defer buf.Release() - - assert.Equal(t, expected, buf.Bytes()) - - dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, nil, nil) - dec.SetData(len(test), expected) - out := make([]parquet.ByteArray, len(test)) - decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) - assert.Equal(t, len(test), decoded) - assert.Equal(t, test, out) -} diff --git a/go/parquet/internal/encoding/levels.go b/go/parquet/internal/encoding/levels.go deleted file mode 100644 index c45858d4653..00000000000 --- a/go/parquet/internal/encoding/levels.go +++ /dev/null @@ -1,284 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding - -import ( - "bytes" - "encoding/binary" - "io" - "math/bits" - - "github.com/JohnCGriffin/overflow" - "github.com/apache/arrow/go/arrow/bitutil" - "github.com/apache/arrow/go/parquet" - format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet" - "github.com/apache/arrow/go/parquet/internal/utils" -) - -// LevelEncoder is for handling the encoding of Definition and Repetition levels -// to parquet files. -type LevelEncoder struct { - bitWidth int - rleLen int - encoding format.Encoding - rle *utils.RleEncoder - bit *utils.BitWriter -} - -// LevelEncodingMaxBufferSize estimates the max number of bytes needed to encode data with the -// specified encoding given the max level and number of buffered values provided. -func LevelEncodingMaxBufferSize(encoding parquet.Encoding, maxLvl int16, nbuffered int) int { - bitWidth := bits.Len64(uint64(maxLvl)) - nbytes := 0 - switch encoding { - case parquet.Encodings.RLE: - nbytes = utils.MaxBufferSize(bitWidth, nbuffered) + utils.MinBufferSize(bitWidth) - case parquet.Encodings.BitPacked: - nbytes = int(bitutil.BytesForBits(int64(nbuffered * bitWidth))) - default: - panic("parquet: unknown encoding type for levels") - } - return nbytes -} - -// Reset resets the encoder allowing it to be reused and updating the maxlevel to the new -// specified value. -func (l *LevelEncoder) Reset(maxLvl int16) { - l.bitWidth = bits.Len64(uint64(maxLvl)) - switch l.encoding { - case format.Encoding_RLE: - l.rle.Clear() - l.rle.BitWidth = l.bitWidth - case format.Encoding_BIT_PACKED: - l.bit.Clear() - default: - panic("parquet: unknown encoding type") - } -} - -// Init is called to set up the desired encoding type, max level and underlying writer for a -// level encoder to control where the resulting encoded buffer will end up. -func (l *LevelEncoder) Init(encoding parquet.Encoding, maxLvl int16, w io.WriterAt) { - l.bitWidth = bits.Len64(uint64(maxLvl)) - l.encoding = format.Encoding(encoding) - switch l.encoding { - case format.Encoding_RLE: - l.rle = utils.NewRleEncoder(w, l.bitWidth) - case format.Encoding_BIT_PACKED: - l.bit = utils.NewBitWriter(w) - default: - panic("parquet: unknown encoding type for levels") - } -} - -// EncodeNoFlush encodes the provided levels in the encoder, but doesn't flush -// the buffer and return it yet, appending these encoded values. Returns the number -// of values encoded. -func (l *LevelEncoder) EncodeNoFlush(lvls []int16) int { - nencoded := 0 - if l.rle == nil && l.bit == nil { - panic("parquet: level encoders are not initialized") - } - - switch l.encoding { - case format.Encoding_RLE: - for _, level := range lvls { - if !l.rle.Put(uint64(level)) { - break - } - nencoded++ - } - default: - for _, level := range lvls { - if l.bit.WriteValue(uint64(level), uint(l.bitWidth)) != nil { - break - } - nencoded++ - } - } - return nencoded -} - -// Flush flushes out any encoded data to the underlying writer. -func (l *LevelEncoder) Flush() { - if l.rle == nil && l.bit == nil { - panic("parquet: level encoders are not initialized") - } - - switch l.encoding { - case format.Encoding_RLE: - l.rleLen = l.rle.Flush() - default: - l.bit.Flush(false) - } -} - -// Encode encodes the slice of definition or repetition levels based on -// the currently configured encoding type and returns the number of -// values that were encoded. -func (l *LevelEncoder) Encode(lvls []int16) int { - nencoded := 0 - if l.rle == nil && l.bit == nil { - panic("parquet: level encoders are not initialized") - } - - switch l.encoding { - case format.Encoding_RLE: - for _, level := range lvls { - if !l.rle.Put(uint64(level)) { - break - } - nencoded++ - } - l.rleLen = l.rle.Flush() - default: - for _, level := range lvls { - if l.bit.WriteValue(uint64(level), uint(l.bitWidth)) != nil { - break - } - nencoded++ - } - l.bit.Flush(false) - } - return nencoded -} - -// Len returns the number of bytes that were written as Run Length encoded -// levels, this is only valid for run length encoding and will panic if using -// deprecated bit packed encoding. -func (l *LevelEncoder) Len() int { - if l.encoding != format.Encoding_RLE { - panic("parquet: level encoder, only implemented for RLE") - } - return l.rleLen -} - -// LevelDecoder handles the decoding of repetition and definition levels from a -// parquet file supporting bit packed and run length encoded values. -type LevelDecoder struct { - bitWidth int - remaining int - maxLvl int16 - encoding format.Encoding - rle *utils.RleDecoder - bit *utils.BitReader -} - -// SetData sets in the data to be decoded by subsequent calls by specifying the encoding type -// the maximum level (which is what determines the bit width), the number of values expected -// and the raw bytes to decode. Returns the number of bytes expected to be decoded. -func (l *LevelDecoder) SetData(encoding parquet.Encoding, maxLvl int16, nbuffered int, data []byte) int { - l.maxLvl = maxLvl - l.encoding = format.Encoding(encoding) - l.remaining = nbuffered - l.bitWidth = bits.Len64(uint64(maxLvl)) - - switch encoding { - case parquet.Encodings.RLE: - if len(data) < 4 { - panic("parquet: received invalid levels (corrupt data page?)") - } - - nbytes := int32(binary.LittleEndian.Uint32(data[:4])) - if nbytes < 0 || nbytes > int32(len(data)-4) { - panic("parquet: received invalid number of bytes (corrupt data page?)") - } - - buf := data[4:] - if l.rle == nil { - l.rle = utils.NewRleDecoder(bytes.NewReader(buf), l.bitWidth) - } else { - l.rle.Reset(bytes.NewReader(buf), l.bitWidth) - } - return int(nbytes) + 4 - case parquet.Encodings.BitPacked: - nbits, ok := overflow.Mul(nbuffered, l.bitWidth) - if !ok { - panic("parquet: number of buffered values too large (corrupt data page?)") - } - - nbytes := bitutil.BytesForBits(int64(nbits)) - if nbytes < 0 || nbytes > int64(len(data)) { - panic("parquet: recieved invalid number of bytes (corrupt data page?)") - } - if l.bit == nil { - l.bit = utils.NewBitReader(bytes.NewReader(data)) - } else { - l.bit.Reset(bytes.NewReader(data)) - } - return int(nbytes) - default: - panic("parquet: unknown encoding type for levels") - } -} - -// SetDataV2 is the same as SetData but only for DataPageV2 pages and only supports -// run length encoding. -func (l *LevelDecoder) SetDataV2(nbytes int32, maxLvl int16, nbuffered int, data []byte) { - if nbytes < 0 { - panic("parquet: invalid page header (corrupt data page?)") - } - - l.maxLvl = maxLvl - l.encoding = format.Encoding_RLE - l.remaining = nbuffered - l.bitWidth = bits.Len64(uint64(maxLvl)) - - if l.rle == nil { - l.rle = utils.NewRleDecoder(bytes.NewReader(data), l.bitWidth) - } else { - l.rle.Reset(bytes.NewReader(data), l.bitWidth) - } -} - -// Decode decodes the bytes that were set with SetData into the slice of levels -// returning the total number of levels that were decoded and the number of -// values which had a level equal to the max level, indicating how many physical -// values exist to be read. -func (l *LevelDecoder) Decode(levels []int16) (int, int64) { - var ( - buf [1024]uint64 - totaldecoded int - decoded int - valsToRead int64 - ) - - n := utils.Min(int64(l.remaining), int64(len(levels))) - for n > 0 { - batch := utils.Min(1024, n) - switch l.encoding { - case format.Encoding_RLE: - decoded = l.rle.GetBatch(buf[:batch]) - case format.Encoding_BIT_PACKED: - decoded, _ = l.bit.GetBatch(uint(l.bitWidth), buf[:batch]) - } - l.remaining -= decoded - totaldecoded += decoded - n -= batch - - for idx, val := range buf[:decoded] { - lvl := int16(val) - levels[idx] = lvl - if lvl == l.maxLvl { - valsToRead++ - } - } - levels = levels[decoded:] - } - - return totaldecoded, valsToRead -} diff --git a/go/parquet/internal/encoding/levels_test.go b/go/parquet/internal/encoding/levels_test.go deleted file mode 100644 index f39c1d558d8..00000000000 --- a/go/parquet/internal/encoding/levels_test.go +++ /dev/null @@ -1,288 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding_test - -import ( - "encoding/binary" - "strconv" - "testing" - - "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/memory" - "github.com/apache/arrow/go/parquet" - "github.com/apache/arrow/go/parquet/internal/encoding" - "github.com/stretchr/testify/assert" -) - -func generateLevels(minRepeat, maxRepeat int, maxLevel int16) []int16 { - // for each repetition count up to max repeat - ret := make([]int16, 0) - for rep := minRepeat; rep <= maxRepeat; rep++ { - var ( - repCount = 1 << rep - val int16 = 0 - bwidth = 0 - ) - // generate levels for repetition count up to max level - for val <= maxLevel { - for i := 0; i < repCount; i++ { - ret = append(ret, val) - } - val = int16((2 << bwidth) - 1) - bwidth++ - } - } - return ret -} - -func encodeLevels(t *testing.T, enc parquet.Encoding, maxLvl int16, numLevels int, input []int16) []byte { - var ( - encoder encoding.LevelEncoder - lvlCount = 0 - buf = encoding.NewBufferWriter(2*numLevels, memory.DefaultAllocator) - ) - - if enc == parquet.Encodings.RLE { - buf.SetOffset(arrow.Int32SizeBytes) - // leave space to write the rle length value - encoder.Init(enc, maxLvl, buf) - lvlCount = encoder.Encode(input) - buf.SetOffset(0) - arrow.Int32Traits.CastFromBytes(buf.Bytes())[0] = int32(encoder.Len()) - } else { - encoder.Init(enc, maxLvl, buf) - lvlCount = encoder.Encode(input) - } - - assert.Equal(t, numLevels, lvlCount) - return buf.Bytes() -} - -func verifyDecodingLvls(t *testing.T, enc parquet.Encoding, maxLvl int16, input []int16, buf []byte) { - var ( - decoder encoding.LevelDecoder - lvlCount = 0 - numLevels = len(input) - output = make([]int16, numLevels) - decodeCount = 4 - numInnerLevels = numLevels / decodeCount - ) - - // decode levels and test with multiple decode calls - decoder.SetData(enc, maxLvl, numLevels, buf) - // try multiple decoding on a single setdata call - for ct := 0; ct < decodeCount; ct++ { - offset := ct * numInnerLevels - lvlCount, _ = decoder.Decode(output[:numInnerLevels]) - assert.Equal(t, numInnerLevels, lvlCount) - assert.Equal(t, input[offset:offset+numInnerLevels], output[:numInnerLevels]) - } - - // check the remaining levels - var ( - levelsCompleted = decodeCount * (numLevels / decodeCount) - remaining = numLevels - levelsCompleted - ) - - if remaining > 0 { - lvlCount, _ = decoder.Decode(output[:remaining]) - assert.Equal(t, remaining, lvlCount) - assert.Equal(t, input[levelsCompleted:], output[:remaining]) - } - // test decode zero values - lvlCount, _ = decoder.Decode(output[:1]) - assert.Zero(t, lvlCount) -} - -func verifyDecodingMultipleSetData(t *testing.T, enc parquet.Encoding, max int16, input []int16, buf [][]byte) { - var ( - decoder encoding.LevelDecoder - lvlCount = 0 - setdataCount = len(buf) - numLevels = len(input) / setdataCount - output = make([]int16, numLevels) - ) - - for ct := 0; ct < setdataCount; ct++ { - offset := ct * numLevels - assert.Len(t, output, numLevels) - decoder.SetData(enc, max, numLevels, buf[ct]) - lvlCount, _ = decoder.Decode(output) - assert.Equal(t, numLevels, lvlCount) - assert.Equal(t, input[offset:offset+numLevels], output) - } -} - -func TestLevelsDecodeMultipleBitWidth(t *testing.T) { - t.Parallel() - // Test levels with maximum bit-width from 1 to 8 - // increase the repetition count for each iteration by a factor of 2 - var ( - minRepeat = 0 - maxRepeat = 7 // 128 - maxBitWidth = 8 - input []int16 - buf []byte - encodings = [2]parquet.Encoding{parquet.Encodings.RLE, parquet.Encodings.BitPacked} - ) - - for _, enc := range encodings { - t.Run(enc.String(), func(t *testing.T) { - // bitpacked requires a sequence of at least 8 - if enc == parquet.Encodings.BitPacked { - minRepeat = 3 - } - // for each max bit width - for bitWidth := 1; bitWidth <= maxBitWidth; bitWidth++ { - t.Run(strconv.Itoa(bitWidth), func(t *testing.T) { - max := int16((1 << bitWidth) - 1) - // generate levels - input = generateLevels(minRepeat, maxRepeat, max) - assert.NotPanics(t, func() { - buf = encodeLevels(t, enc, max, len(input), input) - }) - assert.NotPanics(t, func() { - verifyDecodingLvls(t, enc, max, input, buf) - }) - }) - } - }) - } -} - -func TestLevelsDecodeMultipleSetData(t *testing.T) { - t.Parallel() - - var ( - minRepeat = 3 - maxRepeat = 7 - bitWidth = 8 - maxLevel = int16((1 << bitWidth) - 1) - encodings = [2]parquet.Encoding{parquet.Encodings.RLE, parquet.Encodings.BitPacked} - ) - - input := generateLevels(minRepeat, maxRepeat, maxLevel) - - var ( - numLevels = len(input) - setdataFactor = 8 - splitLevelSize = numLevels / setdataFactor - buf = make([][]byte, setdataFactor) - ) - - for _, enc := range encodings { - t.Run(enc.String(), func(t *testing.T) { - for rf := 0; rf < setdataFactor; rf++ { - offset := rf * splitLevelSize - assert.NotPanics(t, func() { - buf[rf] = encodeLevels(t, enc, maxLevel, splitLevelSize, input[offset:offset+splitLevelSize]) - }) - } - assert.NotPanics(t, func() { - verifyDecodingMultipleSetData(t, enc, maxLevel, input, buf) - }) - }) - } -} - -func TestMinimumBufferSize(t *testing.T) { - t.Parallel() - - const numToEncode = 1024 - levels := make([]int16, numToEncode) - - for idx := range levels { - if idx%9 == 0 { - levels[idx] = 0 - } else { - levels[idx] = 1 - } - } - - output := encoding.NewBufferWriter(0, memory.DefaultAllocator) - - var encoder encoding.LevelEncoder - encoder.Init(parquet.Encodings.RLE, 1, output) - count := encoder.Encode(levels) - assert.Equal(t, numToEncode, count) -} - -func TestMinimumBufferSize2(t *testing.T) { - t.Parallel() - - // test the worst case for bit_width=2 consisting of - // LiteralRun(size=8) - // RepeatedRun(size=8) - // LiteralRun(size=8) - // ... - const numToEncode = 1024 - levels := make([]int16, numToEncode) - - for idx := range levels { - // This forces a literal run of 00000001 - // followed by eight 1s - if (idx % 16) < 7 { - levels[idx] = 0 - } else { - levels[idx] = 1 - } - } - - for bitWidth := int16(1); bitWidth <= 8; bitWidth++ { - output := encoding.NewBufferWriter(0, memory.DefaultAllocator) - - var encoder encoding.LevelEncoder - encoder.Init(parquet.Encodings.RLE, bitWidth, output) - count := encoder.Encode(levels) - assert.Equal(t, numToEncode, count) - } -} - -func TestEncodeDecodeLevels(t *testing.T) { - t.Parallel() - const numToEncode = 2048 - levels := make([]int16, numToEncode) - numones := 0 - for idx := range levels { - if (idx % 16) < 7 { - levels[idx] = 0 - } else { - levels[idx] = 1 - numones++ - } - } - - output := encoding.NewBufferWriter(0, memory.DefaultAllocator) - - var encoder encoding.LevelEncoder - encoder.Init(parquet.Encodings.RLE, 1, output) - count := encoder.Encode(levels) - assert.Equal(t, numToEncode, count) - encoder.Flush() - - buf := output.Bytes() - var prefix [4]byte - binary.LittleEndian.PutUint32(prefix[:], uint32(len(buf))) - - var decoder encoding.LevelDecoder - decoder.SetData(parquet.Encodings.RLE, 1, numToEncode, append(prefix[:], buf...)) - var levelOut [numToEncode]int16 - total, vals := decoder.Decode(levelOut[:]) - assert.EqualValues(t, numToEncode, total) - assert.EqualValues(t, numones, vals) - assert.Equal(t, levels, levelOut[:]) -} diff --git a/go/parquet/internal/encoding/memo_table.go b/go/parquet/internal/encoding/memo_table.go deleted file mode 100644 index 9a04e6e0d02..00000000000 --- a/go/parquet/internal/encoding/memo_table.go +++ /dev/null @@ -1,380 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding - -import ( - "math" - "unsafe" - - "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/memory" - "github.com/apache/arrow/go/parquet" - "github.com/apache/arrow/go/parquet/internal/hashing" -) - -//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata memo_table_types.gen.go.tmpl - -// MemoTable interface that can be used to swap out implementations of the hash table -// used for handling dictionary encoding. Dictionary encoding is built against this interface -// to make it easy for code generation and changing implementations. -// -// Values should remember the order they are inserted to generate a valid dictionary index -type MemoTable interface { - // Reset drops everything in the table allowing it to be reused - Reset() - // Size returns the current number of unique values stored in the table - // including whether or not a null value has been passed in using GetOrInsertNull - Size() int - // CopyValues populates out with the values currently in the table, out must - // be a slice of the appropriate type for the table type. - CopyValues(out interface{}) - // CopyValuesSubset is like CopyValues but only copies a subset of values starting - // at the indicated index. - CopyValuesSubset(start int, out interface{}) - // Get returns the index of the table the specified value is, and a boolean indicating - // whether or not the value was found in the table. Will panic if val is not the appropriate - // type for the underlying table. - Get(val interface{}) (int, bool) - // GetOrInsert is the same as Get, except if the value is not currently in the table it will - // be inserted into the table. - GetOrInsert(val interface{}) (idx int, existed bool, err error) - // GetNull returns the index of the null value and whether or not it was found in the table - GetNull() (int, bool) - // GetOrInsertNull returns the index of the null value, if it didn't already exist in the table, - // it is inserted. - GetOrInsertNull() (idx int, existed bool) -} - -// BinaryMemoTable is an extension of the MemoTable interface adding extra methods -// for handling byte arrays/strings/fixed length byte arrays. -type BinaryMemoTable interface { - MemoTable - // ValuesSize returns the total number of bytes needed to copy all of the values - // from this table. - ValuesSize() int - // CopyOffsets populates out with the start and end offsets of each value in the - // table data. Out should be sized to Size()+1 to accomodate all of the offsets. - CopyOffsets(out []int8) - // CopyOffsetsSubset is like CopyOffsets but only gets a subset of the offsets - // starting at the specified index. - CopyOffsetsSubset(start int, out []int8) - // CopyFixedWidthValues exists to cope with the fact that the table doesn't track - // the fixed width when inserting the null value into the databuffer populating - // a zero length byte slice for the null value (if found). - CopyFixedWidthValues(start int, width int, out []byte) - // VisitValues calls visitFn on each value in the table starting with the index specified - VisitValues(start int, visitFn func([]byte)) - // Retain increases the reference count of the separately stored binary data that is - // kept alongside the table which contains all of the values in the table. This is - // safe to call simultaneously across multiple goroutines. - Retain() - // Release decreases the reference count by 1 of the separately stored binary data - // kept alongside the table containing the values. When the reference count goes to - // 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy. - Release() -} - -// NewInt32Dictionary returns a memotable interface for use with Int32 values only -func NewInt32Dictionary() MemoTable { - return hashing.NewInt32MemoTable(0) -} - -// NewInt64Dictionary returns a memotable interface for use with Int64 values only -func NewInt64Dictionary() MemoTable { - return hashing.NewInt64MemoTable(0) -} - -// NewFloat32Dictionary returns a memotable interface for use with Float32 values only -func NewFloat32Dictionary() MemoTable { - return hashing.NewFloat32MemoTable(0) -} - -// NewFloat64Dictionary returns a memotable interface for use with Float64 values only -func NewFloat64Dictionary() MemoTable { - return hashing.NewFloat64MemoTable(0) -} - -// NewBinaryDictionary returns a memotable interface for use with strings, byte slices, -// parquet.ByteArray and parquet.FixedLengthByteArray only. -func NewBinaryDictionary(mem memory.Allocator) BinaryMemoTable { - return hashing.NewBinaryMemoTable(mem, 0, -1) -} - -const keyNotFound = hashing.KeyNotFound - -// standard map based implementation of a binary memotable which is only kept around -// currently to be used as a benchmark against the memotables in the internal/hashing -// module as a baseline comparison. - -func NewBinaryMemoTable(mem memory.Allocator) BinaryMemoTable { - return &binaryMemoTableImpl{ - table: make(map[string]int), - nullIndex: keyNotFound, - builder: array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary), - } -} - -type binaryMemoTableImpl struct { - table map[string]int - builder *array.BinaryBuilder - nullIndex int -} - -func (m *binaryMemoTableImpl) Reset() { - m.table = make(map[string]int) - m.nullIndex = keyNotFound - m.builder.NewArray().Release() -} - -func (m *binaryMemoTableImpl) CopyValues(out interface{}) { - m.CopyValuesSubset(0, out) -} - -func (m *binaryMemoTableImpl) GetNull() (int, bool) { - return m.nullIndex, m.nullIndex != keyNotFound -} - -func (m *binaryMemoTableImpl) ValuesSize() int { - return m.builder.DataLen() -} - -func (m *binaryMemoTableImpl) Size() int { - sz := len(m.table) - if _, ok := m.GetNull(); ok { - sz++ - } - return sz -} - -func (m *binaryMemoTableImpl) valAsString(val interface{}) string { - switch v := val.(type) { - case string: - return v - case []byte: - return *(*string)(unsafe.Pointer(&v)) - case parquet.ByteArray: - return *(*string)(unsafe.Pointer(&v)) - case parquet.FixedLenByteArray: - return *(*string)(unsafe.Pointer(&v)) - default: - panic("invalid type for value in binarymemotable") - } -} - -func (m *binaryMemoTableImpl) Get(val interface{}) (int, bool) { - key := m.valAsString(val) - if p, ok := m.table[key]; ok { - return p, true - } - return keyNotFound, false -} - -func (m *binaryMemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { - key := m.valAsString(val) - idx, found = m.table[key] - if !found { - idx = m.Size() - m.builder.AppendString(key) - m.table[key] = idx - } - return -} - -func (m *binaryMemoTableImpl) GetOrInsertNull() (idx int, found bool) { - idx, found = m.GetNull() - if !found { - idx = m.Size() - m.nullIndex = idx - m.builder.AppendNull() - } - return -} - -func (m *binaryMemoTableImpl) findOffset(idx int) uintptr { - val := m.builder.Value(idx) - for len(val) == 0 { - idx++ - if idx >= m.builder.Len() { - break - } - val = m.builder.Value(idx) - } - if len(val) != 0 { - return uintptr(unsafe.Pointer(&val[0])) - } - return uintptr(m.builder.DataLen()) + m.findOffset(0) -} - -func (m *binaryMemoTableImpl) CopyValuesSubset(start int, out interface{}) { - var ( - first = m.findOffset(0) - offset = m.findOffset(int(start)) - length = m.builder.DataLen() - int(offset-first) - ) - - outval := out.([]byte) - copy(outval, m.builder.Value(start)[0:length]) -} - -func (m *binaryMemoTableImpl) CopyFixedWidthValues(start, width int, out []byte) { - -} - -func (m *binaryMemoTableImpl) CopyOffsetsSubset(start int, out []int8) { - if m.builder.Len() <= start { - return - } - - first := m.findOffset(0) - delta := m.findOffset(start) - for i := start; i < m.Size(); i++ { - offset := int8(m.findOffset(i) - delta) - out[i-start] = offset - } - - out[m.Size()-start] = int8(m.builder.DataLen() - int(delta) - int(first)) -} - -func (m *binaryMemoTableImpl) CopyOffsets(out []int8) { - m.CopyOffsetsSubset(0, out) -} - -func (m *binaryMemoTableImpl) VisitValues(start int, visitFn func([]byte)) { - for i := int(start); i < m.Size(); i++ { - visitFn(m.builder.Value(i)) - } -} - -func (m *binaryMemoTableImpl) Release() { - m.builder.Release() -} - -func (m *binaryMemoTableImpl) Retain() { - m.builder.Retain() -} - -// standard map based implementation of a float64 memotable which is only kept around -// currently to be used as a benchmark against the memotables in the internal/hashing -// module as a baseline comparison. - -func NewFloat64MemoTable(memory.Allocator) MemoTable { - return &float64MemoTableImpl{ - table: make(map[float64]struct { - value float64 - memoIndex int - }), - nullIndex: keyNotFound, - nanIndex: keyNotFound, - } -} - -type float64MemoTableImpl struct { - table map[float64]struct { - value float64 - memoIndex int - } - nullIndex int - nanIndex int -} - -func (m *float64MemoTableImpl) Reset() { - m.table = make(map[float64]struct { - value float64 - memoIndex int - }) - m.nullIndex = keyNotFound - m.nanIndex = keyNotFound -} - -func (m *float64MemoTableImpl) GetNull() (int, bool) { - return m.nullIndex, m.nullIndex != keyNotFound -} - -func (m *float64MemoTableImpl) Size() int { - sz := len(m.table) - if _, ok := m.GetNull(); ok { - sz++ - } - if m.nanIndex != keyNotFound { - sz++ - } - return sz -} - -func (m *float64MemoTableImpl) GetOrInsertNull() (idx int, found bool) { - idx, found = m.GetNull() - if !found { - idx = m.Size() - m.nullIndex = idx - } - return -} - -func (m *float64MemoTableImpl) Get(val interface{}) (int, bool) { - v := val.(float64) - if p, ok := m.table[v]; ok { - return p.memoIndex, true - } - if math.IsNaN(v) && m.nanIndex != keyNotFound { - return m.nanIndex, true - } - return keyNotFound, false -} - -func (m *float64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { - v := val.(float64) - if math.IsNaN(v) { - if m.nanIndex == keyNotFound { - idx = m.Size() - m.nanIndex = idx - } else { - idx = m.nanIndex - found = true - } - return - } - - p, ok := m.table[v] - if ok { - idx = p.memoIndex - } else { - idx = m.Size() - p.value = v - p.memoIndex = idx - m.table[v] = p - found = true - } - return -} - -func (m *float64MemoTableImpl) CopyValues(out interface{}) { - m.CopyValuesSubset(0, out) -} - -func (m *float64MemoTableImpl) CopyValuesSubset(start int, out interface{}) { - outval := out.([]float64) - for _, v := range m.table { - idx := v.memoIndex - start - if idx >= 0 { - outval[idx] = v.value - } - } - if m.nanIndex != keyNotFound { - outval[m.nanIndex] = math.NaN() - } -} diff --git a/go/parquet/internal/encoding/memo_table_test.go b/go/parquet/internal/encoding/memo_table_test.go deleted file mode 100644 index 96f1c22733b..00000000000 --- a/go/parquet/internal/encoding/memo_table_test.go +++ /dev/null @@ -1,284 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding_test - -import ( - "math" - "testing" - - "github.com/apache/arrow/go/arrow/memory" - "github.com/apache/arrow/go/parquet/internal/encoding" - "github.com/apache/arrow/go/parquet/internal/hashing" - "github.com/stretchr/testify/suite" -) - -type MemoTableTestSuite struct { - suite.Suite -} - -func TestMemoTable(t *testing.T) { - suite.Run(t, new(MemoTableTestSuite)) -} - -func (m *MemoTableTestSuite) assertGetNotFound(table encoding.MemoTable, v interface{}) { - _, ok := table.Get(v) - m.False(ok) -} - -func (m *MemoTableTestSuite) assertGet(table encoding.MemoTable, v interface{}, expected int) { - idx, ok := table.Get(v) - m.Equal(expected, idx) - m.True(ok) -} - -func (m *MemoTableTestSuite) assertGetOrInsert(table encoding.MemoTable, v interface{}, expected int) { - idx, _, err := table.GetOrInsert(v) - m.NoError(err) - m.Equal(expected, idx) -} - -func (m *MemoTableTestSuite) assertGetNullNotFound(table encoding.MemoTable) { - _, ok := table.GetNull() - m.False(ok) -} - -func (m *MemoTableTestSuite) assertGetNull(table encoding.MemoTable, expected int) { - idx, ok := table.GetNull() - m.Equal(expected, idx) - m.True(ok) -} - -func (m *MemoTableTestSuite) assertGetOrInsertNull(table encoding.MemoTable, expected int) { - idx, _ := table.GetOrInsertNull() - m.Equal(expected, idx) -} - -func (m *MemoTableTestSuite) TestInt64() { - const ( - A int64 = 1234 - B int64 = 0 - C int64 = -98765321 - D int64 = 12345678901234 - E int64 = -1 - F int64 = 1 - G int64 = 9223372036854775807 - H int64 = -9223372036854775807 - 1 - ) - - // table := encoding.NewInt64MemoTable(nil) - table := hashing.NewInt64MemoTable(0) - m.Zero(table.Size()) - m.assertGetNotFound(table, A) - m.assertGetNullNotFound(table) - m.assertGetOrInsert(table, A, 0) - m.assertGetNotFound(table, B) - m.assertGetOrInsert(table, B, 1) - m.assertGetOrInsert(table, C, 2) - m.assertGetOrInsert(table, D, 3) - m.assertGetOrInsert(table, E, 4) - m.assertGetOrInsertNull(table, 5) - - m.assertGet(table, A, 0) - m.assertGetOrInsert(table, A, 0) - m.assertGet(table, E, 4) - m.assertGetOrInsert(table, E, 4) - - m.assertGetOrInsert(table, F, 6) - m.assertGetOrInsert(table, G, 7) - m.assertGetOrInsert(table, H, 8) - - m.assertGetOrInsert(table, G, 7) - m.assertGetOrInsert(table, F, 6) - m.assertGetOrInsertNull(table, 5) - m.assertGetOrInsert(table, E, 4) - m.assertGetOrInsert(table, D, 3) - m.assertGetOrInsert(table, C, 2) - m.assertGetOrInsert(table, B, 1) - m.assertGetOrInsert(table, A, 0) - - const sz int = 9 - m.Equal(sz, table.Size()) - m.Panics(func() { - values := make([]int32, sz) - table.CopyValues(values) - }, "should panic because wrong type") - m.Panics(func() { - values := make([]int64, sz-3) - table.CopyValues(values) - }, "should panic because out of bounds") - - { - values := make([]int64, sz) - table.CopyValues(values) - m.Equal([]int64{A, B, C, D, E, 0, F, G, H}, values) - } - { - const offset = 3 - values := make([]int64, sz-offset) - table.CopyValuesSubset(offset, values) - m.Equal([]int64{D, E, 0, F, G, H}, values) - } -} - -func (m *MemoTableTestSuite) TestFloat64() { - const ( - A float64 = 0.0 - B float64 = 1.5 - C float64 = -0.1 - ) - var ( - D = math.Inf(1) - E = -D - F = math.NaN() - ) - - // table := encoding.NewFloat64MemoTable(nil) - // table := &hashing.Float64MemoTable{hashing.NewScalarMemoTable(0)} - table := hashing.NewFloat64MemoTable(0) - m.Zero(table.Size()) - m.assertGetNotFound(table, A) - m.assertGetNullNotFound(table) - m.assertGetOrInsert(table, A, 0) - m.assertGetNotFound(table, B) - m.assertGetOrInsert(table, B, 1) - m.assertGetOrInsert(table, C, 2) - m.assertGetOrInsert(table, D, 3) - m.assertGetOrInsert(table, E, 4) - m.assertGetOrInsert(table, F, 5) - - m.assertGet(table, A, 0) - m.assertGetOrInsert(table, A, 0) - m.assertGetOrInsert(table, B, 1) - m.assertGetOrInsert(table, C, 2) - m.assertGetOrInsert(table, D, 3) - m.assertGet(table, E, 4) - m.assertGetOrInsert(table, E, 4) - m.assertGet(table, F, 5) - m.assertGetOrInsert(table, F, 5) - - m.Equal(6, table.Size()) - expected := []float64{A, B, C, D, E, F} - m.Panics(func() { - values := make([]int32, 6) - table.CopyValues(values) - }, "should panic because wrong type") - m.Panics(func() { - values := make([]float64, 3) - table.CopyValues(values) - }, "should panic because out of bounds") - - values := make([]float64, len(expected)) - table.CopyValues(values) - for idx, ex := range expected { - if math.IsNaN(ex) { - m.True(math.IsNaN(values[idx])) - } else { - m.Equal(ex, values[idx]) - } - } -} - -func (m *MemoTableTestSuite) TestBinaryBasics() { - const ( - A = "" - B = "a" - C = "foo" - D = "bar" - E = "\000" - F = "\000trailing" - ) - - table := hashing.NewBinaryMemoTable(memory.DefaultAllocator, 0, -1) - defer table.Release() - - m.Zero(table.Size()) - m.assertGetNotFound(table, A) - m.assertGetNullNotFound(table) - m.assertGetOrInsert(table, A, 0) - m.assertGetNotFound(table, B) - m.assertGetOrInsert(table, B, 1) - m.assertGetOrInsert(table, C, 2) - m.assertGetOrInsert(table, D, 3) - m.assertGetOrInsert(table, E, 4) - m.assertGetOrInsert(table, F, 5) - m.assertGetOrInsertNull(table, 6) - - m.assertGet(table, A, 0) - m.assertGetOrInsert(table, A, 0) - m.assertGet(table, B, 1) - m.assertGetOrInsert(table, B, 1) - m.assertGetOrInsert(table, C, 2) - m.assertGetOrInsert(table, D, 3) - m.assertGetOrInsert(table, E, 4) - m.assertGet(table, F, 5) - m.assertGetOrInsert(table, F, 5) - m.assertGetNull(table, 6) - m.assertGetOrInsertNull(table, 6) - - m.Equal(7, table.Size()) - m.Equal(17, table.ValuesSize()) - - size := table.Size() - { - offsets := make([]int8, size+1) - table.CopyOffsets(offsets) - m.Equal([]int8{0, 0, 1, 4, 7, 8, 17, 17}, offsets) - - expectedValues := "afoobar" - expectedValues += "\000" - expectedValues += "\000" - expectedValues += "trailing" - values := make([]byte, 17) - table.CopyValues(values) - m.Equal(expectedValues, string(values)) - } - - { - startOffset := 4 - offsets := make([]int8, size+1-int(startOffset)) - table.CopyOffsetsSubset(startOffset, offsets) - m.Equal([]int8{0, 1, 10, 10}, offsets) - - expectedValues := "" - expectedValues += "\000" - expectedValues += "\000" - expectedValues += "trailing" - - values := make([]byte, 10) - table.CopyValuesSubset(startOffset, values) - m.Equal(expectedValues, string(values)) - } - - { - startOffset := 1 - values := make([]string, 0) - table.VisitValues(startOffset, func(b []byte) { - values = append(values, string(b)) - }) - m.Equal([]string{B, C, D, E, F, ""}, values) - } -} - -func (m *MemoTableTestSuite) TestBinaryEmpty() { - table := encoding.NewBinaryMemoTable(memory.DefaultAllocator) - defer table.Release() - - m.Zero(table.Size()) - offsets := make([]int8, 1) - table.CopyOffsetsSubset(0, offsets) - m.Equal(int8(0), offsets[0]) -} diff --git a/go/parquet/internal/encoding/memo_table_types.gen.go b/go/parquet/internal/encoding/memo_table_types.gen.go deleted file mode 100644 index 5c4812cbbeb..00000000000 --- a/go/parquet/internal/encoding/memo_table_types.gen.go +++ /dev/null @@ -1,366 +0,0 @@ -// Code generated by memo_table_types.gen.go.tmpl. DO NOT EDIT. - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding - -import ( - "github.com/apache/arrow/go/arrow/memory" - "github.com/apache/arrow/go/parquet" -) - -// standard map based implementation of memo tables which can be more efficient -// in some cases based on the uniqueness / amount / size of the data. -// these are left here for now for use in the benchmarks to compare against the -// custom hash table implementation in the internal/hashing package as a base -// benchmark comparison. - -func NewInt32MemoTable(memory.Allocator) MemoTable { - return &int32MemoTableImpl{ - table: make(map[int32]struct { - value int32 - memoIndex int - }), - nullIndex: keyNotFound, - } -} - -type int32MemoTableImpl struct { - table map[int32]struct { - value int32 - memoIndex int - } - nullIndex int -} - -func (m *int32MemoTableImpl) Reset() { - m.table = make(map[int32]struct { - value int32 - memoIndex int - }) - m.nullIndex = keyNotFound -} - -func (m *int32MemoTableImpl) GetNull() (int, bool) { - return m.nullIndex, m.nullIndex != keyNotFound -} - -func (m *int32MemoTableImpl) Size() int { - sz := len(m.table) - if _, ok := m.GetNull(); ok { - sz++ - } - return sz -} - -func (m *int32MemoTableImpl) GetOrInsertNull() (idx int, found bool) { - idx, found = m.GetNull() - if !found { - idx = m.Size() - m.nullIndex = idx - } - return -} - -func (m *int32MemoTableImpl) Get(val interface{}) (int, bool) { - v := val.(int32) - if p, ok := m.table[v]; ok { - return p.memoIndex, true - } - return keyNotFound, false -} - -func (m *int32MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { - v := val.(int32) - p, ok := m.table[v] - if ok { - idx = p.memoIndex - } else { - idx = m.Size() - p.value = v - p.memoIndex = idx - m.table[v] = p - found = true - } - return -} - -func (m *int32MemoTableImpl) CopyValues(out interface{}) { - m.CopyValuesSubset(0, out) -} - -func (m *int32MemoTableImpl) CopyValuesSubset(start int, out interface{}) { - outval := out.([]int32) - for _, v := range m.table { - idx := v.memoIndex - start - if idx >= 0 { - outval[idx] = v.value - } - } -} - -func NewInt64MemoTable(memory.Allocator) MemoTable { - return &int64MemoTableImpl{ - table: make(map[int64]struct { - value int64 - memoIndex int - }), - nullIndex: keyNotFound, - } -} - -type int64MemoTableImpl struct { - table map[int64]struct { - value int64 - memoIndex int - } - nullIndex int -} - -func (m *int64MemoTableImpl) Reset() { - m.table = make(map[int64]struct { - value int64 - memoIndex int - }) - m.nullIndex = keyNotFound -} - -func (m *int64MemoTableImpl) GetNull() (int, bool) { - return m.nullIndex, m.nullIndex != keyNotFound -} - -func (m *int64MemoTableImpl) Size() int { - sz := len(m.table) - if _, ok := m.GetNull(); ok { - sz++ - } - return sz -} - -func (m *int64MemoTableImpl) GetOrInsertNull() (idx int, found bool) { - idx, found = m.GetNull() - if !found { - idx = m.Size() - m.nullIndex = idx - } - return -} - -func (m *int64MemoTableImpl) Get(val interface{}) (int, bool) { - v := val.(int64) - if p, ok := m.table[v]; ok { - return p.memoIndex, true - } - return keyNotFound, false -} - -func (m *int64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { - v := val.(int64) - p, ok := m.table[v] - if ok { - idx = p.memoIndex - } else { - idx = m.Size() - p.value = v - p.memoIndex = idx - m.table[v] = p - found = true - } - return -} - -func (m *int64MemoTableImpl) CopyValues(out interface{}) { - m.CopyValuesSubset(0, out) -} - -func (m *int64MemoTableImpl) CopyValuesSubset(start int, out interface{}) { - outval := out.([]int64) - for _, v := range m.table { - idx := v.memoIndex - start - if idx >= 0 { - outval[idx] = v.value - } - } -} - -func NewInt96MemoTable(memory.Allocator) MemoTable { - return &int96MemoTableImpl{ - table: make(map[parquet.Int96]struct { - value parquet.Int96 - memoIndex int - }), - nullIndex: keyNotFound, - } -} - -type int96MemoTableImpl struct { - table map[parquet.Int96]struct { - value parquet.Int96 - memoIndex int - } - nullIndex int -} - -func (m *int96MemoTableImpl) Reset() { - m.table = make(map[parquet.Int96]struct { - value parquet.Int96 - memoIndex int - }) - m.nullIndex = keyNotFound -} - -func (m *int96MemoTableImpl) GetNull() (int, bool) { - return m.nullIndex, m.nullIndex != keyNotFound -} - -func (m *int96MemoTableImpl) Size() int { - sz := len(m.table) - if _, ok := m.GetNull(); ok { - sz++ - } - return sz -} - -func (m *int96MemoTableImpl) GetOrInsertNull() (idx int, found bool) { - idx, found = m.GetNull() - if !found { - idx = m.Size() - m.nullIndex = idx - } - return -} - -func (m *int96MemoTableImpl) Get(val interface{}) (int, bool) { - v := val.(parquet.Int96) - if p, ok := m.table[v]; ok { - return p.memoIndex, true - } - return keyNotFound, false -} - -func (m *int96MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { - v := val.(parquet.Int96) - p, ok := m.table[v] - if ok { - idx = p.memoIndex - } else { - idx = m.Size() - p.value = v - p.memoIndex = idx - m.table[v] = p - found = true - } - return -} - -func (m *int96MemoTableImpl) CopyValues(out interface{}) { - m.CopyValuesSubset(0, out) -} - -func (m *int96MemoTableImpl) CopyValuesSubset(start int, out interface{}) { - outval := out.([]parquet.Int96) - for _, v := range m.table { - idx := v.memoIndex - start - if idx >= 0 { - outval[idx] = v.value - } - } -} - -func NewFloat32MemoTable(memory.Allocator) MemoTable { - return &float32MemoTableImpl{ - table: make(map[float32]struct { - value float32 - memoIndex int - }), - nullIndex: keyNotFound, - } -} - -type float32MemoTableImpl struct { - table map[float32]struct { - value float32 - memoIndex int - } - nullIndex int -} - -func (m *float32MemoTableImpl) Reset() { - m.table = make(map[float32]struct { - value float32 - memoIndex int - }) - m.nullIndex = keyNotFound -} - -func (m *float32MemoTableImpl) GetNull() (int, bool) { - return m.nullIndex, m.nullIndex != keyNotFound -} - -func (m *float32MemoTableImpl) Size() int { - sz := len(m.table) - if _, ok := m.GetNull(); ok { - sz++ - } - return sz -} - -func (m *float32MemoTableImpl) GetOrInsertNull() (idx int, found bool) { - idx, found = m.GetNull() - if !found { - idx = m.Size() - m.nullIndex = idx - } - return -} - -func (m *float32MemoTableImpl) Get(val interface{}) (int, bool) { - v := val.(float32) - if p, ok := m.table[v]; ok { - return p.memoIndex, true - } - return keyNotFound, false -} - -func (m *float32MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { - v := val.(float32) - p, ok := m.table[v] - if ok { - idx = p.memoIndex - } else { - idx = m.Size() - p.value = v - p.memoIndex = idx - m.table[v] = p - found = true - } - return -} - -func (m *float32MemoTableImpl) CopyValues(out interface{}) { - m.CopyValuesSubset(0, out) -} - -func (m *float32MemoTableImpl) CopyValuesSubset(start int, out interface{}) { - outval := out.([]float32) - for _, v := range m.table { - idx := v.memoIndex - start - if idx >= 0 { - outval[idx] = v.value - } - } -} diff --git a/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl b/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl deleted file mode 100644 index 0a0a7af2920..00000000000 --- a/go/parquet/internal/encoding/memo_table_types.gen.go.tmpl +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package encoding - -import ( - "github.com/apache/arrow/go/parquet" -) - -// standard map based implementation of memo tables which can be more efficient -// in some cases based on the uniqueness / amount / size of the data. -// these are left here for now for use in the benchmarks to compare against the -// custom hash table implementation in the internal/hashing package as a base -// benchmark comparison. - -{{range .In}} -{{if and (ne .Name "ByteArray") (ne .Name "FixedLenByteArray") (ne .Name "Float64") (ne .Name "Boolean")}} -func New{{.Name}}MemoTable(memory.Allocator) MemoTable { - return &{{.lower}}MemoTableImpl{ - table: make(map[{{.name}}]struct{ - value {{.name}} - memoIndex int - }), - nullIndex: keyNotFound, - } -} - -type {{.lower}}MemoTableImpl struct { - table map[{{.name}}]struct{ - value {{.name}} - memoIndex int - } - nullIndex int -} - -func (m *{{.lower}}MemoTableImpl) Reset() { - m.table = make(map[{{.name}}]struct{ - value {{.name}} - memoIndex int - }) - m.nullIndex = keyNotFound -} - -func (m *{{.lower}}MemoTableImpl) GetNull() (int, bool) { - return m.nullIndex, m.nullIndex != keyNotFound -} - -func (m *{{.lower}}MemoTableImpl) Size() int { - sz := len(m.table) - if _, ok := m.GetNull(); ok { - sz++ - } - return sz -} - -func (m *{{.lower}}MemoTableImpl) GetOrInsertNull() (idx int, found bool) { - idx, found = m.GetNull() - if !found { - idx = m.Size() - m.nullIndex = idx - } - return -} - -func (m *{{.lower}}MemoTableImpl) Get(val interface{}) (int, bool) { - v := val.({{.name}}) - if p, ok := m.table[v]; ok { - return p.memoIndex, true - } - return keyNotFound, false -} - -func (m *{{.lower}}MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) { - v := val.({{.name}}) - p, ok := m.table[v] - if ok { - idx = p.memoIndex - } else { - idx = m.Size() - p.value = v - p.memoIndex = idx - m.table[v] = p - found = true - } - return -} - -func (m *{{.lower}}MemoTableImpl) CopyValues(out interface{}) { - m.CopyValuesSubset(0, out) -} - -func (m *{{.lower}}MemoTableImpl) CopyValuesSubset(start int, out interface{}) { - outval := out.([]{{.name}}) - for _, v := range m.table { - idx := v.memoIndex - start - if idx >= 0 { - outval[idx] = v.value - } - } -} -{{end}} -{{end}} diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go b/go/parquet/internal/encoding/typed_encoder.gen.go index 192286f987c..abcfd95142e 100644 --- a/go/parquet/internal/encoding/typed_encoder.gen.go +++ b/go/parquet/internal/encoding/typed_encoder.gen.go @@ -74,9 +74,6 @@ type int32EncoderTraits struct{} // Encoder returns an encoder for int32 type data, using the specified encoding type and whether or not // it should be dictionary encoded. func (int32EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - return &DictInt32Encoder{newDictEncoderBase(descr, NewInt32Dictionary(), mem)} - } switch e { case format.Encoding_PLAIN: @@ -290,9 +287,6 @@ type int64EncoderTraits struct{} // Encoder returns an encoder for int64 type data, using the specified encoding type and whether or not // it should be dictionary encoded. func (int64EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - return &DictInt64Encoder{newDictEncoderBase(descr, NewInt64Dictionary(), mem)} - } switch e { case format.Encoding_PLAIN: @@ -507,9 +501,6 @@ type int96EncoderTraits struct{} // it should be dictionary encoded. // dictionary encoding does not exist for this type and Encoder will panic if useDict is true func (int96EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - panic("parquet: no parquet.Int96 dictionary encoding") - } switch e { case format.Encoding_PLAIN: @@ -564,9 +555,6 @@ type float32EncoderTraits struct{} // Encoder returns an encoder for float32 type data, using the specified encoding type and whether or not // it should be dictionary encoded. func (float32EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - return &DictFloat32Encoder{newDictEncoderBase(descr, NewFloat32Dictionary(), mem)} - } switch e { case format.Encoding_PLAIN: @@ -768,9 +756,6 @@ type float64EncoderTraits struct{} // Encoder returns an encoder for float64 type data, using the specified encoding type and whether or not // it should be dictionary encoded. func (float64EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - return &DictFloat64Encoder{newDictEncoderBase(descr, NewFloat64Dictionary(), mem)} - } switch e { case format.Encoding_PLAIN: @@ -973,9 +958,6 @@ type boolEncoderTraits struct{} // it should be dictionary encoded. // dictionary encoding does not exist for this type and Encoder will panic if useDict is true func (boolEncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - panic("parquet: no bool dictionary encoding") - } switch e { case format.Encoding_PLAIN: @@ -1030,9 +1012,6 @@ type byteArrayEncoderTraits struct{} // Encoder returns an encoder for byteArray type data, using the specified encoding type and whether or not // it should be dictionary encoded. func (byteArrayEncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - return &DictByteArrayEncoder{newDictEncoderBase(descr, NewBinaryDictionary(mem), mem)} - } switch e { case format.Encoding_PLAIN: @@ -1238,9 +1217,6 @@ type fixedLenByteArrayEncoderTraits struct{} // Encoder returns an encoder for fixedLenByteArray type data, using the specified encoding type and whether or not // it should be dictionary encoded. func (fixedLenByteArrayEncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { - return &DictFixedLenByteArrayEncoder{newDictEncoderBase(descr, NewBinaryDictionary(mem), mem)} - } switch e { case format.Encoding_PLAIN: diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl index 0667143ac07..509266b6878 100644 --- a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl +++ b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl @@ -60,13 +60,13 @@ type {{.lower}}EncoderTraits struct{} // dictionary encoding does not exist for this type and Encoder will panic if useDict is true {{- end }} func ({{.lower}}EncoderTraits) Encoder(e format.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { - if useDict { + {{/* if useDict { {{- if or (eq .Name "Boolean") (eq .Name "Int96")}} panic("parquet: no {{.name}} dictionary encoding") {{- else}} return &Dict{{.Name}}Encoder{newDictEncoderBase(descr, New{{if and (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}}{{.Name}}Dictionary(){{else}}BinaryDictionary(mem){{end}}, mem)} {{- end}} - } + } */}} switch e { case format.Encoding_PLAIN: diff --git a/go/parquet/internal/encoding/types.go b/go/parquet/internal/encoding/types.go index 7bea96a87d8..fa3661e1119 100644 --- a/go/parquet/internal/encoding/types.go +++ b/go/parquet/internal/encoding/types.go @@ -435,3 +435,63 @@ func (b *BufferWriter) Seek(offset int64, whence int) (int64, error) { func (b *BufferWriter) Tell() int64 { return int64(b.pos) } + +// MemoTable interface that can be used to swap out implementations of the hash table +// used for handling dictionary encoding. Dictionary encoding is built against this interface +// to make it easy for code generation and changing implementations. +// +// Values should remember the order they are inserted to generate a valid dictionary index +type MemoTable interface { + // Reset drops everything in the table allowing it to be reused + Reset() + // Size returns the current number of unique values stored in the table + // including whether or not a null value has been passed in using GetOrInsertNull + Size() int + // CopyValues populates out with the values currently in the table, out must + // be a slice of the appropriate type for the table type. + CopyValues(out interface{}) + // CopyValuesSubset is like CopyValues but only copies a subset of values starting + // at the indicated index. + CopyValuesSubset(start int, out interface{}) + // Get returns the index of the table the specified value is, and a boolean indicating + // whether or not the value was found in the table. Will panic if val is not the appropriate + // type for the underlying table. + Get(val interface{}) (int, bool) + // GetOrInsert is the same as Get, except if the value is not currently in the table it will + // be inserted into the table. + GetOrInsert(val interface{}) (idx int, existed bool, err error) + // GetNull returns the index of the null value and whether or not it was found in the table + GetNull() (int, bool) + // GetOrInsertNull returns the index of the null value, if it didn't already exist in the table, + // it is inserted. + GetOrInsertNull() (idx int, existed bool) +} + +// BinaryMemoTable is an extension of the MemoTable interface adding extra methods +// for handling byte arrays/strings/fixed length byte arrays. +type BinaryMemoTable interface { + MemoTable + // ValuesSize returns the total number of bytes needed to copy all of the values + // from this table. + ValuesSize() int + // CopyOffsets populates out with the start and end offsets of each value in the + // table data. Out should be sized to Size()+1 to accomodate all of the offsets. + CopyOffsets(out []int8) + // CopyOffsetsSubset is like CopyOffsets but only gets a subset of the offsets + // starting at the specified index. + CopyOffsetsSubset(start int, out []int8) + // CopyFixedWidthValues exists to cope with the fact that the table doesn't track + // the fixed width when inserting the null value into the databuffer populating + // a zero length byte slice for the null value (if found). + CopyFixedWidthValues(start int, width int, out []byte) + // VisitValues calls visitFn on each value in the table starting with the index specified + VisitValues(start int, visitFn func([]byte)) + // Retain increases the reference count of the separately stored binary data that is + // kept alongside the table which contains all of the values in the table. This is + // safe to call simultaneously across multiple goroutines. + Retain() + // Release decreases the reference count by 1 of the separately stored binary data + // kept alongside the table containing the values. When the reference count goes to + // 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy. + Release() +} diff --git a/go/parquet/internal/hashing/hashing_test.go b/go/parquet/internal/hashing/hashing_test.go deleted file mode 100644 index 875424a9d49..00000000000 --- a/go/parquet/internal/hashing/hashing_test.go +++ /dev/null @@ -1,114 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package hashing - -import ( - "math/rand" - "testing" - - "github.com/stretchr/testify/assert" -) - -func MakeDistinctIntegers(nvals int) map[int]bool { - r := rand.New(rand.NewSource(42)) - values := make(map[int]bool) - for len(values) < nvals { - values[r.Int()] = true - } - return values -} - -func MakeSequentialIntegers(nvals int) map[int]bool { - values := make(map[int]bool) - for i := 0; i < nvals; i++ { - values[i] = true - } - return values -} - -func MakeDistinctStrings(nvals int) map[string]bool { - values := make(map[string]bool) - - r := rand.New(rand.NewSource(42)) - - max := 'z' - min := '0' - for len(values) < nvals { - data := make([]byte, r.Intn(24)) - for idx := range data { - data[idx] = byte(r.Intn(int(max-min+1)) + int(min)) - } - values[string(data)] = true - } - return values -} - -func TestHashingQualityInt(t *testing.T) { - const nvalues = 10000 - - tests := []struct { - name string - values map[int]bool - quality float64 - }{ - {"distinct", MakeDistinctIntegers(nvalues), 0.96}, - {"sequential", MakeSequentialIntegers(nvalues), 0.96}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - hashes := make(map[uint64]bool) - for k := range tt.values { - hashes[hashInt(uint64(k), 0)] = true - hashes[hashInt(uint64(k), 1)] = true - } - assert.GreaterOrEqual(t, float64(len(hashes)), tt.quality*float64(2*len(tt.values))) - }) - } -} - -func TestHashingBoundsStrings(t *testing.T) { - sizes := []int{1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21} - for _, s := range sizes { - str := make([]byte, s) - for idx := range str { - str[idx] = uint8(idx) - } - - h := hash(str, 1) - diff := 0 - for i := 0; i < 120; i++ { - str[len(str)-1] = uint8(i) - if hash(str, 1) != h { - diff++ - } - } - assert.GreaterOrEqual(t, diff, 118) - } -} - -func TestHashingQualityString(t *testing.T) { - const nvalues = 10000 - values := MakeDistinctStrings(nvalues) - - hashes := make(map[uint64]bool) - for k := range values { - hashes[hashString(k, 0)] = true - hashes[hashString(k, 1)] = true - } - assert.GreaterOrEqual(t, float64(len(hashes)), 0.96*float64(2*len(values))) -} diff --git a/go/parquet/internal/hashing/types.tmpldata b/go/parquet/internal/hashing/types.tmpldata deleted file mode 100644 index 2e97e9814e0..00000000000 --- a/go/parquet/internal/hashing/types.tmpldata +++ /dev/null @@ -1,18 +0,0 @@ -[ - { - "Name": "Int32", - "name": "int32" - }, - { - "Name": "Int64", - "name": "int64" - }, - { - "Name": "Float32", - "name": "float32" - }, - { - "Name": "Float64", - "name": "float64" - } -] diff --git a/go/parquet/internal/hashing/xxh3_memo_table.gen.go b/go/parquet/internal/hashing/xxh3_memo_table.gen.go deleted file mode 100644 index 1f37d180803..00000000000 --- a/go/parquet/internal/hashing/xxh3_memo_table.gen.go +++ /dev/null @@ -1,1009 +0,0 @@ -// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT. - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package hashing - -import ( - "math" - - "github.com/apache/arrow/go/arrow/bitutil" -) - -type payloadInt32 struct { - val int32 - memoIdx int32 -} - -type entryInt32 struct { - h uint64 - payload payloadInt32 -} - -func (e entryInt32) Valid() bool { return e.h != sentinel } - -// Int32HashTable is a hashtable specifically for int32 that -// is utilized with the MemoTable to generalize interactions for easier -// implementation of dictionaries without losing performance. -type Int32HashTable struct { - cap uint64 - capMask uint64 - size uint64 - - entries []entryInt32 -} - -// NewInt32HashTable returns a new hash table for int32 values -// initialized with the passed in capacity or 32 whichever is larger. -func NewInt32HashTable(cap uint64) *Int32HashTable { - initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0} - ret.entries = make([]entryInt32, initCap) - return ret -} - -// Reset drops all of the values in this hash table and re-initializes it -// with the specified initial capacity as if by calling New, but without having -// to reallocate the object. -func (h *Int32HashTable) Reset(cap uint64) { - h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - h.capMask = h.cap - 1 - h.size = 0 - h.entries = make([]entryInt32, h.cap) -} - -// CopyValues is used for copying the values out of the hash table into the -// passed in slice, in the order that they were first inserted -func (h *Int32HashTable) CopyValues(out []int32) { - h.CopyValuesSubset(0, out) -} - -// CopyValuesSubset copies a subset of the values in the hashtable out, starting -// with the value at start, in the order that they were inserted. -func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) { - h.VisitEntries(func(e *entryInt32) { - idx := e.payload.memoIdx - int32(start) - if idx >= 0 { - out[idx] = e.payload.val - } - }) -} - -func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } - -func (Int32HashTable) fixHash(v uint64) uint64 { - if v == sentinel { - return 42 - } - return v -} - -// Lookup retrieves the entry for a given hash value assuming it's payload value returns -// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, -// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. -func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) { - idx, ok := h.lookup(v, h.capMask, cmp) - return &h.entries[idx], ok -} - -func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) { - const perturbShift uint8 = 5 - - var ( - idx uint64 - perturb uint64 - e *entryInt32 - ) - - v = h.fixHash(v) - idx = v & szMask - perturb = (v >> uint64(perturbShift)) + 1 - - for { - e = &h.entries[idx] - if e.h == v && cmp(e.payload.val) { - return idx, true - } - - if e.h == sentinel { - return idx, false - } - - // perturbation logic inspired from CPython's set/dict object - // the goal is that all 64 bits of unmasked hash value eventually - // participate int he probing sequence, to minimize clustering - idx = (idx + perturb) & szMask - perturb = (perturb >> uint64(perturbShift)) + 1 - } -} - -func (h *Int32HashTable) upsize(newcap uint64) error { - newMask := newcap - 1 - - oldEntries := h.entries - h.entries = make([]entryInt32, newcap) - for _, e := range oldEntries { - if e.Valid() { - idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false }) - h.entries[idx] = e - } - } - h.cap = newcap - h.capMask = newMask - return nil -} - -// Insert updates the given entry with the provided hash value, payload value and memo index. -// The entry pointer must have been retrieved via lookup in order to actually insert properly. -func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error { - e.h = h.fixHash(v) - e.payload.val = val - e.payload.memoIdx = memoIdx - h.size++ - - if h.needUpsize() { - h.upsize(h.cap * uint64(loadFactor) * 2) - } - return nil -} - -// VisitEntries will call the passed in function on each *valid* entry in the hash table, -// a valid entry being one which has had a value inserted into it. -func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) { - for _, e := range h.entries { - if e.Valid() { - visit(&e) - } - } -} - -// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface -// conforming to the MemoTable interface defined in the encoding package for general interactions -// regarding dictionaries. -type Int32MemoTable struct { - tbl *Int32HashTable - nullIdx int32 -} - -// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further -// allocations when inserting. -func NewInt32MemoTable(num int64) *Int32MemoTable { - return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound} -} - -// Reset allows this table to be re-used by dumping all the data currently in the table. -func (s *Int32MemoTable) Reset() { - s.tbl.Reset(32) - s.nullIdx = KeyNotFound -} - -// Size returns the current number of inserted elements into the table including if a null -// has been inserted. -func (s *Int32MemoTable) Size() int { - sz := int(s.tbl.size) - if _, ok := s.GetNull(); ok { - sz++ - } - return sz -} - -// GetNull returns the index of an inserted null or KeyNotFound along with a bool -// that will be true if found and false if not. -func (s *Int32MemoTable) GetNull() (int, bool) { - return int(s.nullIdx), s.nullIdx != KeyNotFound -} - -// GetOrInsertNull will return the index of the null entry or insert a null entry -// if one currently doesn't exist. The found value will be true if there was already -// a null in the table, and false if it inserted one. -func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) { - idx, found = s.GetNull() - if !found { - idx = s.Size() - s.nullIdx = int32(idx) - } - return -} - -// CopyValues will copy the values from the memo table out into the passed in slice -// which must be of the appropriate type. -func (s *Int32MemoTable) CopyValues(out interface{}) { - s.CopyValuesSubset(0, out) -} - -// CopyValuesSubset is like CopyValues but only copies a subset of values starting -// at the provided start index -func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) { - s.tbl.CopyValuesSubset(start, out.([]int32)) -} - -// Get returns the index of the requested value in the hash table or KeyNotFound -// along with a boolean indicating if it was found or not. -func (s *Int32MemoTable) Get(val interface{}) (int, bool) { - - h := hashInt(uint64(val.(int32)), 0) - if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok { - - return int(e.payload.memoIdx), ok - } - return KeyNotFound, false -} - -// GetOrInsert will return the index of the specified value in the table, or insert the -// value into the table and return the new index. found indicates whether or not it already -// existed in the table (true) or was inserted by this call (false). -func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { - - h := hashInt(uint64(val.(int32)), 0) - e, ok := s.tbl.Lookup(h, func(v int32) bool { - return val.(int32) == v - }) - - if ok { - idx = int(e.payload.memoIdx) - found = true - } else { - idx = s.Size() - s.tbl.Insert(e, h, val.(int32), int32(idx)) - } - return -} - -type payloadInt64 struct { - val int64 - memoIdx int32 -} - -type entryInt64 struct { - h uint64 - payload payloadInt64 -} - -func (e entryInt64) Valid() bool { return e.h != sentinel } - -// Int64HashTable is a hashtable specifically for int64 that -// is utilized with the MemoTable to generalize interactions for easier -// implementation of dictionaries without losing performance. -type Int64HashTable struct { - cap uint64 - capMask uint64 - size uint64 - - entries []entryInt64 -} - -// NewInt64HashTable returns a new hash table for int64 values -// initialized with the passed in capacity or 32 whichever is larger. -func NewInt64HashTable(cap uint64) *Int64HashTable { - initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0} - ret.entries = make([]entryInt64, initCap) - return ret -} - -// Reset drops all of the values in this hash table and re-initializes it -// with the specified initial capacity as if by calling New, but without having -// to reallocate the object. -func (h *Int64HashTable) Reset(cap uint64) { - h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - h.capMask = h.cap - 1 - h.size = 0 - h.entries = make([]entryInt64, h.cap) -} - -// CopyValues is used for copying the values out of the hash table into the -// passed in slice, in the order that they were first inserted -func (h *Int64HashTable) CopyValues(out []int64) { - h.CopyValuesSubset(0, out) -} - -// CopyValuesSubset copies a subset of the values in the hashtable out, starting -// with the value at start, in the order that they were inserted. -func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) { - h.VisitEntries(func(e *entryInt64) { - idx := e.payload.memoIdx - int32(start) - if idx >= 0 { - out[idx] = e.payload.val - } - }) -} - -func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } - -func (Int64HashTable) fixHash(v uint64) uint64 { - if v == sentinel { - return 42 - } - return v -} - -// Lookup retrieves the entry for a given hash value assuming it's payload value returns -// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, -// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. -func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) { - idx, ok := h.lookup(v, h.capMask, cmp) - return &h.entries[idx], ok -} - -func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) { - const perturbShift uint8 = 5 - - var ( - idx uint64 - perturb uint64 - e *entryInt64 - ) - - v = h.fixHash(v) - idx = v & szMask - perturb = (v >> uint64(perturbShift)) + 1 - - for { - e = &h.entries[idx] - if e.h == v && cmp(e.payload.val) { - return idx, true - } - - if e.h == sentinel { - return idx, false - } - - // perturbation logic inspired from CPython's set/dict object - // the goal is that all 64 bits of unmasked hash value eventually - // participate int he probing sequence, to minimize clustering - idx = (idx + perturb) & szMask - perturb = (perturb >> uint64(perturbShift)) + 1 - } -} - -func (h *Int64HashTable) upsize(newcap uint64) error { - newMask := newcap - 1 - - oldEntries := h.entries - h.entries = make([]entryInt64, newcap) - for _, e := range oldEntries { - if e.Valid() { - idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false }) - h.entries[idx] = e - } - } - h.cap = newcap - h.capMask = newMask - return nil -} - -// Insert updates the given entry with the provided hash value, payload value and memo index. -// The entry pointer must have been retrieved via lookup in order to actually insert properly. -func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error { - e.h = h.fixHash(v) - e.payload.val = val - e.payload.memoIdx = memoIdx - h.size++ - - if h.needUpsize() { - h.upsize(h.cap * uint64(loadFactor) * 2) - } - return nil -} - -// VisitEntries will call the passed in function on each *valid* entry in the hash table, -// a valid entry being one which has had a value inserted into it. -func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) { - for _, e := range h.entries { - if e.Valid() { - visit(&e) - } - } -} - -// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface -// conforming to the MemoTable interface defined in the encoding package for general interactions -// regarding dictionaries. -type Int64MemoTable struct { - tbl *Int64HashTable - nullIdx int32 -} - -// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further -// allocations when inserting. -func NewInt64MemoTable(num int64) *Int64MemoTable { - return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound} -} - -// Reset allows this table to be re-used by dumping all the data currently in the table. -func (s *Int64MemoTable) Reset() { - s.tbl.Reset(32) - s.nullIdx = KeyNotFound -} - -// Size returns the current number of inserted elements into the table including if a null -// has been inserted. -func (s *Int64MemoTable) Size() int { - sz := int(s.tbl.size) - if _, ok := s.GetNull(); ok { - sz++ - } - return sz -} - -// GetNull returns the index of an inserted null or KeyNotFound along with a bool -// that will be true if found and false if not. -func (s *Int64MemoTable) GetNull() (int, bool) { - return int(s.nullIdx), s.nullIdx != KeyNotFound -} - -// GetOrInsertNull will return the index of the null entry or insert a null entry -// if one currently doesn't exist. The found value will be true if there was already -// a null in the table, and false if it inserted one. -func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) { - idx, found = s.GetNull() - if !found { - idx = s.Size() - s.nullIdx = int32(idx) - } - return -} - -// CopyValues will copy the values from the memo table out into the passed in slice -// which must be of the appropriate type. -func (s *Int64MemoTable) CopyValues(out interface{}) { - s.CopyValuesSubset(0, out) -} - -// CopyValuesSubset is like CopyValues but only copies a subset of values starting -// at the provided start index -func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) { - s.tbl.CopyValuesSubset(start, out.([]int64)) -} - -// Get returns the index of the requested value in the hash table or KeyNotFound -// along with a boolean indicating if it was found or not. -func (s *Int64MemoTable) Get(val interface{}) (int, bool) { - - h := hashInt(uint64(val.(int64)), 0) - if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok { - - return int(e.payload.memoIdx), ok - } - return KeyNotFound, false -} - -// GetOrInsert will return the index of the specified value in the table, or insert the -// value into the table and return the new index. found indicates whether or not it already -// existed in the table (true) or was inserted by this call (false). -func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { - - h := hashInt(uint64(val.(int64)), 0) - e, ok := s.tbl.Lookup(h, func(v int64) bool { - return val.(int64) == v - }) - - if ok { - idx = int(e.payload.memoIdx) - found = true - } else { - idx = s.Size() - s.tbl.Insert(e, h, val.(int64), int32(idx)) - } - return -} - -type payloadFloat32 struct { - val float32 - memoIdx int32 -} - -type entryFloat32 struct { - h uint64 - payload payloadFloat32 -} - -func (e entryFloat32) Valid() bool { return e.h != sentinel } - -// Float32HashTable is a hashtable specifically for float32 that -// is utilized with the MemoTable to generalize interactions for easier -// implementation of dictionaries without losing performance. -type Float32HashTable struct { - cap uint64 - capMask uint64 - size uint64 - - entries []entryFloat32 -} - -// NewFloat32HashTable returns a new hash table for float32 values -// initialized with the passed in capacity or 32 whichever is larger. -func NewFloat32HashTable(cap uint64) *Float32HashTable { - initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0} - ret.entries = make([]entryFloat32, initCap) - return ret -} - -// Reset drops all of the values in this hash table and re-initializes it -// with the specified initial capacity as if by calling New, but without having -// to reallocate the object. -func (h *Float32HashTable) Reset(cap uint64) { - h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - h.capMask = h.cap - 1 - h.size = 0 - h.entries = make([]entryFloat32, h.cap) -} - -// CopyValues is used for copying the values out of the hash table into the -// passed in slice, in the order that they were first inserted -func (h *Float32HashTable) CopyValues(out []float32) { - h.CopyValuesSubset(0, out) -} - -// CopyValuesSubset copies a subset of the values in the hashtable out, starting -// with the value at start, in the order that they were inserted. -func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) { - h.VisitEntries(func(e *entryFloat32) { - idx := e.payload.memoIdx - int32(start) - if idx >= 0 { - out[idx] = e.payload.val - } - }) -} - -func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } - -func (Float32HashTable) fixHash(v uint64) uint64 { - if v == sentinel { - return 42 - } - return v -} - -// Lookup retrieves the entry for a given hash value assuming it's payload value returns -// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, -// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. -func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) { - idx, ok := h.lookup(v, h.capMask, cmp) - return &h.entries[idx], ok -} - -func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) { - const perturbShift uint8 = 5 - - var ( - idx uint64 - perturb uint64 - e *entryFloat32 - ) - - v = h.fixHash(v) - idx = v & szMask - perturb = (v >> uint64(perturbShift)) + 1 - - for { - e = &h.entries[idx] - if e.h == v && cmp(e.payload.val) { - return idx, true - } - - if e.h == sentinel { - return idx, false - } - - // perturbation logic inspired from CPython's set/dict object - // the goal is that all 64 bits of unmasked hash value eventually - // participate int he probing sequence, to minimize clustering - idx = (idx + perturb) & szMask - perturb = (perturb >> uint64(perturbShift)) + 1 - } -} - -func (h *Float32HashTable) upsize(newcap uint64) error { - newMask := newcap - 1 - - oldEntries := h.entries - h.entries = make([]entryFloat32, newcap) - for _, e := range oldEntries { - if e.Valid() { - idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false }) - h.entries[idx] = e - } - } - h.cap = newcap - h.capMask = newMask - return nil -} - -// Insert updates the given entry with the provided hash value, payload value and memo index. -// The entry pointer must have been retrieved via lookup in order to actually insert properly. -func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error { - e.h = h.fixHash(v) - e.payload.val = val - e.payload.memoIdx = memoIdx - h.size++ - - if h.needUpsize() { - h.upsize(h.cap * uint64(loadFactor) * 2) - } - return nil -} - -// VisitEntries will call the passed in function on each *valid* entry in the hash table, -// a valid entry being one which has had a value inserted into it. -func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) { - for _, e := range h.entries { - if e.Valid() { - visit(&e) - } - } -} - -// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface -// conforming to the MemoTable interface defined in the encoding package for general interactions -// regarding dictionaries. -type Float32MemoTable struct { - tbl *Float32HashTable - nullIdx int32 -} - -// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further -// allocations when inserting. -func NewFloat32MemoTable(num int64) *Float32MemoTable { - return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound} -} - -// Reset allows this table to be re-used by dumping all the data currently in the table. -func (s *Float32MemoTable) Reset() { - s.tbl.Reset(32) - s.nullIdx = KeyNotFound -} - -// Size returns the current number of inserted elements into the table including if a null -// has been inserted. -func (s *Float32MemoTable) Size() int { - sz := int(s.tbl.size) - if _, ok := s.GetNull(); ok { - sz++ - } - return sz -} - -// GetNull returns the index of an inserted null or KeyNotFound along with a bool -// that will be true if found and false if not. -func (s *Float32MemoTable) GetNull() (int, bool) { - return int(s.nullIdx), s.nullIdx != KeyNotFound -} - -// GetOrInsertNull will return the index of the null entry or insert a null entry -// if one currently doesn't exist. The found value will be true if there was already -// a null in the table, and false if it inserted one. -func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) { - idx, found = s.GetNull() - if !found { - idx = s.Size() - s.nullIdx = int32(idx) - } - return -} - -// CopyValues will copy the values from the memo table out into the passed in slice -// which must be of the appropriate type. -func (s *Float32MemoTable) CopyValues(out interface{}) { - s.CopyValuesSubset(0, out) -} - -// CopyValuesSubset is like CopyValues but only copies a subset of values starting -// at the provided start index -func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) { - s.tbl.CopyValuesSubset(start, out.([]float32)) -} - -// Get returns the index of the requested value in the hash table or KeyNotFound -// along with a boolean indicating if it was found or not. -func (s *Float32MemoTable) Get(val interface{}) (int, bool) { - - h := hashFloat32(val.(float32), 0) - var cmp func(float32) bool - - if math.IsNaN(float64(val.(float32))) { - cmp = isNan32Cmp - - } else { - cmp = func(v float32) bool { return val.(float32) == v } - } - if e, ok := s.tbl.Lookup(h, cmp); ok { - - return int(e.payload.memoIdx), ok - } - return KeyNotFound, false -} - -// GetOrInsert will return the index of the specified value in the table, or insert the -// value into the table and return the new index. found indicates whether or not it already -// existed in the table (true) or was inserted by this call (false). -func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { - - h := hashFloat32(val.(float32), 0) - var cmp func(float32) bool - - if math.IsNaN(float64(val.(float32))) { - cmp = isNan32Cmp - - } else { - cmp = func(v float32) bool { return val.(float32) == v } - } - e, ok := s.tbl.Lookup(h, cmp) - - if ok { - idx = int(e.payload.memoIdx) - found = true - } else { - idx = s.Size() - s.tbl.Insert(e, h, val.(float32), int32(idx)) - } - return -} - -type payloadFloat64 struct { - val float64 - memoIdx int32 -} - -type entryFloat64 struct { - h uint64 - payload payloadFloat64 -} - -func (e entryFloat64) Valid() bool { return e.h != sentinel } - -// Float64HashTable is a hashtable specifically for float64 that -// is utilized with the MemoTable to generalize interactions for easier -// implementation of dictionaries without losing performance. -type Float64HashTable struct { - cap uint64 - capMask uint64 - size uint64 - - entries []entryFloat64 -} - -// NewFloat64HashTable returns a new hash table for float64 values -// initialized with the passed in capacity or 32 whichever is larger. -func NewFloat64HashTable(cap uint64) *Float64HashTable { - initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0} - ret.entries = make([]entryFloat64, initCap) - return ret -} - -// Reset drops all of the values in this hash table and re-initializes it -// with the specified initial capacity as if by calling New, but without having -// to reallocate the object. -func (h *Float64HashTable) Reset(cap uint64) { - h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - h.capMask = h.cap - 1 - h.size = 0 - h.entries = make([]entryFloat64, h.cap) -} - -// CopyValues is used for copying the values out of the hash table into the -// passed in slice, in the order that they were first inserted -func (h *Float64HashTable) CopyValues(out []float64) { - h.CopyValuesSubset(0, out) -} - -// CopyValuesSubset copies a subset of the values in the hashtable out, starting -// with the value at start, in the order that they were inserted. -func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) { - h.VisitEntries(func(e *entryFloat64) { - idx := e.payload.memoIdx - int32(start) - if idx >= 0 { - out[idx] = e.payload.val - } - }) -} - -func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } - -func (Float64HashTable) fixHash(v uint64) uint64 { - if v == sentinel { - return 42 - } - return v -} - -// Lookup retrieves the entry for a given hash value assuming it's payload value returns -// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, -// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. -func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) { - idx, ok := h.lookup(v, h.capMask, cmp) - return &h.entries[idx], ok -} - -func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) { - const perturbShift uint8 = 5 - - var ( - idx uint64 - perturb uint64 - e *entryFloat64 - ) - - v = h.fixHash(v) - idx = v & szMask - perturb = (v >> uint64(perturbShift)) + 1 - - for { - e = &h.entries[idx] - if e.h == v && cmp(e.payload.val) { - return idx, true - } - - if e.h == sentinel { - return idx, false - } - - // perturbation logic inspired from CPython's set/dict object - // the goal is that all 64 bits of unmasked hash value eventually - // participate int he probing sequence, to minimize clustering - idx = (idx + perturb) & szMask - perturb = (perturb >> uint64(perturbShift)) + 1 - } -} - -func (h *Float64HashTable) upsize(newcap uint64) error { - newMask := newcap - 1 - - oldEntries := h.entries - h.entries = make([]entryFloat64, newcap) - for _, e := range oldEntries { - if e.Valid() { - idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false }) - h.entries[idx] = e - } - } - h.cap = newcap - h.capMask = newMask - return nil -} - -// Insert updates the given entry with the provided hash value, payload value and memo index. -// The entry pointer must have been retrieved via lookup in order to actually insert properly. -func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error { - e.h = h.fixHash(v) - e.payload.val = val - e.payload.memoIdx = memoIdx - h.size++ - - if h.needUpsize() { - h.upsize(h.cap * uint64(loadFactor) * 2) - } - return nil -} - -// VisitEntries will call the passed in function on each *valid* entry in the hash table, -// a valid entry being one which has had a value inserted into it. -func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) { - for _, e := range h.entries { - if e.Valid() { - visit(&e) - } - } -} - -// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface -// conforming to the MemoTable interface defined in the encoding package for general interactions -// regarding dictionaries. -type Float64MemoTable struct { - tbl *Float64HashTable - nullIdx int32 -} - -// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further -// allocations when inserting. -func NewFloat64MemoTable(num int64) *Float64MemoTable { - return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound} -} - -// Reset allows this table to be re-used by dumping all the data currently in the table. -func (s *Float64MemoTable) Reset() { - s.tbl.Reset(32) - s.nullIdx = KeyNotFound -} - -// Size returns the current number of inserted elements into the table including if a null -// has been inserted. -func (s *Float64MemoTable) Size() int { - sz := int(s.tbl.size) - if _, ok := s.GetNull(); ok { - sz++ - } - return sz -} - -// GetNull returns the index of an inserted null or KeyNotFound along with a bool -// that will be true if found and false if not. -func (s *Float64MemoTable) GetNull() (int, bool) { - return int(s.nullIdx), s.nullIdx != KeyNotFound -} - -// GetOrInsertNull will return the index of the null entry or insert a null entry -// if one currently doesn't exist. The found value will be true if there was already -// a null in the table, and false if it inserted one. -func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) { - idx, found = s.GetNull() - if !found { - idx = s.Size() - s.nullIdx = int32(idx) - } - return -} - -// CopyValues will copy the values from the memo table out into the passed in slice -// which must be of the appropriate type. -func (s *Float64MemoTable) CopyValues(out interface{}) { - s.CopyValuesSubset(0, out) -} - -// CopyValuesSubset is like CopyValues but only copies a subset of values starting -// at the provided start index -func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) { - s.tbl.CopyValuesSubset(start, out.([]float64)) -} - -// Get returns the index of the requested value in the hash table or KeyNotFound -// along with a boolean indicating if it was found or not. -func (s *Float64MemoTable) Get(val interface{}) (int, bool) { - - h := hashFloat64(val.(float64), 0) - var cmp func(float64) bool - - if math.IsNaN(val.(float64)) { - cmp = math.IsNaN - - } else { - cmp = func(v float64) bool { return val.(float64) == v } - } - if e, ok := s.tbl.Lookup(h, cmp); ok { - - return int(e.payload.memoIdx), ok - } - return KeyNotFound, false -} - -// GetOrInsert will return the index of the specified value in the table, or insert the -// value into the table and return the new index. found indicates whether or not it already -// existed in the table (true) or was inserted by this call (false). -func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { - - h := hashFloat64(val.(float64), 0) - var cmp func(float64) bool - - if math.IsNaN(val.(float64)) { - cmp = math.IsNaN - - } else { - cmp = func(v float64) bool { return val.(float64) == v } - } - e, ok := s.tbl.Lookup(h, cmp) - - if ok { - idx = int(e.payload.memoIdx) - found = true - } else { - idx = s.Size() - s.tbl.Insert(e, h, val.(float64), int32(idx)) - } - return -} diff --git a/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl b/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl deleted file mode 100644 index f8ec55586ae..00000000000 --- a/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl +++ /dev/null @@ -1,290 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package hashing - -import ( - "github.com/apache/arrow/go/arrow/bitutil" -) - -{{range .In}} -type payload{{.Name}} struct { - val {{.name}} - memoIdx int32 -} - -type entry{{.Name}} struct { - h uint64 - payload payload{{.Name}} -} - -func (e entry{{.Name}}) Valid() bool { return e.h != sentinel } - -// {{.Name}}HashTable is a hashtable specifically for {{.name}} that -// is utilized with the MemoTable to generalize interactions for easier -// implementation of dictionaries without losing performance. -type {{.Name}}HashTable struct { - cap uint64 - capMask uint64 - size uint64 - - entries []entry{{.Name}} -} - -// New{{.Name}}HashTable returns a new hash table for {{.name}} values -// initialized with the passed in capacity or 32 whichever is larger. -func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable { - initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0} - ret.entries = make([]entry{{.Name}}, initCap) - return ret -} - -// Reset drops all of the values in this hash table and re-initializes it -// with the specified initial capacity as if by calling New, but without having -// to reallocate the object. -func (h *{{.Name}}HashTable) Reset(cap uint64) { - h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) - h.capMask = h.cap - 1 - h.size = 0 - h.entries = make([]entry{{.Name}}, h.cap) -} - -// CopyValues is used for copying the values out of the hash table into the -// passed in slice, in the order that they were first inserted -func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) { - h.CopyValuesSubset(0, out) -} - -// CopyValuesSubset copies a subset of the values in the hashtable out, starting -// with the value at start, in the order that they were inserted. -func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) { - h.VisitEntries(func(e *entry{{.Name}}) { - idx := e.payload.memoIdx - int32(start) - if idx >= 0 { - out[idx] = e.payload.val - } - }) -} - -func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } - -func ({{.Name}}HashTable) fixHash(v uint64) uint64 { - if v == sentinel { - return 42 - } - return v -} - -// Lookup retrieves the entry for a given hash value assuming it's payload value returns -// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, -// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. -func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) { - idx, ok := h.lookup(v, h.capMask, cmp) - return &h.entries[idx], ok -} - -func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) { - const perturbShift uint8 = 5 - - var ( - idx uint64 - perturb uint64 - e *entry{{.Name}} - ) - - v = h.fixHash(v) - idx = v & szMask - perturb = (v >> uint64(perturbShift)) + 1 - - for { - e = &h.entries[idx] - if e.h == v && cmp(e.payload.val) { - return idx, true - } - - if e.h == sentinel { - return idx, false - } - - // perturbation logic inspired from CPython's set/dict object - // the goal is that all 64 bits of unmasked hash value eventually - // participate int he probing sequence, to minimize clustering - idx = (idx + perturb) & szMask - perturb = (perturb >> uint64(perturbShift)) + 1 - } -} - -func (h *{{.Name}}HashTable) upsize(newcap uint64) error { - newMask := newcap - 1 - - oldEntries := h.entries - h.entries = make([]entry{{.Name}}, newcap) - for _, e := range oldEntries { - if e.Valid() { - idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false }) - h.entries[idx] = e - } - } - h.cap = newcap - h.capMask = newMask - return nil -} - -// Insert updates the given entry with the provided hash value, payload value and memo index. -// The entry pointer must have been retrieved via lookup in order to actually insert properly. -func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error { - e.h = h.fixHash(v) - e.payload.val = val - e.payload.memoIdx = memoIdx - h.size++ - - if h.needUpsize() { - h.upsize(h.cap * uint64(loadFactor) * 2) - } - return nil -} - -// VisitEntries will call the passed in function on each *valid* entry in the hash table, -// a valid entry being one which has had a value inserted into it. -func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) { - for _, e := range h.entries { - if e.Valid() { - visit(&e) - } - } -} - -// {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface -// conforming to the MemoTable interface defined in the encoding package for general interactions -// regarding dictionaries. -type {{.Name}}MemoTable struct { - tbl *{{.Name}}HashTable - nullIdx int32 -} - -// New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further -// allocations when inserting. -func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable { - return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound} -} - -// Reset allows this table to be re-used by dumping all the data currently in the table. -func (s *{{.Name}}MemoTable) Reset() { - s.tbl.Reset(32) - s.nullIdx = KeyNotFound -} - -// Size returns the current number of inserted elements into the table including if a null -// has been inserted. -func (s *{{.Name}}MemoTable) Size() int { - sz := int(s.tbl.size) - if _, ok := s.GetNull(); ok { - sz++ - } - return sz -} - -// GetNull returns the index of an inserted null or KeyNotFound along with a bool -// that will be true if found and false if not. -func (s *{{.Name}}MemoTable) GetNull() (int, bool) { - return int(s.nullIdx), s.nullIdx != KeyNotFound -} - -// GetOrInsertNull will return the index of the null entry or insert a null entry -// if one currently doesn't exist. The found value will be true if there was already -// a null in the table, and false if it inserted one. -func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) { - idx, found = s.GetNull() - if !found { - idx = s.Size() - s.nullIdx = int32(idx) - } - return -} - -// CopyValues will copy the values from the memo table out into the passed in slice -// which must be of the appropriate type. -func (s *{{.Name}}MemoTable) CopyValues(out interface{}) { - s.CopyValuesSubset(0, out) -} - -// CopyValuesSubset is like CopyValues but only copies a subset of values starting -// at the provided start index -func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) { - s.tbl.CopyValuesSubset(start, out.([]{{.name}})) -} - -// Get returns the index of the requested value in the hash table or KeyNotFound -// along with a boolean indicating if it was found or not. -func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) { -{{if or (eq .Name "Int32") (eq .Name "Int64") }} - h := hashInt(uint64(val.({{.name}})), 0) - if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok { -{{ else }} - h := hash{{.Name}}(val.({{.name}}), 0) - var cmp func({{.name}}) bool - {{if eq .Name "Float32"}} - if math.IsNaN(float64(val.(float32))) { - cmp = isNan32Cmp - {{ else }} - if math.IsNaN(val.(float64)) { - cmp = math.IsNaN - {{end}} - } else { - cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } - } - if e, ok := s.tbl.Lookup(h, cmp); ok { -{{ end }} - return int(e.payload.memoIdx), ok - } - return KeyNotFound, false -} - -// GetOrInsert will return the index of the specified value in the table, or insert the -// value into the table and return the new index. found indicates whether or not it already -// existed in the table (true) or was inserted by this call (false). -func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { - {{if or (eq .Name "Int32") (eq .Name "Int64") }} - h := hashInt(uint64(val.({{.name}})), 0) - e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { - return val.({{.name}}) == v - }) -{{ else }} - h := hash{{.Name}}(val.({{.name}}), 0) - var cmp func({{.name}}) bool - {{if eq .Name "Float32"}} - if math.IsNaN(float64(val.(float32))) { - cmp = isNan32Cmp - {{ else }} - if math.IsNaN(val.(float64)) { - cmp = math.IsNaN - {{end}} - } else { - cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } - } - e, ok := s.tbl.Lookup(h, cmp) -{{ end }} - if ok { - idx = int(e.payload.memoIdx) - found = true - } else { - idx = s.Size() - s.tbl.Insert(e, h, val.({{.name}}), int32(idx)) - } - return -} -{{end}} diff --git a/go/parquet/internal/hashing/xxh3_memo_table.go b/go/parquet/internal/hashing/xxh3_memo_table.go deleted file mode 100644 index dd1ee6cf58f..00000000000 --- a/go/parquet/internal/hashing/xxh3_memo_table.go +++ /dev/null @@ -1,386 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package hashing provides utilities for and an implementation of a hash -// table which is more performant than the default go map implementation -// by leveraging xxh3 and some custom hash functions. -package hashing - -import ( - "bytes" - "math" - "math/bits" - "reflect" - "unsafe" - - "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/memory" - "github.com/apache/arrow/go/parquet" - - "github.com/zeebo/xxh3" -) - -//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl - -func hashInt(val uint64, alg uint64) uint64 { - // Two of xxhash's prime multipliers (which are chosen for their - // bit dispersion properties) - var multipliers = [2]uint64{11400714785074694791, 14029467366897019727} - // Multiplying by the prime number mixes the low bits into the high bits, - // then byte-swapping (which is a single CPU instruction) allows the - // combined high and low bits to participate in the initial hash table index. - return bits.ReverseBytes64(multipliers[alg] * val) -} - -func hashFloat32(val float32, alg uint64) uint64 { - // grab the raw byte pattern of the - bt := *(*[4]byte)(unsafe.Pointer(&val)) - x := uint64(*(*uint32)(unsafe.Pointer(&bt[0]))) - hx := hashInt(x, alg) - hy := hashInt(x, alg^1) - return 4 ^ hx ^ hy -} - -func hashFloat64(val float64, alg uint64) uint64 { - bt := *(*[8]byte)(unsafe.Pointer(&val)) - hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg) - hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1) - return 8 ^ hx ^ hy -} - -func hashString(val string, alg uint64) uint64 { - buf := *(*[]byte)(unsafe.Pointer(&val)) - (*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val) - return hash(buf, alg) -} - -// prime constants used for slightly increasing the hash quality further -var exprimes = [2]uint64{1609587929392839161, 9650029242287828579} - -// for smaller amounts of bytes this is faster than even calling into -// xxh3 to do the hash, so we specialize in order to get the benefits -// of that performance. -func hash(b []byte, alg uint64) uint64 { - n := uint32(len(b)) - if n <= 16 { - switch { - case n > 8: - // 8 < length <= 16 - // apply same principle as above, but as two 64-bit ints - x := *(*uint64)(unsafe.Pointer(&b[n-8])) - y := *(*uint64)(unsafe.Pointer(&b[0])) - hx := hashInt(x, alg) - hy := hashInt(y, alg^1) - return uint64(n) ^ hx ^ hy - case n >= 4: - // 4 < length <= 8 - // we can read the bytes as two overlapping 32-bit ints, apply different - // hash functions to each in parallel - // then xor the results - x := *(*uint32)(unsafe.Pointer(&b[n-4])) - y := *(*uint32)(unsafe.Pointer(&b[0])) - hx := hashInt(uint64(x), alg) - hy := hashInt(uint64(y), alg^1) - return uint64(n) ^ hx ^ hy - case n > 0: - x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1])) - return hashInt(uint64(x), alg) - case n == 0: - return 1 - } - } - - // increase differentiation enough to improve hash quality - return xxh3.Hash(b) + exprimes[alg] -} - -const ( - sentinel uint64 = 0 - loadFactor int64 = 2 -) - -func max(a, b uint64) uint64 { - if a > b { - return a - } - return b -} - -var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } - -// KeyNotFound is the constant returned by memo table functions when a key isn't found in the table -const KeyNotFound = -1 - -// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder -// to construct the actual data in an easy to pass around way with minimal copies -// while using a hash table to keep track of the indexes into the dictionary that -// is created as we go. -type BinaryMemoTable struct { - tbl *Int32HashTable - builder *array.BinaryBuilder - nullIdx int -} - -// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will -// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. -// initial and valuesize can be used to pre-allocate the table to reduce allocations. With -// initial being the initial number of entries to allocate for and valuesize being the starting -// amount of space allocated for writing the actual binary data. -func NewBinaryMemoTable(mem memory.Allocator, initial, valuesize int) *BinaryMemoTable { - if mem == nil { - mem = memory.DefaultAllocator - } - bldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) - bldr.Reserve(int(initial)) - datasize := valuesize - if datasize <= 0 { - datasize = initial * 4 - } - bldr.ReserveData(datasize) - return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} -} - -// Reset dumps all of the data in the table allowing it to be reutilized. -func (s *BinaryMemoTable) Reset() { - s.tbl.Reset(32) - s.builder.NewArray().Release() - s.builder.Reserve(int(32)) - s.builder.ReserveData(int(32) * 4) - s.nullIdx = KeyNotFound -} - -// GetNull returns the index of a null that has been inserted into the table or -// KeyNotFound. The bool returned will be true if there was a null inserted into -// the table, and false otherwise. -func (s *BinaryMemoTable) GetNull() (int, bool) { - return int(s.nullIdx), s.nullIdx != KeyNotFound -} - -// Size returns the current size of the memo table including the null value -// if one has been inserted. -func (s *BinaryMemoTable) Size() int { - sz := int(s.tbl.size) - if _, ok := s.GetNull(); ok { - sz++ - } - return sz -} - -// helper function to easily return a byte slice for any given value -// regardless of the type if it's a []byte, parquet.ByteArray, -// parquet.FixedLenByteArray or string. -func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { - switch v := val.(type) { - case []byte: - return v - case parquet.ByteArray: - return *(*[]byte)(unsafe.Pointer(&v)) - case parquet.FixedLenByteArray: - return *(*[]byte)(unsafe.Pointer(&v)) - case string: - return (*(*[]byte)(unsafe.Pointer(&v)))[:len(v):len(v)] - default: - panic("invalid type for binarymemotable") - } -} - -// helper function to get the hash value regardless of the underlying binary type -func (BinaryMemoTable) getHash(val interface{}) uint64 { - switch v := val.(type) { - case string: - return hashString(v, 0) - case []byte: - return hash(v, 0) - case parquet.ByteArray: - return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) - case parquet.FixedLenByteArray: - return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) - default: - panic("invalid type for binarymemotable") - } -} - -// helper function to append the given value to the builder regardless -// of the underlying binary type. -func (b *BinaryMemoTable) appendVal(val interface{}) { - switch v := val.(type) { - case string: - b.builder.AppendString(v) - case []byte: - b.builder.Append(v) - case parquet.ByteArray: - b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) - case parquet.FixedLenByteArray: - b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) - } -} - -func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { - return b.tbl.Lookup(h, func(i int32) bool { - return bytes.Equal(val, b.builder.Value(int(i))) - }) -} - -// Get returns the index of the specified value in the table or KeyNotFound, -// and a boolean indicating whether it was found in the table. -func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { - if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { - return int(p.payload.val), ok - } - return KeyNotFound, false -} - -// GetOrInsert returns the index of the given value in the table, if not found -// it is inserted into the table. The return value 'found' indicates whether the value -// was found in the table (true) or inserted (false) along with any possible error. -func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { - h := b.getHash(val) - p, found := b.lookup(h, b.valAsByteSlice(val)) - if found { - idx = int(p.payload.val) - } else { - idx = b.Size() - b.appendVal(val) - b.tbl.Insert(p, h, int32(idx), -1) - } - return -} - -// GetOrInsertNull retrieves the index of a null in the table or inserts -// null into the table, returning the index and a boolean indicating if it was -// found in the table (true) or was inserted (false). -func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { - idx, found = b.GetNull() - if !found { - idx = b.Size() - b.nullIdx = idx - b.builder.AppendNull() - } - return -} - -// helper function to get the offset into the builder data for a given -// index value. -func (b *BinaryMemoTable) findOffset(idx int) uintptr { - val := b.builder.Value(idx) - for len(val) == 0 { - idx++ - if idx >= b.builder.Len() { - break - } - val = b.builder.Value(idx) - } - if len(val) != 0 { - return uintptr(unsafe.Pointer(&val[0])) - } - return uintptr(b.builder.DataLen()) + b.findOffset(0) -} - -// CopyOffsets copies the list of offsets into the passed in slice, the offsets -// being the start and end values of the underlying allocated bytes in the builder -// for the individual values of the table. out should be at least sized to Size()+1 -func (b *BinaryMemoTable) CopyOffsets(out []int8) { - b.CopyOffsetsSubset(0, out) -} - -// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, -// it gets a subset of the offsets in the table starting at the index provided by "start". -func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int8) { - if b.builder.Len() <= start { - return - } - - first := b.findOffset(0) - delta := b.findOffset(start) - for i := start; i < b.Size(); i++ { - offset := int8(b.findOffset(i) - delta) - out[i-start] = offset - } - - out[b.Size()-start] = int8(b.builder.DataLen() - int(delta) - int(first)) -} - -// CopyValues copies the raw binary data bytes out, out should be a []byte -// with at least ValuesSize bytes allocated to copy into. -func (b *BinaryMemoTable) CopyValues(out interface{}) { - b.CopyValuesSubset(0, out) -} - -// CopyValuesSubset copies the raw binary data bytes out starting with the value -// at the index start, out should be a []byte with at least ValuesSize bytes allocated -func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { - var ( - first = b.findOffset(0) - offset = b.findOffset(int(start)) - length = b.builder.DataLen() - int(offset-first) - ) - - outval := out.([]byte) - copy(outval, b.builder.Value(start)[0:length]) -} - -// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep -// track of the fixed width when inserting the null value the databuffer holds a -// zero length byte slice for the null value (if found) -func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { - if start >= b.Size() { - return - } - - null, exists := b.GetNull() - if !exists || null < start { - // nothing to skip, proceed as usual - b.CopyValuesSubset(start, out) - return - } - - var ( - leftOffset = b.findOffset(start) - nullOffset = b.findOffset(null) - leftSize = nullOffset - leftOffset - ) - - if leftSize > 0 { - copy(out, b.builder.Value(start)[0:leftSize]) - } - - rightSize := b.ValuesSize() - int(nullOffset) - if rightSize > 0 { - // skip the null fixed size value - copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize]) - } -} - -// VisitValues exists to run the visitFn on each value currently in the hash table. -func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { - for i := int(start); i < b.Size(); i++ { - visitFn(b.builder.Value(i)) - } -} - -// Release is used to tell the underlying builder that it can release the memory allocated -// when the reference count reaches 0, this is safe to be called from multiple goroutines -// simultaneously -func (b *BinaryMemoTable) Release() { b.builder.Release() } - -// Retain increases the ref count, it is safe to call it from multiple goroutines -// simultaneously. -func (b *BinaryMemoTable) Retain() { b.builder.Retain() } - -// ValuesSize returns the current total size of all the raw bytes that have been inserted -// into the memotable so far. -func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() } From 5d54a304154ea5f85a74539f9dc73fac39b18b46 Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Tue, 6 Jul 2021 12:27:38 -0400 Subject: [PATCH 15/17] delta byte array feedback to delay memcpy --- .../internal/encoding/delta_byte_array.go | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/go/parquet/internal/encoding/delta_byte_array.go b/go/parquet/internal/encoding/delta_byte_array.go index 6c4833c58b7..d11413ea236 100644 --- a/go/parquet/internal/encoding/delta_byte_array.go +++ b/go/parquet/internal/encoding/delta_byte_array.go @@ -62,7 +62,7 @@ func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) { enc.initEncoders() enc.prefixEncoder.Put([]int32{0}) suf = in[0] - enc.lastVal = append([]byte(nil), in[0]...) + enc.lastVal = in[0] enc.suffixEncoder.Put([]parquet.ByteArray{suf}) in = in[1:] } @@ -82,8 +82,14 @@ func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) { enc.prefixEncoder.Put([]int32{int32(j)}) suf = val[j:] enc.suffixEncoder.Put([]parquet.ByteArray{suf}) - enc.lastVal = append([]byte(nil), val...) + enc.lastVal = val } + + // do the memcpy after the loops to keep a copy of the lastVal + // we do a copy here so that we only copy and keep a reference + // to the suffix, and aren't forcing the *entire* value to stay + // in memory while we have this reference to just the suffix. + enc.lastVal = append([]byte{}, enc.lastVal...) } // PutSpaced is like Put, but assumes the data is already spaced for nulls and uses the bitmap provided and offset @@ -177,15 +183,19 @@ func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { for len(out) > 0 { prefixLen, d.prefixLengths = d.prefixLengths[0], d.prefixLengths[1:] - prefix := d.lastVal[:prefixLen] + prefix := d.lastVal[:prefixLen:prefixLen] _, err = d.DeltaLengthByteArrayDecoder.Decode(suffixHolder) if err != nil { return 0, err } - d.lastVal = make([]byte, 0, int(prefixLen)+len(suffixHolder[0])) - d.lastVal = append([]byte{}, prefix...) - d.lastVal = append(d.lastVal, suffixHolder[0]...) + if len(suffixHolder[0]) == 0 { + d.lastVal = prefix + } else { + d.lastVal = make([]byte, int(prefixLen)+len(suffixHolder[0])) + copy(d.lastVal, prefix) + copy(d.lastVal[prefixLen:], suffixHolder[0]) + } out[0], out = d.lastVal, out[1:] } return max, nil From da6a92be69e2ddd16cb4911d5b4928f82e4c907b Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Tue, 6 Jul 2021 12:28:03 -0400 Subject: [PATCH 16/17] add capacity to output bytearrays to prevent expanding the slices into other data. --- go/parquet/internal/encoding/delta_length_byte_array.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/parquet/internal/encoding/delta_length_byte_array.go b/go/parquet/internal/encoding/delta_length_byte_array.go index 65083919093..3563ccec461 100644 --- a/go/parquet/internal/encoding/delta_length_byte_array.go +++ b/go/parquet/internal/encoding/delta_length_byte_array.go @@ -124,7 +124,7 @@ func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) error { func (d *DeltaLengthByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { max := utils.MinInt(len(out), d.nvals) for i := 0; i < max; i++ { - out[i] = d.data[:d.lengths[i]] + out[i] = d.data[:d.lengths[i]:d.lengths[i]] d.data = d.data[d.lengths[i]:] } d.nvals -= max From fba1f2398f1d3947efa44b705a883ca4a92c78ef Mon Sep 17 00:00:00 2001 From: Matthew Topol Date: Tue, 6 Jul 2021 12:28:23 -0400 Subject: [PATCH 17/17] properly handle bigendian in addition to little endian for plain encoding --- .../encoding/plain_encoder_types.gen.go | 106 ++++++++++++++++-- .../encoding/plain_encoder_types.gen.go.tmpl | 54 ++++++++- 2 files changed, 148 insertions(+), 12 deletions(-) diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go b/go/parquet/internal/encoding/plain_encoder_types.gen.go index c48268dcca3..a3826339dfa 100644 --- a/go/parquet/internal/encoding/plain_encoder_types.gen.go +++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go @@ -19,14 +19,100 @@ package encoding import ( + "bytes" + "encoding/binary" "math" "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/endian" "github.com/apache/arrow/go/parquet" "github.com/apache/arrow/go/parquet/internal/utils" "golang.org/x/xerrors" ) +var ( + writeInt32LE func(*encoder, []int32) + copyFromInt32LE func(dst []int32, src []byte) + writeInt64LE func(*encoder, []int64) + copyFromInt64LE func(dst []int64, src []byte) + writeInt96LE func(*encoder, []parquet.Int96) + copyFromInt96LE func(dst []parquet.Int96, src []byte) + writeFloat32LE func(*encoder, []float32) + copyFromFloat32LE func(dst []float32, src []byte) + writeFloat64LE func(*encoder, []float64) + copyFromFloat64LE func(dst []float64, src []byte) +) + +func init() { + // int96 is already internally represented as little endian data + // no need to have special behavior on big endian architectures + // for read/write, consumers will need to be aware of the fact + // that it is internally 12 bytes little endian when attempting + // to utilize it. + writeInt96LE = func(e *encoder, in []parquet.Int96) { + e.append(parquet.Int96Traits.CastToBytes(in)) + } + copyFromInt96LE = func(dst []parquet.Int96, src []byte) { + copy(parquet.Int96Traits.CastToBytes(dst), src) + } + + if endian.IsBigEndian { + writeInt32LE = func(e *encoder, in []int32) { + binary.Write(e.sink, binary.LittleEndian, in) + } + copyFromInt32LE = func(dst []int32, src []byte) { + r := bytes.NewReader(src) + binary.Read(r, binary.LittleEndian, &dst) + } + writeInt64LE = func(e *encoder, in []int64) { + binary.Write(e.sink, binary.LittleEndian, in) + } + copyFromInt64LE = func(dst []int64, src []byte) { + r := bytes.NewReader(src) + binary.Read(r, binary.LittleEndian, &dst) + } + writeFloat32LE = func(e *encoder, in []float32) { + binary.Write(e.sink, binary.LittleEndian, in) + } + copyFromFloat32LE = func(dst []float32, src []byte) { + r := bytes.NewReader(src) + binary.Read(r, binary.LittleEndian, &dst) + } + writeFloat64LE = func(e *encoder, in []float64) { + binary.Write(e.sink, binary.LittleEndian, in) + } + copyFromFloat64LE = func(dst []float64, src []byte) { + r := bytes.NewReader(src) + binary.Read(r, binary.LittleEndian, &dst) + } + } else { + writeInt32LE = func(e *encoder, in []int32) { + e.append(arrow.Int32Traits.CastToBytes(in)) + } + copyFromInt32LE = func(dst []int32, src []byte) { + copy(arrow.Int32Traits.CastToBytes(dst), src) + } + writeInt64LE = func(e *encoder, in []int64) { + e.append(arrow.Int64Traits.CastToBytes(in)) + } + copyFromInt64LE = func(dst []int64, src []byte) { + copy(arrow.Int64Traits.CastToBytes(dst), src) + } + writeFloat32LE = func(e *encoder, in []float32) { + e.append(arrow.Float32Traits.CastToBytes(in)) + } + copyFromFloat32LE = func(dst []float32, src []byte) { + copy(arrow.Float32Traits.CastToBytes(dst), src) + } + writeFloat64LE = func(e *encoder, in []float64) { + e.append(arrow.Float64Traits.CastToBytes(in)) + } + copyFromFloat64LE = func(dst []float64, src []byte) { + copy(arrow.Float64Traits.CastToBytes(dst), src) + } + } +} + // PlainInt32Encoder is an encoder for int32 values using Plain Encoding // which in general is just storing the values as raw bytes of the appropriate size type PlainInt32Encoder struct { @@ -37,7 +123,7 @@ type PlainInt32Encoder struct { // Put encodes a slice of values into the underlying buffer func (enc *PlainInt32Encoder) Put(in []int32) { - enc.append(arrow.Int32Traits.CastToBytes(in)) + writeInt32LE(&enc.encoder, in) } // PutSpaced encodes a slice of values into the underlying buffer which are spaced out @@ -90,7 +176,7 @@ func (dec *PlainInt32Decoder) Decode(out []int32) (int, error) { return 0, xerrors.Errorf("parquet: eof exception decode plain Int32, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) } - copy(arrow.Int32Traits.CastToBytes(out), dec.data[:nbytes]) + copyFromInt32LE(out, dec.data[:nbytes]) dec.data = dec.data[nbytes:] dec.nvals -= max return max, nil @@ -142,7 +228,7 @@ type PlainInt64Encoder struct { // Put encodes a slice of values into the underlying buffer func (enc *PlainInt64Encoder) Put(in []int64) { - enc.append(arrow.Int64Traits.CastToBytes(in)) + writeInt64LE(&enc.encoder, in) } // PutSpaced encodes a slice of values into the underlying buffer which are spaced out @@ -195,7 +281,7 @@ func (dec *PlainInt64Decoder) Decode(out []int64) (int, error) { return 0, xerrors.Errorf("parquet: eof exception decode plain Int64, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) } - copy(arrow.Int64Traits.CastToBytes(out), dec.data[:nbytes]) + copyFromInt64LE(out, dec.data[:nbytes]) dec.data = dec.data[nbytes:] dec.nvals -= max return max, nil @@ -247,7 +333,7 @@ type PlainInt96Encoder struct { // Put encodes a slice of values into the underlying buffer func (enc *PlainInt96Encoder) Put(in []parquet.Int96) { - enc.append(parquet.Int96Traits.CastToBytes(in)) + writeInt96LE(&enc.encoder, in) } // PutSpaced encodes a slice of values into the underlying buffer which are spaced out @@ -300,7 +386,7 @@ func (dec *PlainInt96Decoder) Decode(out []parquet.Int96) (int, error) { return 0, xerrors.Errorf("parquet: eof exception decode plain Int96, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) } - copy(parquet.Int96Traits.CastToBytes(out), dec.data[:nbytes]) + copyFromInt96LE(out, dec.data[:nbytes]) dec.data = dec.data[nbytes:] dec.nvals -= max return max, nil @@ -352,7 +438,7 @@ type PlainFloat32Encoder struct { // Put encodes a slice of values into the underlying buffer func (enc *PlainFloat32Encoder) Put(in []float32) { - enc.append(arrow.Float32Traits.CastToBytes(in)) + writeFloat32LE(&enc.encoder, in) } // PutSpaced encodes a slice of values into the underlying buffer which are spaced out @@ -405,7 +491,7 @@ func (dec *PlainFloat32Decoder) Decode(out []float32) (int, error) { return 0, xerrors.Errorf("parquet: eof exception decode plain Float32, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) } - copy(arrow.Float32Traits.CastToBytes(out), dec.data[:nbytes]) + copyFromFloat32LE(out, dec.data[:nbytes]) dec.data = dec.data[nbytes:] dec.nvals -= max return max, nil @@ -457,7 +543,7 @@ type PlainFloat64Encoder struct { // Put encodes a slice of values into the underlying buffer func (enc *PlainFloat64Encoder) Put(in []float64) { - enc.append(arrow.Float64Traits.CastToBytes(in)) + writeFloat64LE(&enc.encoder, in) } // PutSpaced encodes a slice of values into the underlying buffer which are spaced out @@ -510,7 +596,7 @@ func (dec *PlainFloat64Decoder) Decode(out []float64) (int, error) { return 0, xerrors.Errorf("parquet: eof exception decode plain Float64, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) } - copy(arrow.Float64Traits.CastToBytes(out), dec.data[:nbytes]) + copyFromFloat64LE(out, dec.data[:nbytes]) dec.data = dec.data[nbytes:] dec.nvals -= max return max, nil diff --git a/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl index 86e04e4e637..1b72497444c 100644 --- a/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl +++ b/go/parquet/internal/encoding/plain_encoder_types.gen.go.tmpl @@ -17,11 +17,61 @@ package encoding import ( + "encoding/binary" + "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/parquet" "github.com/apache/arrow/go/parquet/internal/utils" ) +var ( +{{range .In}} +{{if and (ne .Name "Boolean") (ne .Name "ByteArray") (ne .Name "FixedLenByteArray") -}} + write{{.Name}}LE func(*encoder, []{{.name}}) + copyFrom{{.Name}}LE func(dst []{{.name}}, src []byte) +{{- end}} +{{- end}} +) + +func init() { + // int96 is already internally represented as little endian data + // no need to have special behavior on big endian architectures + // for read/write, consumers will need to be aware of the fact + // that it is internally 12 bytes little endian when attempting + // to utilize it. + writeInt96LE = func(e *encoder, in []parquet.Int96) { + e.append(parquet.Int96Traits.CastToBytes(in)) + } + copyFromInt96LE = func(dst []parquet.Int96, src []byte) { + copy(parquet.Int96Traits.CastToBytes(dst), src) + } + + if endian.IsBigEndian { +{{- range .In}} +{{- if and (ne .Name "Boolean") (ne .Name "ByteArray") (ne .Name "FixedLenByteArray") (ne .Name "Int96")}} + write{{.Name}}LE = func(e *encoder, in []{{.name}}) { + binary.Write(e.sink, binary.LittleEndian, in) + } + copyFrom{{.Name}}LE = func(dst []{{.name}}, src []byte) { + r := bytes.NewReader(src) + binary.Read(r, binary.LittleEndian, &dst) + } +{{- end -}} +{{- end}} + } else { +{{- range .In}} +{{- if and (ne .Name "Boolean") (ne .Name "ByteArray") (ne .Name "FixedLenByteArray") (ne .Name "Int96")}} + write{{.Name}}LE = func(e *encoder, in []{{.name}}) { + e.append({{.prefix}}.{{.Name}}Traits.CastToBytes(in)) + } + copyFrom{{.Name}}LE = func(dst []{{.name}}, src []byte) { + copy({{.prefix}}.{{.Name}}Traits.CastToBytes(dst), src) + } +{{- end -}} +{{- end}} + } +} + {{range .In}} {{if and (ne .Name "Boolean") (ne .Name "ByteArray") (ne .Name "FixedLenByteArray")}} // Plain{{.Name}}Encoder is an encoder for {{.name}} values using Plain Encoding @@ -34,7 +84,7 @@ type Plain{{.Name}}Encoder struct { // Put encodes a slice of values into the underlying buffer func (enc *Plain{{.Name}}Encoder) Put(in []{{.name}}) { - enc.append({{.prefix}}.{{.Name}}Traits.CastToBytes(in)) + write{{.Name}}LE(&enc.encoder, in) } // PutSpaced encodes a slice of values into the underlying buffer which are spaced out @@ -87,7 +137,7 @@ func (dec *Plain{{.Name}}Decoder) Decode(out []{{.name}}) (int, error) { return 0, xerrors.Errorf("parquet: eof exception decode plain {{.Name}}, nvals: %d, nbytes: %d, datalen: %d", dec.nvals, nbytes, len(dec.data)) } - copy({{.prefix}}.{{.Name}}Traits.CastToBytes(out), dec.data[:nbytes]) + copyFrom{{.Name}}LE(out, dec.data[:nbytes]) dec.data = dec.data[nbytes:] dec.nvals -= max return max, nil