diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh index e31fa555642..54b05c3cc2b 100755 --- a/ci/scripts/go_test.sh +++ b/ci/scripts/go_test.sh @@ -61,7 +61,7 @@ pushd ${source_dir}/arrow TAGS="assert,test" if [[ -n "${ARROW_GO_TESTCGO}" ]]; then if [[ "${MSYSTEM}" = "MINGW64" ]]; then - export PATH=${MINGW_PREFIX}/bin:$PATH + export PATH=${MINGW_PREFIX}\\bin:${MINGW_PREFIX}\\lib:$PATH fi TAGS="${TAGS},ccalloc" fi diff --git a/go/arrow/bitutil/_lib/bitmap_ops.c b/go/arrow/bitutil/_lib/bitmap_ops.c index 96817b2f2b5..f48b4d4d821 100644 --- a/go/arrow/bitutil/_lib/bitmap_ops.c +++ b/go/arrow/bitutil/_lib/bitmap_ops.c @@ -31,4 +31,16 @@ void FULL_NAME(bitmap_aligned_or)(const uint8_t* left, const uint8_t* right, uin for (int64_t i = 0; i < nbytes; ++i) { out[i] = left[i] | right[i]; } -} \ No newline at end of file +} + +void FULL_NAME(bitmap_aligned_and_not)(const uint8_t* left, const uint8_t* right, uint8_t* out, const int64_t nbytes) { + for (int64_t i = 0; i < nbytes; ++i) { + out[i] = left[i] & ~right[i]; + } +} + +void FULL_NAME(bitmap_aligned_xor)(const uint8_t* left, const uint8_t* right, uint8_t* out, const int64_t nbytes) { + for (int64_t i = 0; i < nbytes; ++i) { + out[i] = left[i] ^ right[i]; + } +} diff --git a/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s b/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s index 69f69d29708..a4010dab55b 100644 --- a/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s +++ b/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s @@ -207,6 +207,204 @@ bitmap_aligned_or_avx2: # @bitmap_aligned_or_avx2 .Lfunc_end1: .size bitmap_aligned_or_avx2, .Lfunc_end1-bitmap_aligned_or_avx2 # -- End function + .globl bitmap_aligned_and_not_avx2 # -- Begin function bitmap_aligned_and_not_avx2 + .p2align 4, 0x90 + .type bitmap_aligned_and_not_avx2,@function +bitmap_aligned_and_not_avx2: # @bitmap_aligned_and_not_avx2 +# %bb.0: + push rbp + mov rbp, rsp + push rbx + and rsp, -8 + test rcx, rcx + jle .LBB2_12 +# %bb.1: + cmp rcx, 127 + ja .LBB2_7 +# %bb.2: + xor r8d, r8d + jmp .LBB2_3 +.LBB2_7: + lea r8, [rdx + rcx] + lea rax, [rdi + rcx] + cmp rax, rdx + seta r11b + lea rax, [rsi + rcx] + cmp r8, rdi + seta bl + cmp rax, rdx + seta r10b + cmp r8, rsi + seta r9b + xor r8d, r8d + test r11b, bl + jne .LBB2_3 +# %bb.8: + and r10b, r9b + jne .LBB2_3 +# %bb.9: + mov r8, rcx + and r8, -128 + xor eax, eax + .p2align 4, 0x90 +.LBB2_10: # =>This Inner Loop Header: Depth=1 + vmovups ymm0, ymmword ptr [rsi + rax] + vmovups ymm1, ymmword ptr [rsi + rax + 32] + vmovups ymm2, ymmword ptr [rsi + rax + 64] + vmovups ymm3, ymmword ptr [rsi + rax + 96] + vandnps ymm0, ymm0, ymmword ptr [rdi + rax] + vandnps ymm1, ymm1, ymmword ptr [rdi + rax + 32] + vandnps ymm2, ymm2, ymmword ptr [rdi + rax + 64] + vandnps ymm3, ymm3, ymmword ptr [rdi + rax + 96] + vmovups ymmword ptr [rdx + rax], ymm0 + vmovups ymmword ptr [rdx + rax + 32], ymm1 + vmovups ymmword ptr [rdx + rax + 64], ymm2 + vmovups ymmword ptr [rdx + rax + 96], ymm3 + sub rax, -128 + cmp r8, rax + jne .LBB2_10 +# %bb.11: + cmp r8, rcx + je .LBB2_12 +.LBB2_3: + mov r9, r8 + not r9 + test cl, 1 + je .LBB2_5 +# %bb.4: + mov al, byte ptr [rsi + r8] + not al + and al, byte ptr [rdi + r8] + mov byte ptr [rdx + r8], al + or r8, 1 +.LBB2_5: + add r9, rcx + je .LBB2_12 + .p2align 4, 0x90 +.LBB2_6: # =>This Inner Loop Header: Depth=1 + movzx eax, byte ptr [rsi + r8] + not al + and al, byte ptr [rdi + r8] + mov byte ptr [rdx + r8], al + movzx eax, byte ptr [rsi + r8 + 1] + not al + and al, byte ptr [rdi + r8 + 1] + mov byte ptr [rdx + r8 + 1], al + add r8, 2 + cmp rcx, r8 + jne .LBB2_6 +.LBB2_12: + lea rsp, [rbp - 8] + pop rbx + pop rbp + vzeroupper + ret +.Lfunc_end2: + .size bitmap_aligned_and_not_avx2, .Lfunc_end2-bitmap_aligned_and_not_avx2 + # -- End function + .globl bitmap_aligned_xor_avx2 # -- Begin function bitmap_aligned_xor_avx2 + .p2align 4, 0x90 + .type bitmap_aligned_xor_avx2,@function +bitmap_aligned_xor_avx2: # @bitmap_aligned_xor_avx2 +# %bb.0: + push rbp + mov rbp, rsp + push rbx + and rsp, -8 + test rcx, rcx + jle .LBB3_12 +# %bb.1: + cmp rcx, 127 + ja .LBB3_7 +# %bb.2: + xor r10d, r10d + jmp .LBB3_3 +.LBB3_7: + lea r9, [rdx + rcx] + lea rax, [rdi + rcx] + cmp rax, rdx + seta r11b + lea rax, [rsi + rcx] + cmp r9, rdi + seta bl + cmp rax, rdx + seta r8b + cmp r9, rsi + seta r9b + xor r10d, r10d + test r11b, bl + jne .LBB3_3 +# %bb.8: + and r8b, r9b + jne .LBB3_3 +# %bb.9: + mov r10, rcx + and r10, -128 + xor r8d, r8d + .p2align 4, 0x90 +.LBB3_10: # =>This Inner Loop Header: Depth=1 + vmovups ymm0, ymmword ptr [rsi + r8] + vmovups ymm1, ymmword ptr [rsi + r8 + 32] + vmovups ymm2, ymmword ptr [rsi + r8 + 64] + vmovups ymm3, ymmword ptr [rsi + r8 + 96] + vxorps ymm0, ymm0, ymmword ptr [rdi + r8] + vxorps ymm1, ymm1, ymmword ptr [rdi + r8 + 32] + vxorps ymm2, ymm2, ymmword ptr [rdi + r8 + 64] + vxorps ymm3, ymm3, ymmword ptr [rdi + r8 + 96] + vmovups ymmword ptr [rdx + r8], ymm0 + vmovups ymmword ptr [rdx + r8 + 32], ymm1 + vmovups ymmword ptr [rdx + r8 + 64], ymm2 + vmovups ymmword ptr [rdx + r8 + 96], ymm3 + sub r8, -128 + cmp r10, r8 + jne .LBB3_10 +# %bb.11: + cmp r10, rcx + je .LBB3_12 +.LBB3_3: + mov r8, r10 + not r8 + add r8, rcx + mov r9, rcx + and r9, 3 + je .LBB3_5 + .p2align 4, 0x90 +.LBB3_4: # =>This Inner Loop Header: Depth=1 + movzx eax, byte ptr [rsi + r10] + xor al, byte ptr [rdi + r10] + mov byte ptr [rdx + r10], al + add r10, 1 + add r9, -1 + jne .LBB3_4 +.LBB3_5: + cmp r8, 3 + jb .LBB3_12 + .p2align 4, 0x90 +.LBB3_6: # =>This Inner Loop Header: Depth=1 + movzx eax, byte ptr [rsi + r10] + xor al, byte ptr [rdi + r10] + mov byte ptr [rdx + r10], al + movzx eax, byte ptr [rsi + r10 + 1] + xor al, byte ptr [rdi + r10 + 1] + mov byte ptr [rdx + r10 + 1], al + movzx eax, byte ptr [rsi + r10 + 2] + xor al, byte ptr [rdi + r10 + 2] + mov byte ptr [rdx + r10 + 2], al + movzx eax, byte ptr [rsi + r10 + 3] + xor al, byte ptr [rdi + r10 + 3] + mov byte ptr [rdx + r10 + 3], al + add r10, 4 + cmp rcx, r10 + jne .LBB3_6 +.LBB3_12: + lea rsp, [rbp - 8] + pop rbx + pop rbp + vzeroupper + ret +.Lfunc_end3: + .size bitmap_aligned_xor_avx2, .Lfunc_end3-bitmap_aligned_xor_avx2 + # -- End function .ident "Ubuntu clang version 11.1.0-6" .section ".note.GNU-stack","",@progbits .addrsig diff --git a/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s b/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s index 9d028155b72..840c1a623bb 100644 --- a/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s +++ b/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s @@ -267,6 +267,264 @@ bitmap_aligned_or_sse4: # @bitmap_aligned_or_sse4 .Lfunc_end1: .size bitmap_aligned_or_sse4, .Lfunc_end1-bitmap_aligned_or_sse4 # -- End function + .globl bitmap_aligned_and_not_sse4 # -- Begin function bitmap_aligned_and_not_sse4 + .p2align 4, 0x90 + .type bitmap_aligned_and_not_sse4,@function +bitmap_aligned_and_not_sse4: # @bitmap_aligned_and_not_sse4 +# %bb.0: + push rbp + mov rbp, rsp + push rbx + and rsp, -8 + test rcx, rcx + jle .LBB2_16 +# %bb.1: + cmp rcx, 31 + ja .LBB2_7 +# %bb.2: + xor r11d, r11d +.LBB2_3: + mov r8, r11 + not r8 + test cl, 1 + je .LBB2_5 +# %bb.4: + mov al, byte ptr [rsi + r11] + not al + and al, byte ptr [rdi + r11] + mov byte ptr [rdx + r11], al + or r11, 1 +.LBB2_5: + add r8, rcx + je .LBB2_16 + .p2align 4, 0x90 +.LBB2_6: # =>This Inner Loop Header: Depth=1 + movzx eax, byte ptr [rsi + r11] + not al + and al, byte ptr [rdi + r11] + mov byte ptr [rdx + r11], al + movzx eax, byte ptr [rsi + r11 + 1] + not al + and al, byte ptr [rdi + r11 + 1] + mov byte ptr [rdx + r11 + 1], al + add r11, 2 + cmp rcx, r11 + jne .LBB2_6 + jmp .LBB2_16 +.LBB2_7: + lea r9, [rdx + rcx] + lea rax, [rdi + rcx] + cmp rax, rdx + seta r10b + lea rax, [rsi + rcx] + cmp r9, rdi + seta bl + cmp rax, rdx + seta r8b + cmp r9, rsi + seta r9b + xor r11d, r11d + test r10b, bl + jne .LBB2_3 +# %bb.8: + and r8b, r9b + jne .LBB2_3 +# %bb.9: + mov r11, rcx + and r11, -32 + lea rax, [r11 - 32] + mov r9, rax + shr r9, 5 + add r9, 1 + test rax, rax + je .LBB2_10 +# %bb.11: + mov r10, r9 + and r10, -2 + neg r10 + xor r8d, r8d + .p2align 4, 0x90 +.LBB2_12: # =>This Inner Loop Header: Depth=1 + movups xmm0, xmmword ptr [rdi + r8] + movups xmm1, xmmword ptr [rdi + r8 + 16] + movups xmm2, xmmword ptr [rsi + r8] + andnps xmm2, xmm0 + movups xmm0, xmmword ptr [rsi + r8 + 16] + andnps xmm0, xmm1 + movups xmmword ptr [rdx + r8], xmm2 + movups xmmword ptr [rdx + r8 + 16], xmm0 + movups xmm0, xmmword ptr [rdi + r8 + 32] + movups xmm1, xmmword ptr [rdi + r8 + 48] + movups xmm2, xmmword ptr [rsi + r8 + 32] + andnps xmm2, xmm0 + movups xmm0, xmmword ptr [rsi + r8 + 48] + andnps xmm0, xmm1 + movups xmmword ptr [rdx + r8 + 32], xmm2 + movups xmmword ptr [rdx + r8 + 48], xmm0 + add r8, 64 + add r10, 2 + jne .LBB2_12 +# %bb.13: + test r9b, 1 + je .LBB2_15 +.LBB2_14: + movups xmm0, xmmword ptr [rdi + r8] + movups xmm1, xmmword ptr [rdi + r8 + 16] + movups xmm2, xmmword ptr [rsi + r8] + andnps xmm2, xmm0 + movups xmm0, xmmword ptr [rsi + r8 + 16] + andnps xmm0, xmm1 + movups xmmword ptr [rdx + r8], xmm2 + movups xmmword ptr [rdx + r8 + 16], xmm0 +.LBB2_15: + cmp r11, rcx + jne .LBB2_3 +.LBB2_16: + lea rsp, [rbp - 8] + pop rbx + pop rbp + ret +.LBB2_10: + xor r8d, r8d + test r9b, 1 + jne .LBB2_14 + jmp .LBB2_15 +.Lfunc_end2: + .size bitmap_aligned_and_not_sse4, .Lfunc_end2-bitmap_aligned_and_not_sse4 + # -- End function + .globl bitmap_aligned_xor_sse4 # -- Begin function bitmap_aligned_xor_sse4 + .p2align 4, 0x90 + .type bitmap_aligned_xor_sse4,@function +bitmap_aligned_xor_sse4: # @bitmap_aligned_xor_sse4 +# %bb.0: + push rbp + mov rbp, rsp + push rbx + and rsp, -8 + test rcx, rcx + jle .LBB3_16 +# %bb.1: + cmp rcx, 31 + ja .LBB3_7 +# %bb.2: + xor r11d, r11d +.LBB3_3: + mov r8, r11 + not r8 + add r8, rcx + mov r9, rcx + and r9, 3 + je .LBB3_5 + .p2align 4, 0x90 +.LBB3_4: # =>This Inner Loop Header: Depth=1 + movzx eax, byte ptr [rsi + r11] + xor al, byte ptr [rdi + r11] + mov byte ptr [rdx + r11], al + add r11, 1 + add r9, -1 + jne .LBB3_4 +.LBB3_5: + cmp r8, 3 + jb .LBB3_16 + .p2align 4, 0x90 +.LBB3_6: # =>This Inner Loop Header: Depth=1 + movzx eax, byte ptr [rsi + r11] + xor al, byte ptr [rdi + r11] + mov byte ptr [rdx + r11], al + movzx eax, byte ptr [rsi + r11 + 1] + xor al, byte ptr [rdi + r11 + 1] + mov byte ptr [rdx + r11 + 1], al + movzx eax, byte ptr [rsi + r11 + 2] + xor al, byte ptr [rdi + r11 + 2] + mov byte ptr [rdx + r11 + 2], al + movzx eax, byte ptr [rsi + r11 + 3] + xor al, byte ptr [rdi + r11 + 3] + mov byte ptr [rdx + r11 + 3], al + add r11, 4 + cmp rcx, r11 + jne .LBB3_6 + jmp .LBB3_16 +.LBB3_7: + lea r9, [rdx + rcx] + lea rax, [rdi + rcx] + cmp rax, rdx + seta r10b + lea rax, [rsi + rcx] + cmp r9, rdi + seta bl + cmp rax, rdx + seta r8b + cmp r9, rsi + seta r9b + xor r11d, r11d + test r10b, bl + jne .LBB3_3 +# %bb.8: + and r8b, r9b + jne .LBB3_3 +# %bb.9: + mov r11, rcx + and r11, -32 + lea rax, [r11 - 32] + mov r9, rax + shr r9, 5 + add r9, 1 + test rax, rax + je .LBB3_10 +# %bb.11: + mov r10, r9 + and r10, -2 + neg r10 + xor r8d, r8d + .p2align 4, 0x90 +.LBB3_12: # =>This Inner Loop Header: Depth=1 + movups xmm0, xmmword ptr [rdi + r8] + movups xmm1, xmmword ptr [rdi + r8 + 16] + movups xmm2, xmmword ptr [rsi + r8] + xorps xmm2, xmm0 + movups xmm0, xmmword ptr [rsi + r8 + 16] + xorps xmm0, xmm1 + movups xmmword ptr [rdx + r8], xmm2 + movups xmmword ptr [rdx + r8 + 16], xmm0 + movups xmm0, xmmword ptr [rdi + r8 + 32] + movups xmm1, xmmword ptr [rdi + r8 + 48] + movups xmm2, xmmword ptr [rsi + r8 + 32] + xorps xmm2, xmm0 + movups xmm0, xmmword ptr [rsi + r8 + 48] + xorps xmm0, xmm1 + movups xmmword ptr [rdx + r8 + 32], xmm2 + movups xmmword ptr [rdx + r8 + 48], xmm0 + add r8, 64 + add r10, 2 + jne .LBB3_12 +# %bb.13: + test r9b, 1 + je .LBB3_15 +.LBB3_14: + movups xmm0, xmmword ptr [rdi + r8] + movups xmm1, xmmword ptr [rdi + r8 + 16] + movups xmm2, xmmword ptr [rsi + r8] + xorps xmm2, xmm0 + movups xmm0, xmmword ptr [rsi + r8 + 16] + xorps xmm0, xmm1 + movups xmmword ptr [rdx + r8], xmm2 + movups xmmword ptr [rdx + r8 + 16], xmm0 +.LBB3_15: + cmp r11, rcx + jne .LBB3_3 +.LBB3_16: + lea rsp, [rbp - 8] + pop rbx + pop rbp + ret +.LBB3_10: + xor r8d, r8d + test r9b, 1 + jne .LBB3_14 + jmp .LBB3_15 +.Lfunc_end3: + .size bitmap_aligned_xor_sse4, .Lfunc_end3-bitmap_aligned_xor_sse4 + # -- End function .ident "Ubuntu clang version 11.1.0-6" .section ".note.GNU-stack","",@progbits .addrsig diff --git a/go/arrow/bitutil/bitmap_ops.go b/go/arrow/bitutil/bitmap_ops.go index 62322b04b9d..7db750a6dd9 100644 --- a/go/arrow/bitutil/bitmap_ops.go +++ b/go/arrow/bitutil/bitmap_ops.go @@ -39,6 +39,29 @@ func alignedBitAndGo(left, right, out []byte) { } } +func alignedBitAndNotGo(left, right, out []byte) { + var ( + nbytes = len(out) + i = 0 + ) + if nbytes > uint64SizeBytes { + // case where we have enough bytes to operate on words + leftWords := bytesToUint64(left[i:]) + rightWords := bytesToUint64(right[i:]) + outWords := bytesToUint64(out[i:]) + + for w := range outWords { + outWords[w] = leftWords[w] &^ rightWords[w] + } + + i += len(outWords) * uint64SizeBytes + } + // grab any remaining bytes that were fewer than a word + for ; i < nbytes; i++ { + out[i] = left[i] &^ right[i] + } +} + func alignedBitOrGo(left, right, out []byte) { var ( nbytes = len(out) @@ -61,3 +84,26 @@ func alignedBitOrGo(left, right, out []byte) { out[i] = left[i] | right[i] } } + +func alignedBitXorGo(left, right, out []byte) { + var ( + nbytes = len(out) + i = 0 + ) + if nbytes > uint64SizeBytes { + // case where we have enough bytes to operate on words + leftWords := bytesToUint64(left[i:]) + rightWords := bytesToUint64(right[i:]) + outWords := bytesToUint64(out[i:]) + + for w := range outWords { + outWords[w] = leftWords[w] ^ rightWords[w] + } + + i += len(outWords) * uint64SizeBytes + } + // grab any remaining bytes that were fewer than a word + for ; i < nbytes; i++ { + out[i] = left[i] ^ right[i] + } +} diff --git a/go/arrow/bitutil/bitmap_ops_amd64.go b/go/arrow/bitutil/bitmap_ops_amd64.go index 9aa5a6dd56b..ad0fd674ab9 100644 --- a/go/arrow/bitutil/bitmap_ops_amd64.go +++ b/go/arrow/bitutil/bitmap_ops_amd64.go @@ -25,11 +25,17 @@ func init() { if cpu.X86.HasAVX2 { bitAndOp.opAligned = bitmapAlignedAndAVX2 bitOrOp.opAligned = bitmapAlignedOrAVX2 + bitAndNotOp.opAligned = bitmapAlignedAndNotAVX2 + bitXorOp.opAligned = bitmapAlignedXorAVX2 } else if cpu.X86.HasSSE42 { bitAndOp.opAligned = bitmapAlignedAndSSE4 bitOrOp.opAligned = bitmapAlignedOrSSE4 + bitAndNotOp.opAligned = bitmapAlignedAndNotSSE4 + bitXorOp.opAligned = bitmapAlignedXorSSE4 } else { bitAndOp.opAligned = alignedBitAndGo bitOrOp.opAligned = alignedBitOrGo + bitAndNotOp.opAligned = alignedBitAndNotGo + bitXorOp.opAligned = alignedBitXorGo } } diff --git a/go/arrow/bitutil/bitmap_ops_arm64.go b/go/arrow/bitutil/bitmap_ops_arm64.go index 86c47639a9e..28d95d84ade 100644 --- a/go/arrow/bitutil/bitmap_ops_arm64.go +++ b/go/arrow/bitutil/bitmap_ops_arm64.go @@ -22,4 +22,6 @@ package bitutil func init() { bitAndOp.opAligned = alignedBitAndGo bitOrOp.opAligned = alignedBitOrGo + bitAndNotOp.opAligned = alignedBitAndNotGo + bitXorOp.opAligned = alignedBitXorGo } diff --git a/go/arrow/bitutil/bitmap_ops_avx2_amd64.go b/go/arrow/bitutil/bitmap_ops_avx2_amd64.go index 731b9807b79..1c01bd0f380 100644 --- a/go/arrow/bitutil/bitmap_ops_avx2_amd64.go +++ b/go/arrow/bitutil/bitmap_ops_avx2_amd64.go @@ -36,3 +36,17 @@ func _bitmap_aligned_or_avx2(left, right, out unsafe.Pointer, length int64) func bitmapAlignedOrAVX2(left, right, out []byte) { _bitmap_aligned_or_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out))) } + +//go:noescape +func _bitmap_aligned_and_not_avx2(left, right, out unsafe.Pointer, length int64) + +func bitmapAlignedAndNotAVX2(left, right, out []byte) { + _bitmap_aligned_and_not_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out))) +} + +//go:noescape +func _bitmap_aligned_xor_avx2(left, right, out unsafe.Pointer, length int64) + +func bitmapAlignedXorAVX2(left, right, out []byte) { + _bitmap_aligned_xor_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out))) +} diff --git a/go/arrow/bitutil/bitmap_ops_avx2_amd64.s b/go/arrow/bitutil/bitmap_ops_avx2_amd64.s index 2e2ade89617..00172e86592 100644 --- a/go/arrow/bitutil/bitmap_ops_avx2_amd64.s +++ b/go/arrow/bitutil/bitmap_ops_avx2_amd64.s @@ -190,3 +190,184 @@ LBB1_6: LBB1_12: VZEROUPPER RET + +TEXT ·_bitmap_aligned_and_not_avx2(SB), $0-32 + + MOVQ left+0(FP), DI + MOVQ right+8(FP), SI + MOVQ out+16(FP), DX + MOVQ length+24(FP), CX + + WORD $0x8548; BYTE $0xc9 // test rcx, rcx + JLE LBB2_12 + LONG $0x7ff98348 // cmp rcx, 127 + JA LBB2_7 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + JMP LBB2_3 + +LBB2_7: + LONG $0x0a048d4c // lea r8, [rdx + rcx] + LONG $0x0f048d48 // lea rax, [rdi + rcx] + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd3970f41 // seta r11b + LONG $0x0e048d48 // lea rax, [rsi + rcx] + WORD $0x3949; BYTE $0xf8 // cmp r8, rdi + WORD $0x970f; BYTE $0xd3 // seta bl + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd2970f41 // seta r10b + WORD $0x3949; BYTE $0xf0 // cmp r8, rsi + LONG $0xd1970f41 // seta r9b + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + WORD $0x8441; BYTE $0xdb // test r11b, bl + JNE LBB2_3 + WORD $0x2045; BYTE $0xca // and r10b, r9b + JNE LBB2_3 + WORD $0x8949; BYTE $0xc8 // mov r8, rcx + LONG $0x80e08349 // and r8, -128 + WORD $0xc031 // xor eax, eax + +LBB2_10: + LONG $0x0410fcc5; BYTE $0x06 // vmovups ymm0, yword [rsi + rax] + LONG $0x4c10fcc5; WORD $0x2006 // vmovups ymm1, yword [rsi + rax + 32] + LONG $0x5410fcc5; WORD $0x4006 // vmovups ymm2, yword [rsi + rax + 64] + LONG $0x5c10fcc5; WORD $0x6006 // vmovups ymm3, yword [rsi + rax + 96] + LONG $0x0455fcc5; BYTE $0x07 // vandnps ymm0, ymm0, yword [rdi + rax] + LONG $0x4c55f4c5; WORD $0x2007 // vandnps ymm1, ymm1, yword [rdi + rax + 32] + LONG $0x5455ecc5; WORD $0x4007 // vandnps ymm2, ymm2, yword [rdi + rax + 64] + LONG $0x5c55e4c5; WORD $0x6007 // vandnps ymm3, ymm3, yword [rdi + rax + 96] + LONG $0x0411fcc5; BYTE $0x02 // vmovups yword [rdx + rax], ymm0 + LONG $0x4c11fcc5; WORD $0x2002 // vmovups yword [rdx + rax + 32], ymm1 + LONG $0x5411fcc5; WORD $0x4002 // vmovups yword [rdx + rax + 64], ymm2 + LONG $0x5c11fcc5; WORD $0x6002 // vmovups yword [rdx + rax + 96], ymm3 + LONG $0x80e88348 // sub rax, -128 + WORD $0x3949; BYTE $0xc0 // cmp r8, rax + JNE LBB2_10 + WORD $0x3949; BYTE $0xc8 // cmp r8, rcx + JE LBB2_12 + +LBB2_3: + WORD $0x894d; BYTE $0xc1 // mov r9, r8 + WORD $0xf749; BYTE $0xd1 // not r9 + WORD $0xc1f6; BYTE $0x01 // test cl, 1 + JE LBB2_5 + LONG $0x06048a42 // mov al, byte [rsi + r8] + WORD $0xd0f6 // not al + LONG $0x07042242 // and al, byte [rdi + r8] + LONG $0x02048842 // mov byte [rdx + r8], al + LONG $0x01c88349 // or r8, 1 + +LBB2_5: + WORD $0x0149; BYTE $0xc9 // add r9, rcx + JE LBB2_12 + +LBB2_6: + LONG $0x04b60f42; BYTE $0x06 // movzx eax, byte [rsi + r8] + WORD $0xd0f6 // not al + LONG $0x07042242 // and al, byte [rdi + r8] + LONG $0x02048842 // mov byte [rdx + r8], al + LONG $0x44b60f42; WORD $0x0106 // movzx eax, byte [rsi + r8 + 1] + WORD $0xd0f6 // not al + LONG $0x07442242; BYTE $0x01 // and al, byte [rdi + r8 + 1] + LONG $0x02448842; BYTE $0x01 // mov byte [rdx + r8 + 1], al + LONG $0x02c08349 // add r8, 2 + WORD $0x394c; BYTE $0xc1 // cmp rcx, r8 + JNE LBB2_6 + +LBB2_12: + VZEROUPPER + RET + +TEXT ·_bitmap_aligned_xor_avx2(SB), $0-32 + + MOVQ left+0(FP), DI + MOVQ right+8(FP), SI + MOVQ out+16(FP), DX + MOVQ length+24(FP), CX + + WORD $0x8548; BYTE $0xc9 // test rcx, rcx + JLE LBB3_12 + LONG $0x7ff98348 // cmp rcx, 127 + JA LBB3_7 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB3_3 + +LBB3_7: + LONG $0x0a0c8d4c // lea r9, [rdx + rcx] + LONG $0x0f048d48 // lea rax, [rdi + rcx] + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd3970f41 // seta r11b + LONG $0x0e048d48 // lea rax, [rsi + rcx] + WORD $0x3949; BYTE $0xf9 // cmp r9, rdi + WORD $0x970f; BYTE $0xd3 // seta bl + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd0970f41 // seta r8b + WORD $0x3949; BYTE $0xf1 // cmp r9, rsi + LONG $0xd1970f41 // seta r9b + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + WORD $0x8441; BYTE $0xdb // test r11b, bl + JNE LBB3_3 + WORD $0x2045; BYTE $0xc8 // and r8b, r9b + JNE LBB3_3 + WORD $0x8949; BYTE $0xca // mov r10, rcx + LONG $0x80e28349 // and r10, -128 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB3_10: + LONG $0x107ca1c4; WORD $0x0604 // vmovups ymm0, yword [rsi + r8] + LONG $0x107ca1c4; WORD $0x064c; BYTE $0x20 // vmovups ymm1, yword [rsi + r8 + 32] + LONG $0x107ca1c4; WORD $0x0654; BYTE $0x40 // vmovups ymm2, yword [rsi + r8 + 64] + LONG $0x107ca1c4; WORD $0x065c; BYTE $0x60 // vmovups ymm3, yword [rsi + r8 + 96] + LONG $0x577ca1c4; WORD $0x0704 // vxorps ymm0, ymm0, yword [rdi + r8] + LONG $0x5774a1c4; WORD $0x074c; BYTE $0x20 // vxorps ymm1, ymm1, yword [rdi + r8 + 32] + LONG $0x576ca1c4; WORD $0x0754; BYTE $0x40 // vxorps ymm2, ymm2, yword [rdi + r8 + 64] + LONG $0x5764a1c4; WORD $0x075c; BYTE $0x60 // vxorps ymm3, ymm3, yword [rdi + r8 + 96] + LONG $0x117ca1c4; WORD $0x0204 // vmovups yword [rdx + r8], ymm0 + LONG $0x117ca1c4; WORD $0x024c; BYTE $0x20 // vmovups yword [rdx + r8 + 32], ymm1 + LONG $0x117ca1c4; WORD $0x0254; BYTE $0x40 // vmovups yword [rdx + r8 + 64], ymm2 + LONG $0x117ca1c4; WORD $0x025c; BYTE $0x60 // vmovups yword [rdx + r8 + 96], ymm3 + LONG $0x80e88349 // sub r8, -128 + WORD $0x394d; BYTE $0xc2 // cmp r10, r8 + JNE LBB3_10 + WORD $0x3949; BYTE $0xca // cmp r10, rcx + JE LBB3_12 + +LBB3_3: + WORD $0x894d; BYTE $0xd0 // mov r8, r10 + WORD $0xf749; BYTE $0xd0 // not r8 + WORD $0x0149; BYTE $0xc8 // add r8, rcx + WORD $0x8949; BYTE $0xc9 // mov r9, rcx + LONG $0x03e18349 // and r9, 3 + JE LBB3_5 + +LBB3_4: + LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10] + LONG $0x17043242 // xor al, byte [rdi + r10] + LONG $0x12048842 // mov byte [rdx + r10], al + LONG $0x01c28349 // add r10, 1 + LONG $0xffc18349 // add r9, -1 + JNE LBB3_4 + +LBB3_5: + LONG $0x03f88349 // cmp r8, 3 + JB LBB3_12 + +LBB3_6: + LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10] + LONG $0x17043242 // xor al, byte [rdi + r10] + LONG $0x12048842 // mov byte [rdx + r10], al + LONG $0x44b60f42; WORD $0x0116 // movzx eax, byte [rsi + r10 + 1] + LONG $0x17443242; BYTE $0x01 // xor al, byte [rdi + r10 + 1] + LONG $0x12448842; BYTE $0x01 // mov byte [rdx + r10 + 1], al + LONG $0x44b60f42; WORD $0x0216 // movzx eax, byte [rsi + r10 + 2] + LONG $0x17443242; BYTE $0x02 // xor al, byte [rdi + r10 + 2] + LONG $0x12448842; BYTE $0x02 // mov byte [rdx + r10 + 2], al + LONG $0x44b60f42; WORD $0x0316 // movzx eax, byte [rsi + r10 + 3] + LONG $0x17443242; BYTE $0x03 // xor al, byte [rdi + r10 + 3] + LONG $0x12448842; BYTE $0x03 // mov byte [rdx + r10 + 3], al + LONG $0x04c28349 // add r10, 4 + WORD $0x394c; BYTE $0xd1 // cmp rcx, r10 + JNE LBB3_6 + +LBB3_12: + VZEROUPPER + RET diff --git a/go/arrow/bitutil/bitmap_ops_noasm.go b/go/arrow/bitutil/bitmap_ops_noasm.go index 785531c1c23..e25347791fe 100644 --- a/go/arrow/bitutil/bitmap_ops_noasm.go +++ b/go/arrow/bitutil/bitmap_ops_noasm.go @@ -22,4 +22,6 @@ package bitutil func init() { bitAndOp.opAligned = alignedBitAndGo bitOrOp.opAligned = alignedBitOrGo + bitAndNotOp.opAligned = alignedBitAndNotGo + bitXorOp.opAligned = alignedBitXorGo } diff --git a/go/arrow/bitutil/bitmap_ops_ppc64le.go b/go/arrow/bitutil/bitmap_ops_ppc64le.go index 86c47639a9e..28d95d84ade 100644 --- a/go/arrow/bitutil/bitmap_ops_ppc64le.go +++ b/go/arrow/bitutil/bitmap_ops_ppc64le.go @@ -22,4 +22,6 @@ package bitutil func init() { bitAndOp.opAligned = alignedBitAndGo bitOrOp.opAligned = alignedBitOrGo + bitAndNotOp.opAligned = alignedBitAndNotGo + bitXorOp.opAligned = alignedBitXorGo } diff --git a/go/arrow/bitutil/bitmap_ops_s390x.go b/go/arrow/bitutil/bitmap_ops_s390x.go index 86c47639a9e..28d95d84ade 100644 --- a/go/arrow/bitutil/bitmap_ops_s390x.go +++ b/go/arrow/bitutil/bitmap_ops_s390x.go @@ -22,4 +22,6 @@ package bitutil func init() { bitAndOp.opAligned = alignedBitAndGo bitOrOp.opAligned = alignedBitOrGo + bitAndNotOp.opAligned = alignedBitAndNotGo + bitXorOp.opAligned = alignedBitXorGo } diff --git a/go/arrow/bitutil/bitmap_ops_sse4_amd64.go b/go/arrow/bitutil/bitmap_ops_sse4_amd64.go index 5d1fcf96829..f16bce12bbf 100644 --- a/go/arrow/bitutil/bitmap_ops_sse4_amd64.go +++ b/go/arrow/bitutil/bitmap_ops_sse4_amd64.go @@ -36,3 +36,17 @@ func _bitmap_aligned_or_sse4(left, right, out unsafe.Pointer, length int64) func bitmapAlignedOrSSE4(left, right, out []byte) { _bitmap_aligned_or_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out))) } + +//go:noescape +func _bitmap_aligned_and_not_sse4(left, right, out unsafe.Pointer, length int64) + +func bitmapAlignedAndNotSSE4(left, right, out []byte) { + _bitmap_aligned_and_not_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out))) +} + +//go:noescape +func _bitmap_aligned_xor_sse4(left, right, out unsafe.Pointer, length int64) + +func bitmapAlignedXorSSE4(left, right, out []byte) { + _bitmap_aligned_xor_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out))) +} diff --git a/go/arrow/bitutil/bitmap_ops_sse4_amd64.s b/go/arrow/bitutil/bitmap_ops_sse4_amd64.s index ad81cf63720..c15e186253a 100644 --- a/go/arrow/bitutil/bitmap_ops_sse4_amd64.s +++ b/go/arrow/bitutil/bitmap_ops_sse4_amd64.s @@ -254,3 +254,248 @@ LBB1_10: LONG $0x01c1f641 // test r9b, 1 JNE LBB1_14 JMP LBB1_15 + +TEXT ·_bitmap_aligned_and_not_sse4(SB), $0-32 + + MOVQ left+0(FP), DI + MOVQ right+8(FP), SI + MOVQ out+16(FP), DX + MOVQ length+24(FP), CX + + WORD $0x8548; BYTE $0xc9 // test rcx, rcx + JLE LBB2_16 + LONG $0x1ff98348 // cmp rcx, 31 + JA LBB2_7 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + +LBB2_3: + WORD $0x894d; BYTE $0xd8 // mov r8, r11 + WORD $0xf749; BYTE $0xd0 // not r8 + WORD $0xc1f6; BYTE $0x01 // test cl, 1 + JE LBB2_5 + LONG $0x1e048a42 // mov al, byte [rsi + r11] + WORD $0xd0f6 // not al + LONG $0x1f042242 // and al, byte [rdi + r11] + LONG $0x1a048842 // mov byte [rdx + r11], al + LONG $0x01cb8349 // or r11, 1 + +LBB2_5: + WORD $0x0149; BYTE $0xc8 // add r8, rcx + JE LBB2_16 + +LBB2_6: + LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11] + WORD $0xd0f6 // not al + LONG $0x1f042242 // and al, byte [rdi + r11] + LONG $0x1a048842 // mov byte [rdx + r11], al + LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1] + WORD $0xd0f6 // not al + LONG $0x1f442242; BYTE $0x01 // and al, byte [rdi + r11 + 1] + LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al + LONG $0x02c38349 // add r11, 2 + WORD $0x394c; BYTE $0xd9 // cmp rcx, r11 + JNE LBB2_6 + JMP LBB2_16 + +LBB2_7: + LONG $0x0a0c8d4c // lea r9, [rdx + rcx] + LONG $0x0f048d48 // lea rax, [rdi + rcx] + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd2970f41 // seta r10b + LONG $0x0e048d48 // lea rax, [rsi + rcx] + WORD $0x3949; BYTE $0xf9 // cmp r9, rdi + WORD $0x970f; BYTE $0xd3 // seta bl + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd0970f41 // seta r8b + WORD $0x3949; BYTE $0xf1 // cmp r9, rsi + LONG $0xd1970f41 // seta r9b + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + WORD $0x8441; BYTE $0xda // test r10b, bl + JNE LBB2_3 + WORD $0x2045; BYTE $0xc8 // and r8b, r9b + JNE LBB2_3 + WORD $0x8949; BYTE $0xcb // mov r11, rcx + LONG $0xe0e38349 // and r11, -32 + LONG $0xe0438d49 // lea rax, [r11 - 32] + WORD $0x8949; BYTE $0xc1 // mov r9, rax + LONG $0x05e9c149 // shr r9, 5 + LONG $0x01c18349 // add r9, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB2_10 + WORD $0x894d; BYTE $0xca // mov r10, r9 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB2_12: + LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8] + LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16] + LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8] + WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0 + LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16] + WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1 + LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2 + LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0 + LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32] + LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48] + LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32] + WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0 + LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48] + WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1 + LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2 + LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0 + LONG $0x40c08349 // add r8, 64 + LONG $0x02c28349 // add r10, 2 + JNE LBB2_12 + LONG $0x01c1f641 // test r9b, 1 + JE LBB2_15 + +LBB2_14: + LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8] + LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16] + LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8] + WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0 + LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16] + WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1 + LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2 + LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0 + +LBB2_15: + WORD $0x3949; BYTE $0xcb // cmp r11, rcx + JNE LBB2_3 + +LBB2_16: + RET + +LBB2_10: + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + LONG $0x01c1f641 // test r9b, 1 + JNE LBB2_14 + JMP LBB2_15 + +TEXT ·_bitmap_aligned_xor_sse4(SB), $0-32 + + MOVQ left+0(FP), DI + MOVQ right+8(FP), SI + MOVQ out+16(FP), DX + MOVQ length+24(FP), CX + + WORD $0x8548; BYTE $0xc9 // test rcx, rcx + JLE LBB3_16 + LONG $0x1ff98348 // cmp rcx, 31 + JA LBB3_7 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + +LBB3_3: + WORD $0x894d; BYTE $0xd8 // mov r8, r11 + WORD $0xf749; BYTE $0xd0 // not r8 + WORD $0x0149; BYTE $0xc8 // add r8, rcx + WORD $0x8949; BYTE $0xc9 // mov r9, rcx + LONG $0x03e18349 // and r9, 3 + JE LBB3_5 + +LBB3_4: + LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11] + LONG $0x1f043242 // xor al, byte [rdi + r11] + LONG $0x1a048842 // mov byte [rdx + r11], al + LONG $0x01c38349 // add r11, 1 + LONG $0xffc18349 // add r9, -1 + JNE LBB3_4 + +LBB3_5: + LONG $0x03f88349 // cmp r8, 3 + JB LBB3_16 + +LBB3_6: + LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11] + LONG $0x1f043242 // xor al, byte [rdi + r11] + LONG $0x1a048842 // mov byte [rdx + r11], al + LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1] + LONG $0x1f443242; BYTE $0x01 // xor al, byte [rdi + r11 + 1] + LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al + LONG $0x44b60f42; WORD $0x021e // movzx eax, byte [rsi + r11 + 2] + LONG $0x1f443242; BYTE $0x02 // xor al, byte [rdi + r11 + 2] + LONG $0x1a448842; BYTE $0x02 // mov byte [rdx + r11 + 2], al + LONG $0x44b60f42; WORD $0x031e // movzx eax, byte [rsi + r11 + 3] + LONG $0x1f443242; BYTE $0x03 // xor al, byte [rdi + r11 + 3] + LONG $0x1a448842; BYTE $0x03 // mov byte [rdx + r11 + 3], al + LONG $0x04c38349 // add r11, 4 + WORD $0x394c; BYTE $0xd9 // cmp rcx, r11 + JNE LBB3_6 + JMP LBB3_16 + +LBB3_7: + LONG $0x0a0c8d4c // lea r9, [rdx + rcx] + LONG $0x0f048d48 // lea rax, [rdi + rcx] + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd2970f41 // seta r10b + LONG $0x0e048d48 // lea rax, [rsi + rcx] + WORD $0x3949; BYTE $0xf9 // cmp r9, rdi + WORD $0x970f; BYTE $0xd3 // seta bl + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + LONG $0xd0970f41 // seta r8b + WORD $0x3949; BYTE $0xf1 // cmp r9, rsi + LONG $0xd1970f41 // seta r9b + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + WORD $0x8441; BYTE $0xda // test r10b, bl + JNE LBB3_3 + WORD $0x2045; BYTE $0xc8 // and r8b, r9b + JNE LBB3_3 + WORD $0x8949; BYTE $0xcb // mov r11, rcx + LONG $0xe0e38349 // and r11, -32 + LONG $0xe0438d49 // lea rax, [r11 - 32] + WORD $0x8949; BYTE $0xc1 // mov r9, rax + LONG $0x05e9c149 // shr r9, 5 + LONG $0x01c18349 // add r9, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB3_10 + WORD $0x894d; BYTE $0xca // mov r10, r9 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB3_12: + LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8] + LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16] + LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8] + WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0 + LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16] + WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1 + LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2 + LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0 + LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32] + LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48] + LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32] + WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0 + LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48] + WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1 + LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2 + LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0 + LONG $0x40c08349 // add r8, 64 + LONG $0x02c28349 // add r10, 2 + JNE LBB3_12 + LONG $0x01c1f641 // test r9b, 1 + JE LBB3_15 + +LBB3_14: + LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8] + LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16] + LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8] + WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0 + LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16] + WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1 + LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2 + LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0 + +LBB3_15: + WORD $0x3949; BYTE $0xcb // cmp r11, rcx + JNE LBB3_3 + +LBB3_16: + RET + +LBB3_10: + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + LONG $0x01c1f641 // test r9b, 1 + JNE LBB3_14 + JMP LBB3_15 diff --git a/go/arrow/bitutil/bitmaps.go b/go/arrow/bitutil/bitmaps.go index abd1b188a74..c23a1232921 100644 --- a/go/arrow/bitutil/bitmaps.go +++ b/go/arrow/bitutil/bitmaps.go @@ -18,6 +18,7 @@ package bitutil import ( "bytes" + "errors" "math/bits" "unsafe" @@ -374,9 +375,14 @@ func (bm *BitmapWordWriter) PutNextTrailingByte(b byte, validBits int) { } } -// CopyBitmap copies the bitmap indicated by src, starting at bit offset srcOffset, -// and copying length bits into dst, starting at bit offset dstOffset. -func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) { +type transferMode int8 + +const ( + transferCopy transferMode = iota + transferInvert +) + +func transferBitmap(mode transferMode, src []byte, srcOffset, length int, dst []byte, dstOffset int) { if length == 0 { // if there's nothing to write, end early. return @@ -393,12 +399,19 @@ func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) { nwords := rdr.Words() for nwords > 0 { nwords-- - wr.PutNextWord(rdr.NextWord()) + if mode == transferInvert { + wr.PutNextWord(^rdr.NextWord()) + } else { + wr.PutNextWord(rdr.NextWord()) + } } nbytes := rdr.TrailingBytes() for nbytes > 0 { nbytes-- bt, validBits := rdr.NextTrailingByte() + if mode == transferInvert { + bt = ^bt + } wr.PutNextTrailingByte(bt, validBits) } return @@ -417,14 +430,33 @@ func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) { // - high 5 bits: old bits from last byte of dest buffer trailingBits := nbytes*8 - length trailMask := byte(uint(1)<<(8-trailingBits)) - 1 - - copy(dst, src[:nbytes-1]) - lastData := src[nbytes-1] + var lastData byte + if mode == transferInvert { + for i, b := range src[:nbytes-1] { + dst[i] = ^b + } + lastData = ^src[nbytes-1] + } else { + copy(dst, src[:nbytes-1]) + lastData = src[nbytes-1] + } dst[nbytes-1] &= ^trailMask dst[nbytes-1] |= lastData & trailMask } +// CopyBitmap copies the bitmap indicated by src, starting at bit offset srcOffset, +// and copying length bits into dst, starting at bit offset dstOffset. +func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) { + transferBitmap(transferCopy, src, srcOffset, length, dst, dstOffset) +} + +// InvertBitmap copies a bit range of a bitmap, inverting it as it copies +// over into the destination. +func InvertBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) { + transferBitmap(transferInvert, src, srcOffset, length, dst, dstOffset) +} + type bitOp struct { opWord func(uint64, uint64) uint64 opByte func(byte, byte) byte @@ -440,6 +472,14 @@ var ( opWord: func(l, r uint64) uint64 { return l | r }, opByte: func(l, r byte) byte { return l | r }, } + bitAndNotOp = bitOp{ + opWord: func(l, r uint64) uint64 { return l &^ r }, + opByte: func(l, r byte) byte { return l &^ r }, + } + bitXorOp = bitOp{ + opWord: func(l, r uint64) uint64 { return l ^ r }, + opByte: func(l, r byte) byte { return l ^ r }, + } ) func alignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) { @@ -532,6 +572,22 @@ func BitmapOrAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset in return BitmapOpAlloc(mem, bitOrOp, left, right, lOffset, rOffset, length, outOffset) } +func BitmapAndNot(left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) { + BitmapOp(bitAndNotOp, left, right, lOffset, rOffset, out, outOffset, length) +} + +func BitmapAndNotAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset int64, length, outOffset int64) *memory.Buffer { + return BitmapOpAlloc(mem, bitAndNotOp, left, right, lOffset, rOffset, length, outOffset) +} + +func BitmapXor(left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) { + BitmapOp(bitXorOp, left, right, lOffset, rOffset, out, outOffset, length) +} + +func BitmapXorAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset int64, length, outOffset int64) *memory.Buffer { + return BitmapOpAlloc(mem, bitXorOp, left, right, lOffset, rOffset, length, outOffset) +} + func BitmapEquals(left, right []byte, lOffset, rOffset int64, length int64) bool { if lOffset%8 == 0 && rOffset%8 == 0 { // byte aligned, fast path, can use bytes.Equal (memcmp) @@ -584,3 +640,108 @@ type OptionalBitIndexer struct { func (b *OptionalBitIndexer) GetBit(i int) bool { return b.Bitmap == nil || BitIsSet(b.Bitmap, b.Offset+i) } + +type Bitmap struct { + Data []byte + Offset, Len int64 +} + +func bitLength(bitmaps []Bitmap) (int64, error) { + for _, b := range bitmaps[1:] { + if b.Len != bitmaps[0].Len { + return -1, errors.New("bitmaps must be same length") + } + } + return bitmaps[0].Len, nil +} + +func runVisitWordsAndWriteLoop(bitLen int64, rdrs []*BitmapWordReader, wrs []*BitmapWordWriter, visitor func(in, out []uint64)) { + const bitWidth int64 = int64(uint64SizeBits) + + visited := make([]uint64, len(rdrs)) + output := make([]uint64, len(wrs)) + + // every reader will have same number of words, since they are same + // length'ed. This will be inefficient in some cases. When there's + // offsets beyond the Word boundary, every word would have to be + // created from 2 adjoining words + nwords := int64(rdrs[0].Words()) + bitLen -= nwords * bitWidth + for nwords > 0 { + nwords-- + for i := range visited { + visited[i] = rdrs[i].NextWord() + } + visitor(visited, output) + for i := range output { + wrs[i].PutNextWord(output[i]) + } + } + + // every reader will have the same number of trailing bytes, because + // we already confirmed they have the same length. Because + // offsets beyond the Word boundary can cause adjoining words, the + // tailing portion could be more than one word remaining full/partial + // words to write. + if bitLen == 0 { + return + } + + // convert the word visitor to a bytevisitor + byteVisitor := func(in, out []byte) { + for i, w := range in { + visited[i] = uint64(w) + } + visitor(visited, output) + for i, w := range output { + out[i] = byte(w) + } + } + + visitedBytes := make([]byte, len(rdrs)) + outputBytes := make([]byte, len(wrs)) + nbytes := rdrs[0].trailingBytes + for nbytes > 0 { + nbytes-- + memory.Set(visitedBytes, 0) + memory.Set(outputBytes, 0) + + var validBits int + for i := range rdrs { + visitedBytes[i], validBits = rdrs[i].NextTrailingByte() + } + byteVisitor(visitedBytes, outputBytes) + for i, w := range outputBytes { + wrs[i].PutNextTrailingByte(w, validBits) + } + } +} + +// VisitWordsAndWrite visits words of bits from each input bitmap and +// collects outputs to a slice of output Bitmaps. +// +// All bitmaps must have identical lengths. The first bit in a visited +// bitmap may be offset within the first visited word, but words will +// otherwise contain densely packed bits loaded from the bitmap. That +// offset within the first word is returned. +// +// NOTE: this function is efficient on 3+ sufficiently large bitmaps. +// It also has a large prolog/epilog overhead and should be used +// carefully in other cases. For 2 or fewer bitmaps, and/or smaller +// bitmaps, try BitmapReader and or other utilities. +func VisitWordsAndWrite(args []Bitmap, out []Bitmap, visitor func(in, out []uint64)) error { + bitLen, err := bitLength(args) + if err != nil { + return err + } + + rdrs, wrs := make([]*BitmapWordReader, len(args)), make([]*BitmapWordWriter, len(out)) + for i, in := range args { + rdrs[i] = NewBitmapWordReader(in.Data, int(in.Offset), int(in.Len)) + } + for i, o := range out { + wrs[i] = NewBitmapWordWriter(o.Data, int(o.Offset), int(o.Len)) + } + runVisitWordsAndWriteLoop(bitLen, rdrs, wrs, visitor) + return nil +} diff --git a/go/arrow/compute/internal/kernels/scalar_boolean.go b/go/arrow/compute/internal/kernels/scalar_boolean.go new file mode 100644 index 00000000000..a458306451b --- /dev/null +++ b/go/arrow/compute/internal/kernels/scalar_boolean.go @@ -0,0 +1,332 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernels + +import ( + "github.com/apache/arrow/go/v10/arrow/bitutil" + "github.com/apache/arrow/go/v10/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v10/arrow/scalar" +) + +type computeWordFN func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) + +func computeKleene(computeWord computeWordFN, ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + var ( + inBMs = [4]bitutil.Bitmap{ + {Data: left.Buffers[0].Buf, Offset: left.Offset, Len: left.Len}, + {Data: left.Buffers[1].Buf, Offset: left.Offset, Len: left.Len}, + {Data: right.Buffers[1].Buf, Offset: right.Offset, Len: right.Len}, + {Data: right.Buffers[0].Buf, Offset: right.Offset, Len: right.Len}, + } + outBMs = [2]bitutil.Bitmap{ + {Data: out.Buffers[0].Buf, Offset: out.Offset, Len: out.Len}, + {Data: out.Buffers[1].Buf, Offset: out.Offset, Len: out.Len}, + } + apply = func(leftValid, leftData uint64, rightValid, rightData uint64) (outValidity, outData uint64) { + leftTrue, leftFalse := leftValid&leftData, leftValid&^leftData + rightTrue, rightFalse := rightValid&rightData, rightValid&^rightData + return computeWord(leftTrue, leftFalse, rightTrue, rightFalse) + } + ) + + switch { + case right.UpdateNullCount() == 0: + return bitutil.VisitWordsAndWrite(inBMs[:3], outBMs[:], + func(in, out []uint64) { + out[0], out[1] = apply(in[0], in[1], ^uint64(0), in[2]) + }) + case left.UpdateNullCount() == 0: + return bitutil.VisitWordsAndWrite(inBMs[1:], outBMs[:], + func(in, out []uint64) { + out[0], out[1] = apply(^uint64(0), in[0], in[2], in[1]) + }) + default: + return bitutil.VisitWordsAndWrite(inBMs[:], outBMs[:], + func(in, out []uint64) { + out[0], out[1] = apply(in[0], in[1], in[3], in[2]) + }) + } +} + +type AndOpKernel struct { + commutativeBinaryKernel[AndOpKernel] +} + +func (AndOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + bitutil.BitmapAnd(left.Buffers[1].Buf, right.Buffers[1].Buf, + left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, left.Len) + return nil +} + +func (AndOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error { + if !left.IsValid() { + return nil + } + + outBM := out.Buffers[1].Buf + if left.(*scalar.Boolean).Value { + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), + int(right.Len), outBM, int(out.Offset)) + } else { + bitutil.SetBitsTo(outBM, out.Offset, out.Len, false) + } + return nil +} + +type KleeneAndOpKernel struct { + commutativeBinaryKernel[KleeneAndOpKernel] +} + +func (KleeneAndOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 { + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + return (AndOpKernel{}).Call(ctx, left, right, out) + } + + computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) { + return leftFalse | rightFalse | (leftTrue & rightTrue), leftTrue & rightTrue + } + return computeKleene(computeWord, ctx, left, right, out) +} + +func (KleeneAndOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error { + var ( + leftTrue = left.IsValid() && left.(*scalar.Boolean).Value + leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value + ) + + switch { + case leftFalse: + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, false) + case leftTrue: + if right.UpdateNullCount() == 0 { + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + } else { + bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len), + out.Buffers[0].Buf, int(out.Offset)) + } + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[1].Buf, int(out.Offset)) + default: // scalar was null: out[i] is valid iff right[i] was false + if right.UpdateNullCount() == 0 { + bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[0].Buf, int(out.Offset)) + } else { + bitutil.BitmapAndNot(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset, + right.Offset, out.Buffers[0].Buf, out.Offset, right.Len) + } + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[1].Buf, int(out.Offset)) + } + return nil +} + +type OrOpKernel struct { + commutativeBinaryKernel[OrOpKernel] +} + +func (OrOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + bitutil.BitmapOr(left.Buffers[1].Buf, right.Buffers[1].Buf, + left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, left.Len) + return nil +} + +func (OrOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error { + if !left.IsValid() { + return nil + } + + outBM := out.Buffers[1].Buf + if left.(*scalar.Boolean).Value { + bitutil.SetBitsTo(outBM, out.Offset, out.Len, true) + } else { + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), + int(right.Len), outBM, int(out.Offset)) + } + return nil +} + +type KleeneOrOpKernel struct { + commutativeBinaryKernel[KleeneOrOpKernel] +} + +func (KleeneOrOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 { + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + return (OrOpKernel{}).Call(ctx, left, right, out) + } + + computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) { + return leftTrue | rightTrue | (leftFalse & rightFalse), leftTrue | rightTrue + } + return computeKleene(computeWord, ctx, left, right, out) +} + +func (KleeneOrOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error { + var ( + leftTrue = left.IsValid() && left.(*scalar.Boolean).Value + leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value + ) + + switch { + case leftTrue: + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, true) // all true case + case leftFalse: + if right.UpdateNullCount() == 0 { + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + } else { + bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len), + out.Buffers[0].Buf, int(out.Offset)) + } + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[1].Buf, int(out.Offset)) + default: // scalar was null: out[i] is valid iff right[i] was true + if right.UpdateNullCount() == 0 { + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[0].Buf, int(out.Offset)) + } else { + bitutil.BitmapAnd(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset, + right.Offset, out.Buffers[0].Buf, out.Offset, right.Len) + } + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[1].Buf, int(out.Offset)) + } + return nil +} + +type XorOpKernel struct { + commutativeBinaryKernel[XorOpKernel] +} + +func (XorOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + bitutil.BitmapXor(left.Buffers[1].Buf, right.Buffers[1].Buf, + left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, out.Len) + return nil +} + +func (XorOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error { + if !left.IsValid() { + return nil + } + + outBM := out.Buffers[1].Buf + if left.(*scalar.Boolean).Value { + bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + outBM, int(out.Offset)) + } else { + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + outBM, int(out.Offset)) + } + return nil +} + +func invertScalar(in scalar.Scalar) *scalar.Boolean { + if in.IsValid() { + return scalar.NewBooleanScalar(!in.(*scalar.Boolean).Value) + } + return in.(*scalar.Boolean) +} + +type AndNotOpKernel struct{} + +func (AndNotOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + bitutil.BitmapAndNot(left.Buffers[1].Buf, right.Buffers[1].Buf, left.Offset, right.Offset, + out.Buffers[1].Buf, out.Offset, right.Len) + return nil +} + +func (AndNotOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error { + if !left.IsValid() { + return nil + } + + outBM := out.Buffers[1].Buf + if left.(*scalar.Boolean).Value { + bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + outBM, int(out.Offset)) + } else { + bitutil.SetBitsTo(outBM, out.Offset, out.Len, false) + } + return nil +} + +func (AndNotOpKernel) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error { + return (AndOpKernel{}).CallScalarRight(ctx, left, invertScalar(right), out) +} + +type KleeneAndNotOpKernel struct{} + +func (KleeneAndNotOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error { + if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 { + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + return (AndNotOpKernel{}).Call(ctx, left, right, out) + } + + computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) { + return leftFalse | rightTrue | (leftTrue & rightFalse), leftTrue & rightFalse + } + + return computeKleene(computeWord, ctx, left, right, out) +} + +func (KleeneAndNotOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error { + var ( + leftTrue = left.IsValid() && left.(*scalar.Boolean).Value + leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value + ) + + switch { + case leftFalse: + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, false) + case leftTrue: + if right.UpdateNullCount() == 0 { + bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true) + out.Nulls = 0 + } else { + bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len), + out.Buffers[0].Buf, int(out.Offset)) + } + bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[1].Buf, int(out.Offset)) + default: // scalar was null: out[i] is valid iff right[i] was true + if right.UpdateNullCount() == 0 { + bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[0].Buf, int(out.Offset)) + } else { + bitutil.BitmapAnd(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset, right.Offset, + out.Buffers[0].Buf, out.Offset, right.Len) + } + bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len), + out.Buffers[1].Buf, int(out.Offset)) + } + return nil +} + +func (KleeneAndNotOpKernel) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error { + return (KleeneAndOpKernel{}).CallScalarRight(ctx, left, invertScalar(right), out) +} diff --git a/go/arrow/compute/internal/kernels/types.go b/go/arrow/compute/internal/kernels/types.go index eeae4b6c4e6..073e1c608c8 100644 --- a/go/arrow/compute/internal/kernels/types.go +++ b/go/arrow/compute/internal/kernels/types.go @@ -17,7 +17,12 @@ package kernels import ( + "fmt" + "github.com/apache/arrow/go/v10/arrow" + "github.com/apache/arrow/go/v10/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v10/arrow/internal/debug" + "github.com/apache/arrow/go/v10/arrow/scalar" ) var ( @@ -62,3 +67,41 @@ const ( CmpLT CmpLE ) + +type simpleBinaryKernel interface { + Call(*exec.KernelCtx, *exec.ArraySpan, *exec.ArraySpan, *exec.ExecResult) error + CallScalarLeft(*exec.KernelCtx, scalar.Scalar, *exec.ArraySpan, *exec.ExecResult) error +} + +type commutativeBinaryKernel[T simpleBinaryKernel] struct{} + +func (commutativeBinaryKernel[T]) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error { + var t T + return t.CallScalarLeft(ctx, right, left, out) +} + +type SimpleBinaryKernel interface { + simpleBinaryKernel + CallScalarRight(*exec.KernelCtx, *exec.ArraySpan, scalar.Scalar, *exec.ExecResult) error +} + +func SimpleBinary[K SimpleBinaryKernel](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + if batch.Len == 0 { + return nil + } + + var k K + if batch.Values[0].IsArray() { + if batch.Values[1].IsArray() { + return k.Call(ctx, &batch.Values[0].Array, &batch.Values[1].Array, out) + } + return k.CallScalarRight(ctx, &batch.Values[0].Array, batch.Values[1].Scalar, out) + } + + if batch.Values[1].IsArray() { + return k.CallScalarLeft(ctx, batch.Values[0].Scalar, &batch.Values[1].Array, out) + } + + debug.Assert(false, "should be unreachable") + return fmt.Errorf("%w: should be unreachable", arrow.ErrInvalid) +} diff --git a/go/arrow/compute/registry.go b/go/arrow/compute/registry.go index d56605f407b..c28eea619a8 100644 --- a/go/arrow/compute/registry.go +++ b/go/arrow/compute/registry.go @@ -46,6 +46,7 @@ func GetFunctionRegistry() FunctionRegistry { registry = NewRegistry() RegisterScalarCast(registry) RegisterVectorSelection(registry) + RegisterScalarBoolean(registry) RegisterScalarArithmetic(registry) }) return registry diff --git a/go/arrow/compute/scalar_bool.go b/go/arrow/compute/scalar_bool.go new file mode 100644 index 00000000000..0a0f6afd191 --- /dev/null +++ b/go/arrow/compute/scalar_bool.go @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package compute + +import ( + "fmt" + + "github.com/apache/arrow/go/v10/arrow" + "github.com/apache/arrow/go/v10/arrow/compute/internal/exec" + "github.com/apache/arrow/go/v10/arrow/compute/internal/kernels" +) + +var ( + andDoc = FunctionDoc{ + Summary: "Logical 'and' boolean values", + Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'and_kleene'", + ArgNames: []string{"x", "y"}, + } + andNotDoc = FunctionDoc{ + Summary: "Logical 'and not' boolean values", + Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'and_not_kleene'", + ArgNames: []string{"x", "y"}, + } + orDoc = FunctionDoc{ + Summary: "Logical 'or' boolean values", + Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'or_kleene'", + ArgNames: []string{"x", "y"}, + } + xorDoc = FunctionDoc{ + Summary: "Logical 'xor' boolean values", + Description: "When a null is encountered in either input, a null is output.", + ArgNames: []string{"x", "y"}, + } + andKleeneDoc = FunctionDoc{ + Summary: "Logical 'and' boolean values (Kleene logic)", + Description: `This function behaves as follows with nulls: + + - true and null = null + - null and true = null + - false and null = false + - null and false = false + - null and null = null + + In other words, in this context, a null value really means "unknown" + and an unknown value "and" false is always false. + For a different null behavior, see function "and".`, + ArgNames: []string{"x", "y"}, + } + andNotKleeneDoc = FunctionDoc{ + Summary: "Logical 'and_not' boolean values (Kleene logic)", + Description: `This function behaves as follows with nulls: + + - true and not null = null + - null and not false = null + - false and not null = false + - null and not true = false + - null and not null = null + + In other words, in this context, a null value really means "unknown" + and an unknown value "and not" true is always false, as is false + "and not" an unknown value. + For a different null behavior, see function "and_not".`, + ArgNames: []string{"x", "y"}, + } + orKleeneDoc = FunctionDoc{ + Summary: "Logical 'or' boolean values (Kleene logic)", + Description: `This function behaves as follows with nulls: + + - true or null = true + - null or true = true + - false or null = null + - null or false = null + - null or null = null + + In other words, in this context, a null value really means "unknown" + and an unknown value "or" true is always true. + For a different null behavior, see function "and".`, + ArgNames: []string{"x", "y"}, + } +) + +func makeFunction(reg FunctionRegistry, name string, arity int, ex exec.ArrayKernelExec, doc FunctionDoc, nulls exec.NullHandling) { + fn := NewScalarFunction(name, Arity{NArgs: arity}, doc) + + inTypes := make([]exec.InputType, arity) + for i := range inTypes { + inTypes[i] = exec.NewExactInput(arrow.FixedWidthTypes.Boolean) + } + + k := exec.NewScalarKernel(inTypes, exec.NewOutputType(arrow.FixedWidthTypes.Boolean), ex, nil) + k.NullHandling = nulls + + if err := fn.AddKernel(k); err != nil { + panic(err) + } + + if !reg.AddFunction(fn, false) { + panic(fmt.Errorf("function '%s' already exists", name)) + } +} + +func RegisterScalarBoolean(reg FunctionRegistry) { + makeFunction(reg, "and", 2, kernels.SimpleBinary[kernels.AndOpKernel], + andDoc, exec.NullIntersection) + makeFunction(reg, "and_not", 2, kernels.SimpleBinary[kernels.AndNotOpKernel], + andNotDoc, exec.NullIntersection) + makeFunction(reg, "or", 2, kernels.SimpleBinary[kernels.OrOpKernel], + orDoc, exec.NullIntersection) + makeFunction(reg, "xor", 2, kernels.SimpleBinary[kernels.XorOpKernel], + xorDoc, exec.NullIntersection) + makeFunction(reg, "and_kleene", 2, kernels.SimpleBinary[kernels.KleeneAndOpKernel], + andKleeneDoc, exec.NullComputedPrealloc) + makeFunction(reg, "and_not_kleene", 2, kernels.SimpleBinary[kernels.KleeneAndNotOpKernel], + andNotKleeneDoc, exec.NullComputedPrealloc) + makeFunction(reg, "or_kleene", 2, kernels.SimpleBinary[kernels.KleeneOrOpKernel], + orKleeneDoc, exec.NullComputedPrealloc) +} diff --git a/go/arrow/compute/scalar_bool_test.go b/go/arrow/compute/scalar_bool_test.go new file mode 100644 index 00000000000..956118d2653 --- /dev/null +++ b/go/arrow/compute/scalar_bool_test.go @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package compute_test + +import ( + "context" + "strings" + "testing" + + "github.com/apache/arrow/go/v10/arrow" + "github.com/apache/arrow/go/v10/arrow/array" + "github.com/apache/arrow/go/v10/arrow/compute" + "github.com/apache/arrow/go/v10/arrow/memory" + "github.com/apache/arrow/go/v10/arrow/scalar" + "github.com/stretchr/testify/require" +) + +func checkScalarBinary(t *testing.T, fn string, left, right, expected compute.Datum, opts compute.FunctionOptions) { + checkScalar(t, fn, []compute.Datum{left, right}, expected, opts) +} + +func checkBooleanScalarArrayBinary(t *testing.T, ctx context.Context, funcName string, array compute.Datum) { + mem := compute.GetAllocator(ctx) + for _, sc := range []scalar.Scalar{scalar.MakeNullScalar(arrow.FixedWidthTypes.Boolean), scalar.NewBooleanScalar(true), scalar.NewBooleanScalar(false)} { + constantArr, err := scalar.MakeArrayFromScalar(sc, int(array.Len()), mem) + defer constantArr.Release() + + require.NoError(t, err) + expected, err := compute.CallFunction(ctx, funcName, nil, &compute.ArrayDatum{Value: constantArr.Data()}, array) + require.NoError(t, err) + defer expected.Release() + + checkScalar(t, funcName, []compute.Datum{compute.NewDatum(sc), array}, expected, nil) + + expected, err = compute.CallFunction(ctx, funcName, nil, array, &compute.ArrayDatum{Value: constantArr.Data()}) + require.NoError(t, err) + defer expected.Release() + checkScalar(t, funcName, []compute.Datum{array, compute.NewDatum(sc)}, expected, nil) + } +} + +func TestBooleanKernels(t *testing.T) { + tests := []struct { + fn string + expectedJSON string + commutative bool + }{ + {"and", `[true, false, null, false, null, null]`, true}, + {"or", `[true, true, null, false, null, null]`, true}, + {"xor", `[false, true, null, false, null, null]`, true}, + {"and_not", `[false, true, null, false, false, null, null, null, null]`, false}, + } + + for _, tt := range tests { + t.Run(tt.fn, func(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + var ( + leftJSON = `[true, true, true, false, false, null]` + rightJSON = `[true, false, null, false, null, null]` + ) + + if !tt.commutative { + leftJSON = `[true, true, true, false, false, false, null, null, null]` + rightJSON = `[true, false, null, true, false, null, true, false, null]` + } + + left, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, + strings.NewReader(leftJSON)) + defer left.Release() + right, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, + strings.NewReader(rightJSON)) + defer right.Release() + exp, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, strings.NewReader(tt.expectedJSON)) + defer exp.Release() + + checkScalarBinary(t, tt.fn, &compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()}, &compute.ArrayDatum{Value: exp.Data()}, nil) + ctx := compute.WithAllocator(context.Background(), mem) + checkBooleanScalarArrayBinary(t, ctx, tt.fn, &compute.ArrayDatum{Value: left.Data()}) + }) + } +} + +func TestBooleanKleeneKernels(t *testing.T) { + tests := []struct { + fn string + expectedJSON []string + commutative bool + }{ + {"and_kleene", []string{`[true, false, null, false, false, null]`, `[true, false, false, null, false]`, `[true, false, false, false]`}, true}, + {"or_kleene", []string{`[true, true, true, false, null, null]`, `[true, true, false, true, null]`, `[true, true, false, true]`}, true}, + {"and_not_kleene", []string{`[false, true, null, false, false, false, false, null, null]`, `[false, true, false, false]`}, false}, + } + + for _, tt := range tests { + t.Run(tt.fn, func(t *testing.T) { + var ( + leftJSON = make([]string, len(tt.expectedJSON)) + rightJSON = make([]string, len(tt.expectedJSON)) + ) + + if tt.commutative { + leftJSON[0] = `[true, true, true, false, false, null]` + rightJSON[0] = `[true, false, null, false, null, null]` + leftJSON[1] = `[true, true, false, null, null]` + rightJSON[1] = `[true, false, false, true, false]` + leftJSON[2] = `[true, true, false, true]` + rightJSON[2] = `[true, false, false, false]` + } else { + leftJSON[0] = `[true, true, true, false, false, false, null, null, null]` + rightJSON[0] = `[true, false, null, true, false, null, true, false, null]` + leftJSON[1] = `[true, true, false, false]` + rightJSON[1] = `[true, false, true, false]` + } + + for i := range tt.expectedJSON { + func() { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + left, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, + strings.NewReader(leftJSON[i])) + defer left.Release() + right, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, + strings.NewReader(rightJSON[i])) + defer right.Release() + exp, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, strings.NewReader(tt.expectedJSON[i])) + defer exp.Release() + + checkScalarBinary(t, tt.fn, &compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()}, &compute.ArrayDatum{Value: exp.Data()}, nil) + ctx := compute.WithAllocator(context.Background(), mem) + checkBooleanScalarArrayBinary(t, ctx, tt.fn, &compute.ArrayDatum{Value: left.Data()}) + }() + } + }) + } +} diff --git a/go/go.sum b/go/go.sum index 04695d55594..b247b659ccf 100644 --- a/go/go.sum +++ b/go/go.sum @@ -137,6 +137,7 @@ github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qq github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w= github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=