From f76820b54528b9228d9ded1d3fc56f395f5f996c Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Wed, 27 Mar 2024 22:07:25 -0700 Subject: [PATCH] Cranelift x64 SIMD: some special-cases to make i64x2 packing faster. Sometimes, when in the course of silly optimizations to make the most of one's registers, one might want to pack two `i64`s into one `v128`, and one might want to do it without any loads or stores. In clang targeting Wasm at least, building an `i64x2` (with `wasm_i64x2_make(a, b)` from ``) will generate (i) an `i64x2.splat` to create a new v128 with lane 0's value in both lanes, then `i64x2.replace_lane` to put lane 1's value in place. Or, in the case that one of the lanes is zero, it will generate a `v128.const 0` then insert the other lane. Cranelift's lowerings for both of these patterns on x64 are slightly less optimal than they could be. - For the former (replace-lane of splat), the 64-bit value is moved over to the XMM register, then the rest of the `splat` semantics are implemented by a `pshufd` (shuffle), even though we're just about to overwrite the only other lane. We could omit that shuffle instead, and everything would work fine. This optimization is specific to `i64x2` (that is, only two lanes): we need to know that the only other lane that the `splat` is splatting into is overwritten. We could in theory match a chain of replace-lane operators for higher-lane-count types, but let's save that for the case that we actually need it later. - For the latter (replace-lane of constant zero), the load of a constant zero from the constant pool is the part that bothers me most. While I like zeroed memory as much as the next person, there is a vector XOR instruction *right there* under our noses, and we'd be silly not to use it. This applies to any `vconst 0`, not just ones that occur as a source to replace-lane. --- cranelift/codegen/src/isa/x64/lower.isle | 14 +++ .../isa/x64/simd-lane-access-compile.clif | 25 +++--- .../isa/x64/simd-make-vectors-avx.clif | 86 +++++++++++++++++++ .../filetests/isa/x64/simd-make-vectors.clif | 86 +++++++++++++++++++ .../filetests/runtests/simd-make-vectors.clif | 39 +++++++++ 5 files changed, 235 insertions(+), 15 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif create mode 100644 cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-make-vectors.clif diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 50798aa2bcd7..64f738053c00 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1577,6 +1577,15 @@ (rule (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 1))) (x64_punpcklqdq vec (x64_movq_to_xmm val))) +;; (i64x2.replace_lane 1) with a splat as source for lane 0 -- we can elide +;; the splat and just do a move. This turns out to be a common pattern when +;; constructing an i64x2 out of two i64s. +(rule 3 (lower (insertlane (has_type $I64X2 (splat lane0)) + lane1 + (u8_from_uimm8 1))) + (if-let $true (use_sse41)) + (x64_pinsrq (bitcast_gpr_to_xmm $I64 lane0) lane1 1)) + (rule 1 (lower (insertlane vec @ (value_type $F32X4) (sinkable_load val) (u8_from_uimm8 idx))) (if-let $true (use_sse41)) (x64_insertps vec val (sse_insertps_lane_imm idx))) @@ -4258,6 +4267,11 @@ ;; TODO use Inst::gen_constant() instead. (x64_xmm_load_const ty (const_to_vconst const))) +;; Special case for a zero-vector: don't load, xor instead. +(rule 1 (lower (has_type ty (vconst (u128_from_constant 0)))) + (let ((dst Xmm (xmm_uninit_value))) + (x64_pxor dst dst))) + ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Special case for `pblendw` which takes an 8-bit immediate where each bit diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif index 5a84fa914190..e97c4c5e8504 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif @@ -15,11 +15,12 @@ block0: ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(3), %xmm0 -; movdqu const(2), %xmm2 +; uninit %xmm0 +; pxor %xmm0, %xmm0, %xmm0 +; movdqu const(2), %xmm3 ; pshufb %xmm0, const(0), %xmm0 -; pshufb %xmm2, const(1), %xmm2 -; por %xmm0, %xmm2, %xmm0 +; pshufb %xmm3, const(1), %xmm3 +; por %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -29,22 +30,16 @@ block0: ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x24(%rip), %xmm0 -; movdqu 0x2c(%rip), %xmm2 -; pshufb 0x33(%rip), %xmm0 -; pshufb 0x3a(%rip), %xmm2 -; por %xmm2, %xmm0 +; pxor %xmm0, %xmm0 +; movdqu 0x20(%rip), %xmm3 +; pshufb 0x27(%rip), %xmm0 +; pshufb 0x2e(%rip), %xmm3 +; por %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) ; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) ; addb %al, (%rcx) ; addb %al, (%rax) ; addb %al, (%rax) diff --git a/cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif new file mode 100644 index 000000000000..a4cf9d705a5a --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif @@ -0,0 +1,86 @@ +test compile precise-output +target x86_64 sse42 has_avx + +function %i64x2_make0() -> i64x2 { +block0: + v0 = vconst.i64x2 [0 0] + return v0 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm0 +; vpxor %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpxor %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_make1(i64) -> i64x2 { +block0(v0: i64): + v1 = vconst.i64x2 [0 0] + v2 = insertlane.i64x2 v1, v0, 0 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm3 +; vpxor %xmm3, %xmm3, %xmm5 +; vpinsrq $0, %xmm5, %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpxor %xmm3, %xmm3, %xmm5 +; vpinsrq $0, %rdi, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_make2(i64, i64) -> i64x2 { +block0(v0: i64, v1: i64): + v2 = splat.i64x2 v0 + v3 = insertlane.i64x2 v2, v1, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovq %rdi, %xmm3 +; vpinsrq $1, %xmm3, %rsi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovq %rdi, %xmm3 +; vpinsrq $1, %rsi, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif b/cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif new file mode 100644 index 000000000000..209c8fc56d9d --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif @@ -0,0 +1,86 @@ +test compile precise-output +target x86_64 sse42 + +function %i64x2_make0() -> i64x2 { +block0: + v0 = vconst.i64x2 [0 0] + return v0 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm0 +; pxor %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pxor %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_make1(i64) -> i64x2 { +block0(v0: i64): + v1 = vconst.i64x2 [0 0] + v2 = insertlane.i64x2 v1, v0, 0 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm0 +; pxor %xmm0, %xmm0, %xmm0 +; pinsrd.w $0, %xmm0, %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pxor %xmm0, %xmm0 +; pinsrq $0, %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_make2(i64, i64) -> i64x2 { +block0(v0: i64, v1: i64): + v2 = splat.i64x2 v0 + v3 = insertlane.i64x2 v2, v1, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %xmm0 +; pinsrd.w $1, %xmm0, %rsi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %xmm0 +; pinsrq $1, %rsi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/simd-make-vectors.clif b/cranelift/filetests/filetests/runtests/simd-make-vectors.clif new file mode 100644 index 000000000000..b90c823264ee --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-make-vectors.clif @@ -0,0 +1,39 @@ +test interpret +test run +target aarch64 +target s390x +target x86_64 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v +target riscv64 has_v has_c has_zcb + +function %i64x2_make0() -> i64x2 { +block0: + v0 = vconst.i64x2 [0 0] + return v0 +} + +; run: %i64x2_make0() == [0 0] + +function %i64x2_make1(i64) -> i64x2 { +block0(v0: i64): + v1 = vconst.i64x2 [0 0] + v2 = insertlane.i64x2 v1, v0, 0 + return v2 +} + +; run: %i64x2_make1(0) == [0 0] +; run: %i64x2_make1(0x123456789abcdef0) == [0x123456789abcdef0 0] + +function %i64x2_make2(i64, i64) -> i64x2 { +block0(v0: i64, v1: i64): + v2 = splat.i64x2 v0 + v3 = insertlane.i64x2 v2, v1, 1 + return v3 +} + +; run: %i64x2_make2(0, 0) == [0 0] +; run: %i64x2_make2(0x123456789abcdef0, 0) == [0x123456789abcdef0 0] +; run: %i64x2_make2(0, 0x123456789abcdef0) == [0 0x123456789abcdef0] +; run: %i64x2_make2(0x123456789abcdef0, 0x0fedcba987654321) == [0x123456789abcdef0 0x0fedcba987654321]