From f76820b54528b9228d9ded1d3fc56f395f5f996c Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Wed, 27 Mar 2024 22:07:25 -0700
Subject: [PATCH] Cranelift x64 SIMD: some special-cases to make i64x2 packing
 faster.

Sometimes, when in the course of silly optimizations to make the most of
one's registers, one might want to pack two `i64`s into one `v128`, and
one might want to do it without any loads or stores.

In clang targeting Wasm at least, building an `i64x2` (with
`wasm_i64x2_make(a, b)` from `<wasm_simd128.h>`) will generate (i) an
`i64x2.splat` to create a new v128 with lane 0's value in both lanes,
then `i64x2.replace_lane` to put lane 1's value in place. Or, in the
case that one of the lanes is zero, it will generate a `v128.const 0`
then insert the other lane.

Cranelift's lowerings for both of these patterns on x64 are slightly
less optimal than they could be.

- For the former (replace-lane of splat), the 64-bit value is moved over
  to the XMM register, then the rest of the `splat` semantics are
  implemented by a `pshufd` (shuffle), even though we're just about to
  overwrite the only other lane. We could omit that shuffle instead, and
  everything would work fine.

  This optimization is specific to `i64x2` (that is, only two lanes): we
  need to know that the only other lane that the `splat` is splatting
  into is overwritten. We could in theory match a chain of
  replace-lane operators for higher-lane-count types, but let's save
  that for the case that we actually need it later.

- For the latter (replace-lane of constant zero), the load of a constant
  zero from the constant pool is the part that bothers me most. While I
  like zeroed memory as much as the next person, there is a vector XOR
  instruction *right there* under our noses, and we'd be silly not to
  use it. This applies to any `vconst 0`, not just ones that occur as a
  source to replace-lane.
---
 cranelift/codegen/src/isa/x64/lower.isle      | 14 +++
 .../isa/x64/simd-lane-access-compile.clif     | 25 +++---
 .../isa/x64/simd-make-vectors-avx.clif        | 86 +++++++++++++++++++
 .../filetests/isa/x64/simd-make-vectors.clif  | 86 +++++++++++++++++++
 .../filetests/runtests/simd-make-vectors.clif | 39 +++++++++
 5 files changed, 235 insertions(+), 15 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-make-vectors.clif

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 50798aa2bcd7..64f738053c00 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1577,6 +1577,15 @@
 (rule (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 1)))
       (x64_punpcklqdq vec (x64_movq_to_xmm val)))
 
+;; (i64x2.replace_lane 1) with a splat as source for lane 0 -- we can elide
+;; the splat and just do a move. This turns out to be a common pattern when
+;; constructing an i64x2 out of two i64s.
+(rule 3 (lower (insertlane (has_type $I64X2 (splat lane0))
+                           lane1
+                           (u8_from_uimm8 1)))
+        (if-let $true (use_sse41))
+        (x64_pinsrq (bitcast_gpr_to_xmm $I64 lane0) lane1 1))
+
 (rule 1 (lower (insertlane vec @ (value_type $F32X4) (sinkable_load val) (u8_from_uimm8 idx)))
   (if-let $true (use_sse41))
   (x64_insertps vec val (sse_insertps_lane_imm idx)))
@@ -4258,6 +4267,11 @@
       ;; TODO use Inst::gen_constant() instead.
       (x64_xmm_load_const ty (const_to_vconst const)))
 
+;; Special case for a zero-vector: don't load, xor instead.
+(rule 1 (lower (has_type ty (vconst (u128_from_constant 0))))
+    (let ((dst Xmm (xmm_uninit_value)))
+      (x64_pxor dst dst)))
+
 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Special case for `pblendw` which takes an 8-bit immediate where each bit
diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
index 5a84fa914190..e97c4c5e8504 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
@@ -15,11 +15,12 @@ block0:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqu  const(3), %xmm0
-;   movdqu  const(2), %xmm2
+;   uninit  %xmm0
+;   pxor    %xmm0, %xmm0, %xmm0
+;   movdqu  const(2), %xmm3
 ;   pshufb  %xmm0, const(0), %xmm0
-;   pshufb  %xmm2, const(1), %xmm2
-;   por     %xmm0, %xmm2, %xmm0
+;   pshufb  %xmm3, const(1), %xmm3
+;   por     %xmm0, %xmm3, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -29,22 +30,16 @@ block0:
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqu 0x24(%rip), %xmm0
-;   movdqu 0x2c(%rip), %xmm2
-;   pshufb 0x33(%rip), %xmm0
-;   pshufb 0x3a(%rip), %xmm2
-;   por %xmm2, %xmm0
+;   pxor %xmm0, %xmm0
+;   movdqu 0x20(%rip), %xmm3
+;   pshufb 0x27(%rip), %xmm0
+;   pshufb 0x2e(%rip), %xmm3
+;   por %xmm3, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
 ;   addb %al, (%rcx)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
diff --git a/cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif
new file mode 100644
index 000000000000..a4cf9d705a5a
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/simd-make-vectors-avx.clif
@@ -0,0 +1,86 @@
+test compile precise-output
+target x86_64 sse42 has_avx
+
+function %i64x2_make0() -> i64x2 {
+block0:
+    v0 = vconst.i64x2 [0 0]
+    return v0
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm0
+;   vpxor   %xmm0, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpxor %xmm0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64x2_make1(i64) -> i64x2 {
+block0(v0: i64):
+    v1 = vconst.i64x2 [0 0]
+    v2 = insertlane.i64x2 v1, v0, 0
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm3
+;   vpxor   %xmm3, %xmm3, %xmm5
+;   vpinsrq $0, %xmm5, %rdi, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpxor %xmm3, %xmm3, %xmm5
+;   vpinsrq $0, %rdi, %xmm5, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64x2_make2(i64, i64) -> i64x2 {
+block0(v0: i64, v1: i64):
+    v2 = splat.i64x2 v0
+    v3 = insertlane.i64x2 v2, v1, 1
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vmovq   %rdi, %xmm3
+;   vpinsrq $1, %xmm3, %rsi, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vmovq %rdi, %xmm3
+;   vpinsrq $1, %rsi, %xmm3, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif b/cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif
new file mode 100644
index 000000000000..209c8fc56d9d
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/simd-make-vectors.clif
@@ -0,0 +1,86 @@
+test compile precise-output
+target x86_64 sse42
+
+function %i64x2_make0() -> i64x2 {
+block0:
+    v0 = vconst.i64x2 [0 0]
+    return v0
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm0
+;   pxor    %xmm0, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pxor %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64x2_make1(i64) -> i64x2 {
+block0(v0: i64):
+    v1 = vconst.i64x2 [0 0]
+    v2 = insertlane.i64x2 v1, v0, 0
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm0
+;   pxor    %xmm0, %xmm0, %xmm0
+;   pinsrd.w $0, %xmm0, %rdi, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pxor %xmm0, %xmm0
+;   pinsrq $0, %rdi, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64x2_make2(i64, i64) -> i64x2 {
+block0(v0: i64, v1: i64):
+    v2 = splat.i64x2 v0
+    v3 = insertlane.i64x2 v2, v1, 1
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %xmm0
+;   pinsrd.w $1, %xmm0, %rsi, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %xmm0
+;   pinsrq $1, %rsi, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/runtests/simd-make-vectors.clif b/cranelift/filetests/filetests/runtests/simd-make-vectors.clif
new file mode 100644
index 000000000000..b90c823264ee
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-make-vectors.clif
@@ -0,0 +1,39 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64
+target x86_64 sse42
+target x86_64 sse42 has_avx
+target riscv64 has_v
+target riscv64 has_v has_c has_zcb
+
+function %i64x2_make0() -> i64x2 {
+block0:
+    v0 = vconst.i64x2 [0 0]
+    return v0
+}
+
+; run: %i64x2_make0() == [0 0]
+
+function %i64x2_make1(i64) -> i64x2 {
+block0(v0: i64):
+    v1 = vconst.i64x2 [0 0]
+    v2 = insertlane.i64x2 v1, v0, 0
+    return v2
+}
+
+; run: %i64x2_make1(0) == [0 0]
+; run: %i64x2_make1(0x123456789abcdef0) == [0x123456789abcdef0 0]
+
+function %i64x2_make2(i64, i64) -> i64x2 {
+block0(v0: i64, v1: i64):
+    v2 = splat.i64x2 v0
+    v3 = insertlane.i64x2 v2, v1, 1
+    return v3
+}
+
+; run: %i64x2_make2(0, 0) == [0 0]
+; run: %i64x2_make2(0x123456789abcdef0, 0) == [0x123456789abcdef0 0]
+; run: %i64x2_make2(0, 0x123456789abcdef0) == [0 0x123456789abcdef0]
+; run: %i64x2_make2(0x123456789abcdef0, 0x0fedcba987654321) == [0x123456789abcdef0 0x0fedcba987654321]