From 12d3d798e4364606833a1e128984d4b06af667dc Mon Sep 17 00:00:00 2001 From: Yeting Kuo Date: Wed, 31 Jul 2024 02:20:48 -0700 Subject: [PATCH] [RISCV] Use experimental.vp.splat to splat specific vector length elements. Previously, llvm IR is hard to create a scalable vector splat with a specific vector lenght, so we use riscv.vmv.v.x and riscv.vmv.v.f to do this work. But the two rvv intrinsics needs strict type constraint which not support fixed vector types and illegal vector types. Using vp.splat could preserve old functionality and also generate more optimized code for vector types and illegal vectors. This patch also fixes crash for getEVT not serving ptr types. --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 19 +---- .../RISCV/rvv/fixed-vectors-strided-vpload.ll | 8 +- llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll | 80 +++++++++++++++++-- 3 files changed, 79 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index 0a66a38f6d5ab..be2e880ecd3a9 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -187,25 +187,10 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { auto *VTy = cast(II.getType()); IRBuilder<> Builder(&II); - - // Extend VL from i32 to XLen if needed. - if (ST->is64Bit()) - VL = Builder.CreateZExt(VL, Builder.getInt64Ty()); - Type *STy = VTy->getElementType(); Value *Val = Builder.CreateLoad(STy, BasePtr); - const auto &TLI = *ST->getTargetLowering(); - Value *Res; - - // TODO: Also support fixed/illegal vector types to splat with evl = vl. - if (isa(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) { - unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f - : Intrinsic::riscv_vmv_v_x; - Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()}, - {PoisonValue::get(VTy), Val, VL}); - } else { - Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val); - } + Value *Res = Builder.CreateIntrinsic(Intrinsic::experimental_vp_splat, {VTy}, + {Val, II.getOperand(2), VL}); II.replaceAllUsesWith(Res); II.eraseFromParent(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index b8c7037580c46..849f98c26f459 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -638,14 +638,14 @@ declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) { ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8: ; CHECK-OPT: # %bb.0: -; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero ; CHECK-OPT-NEXT: ret ; ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8: ; CHECK-NO-OPT: # %bb.0: ; CHECK-NO-OPT-NEXT: lbu a0, 0(a0) -; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NO-OPT-NEXT: vmv.v.x v8, a0 ; CHECK-NO-OPT-NEXT: ret %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 3) @@ -657,14 +657,14 @@ define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) { define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) { ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16: ; CHECK-OPT: # %bb.0: -; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma ; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero ; CHECK-OPT-NEXT: ret ; ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4f16: ; CHECK-NO-OPT: # %bb.0: ; CHECK-NO-OPT-NEXT: flh fa5, 0(a0) -; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma ; CHECK-NO-OPT-NEXT: vfmv.v.f v8, fa5 ; CHECK-NO-OPT-NEXT: ret %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3) diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index 0010f64a93fd6..14976f21b7dbb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \ ; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT,CHECK-OPT-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \ ; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT,CHECK-OPT-RV64 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \ ; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \ ; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-RV64 declare @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, , i32) @@ -823,15 +823,15 @@ define @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) { ret %load } -define @zero_strided_vadd.vx( %v, ptr %ptr) { -; CHECK-RV32-LABEL: zero_strided_vadd.vx: +define @zero_strided_vadd_nxv1i64( %v, ptr %ptr) { +; CHECK-RV32-LABEL: zero_strided_vadd_nxv1i64: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-RV32-NEXT: vlse64.v v9, (a0), zero ; CHECK-RV32-NEXT: vadd.vv v8, v8, v9 ; CHECK-RV32-NEXT: ret ; -; CHECK-RV64-LABEL: zero_strided_vadd.vx: +; CHECK-RV64-LABEL: zero_strided_vadd_nxv1i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: ld a0, 0(a0) ; CHECK-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma @@ -842,3 +842,69 @@ define @zero_strided_vadd.vx( %v, ptr %ptr) %w = add %v, %load ret %w } + +define @zero_strided_vadd_nxv16i64( %v, ptr %ptr) { +; CHECK-RV32-LABEL: zero_strided_vadd_nxv16i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: srli a2, a1, 3 +; CHECK-RV32-NEXT: sub a3, a2, a1 +; CHECK-RV32-NEXT: sltu a4, a2, a3 +; CHECK-RV32-NEXT: addi a4, a4, -1 +; CHECK-RV32-NEXT: and a3, a4, a3 +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK-RV32-NEXT: vlse64.v v24, (a0), zero +; CHECK-RV32-NEXT: bltu a2, a1, .LBB55_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a2, a1 +; CHECK-RV32-NEXT: .LBB55_2: +; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-RV32-NEXT: vlse64.v v0, (a0), zero +; CHECK-RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-RV32-NEXT: vadd.vv v16, v16, v24 +; CHECK-RV32-NEXT: vadd.vv v8, v8, v0 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: zero_strided_vadd_nxv16i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; CHECK-RV64-NEXT: vadd.vx v8, v8, a0 +; CHECK-RV64-NEXT: vadd.vx v16, v16, a0 +; CHECK-RV64-NEXT: ret + %vscale = call i32 @llvm.vscale() + %load = call @llvm.experimental.vp.strided.load.nxv16i64.p0.i32(ptr %ptr, i32 0, splat (i1 true), i32 %vscale) + %w = add %v, %load + ret %w +} + +define @zero_strided_vadd_nxv1p0( %v, ptr %ptr) { +; CHECK-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0: +; CHECK-OPT-RV32: # %bb.0: +; CHECK-OPT-RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-OPT-RV32-NEXT: vlse32.v v8, (a0), zero +; CHECK-OPT-RV32-NEXT: ret +; +; CHECK-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0: +; CHECK-OPT-RV64: # %bb.0: +; CHECK-OPT-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-OPT-RV64-NEXT: vlse64.v v8, (a0), zero +; CHECK-OPT-RV64-NEXT: ret +; +; CHECK-NO-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0: +; CHECK-NO-OPT-RV32: # %bb.0: +; CHECK-NO-OPT-RV32-NEXT: lw a0, 0(a0) +; CHECK-NO-OPT-RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NO-OPT-RV32-NEXT: vmv.v.x v8, a0 +; CHECK-NO-OPT-RV32-NEXT: ret +; +; CHECK-NO-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0: +; CHECK-NO-OPT-RV64: # %bb.0: +; CHECK-NO-OPT-RV64-NEXT: ld a0, 0(a0) +; CHECK-NO-OPT-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NO-OPT-RV64-NEXT: vmv.v.x v8, a0 +; CHECK-NO-OPT-RV64-NEXT: ret + %vscale = call i32 @llvm.vscale() + %load = call @llvm.experimental.vp.strided.load.nxv1p0.p0.i32(ptr %ptr, i32 0, splat (i1 true), i32 %vscale) + ret %load +}