From 0683bf422398a9b2efcc1ec1e7d0f351cf3ed3aa Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Tue, 18 Jul 2023 15:07:13 -0700 Subject: [PATCH 1/2] Cranelift: Implement tail calls on riscv64 Co-Authored-By: Jamey Sharp --- cranelift/codegen/src/isa/riscv64/abi.rs | 39 + cranelift/codegen/src/isa/riscv64/inst.isle | 6 + .../codegen/src/isa/riscv64/inst/emit.rs | 172 ++++- cranelift/codegen/src/isa/riscv64/inst/mod.rs | 32 + cranelift/codegen/src/isa/riscv64/lower.isle | 9 + .../codegen/src/isa/riscv64/lower/isle.rs | 45 +- .../filetests/isa/riscv64/return-call.clif | 696 ++++++++++++++++++ .../runtests/return-call-indirect.clif | 2 +- .../filetests/runtests/return-call-loop.clif | 7 +- .../filetests/runtests/return-call.clif | 2 +- cranelift/fuzzgen/src/function_generator.rs | 2 +- 11 files changed, 998 insertions(+), 14 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/return-call.clif diff --git a/cranelift/codegen/src/isa/riscv64/abi.rs b/cranelift/codegen/src/isa/riscv64/abi.rs index 4fbaa4be0c48..e8aa95824009 100644 --- a/cranelift/codegen/src/isa/riscv64/abi.rs +++ b/cranelift/codegen/src/isa/riscv64/abi.rs @@ -686,6 +686,45 @@ impl ABIMachineSpec for Riscv64MachineDeps { } } +impl Riscv64ABICallSite { + pub fn emit_return_call(mut self, ctx: &mut Lower, args: isle::ValueSlice) { + let (new_stack_arg_size, old_stack_arg_size) = + self.emit_temporary_tail_call_frame(ctx, args); + + let dest = self.dest().clone(); + let opcode = self.opcode(); + let uses = self.take_uses(); + let info = Box::new(ReturnCallInfo { + uses, + opcode, + old_stack_arg_size, + new_stack_arg_size, + }); + + match dest { + // TODO: Our riscv64 backend doesn't have relocs for direct calls, + // the callee is always put in a register and then the register is + // relocated, so we don't currently differentiate between + // `RelocDistance::Near` and `RelocDistance::Far`. We just always + // use indirect calls. We should eventually add a non-indirect + // `return_call` instruction and path. + CallDest::ExtName(name, _) => { + let callee = ctx.alloc_tmp(ir::types::I64).only_reg().unwrap(); + ctx.emit(Inst::LoadExtName { + rd: callee, + name: Box::new(name), + offset: 0, + }); + ctx.emit(Inst::ReturnCallInd { + callee: callee.to_reg(), + info, + }); + } + CallDest::Reg(callee) => ctx.emit(Inst::ReturnCallInd { callee, info }), + } + } +} + const CALLEE_SAVE_X_REG: [bool; 32] = [ false, false, true, false, false, false, false, false, // 0-7 true, true, false, false, false, false, false, false, // 8-15 diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index 498086e7ea9f..233bc9a82734 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -98,6 +98,11 @@ (CallInd (info BoxCallIndInfo)) + ;; An indirect return-call macro instruction. + (ReturnCallInd + (callee Reg) + (info BoxReturnCallInfo)) + (TrapIf (test Reg) (trap_code TrapCode)) @@ -720,6 +725,7 @@ (type VecBranchTarget (primitive VecBranchTarget)) (type BoxCallInfo (primitive BoxCallInfo)) (type BoxCallIndInfo (primitive BoxCallIndInfo)) +(type BoxReturnCallInfo (primitive BoxReturnCallInfo)) (type IntegerCompare (primitive IntegerCompare)) (type AMode (primitive AMode)) (type OptionReg (primitive OptionReg)) diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index feadde261b59..47bef3d1dd45 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -1,10 +1,8 @@ //! Riscv64 ISA: binary code emission. use crate::binemit::StackMap; -use crate::ir::RelSourceLoc; -use crate::ir::TrapCode; +use crate::ir::{self, RelSourceLoc, TrapCode}; use crate::isa::riscv64::inst::*; -use crate::isa::riscv64::inst::{zero_reg, AluOPRRR}; use crate::machinst::{AllocationConsumer, Reg, Writable}; use crate::trace; use cranelift_control::ControlPlane; @@ -426,6 +424,7 @@ impl Inst { | Inst::AdjustSp { .. } | Inst::Call { .. } | Inst::CallInd { .. } + | Inst::ReturnCallInd { .. } | Inst::TrapIf { .. } | Inst::Jal { .. } | Inst::CondBr { .. } @@ -885,6 +884,27 @@ impl MachInstEmit for Inst { ); } + &Inst::ReturnCallInd { callee, ref info } => { + let callee = allocs.next(callee); + + emit_return_call_common_sequence( + &mut allocs, + sink, + emit_info, + state, + info.new_stack_arg_size, + info.old_stack_arg_size, + &info.uses, + ); + + Inst::Jalr { + rd: writable_zero_reg(), + base: callee, + offset: Imm12::zero(), + } + .emit(&[], sink, emit_info, state); + } + &Inst::Jal { dest } => { let code: u32 = 0b1101111; match dest { @@ -3056,3 +3076,149 @@ fn alloc_value_regs(orgin: &ValueRegs, alloc: &mut AllocationConsumer) -> V _ => unreachable!(), } } + +fn emit_return_call_common_sequence( + allocs: &mut AllocationConsumer<'_>, + sink: &mut MachBuffer, + emit_info: &EmitInfo, + state: &mut EmitState, + new_stack_arg_size: u32, + old_stack_arg_size: u32, + uses: &CallArgList, +) { + for u in uses { + let _ = allocs.next(u.vreg); + } + + // We are emitting a dynamic number of instructions and might need an + // island. We emit four instructions regardless of how many stack arguments + // we have, and then two instructions per word of stack argument space. + let new_stack_words = new_stack_arg_size / 8; + let insts = 4 + 2 * new_stack_words; + let space_needed = insts * u32::try_from(Inst::INSTRUCTION_SIZE).unwrap(); + if sink.island_needed(space_needed) { + let jump_around_label = sink.get_label(); + Inst::Jal { + dest: BranchTarget::Label(jump_around_label), + } + .emit(&[], sink, emit_info, state); + sink.emit_island(space_needed + 4, &mut state.ctrl_plane); + sink.bind_label(jump_around_label, &mut state.ctrl_plane); + } + + // Copy the new frame on top of our current frame. + // + // The current stack layout is the following: + // + // | ... | + // +---------------------+ + // | ... | + // | stack arguments | + // | ... | + // current | return address | + // frame | old FP | <-- FP + // | ... | + // | old stack slots | + // | ... | + // +---------------------+ + // | ... | + // new | new stack arguments | + // frame | ... | <-- SP + // +---------------------+ + // + // We need to restore the old FP, restore the return address from the stack + // to the link register, copy the new stack arguments over the old stack + // arguments, adjust SP to point to the new stack arguments, and then jump + // to the callee (which will push the old FP and RA again). Note that the + // actual jump happens outside this helper function. + + assert_eq!( + new_stack_arg_size % 8, + 0, + "size of new stack arguments must be 8-byte aligned" + ); + + // The delta from our frame pointer to the (eventual) stack pointer value + // when we jump to the tail callee. This is the difference in size of stack + // arguments as well as accounting for the two words we pushed onto the + // stack upon entry to this function (the return address and old frame + // pointer). + let fp_to_callee_sp = i64::from(old_stack_arg_size) - i64::from(new_stack_arg_size) + 16; + + let tmp1 = regs::writable_spilltmp_reg(); + let tmp2 = regs::writable_spilltmp_reg2(); + + // Restore the return address to the link register, and load the old FP into + // a temporary register. + // + // We can't put the old FP into the FP register until after we copy the + // stack arguments into place, since that uses address modes that are + // relative to our current FP. + // + // Note that the FP is saved in the function prologue for all non-leaf + // functions, even when `preserve_frame_pointers=false`. Note also that + // `return_call` instructions make it so that a function is considered + // non-leaf. Therefore we always have an FP to restore here. + + Inst::gen_load( + writable_link_reg(), + AMode::FPOffset(8, I64), + I64, + MemFlags::trusted(), + ) + .emit(&[], sink, emit_info, state); + Inst::gen_load(tmp1, AMode::FPOffset(0, I64), I64, MemFlags::trusted()).emit( + &[], + sink, + emit_info, + state, + ); + + // Copy the new stack arguments over the old stack arguments. + for i in (0..new_stack_arg_size / 8).rev() { + // Load the `i`th new stack argument word from the temporary stack + // space. + Inst::gen_load( + tmp2, + AMode::SPOffset(i64::from(i * 8), types::I64), + types::I64, + ir::MemFlags::trusted(), + ) + .emit(&[], sink, emit_info, state); + + // Store it to its final destination on the stack, overwriting our + // current frame. + Inst::gen_store( + AMode::FPOffset(fp_to_callee_sp + i64::from(i * 8), types::I64), + tmp2.to_reg(), + types::I64, + ir::MemFlags::trusted(), + ) + .emit(&[], sink, emit_info, state); + } + + // Initialize the SP for the tail callee, deallocating the temporary stack + // argument space and our current frame at the same time. + Inst::AluRRImm12 { + alu_op: AluOPRRI::Addi, + rd: regs::writable_stack_reg(), + rs: regs::fp_reg(), + imm12: Imm12::maybe_from_u64(fp_to_callee_sp as u64).unwrap(), + } + .emit(&[], sink, emit_info, state); + + // Move the old FP value from the temporary into the FP register. + Inst::Mov { + ty: types::I64, + rd: regs::writable_fp_reg(), + rm: tmp1.to_reg(), + } + .emit(&[], sink, emit_info, state); + + state.virtual_sp_offset -= i64::from(new_stack_arg_size); + trace!( + "return_call[_ind] adjusts virtual sp offset by {} -> {}", + new_stack_arg_size, + state.virtual_sp_offset + ); +} diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 091d2128db17..9b96832299cd 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -62,6 +62,7 @@ use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRImm5, Ve type BoxCallInfo = Box; type BoxCallIndInfo = Box; +type BoxReturnCallInfo = Box; /// Additional information for (direct) Call instructions, left out of line to lower the size of /// the Inst enum. @@ -91,6 +92,16 @@ pub struct CallIndInfo { pub callee_pop_size: u32, } +/// Additional information for `return_call[_ind]` instructions, left out of +/// line to lower the size of the `Inst` enum. +#[derive(Clone, Debug)] +pub struct ReturnCallInfo { + pub uses: CallArgList, + pub opcode: Opcode, + pub old_stack_arg_size: u32, + pub new_stack_arg_size: u32, +} + /// A branch target. Either unresolved (basic-block index) or resolved (offset /// from end of current instruction). #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -448,6 +459,12 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan } collector.reg_clobbers(info.clobbers); } + &Inst::ReturnCallInd { ref info, callee } => { + collector.reg_use(callee); + for u in &info.uses { + collector.reg_fixed_use(u.vreg, u.preg); + } + } &Inst::TrapIf { test, .. } => { collector.reg_use(test); } @@ -863,6 +880,7 @@ impl MachInst for Inst { &Inst::Jalr { .. } => MachTerminator::Uncond, &Inst::Ret { .. } => MachTerminator::Ret, &Inst::BrTable { .. } => MachTerminator::Indirect, + &Inst::ReturnCallInd { .. } => MachTerminator::RetCall, _ => MachTerminator::None, } } @@ -1049,6 +1067,7 @@ impl Inst { } } + let mut empty_allocs = AllocationConsumer::default(); match self { &Inst::Nop0 => { format!("##zero length nop") @@ -1583,6 +1602,19 @@ impl Inst { let rd = format_reg(info.rn, allocs); format!("callind {}", rd) } + &MInst::ReturnCallInd { callee, ref info } => { + let callee = format_reg(callee, allocs); + let mut s = format!( + "return_call_ind {callee} old_stack_arg_size:{} new_stack_arg_size:{}", + info.old_stack_arg_size, info.new_stack_arg_size + ); + for ret in &info.uses { + let preg = format_reg(ret.preg, &mut empty_allocs); + let vreg = format_reg(ret.vreg, allocs); + write!(&mut s, " {vreg}={preg}").unwrap(); + } + s + } &MInst::TrapIf { test, trap_code } => { format!("trap_if {},{}", format_reg(test, allocs), trap_code,) } diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 74adb15d8ec0..926d8609ab5e 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1658,6 +1658,15 @@ (rule (lower (call_indirect sig_ref val inputs)) (gen_call_indirect sig_ref val inputs)) +;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (return_call (func_ref_data sig_ref extname dist) args)) + (gen_return_call sig_ref extname dist args)) + +(rule (lower (return_call_indirect sig_ref callee args)) + (gen_return_call_indirect sig_ref callee args)) + + ;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (extractlane x @ (value_type ty) (u8_from_uimm8 idx))) diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs index c030f532bdc0..e29168398dd5 100644 --- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs +++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs @@ -24,7 +24,7 @@ use crate::{ isa::riscv64::inst::*, machinst::{ArgPair, InstOutput, Lower}, }; -use crate::{isle_common_prelude_methods, isle_lower_prelude_methods}; +use crate::{isa, isle_common_prelude_methods, isle_lower_prelude_methods}; use regalloc2::PReg; use std::boxed::Box; use std::convert::TryFrom; @@ -32,6 +32,7 @@ use std::vec::Vec; type BoxCallInfo = Box; type BoxCallIndInfo = Box; +type BoxReturnCallInfo = Box; type BoxExternalName = Box; type VecMachLabel = Vec; type VecArgPair = Vec; @@ -79,8 +80,24 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> distance: RelocDistance, args: ValueSlice, ) -> InstOutput { - let _ = (callee_sig, callee, distance, args); - todo!() + let caller_conv = isa::CallConv::Tail; + debug_assert_eq!( + self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()), + caller_conv, + "Can only do `return_call`s from within a `tail` calling convention function" + ); + + let call_site = Riscv64ABICallSite::from_func( + self.lower_ctx.sigs(), + callee_sig, + &callee, + distance, + caller_conv, + self.backend.flags().clone(), + ); + call_site.emit_return_call(self.lower_ctx, args); + + InstOutput::new() } fn gen_return_call_indirect( @@ -89,8 +106,26 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> callee: Value, args: ValueSlice, ) -> InstOutput { - let _ = (callee_sig, callee, args); - todo!() + let caller_conv = isa::CallConv::Tail; + debug_assert_eq!( + self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()), + caller_conv, + "Can only do `return_call`s from within a `tail` calling convention function" + ); + + let callee = self.put_in_reg(callee); + + let call_site = Riscv64ABICallSite::from_ptr( + self.lower_ctx.sigs(), + callee_sig, + callee, + Opcode::ReturnCallIndirect, + caller_conv, + self.backend.flags().clone(), + ); + call_site.emit_return_call(self.lower_ctx, args); + + InstOutput::new() } fn vreg_new(&mut self, r: Reg) -> VReg { diff --git a/cranelift/filetests/filetests/isa/riscv64/return-call.clif b/cranelift/filetests/filetests/isa/riscv64/return-call.clif new file mode 100644 index 000000000000..df5f8fb3481d --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/return-call.clif @@ -0,0 +1,696 @@ +test compile precise-output + +target riscv64 + +;;;; Test passing `i64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %callee_i64(i64) -> i64 tail { +block0(v0: i64): + v1 = iadd_imm.i64 v0, 10 + return v1 +} + +; VCode: +; block0: +; addi s1,s1,10 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi s1, s1, 0xa +; ret + +function %call_i64(i64) -> i64 tail { + fn0 = %callee_i64(i64) -> i64 tail + +block0(v0: i64): + return_call fn0(v0) +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; load_sym t2,%callee_i64+0 +; return_call_ind t2 old_stack_arg_size:0 new_stack_arg_size:0 s1=s1 +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; auipc t2, 0 +; ld t2, 0xc(t2) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i64 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld ra, 8(s0) +; ld t6, 0(s0) +; addi sp, s0, 0x10 +; ori s0, t6, 0 +; jr t2 + +;;;; Test colocated tail calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %colocated_i64(i64) -> i64 tail { + fn0 = colocated %callee_i64(i64) -> i64 tail + +block0(v0: i64): + return_call fn0(v0) +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; load_sym t2,%callee_i64+0 +; return_call_ind t2 old_stack_arg_size:0 new_stack_arg_size:0 s1=s1 +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; auipc t2, 0 +; ld t2, 0xc(t2) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i64 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld ra, 8(s0) +; ld t6, 0(s0) +; addi sp, s0, 0x10 +; ori s0, t6, 0 +; jr t2 + +;;;; Test passing `f64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %callee_f64(f64) -> f64 tail { +block0(v0: f64): + v1 = f64const 0x10.0 + v2 = fadd.f64 v0, v1 + return v2 +} + +; VCode: +; block0: +; auipc a1,0; ld a1,12(a1); j 12; .8byte 0x4030000000000000 +; fmv.d.x ft4,a1 +; fadd.d ft0,ft0,ft4 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; auipc a1, 0 +; ld a1, 0xc(a1) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x30, 0x40 +; fmv.d.x ft4, a1 +; fadd.d ft0, ft0, ft4 +; ret + +function %call_f64(f64) -> f64 tail { + fn0 = %callee_f64(f64) -> f64 tail + +block0(v0: f64): + return_call fn0(v0) +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; load_sym t2,%callee_f64+0 +; return_call_ind t2 old_stack_arg_size:0 new_stack_arg_size:0 ft0=ft0 +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; auipc t2, 0 +; ld t2, 0xc(t2) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_f64 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld ra, 8(s0) +; ld t6, 0(s0) +; addi sp, s0, 0x10 +; ori s0, t6, 0 +; jr t2 + +;;;; Test passing `i8`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %callee_i8(i8) -> i8 tail { +block0(v0: i8): + v1 = iconst.i8 0 + v2 = icmp eq v0, v1 + return v2 +} + +; VCode: +; block0: +; li a2,0 +; andi a0,s1,255 +; andi a2,a2,255 +; eq s1,a0,a2##ty=i8 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mv a2, zero +; andi a0, s1, 0xff +; andi a2, a2, 0xff +; bne a0, a2, 0xc +; addi s1, zero, 1 +; j 8 +; mv s1, zero +; ret + +function %call_i8(i8) -> i8 tail { + fn0 = %callee_i8(i8) -> i8 tail + +block0(v0: i8): + return_call fn0(v0) +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; load_sym t2,%callee_i8+0 +; return_call_ind t2 old_stack_arg_size:0 new_stack_arg_size:0 s1=s1 +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; auipc t2, 0 +; ld t2, 0xc(t2) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i8 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld ra, 8(s0) +; ld t6, 0(s0) +; addi sp, s0, 0x10 +; ori s0, t6, 0 +; jr t2 + +;;;; Test passing many arguments on stack ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v25 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; ld a6,16(fp) +; ld t3,24(fp) +; ld t0,32(fp) +; ld t2,40(fp) +; ld s1,48(fp) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; add sp, sp, #48 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; ld a6, 0x10(s0) +; ld t3, 0x18(s0) +; ld t0, 0x20(s0) +; ld t2, 0x28(s0) +; ld s1, 0x30(s0) +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; addi sp, sp, 0x30 +; ret + +function %tail_caller_stack_args() -> i64 tail { + fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + return_call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; add sp,-16 +; block0: +; li s1,10 +; sd s1,8(nominal_sp) +; li a0,15 +; sd a0,0(nominal_sp) +; li a1,20 +; li a2,25 +; li a3,30 +; li a4,35 +; li a5,40 +; li a6,45 +; li a7,50 +; li s2,55 +; li s3,60 +; li s4,65 +; li s5,70 +; li s6,75 +; li s7,80 +; li s8,85 +; li s9,90 +; li s10,95 +; li s11,100 +; li t3,105 +; li t4,110 +; li t0,115 +; li t1,120 +; li t2,125 +; li s1,130 +; li a0,135 +; add sp,-48 +; virtual_sp_offset_adj +48 +; sd t0,0(sp) +; sd t1,8(sp) +; sd t2,16(sp) +; sd s1,24(sp) +; sd a0,32(sp) +; load_sym t0,%tail_callee_stack_args+0 +; ld a0,0(nominal_sp) +; ld s1,8(nominal_sp) +; return_call_ind t0 old_stack_arg_size:0 new_stack_arg_size:48 s1=s1 a0=a0 a1=a1 a2=a2 a3=a3 a4=a4 a5=a5 a6=a6 a7=a7 s2=s2 s3=s3 s4=s4 s5=s5 s6=s6 s7=s7 s8=s8 s9=s9 s10=s10 s11=s11 t3=t3 t4=t4 +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; addi sp, sp, -0x10 +; block1: ; offset 0x14 +; addi s1, zero, 0xa +; sd s1, 8(sp) +; addi a0, zero, 0xf +; sd a0, 0(sp) +; addi a1, zero, 0x14 +; addi a2, zero, 0x19 +; addi a3, zero, 0x1e +; addi a4, zero, 0x23 +; addi a5, zero, 0x28 +; addi a6, zero, 0x2d +; addi a7, zero, 0x32 +; addi s2, zero, 0x37 +; addi s3, zero, 0x3c +; addi s4, zero, 0x41 +; addi s5, zero, 0x46 +; addi s6, zero, 0x4b +; addi s7, zero, 0x50 +; addi s8, zero, 0x55 +; addi s9, zero, 0x5a +; addi s10, zero, 0x5f +; addi s11, zero, 0x64 +; addi t3, zero, 0x69 +; addi t4, zero, 0x6e +; addi t0, zero, 0x73 +; addi t1, zero, 0x78 +; addi t2, zero, 0x7d +; addi s1, zero, 0x82 +; addi a0, zero, 0x87 +; addi sp, sp, -0x30 +; sd t0, 0(sp) +; sd t1, 8(sp) +; sd t2, 0x10(sp) +; sd s1, 0x18(sp) +; sd a0, 0x20(sp) +; auipc t0, 0 +; ld t0, 0xc(t0) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_args 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld a0, 0x30(sp) +; ld s1, 0x38(sp) +; ld ra, 8(s0) +; ld t6, 0(s0) +; ld t5, 0x28(sp) +; sd t5, 8(s0) +; ld t5, 0x20(sp) +; sd t5, 0(s0) +; ld t5, 0x18(sp) +; sd t5, -8(s0) +; ld t5, 0x10(sp) +; sd t5, -0x10(s0) +; ld t5, 8(sp) +; sd t5, -0x18(s0) +; ld t5, 0(sp) +; sd t5, -0x20(s0) +; addi sp, s0, -0x20 +; ori s0, t6, 0 +; jr t0 + +;;;; Test diff blocks with diff return calls with diff # of stack args ;;;;;;;;; + +function %different_callee1(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v25 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; ld a6,16(fp) +; ld t3,24(fp) +; ld t0,32(fp) +; ld t2,40(fp) +; ld s1,48(fp) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; add sp, sp, #48 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; ld a6, 0x10(s0) +; ld t3, 0x18(s0) +; ld t0, 0x20(s0) +; ld t2, 0x28(s0) +; ld s1, 0x30(s0) +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; addi sp, sp, 0x30 +; ret + +function %different_callee2(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64, v26: i64): + return v26 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; ld a6,16(fp) +; ld t3,24(fp) +; ld t0,32(fp) +; ld t2,40(fp) +; ld a1,48(fp) +; ld s1,56(fp) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; add sp, sp, #48 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; ld a6, 0x10(s0) +; ld t3, 0x18(s0) +; ld t0, 0x20(s0) +; ld t2, 0x28(s0) +; ld a1, 0x30(s0) +; ld s1, 0x38(s0) +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; addi sp, sp, 0x30 +; ret + +function %caller_of_different_callees(i64) -> i64 tail { + fn0 = %different_callee1(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + fn1 = %different_callee2(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + +block0(v99: i64): + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + brif v99, block1, block2 + +block1: + return_call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + +block2: + v26 = iconst.i64 140 + return_call fn1(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; add sp,-32 +; block0: +; li a1,10 +; sd a1,16(nominal_sp) +; li a0,15 +; sd a0,8(nominal_sp) +; li a1,20 +; sd a1,0(nominal_sp) +; li a2,25 +; li a3,30 +; li a4,35 +; li a5,40 +; li a6,45 +; li a7,50 +; li s2,55 +; li s3,60 +; li s4,65 +; li s5,70 +; li s6,75 +; li s7,80 +; li s8,85 +; li s9,90 +; li s10,95 +; li s11,100 +; li t3,105 +; li t4,110 +; li a1,115 +; li a0,120 +; li t2,125 +; li t1,130 +; li t0,135 +; bne s1,zero,taken(label2),not_taken(label1) +; block1: +; li s1,140 +; add sp,-48 +; virtual_sp_offset_adj +48 +; sd a1,0(sp) +; sd a0,8(sp) +; sd t2,16(sp) +; sd t1,24(sp) +; sd t0,32(sp) +; sd s1,40(sp) +; load_sym t0,%different_callee2+0 +; ld a1,0(nominal_sp) +; ld a0,8(nominal_sp) +; ld s1,16(nominal_sp) +; return_call_ind t0 old_stack_arg_size:0 new_stack_arg_size:48 s1=s1 a0=a0 a1=a1 a2=a2 a3=a3 a4=a4 a5=a5 a6=a6 a7=a7 s2=s2 s3=s3 s4=s4 s5=s5 s6=s6 s7=s7 s8=s8 s9=s9 s10=s10 s11=s11 t3=t3 t4=t4 +; block2: +; ld s1,16(nominal_sp) +; add sp,-48 +; virtual_sp_offset_adj +48 +; sd a1,0(sp) +; sd a0,8(sp) +; sd t2,16(sp) +; sd t1,24(sp) +; sd t0,32(sp) +; load_sym t0,%different_callee1+0 +; ld a1,0(nominal_sp) +; ld a0,8(nominal_sp) +; return_call_ind t0 old_stack_arg_size:0 new_stack_arg_size:48 s1=s1 a0=a0 a1=a1 a2=a2 a3=a3 a4=a4 a5=a5 a6=a6 a7=a7 s2=s2 s3=s3 s4=s4 s5=s5 s6=s6 s7=s7 s8=s8 s9=s9 s10=s10 s11=s11 t3=t3 t4=t4 +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; addi sp, sp, -0x20 +; block1: ; offset 0x14 +; addi a1, zero, 0xa +; sd a1, 0x10(sp) +; addi a0, zero, 0xf +; sd a0, 8(sp) +; addi a1, zero, 0x14 +; sd a1, 0(sp) +; addi a2, zero, 0x19 +; addi a3, zero, 0x1e +; addi a4, zero, 0x23 +; addi a5, zero, 0x28 +; addi a6, zero, 0x2d +; addi a7, zero, 0x32 +; addi s2, zero, 0x37 +; addi s3, zero, 0x3c +; addi s4, zero, 0x41 +; addi s5, zero, 0x46 +; addi s6, zero, 0x4b +; addi s7, zero, 0x50 +; addi s8, zero, 0x55 +; addi s9, zero, 0x5a +; addi s10, zero, 0x5f +; addi s11, zero, 0x64 +; addi t3, zero, 0x69 +; addi t4, zero, 0x6e +; addi a1, zero, 0x73 +; addi a0, zero, 0x78 +; addi t2, zero, 0x7d +; addi t1, zero, 0x82 +; addi t0, zero, 0x87 +; bnez s1, 0x88 +; block2: ; offset 0x8c +; addi s1, zero, 0x8c +; addi sp, sp, -0x30 +; sd a1, 0(sp) +; sd a0, 8(sp) +; sd t2, 0x10(sp) +; sd t1, 0x18(sp) +; sd t0, 0x20(sp) +; sd s1, 0x28(sp) +; auipc t0, 0 +; ld t0, 0xc(t0) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %different_callee2 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld a1, 0x30(sp) +; ld a0, 0x38(sp) +; ld s1, 0x40(sp) +; ld ra, 8(s0) +; ld t6, 0(s0) +; ld t5, 0x28(sp) +; sd t5, 8(s0) +; ld t5, 0x20(sp) +; sd t5, 0(s0) +; ld t5, 0x18(sp) +; sd t5, -8(s0) +; ld t5, 0x10(sp) +; sd t5, -0x10(s0) +; ld t5, 8(sp) +; sd t5, -0x18(s0) +; ld t5, 0(sp) +; sd t5, -0x20(s0) +; addi sp, s0, -0x20 +; ori s0, t6, 0 +; jr t0 +; block3: ; offset 0x110 +; ld s1, 0x10(sp) +; addi sp, sp, -0x30 +; sd a1, 0(sp) +; sd a0, 8(sp) +; sd t2, 0x10(sp) +; sd t1, 0x18(sp) +; sd t0, 0x20(sp) +; auipc t0, 0 +; ld t0, 0xc(t0) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %different_callee1 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld a1, 0x30(sp) +; ld a0, 0x38(sp) +; ld ra, 8(s0) +; ld t6, 0(s0) +; ld t5, 0x28(sp) +; sd t5, 8(s0) +; ld t5, 0x20(sp) +; sd t5, 0(s0) +; ld t5, 0x18(sp) +; sd t5, -8(s0) +; ld t5, 0x10(sp) +; sd t5, -0x10(s0) +; ld t5, 8(sp) +; sd t5, -0x18(s0) +; ld t5, 0(sp) +; sd t5, -0x20(s0) +; addi sp, s0, -0x20 +; ori s0, t6, 0 +; jr t0 diff --git a/cranelift/filetests/filetests/runtests/return-call-indirect.clif b/cranelift/filetests/filetests/runtests/return-call-indirect.clif index a2b1efd1e26b..85e9ae3f1212 100644 --- a/cranelift/filetests/filetests/runtests/return-call-indirect.clif +++ b/cranelift/filetests/filetests/runtests/return-call-indirect.clif @@ -7,7 +7,7 @@ target x86_64 target aarch64 target aarch64 sign_return_address target aarch64 has_pauth sign_return_address -;; target riscv64 +target riscv64 ;; target s390x ;;;; Test passing `i64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/runtests/return-call-loop.clif b/cranelift/filetests/filetests/runtests/return-call-loop.clif index 7778b61fa382..d55a7ea7970a 100644 --- a/cranelift/filetests/filetests/runtests/return-call-loop.clif +++ b/cranelift/filetests/filetests/runtests/return-call-loop.clif @@ -1,9 +1,10 @@ test run set preserve_frame_pointers=true target x86_64 -;; target aarch64 -;; target aarch64 sign_return_address -;; target aarch64 has_pauth sign_return_address +target aarch64 +target aarch64 sign_return_address +target aarch64 has_pauth sign_return_address +target riscv64 ;; target s390x ;;;; Tail-Recursive Loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/runtests/return-call.clif b/cranelift/filetests/filetests/runtests/return-call.clif index 1f4bb77df5c5..f35ada47a9bc 100644 --- a/cranelift/filetests/filetests/runtests/return-call.clif +++ b/cranelift/filetests/filetests/runtests/return-call.clif @@ -7,7 +7,7 @@ target x86_64 target aarch64 target aarch64 sign_return_address target aarch64 has_pauth sign_return_address -;; target riscv64 +target riscv64 ;; target s390x ;;;; Test passing `i64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs index 54962317276a..a726f424089f 100644 --- a/cranelift/fuzzgen/src/function_generator.rs +++ b/cranelift/fuzzgen/src/function_generator.rs @@ -1912,9 +1912,9 @@ where let is_tail_caller = self.signature.call_conv == CallConv::Tail; let supports_tail_calls = match self.isa.triple().architecture { + Architecture::Aarch64(_) | Architecture::Riscv64(_) => true, // TODO: x64 currently requires frame pointers for tail calls. Architecture::X86_64 => self.isa.flags().preserve_frame_pointers(), - Architecture::Aarch64(target_lexicon::Aarch64Architecture::Aarch64) => true, // TODO: Other platforms do not support tail calls yet. _ => false, }; From fa7505aa0e65e6e52a17b1ba0ea3812a16b09247 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Mon, 24 Jul 2023 11:08:52 -0700 Subject: [PATCH 2/2] Use existing variable rather than recomputing value Co-authored-by: Trevor Elliott --- cranelift/codegen/src/isa/riscv64/inst/emit.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index 47bef3d1dd45..742cc9220eda 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -3175,7 +3175,7 @@ fn emit_return_call_common_sequence( ); // Copy the new stack arguments over the old stack arguments. - for i in (0..new_stack_arg_size / 8).rev() { + for i in (0..new_stack_words).rev() { // Load the `i`th new stack argument word from the temporary stack // space. Inst::gen_load(