From 1a1a1bc61ae4696d3355ab072164948041c2a5ac Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Thu, 29 Jun 2023 12:40:54 -0700 Subject: [PATCH 1/2] Cranelift: Get tail calls working on aarch64 Co-Authored-By: Jamey Sharp --- cranelift/codegen/src/isa/aarch64/abi.rs | 37 ++ cranelift/codegen/src/isa/aarch64/inst.isle | 11 + .../codegen/src/isa/aarch64/inst/emit.rs | 209 ++++++- cranelift/codegen/src/isa/aarch64/inst/mod.rs | 59 ++ cranelift/codegen/src/isa/aarch64/lower.isle | 8 + .../codegen/src/isa/aarch64/lower/isle.rs | 46 +- cranelift/codegen/src/isa/x64/abi.rs | 20 +- cranelift/codegen/src/machinst/abi.rs | 54 +- .../filetests/isa/aarch64/return-call.clif | 541 ++++++++++++++++++ .../filetests/isa/x64/return-call.clif | 236 ++++---- .../runtests/return-call-indirect.clif | 7 +- .../filetests/runtests/return-call.clif | 7 +- cranelift/fuzzgen/src/function_generator.rs | 14 +- 13 files changed, 1082 insertions(+), 167 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/aarch64/return-call.clif diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 9959409a244b..961e8216e50f 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -1161,6 +1161,43 @@ impl ABIMachineSpec for AArch64MachineDeps { } } +impl AArch64CallSite { + pub fn emit_return_call(mut self, ctx: &mut Lower, args: isle::ValueSlice) { + let (new_stack_arg_size, old_stack_arg_size) = + self.emit_temporary_tail_call_frame(ctx, args); + + let dest = self.dest().clone(); + let opcode = self.opcode(); + let uses = self.take_uses(); + let info = Box::new(ReturnCallInfo { + uses, + opcode, + old_stack_arg_size, + new_stack_arg_size, + }); + + match dest { + CallDest::ExtName(callee, RelocDistance::Near) => { + let callee = Box::new(callee); + ctx.emit(Inst::ReturnCall { callee, info }); + } + CallDest::ExtName(name, RelocDistance::Far) => { + let callee = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + ctx.emit(Inst::LoadExtName { + rd: callee, + name: Box::new(name), + offset: 0, + }); + ctx.emit(Inst::ReturnCallInd { + callee: callee.to_reg(), + info, + }); + } + CallDest::Reg(callee) => ctx.emit(Inst::ReturnCallInd { callee, info }), + } + } +} + fn compute_arg_locs_tail<'a, I>( params: I, add_ret_area_ptr: bool, diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index 9cdff39e340a..47a85cf59f4b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -788,6 +788,16 @@ (CallInd (info BoxCallIndInfo)) + ;; A return-call macro instruction. + (ReturnCall + (callee BoxExternalName) + (info BoxReturnCallInfo)) + + ;; An indirect return-call macro instruction. + (ReturnCallInd + (callee Reg) + (info BoxReturnCallInfo)) + ;; A pseudo-instruction that captures register arguments in vregs. (Args (args VecArgPair)) @@ -1030,6 +1040,7 @@ (type BoxCallInfo (primitive BoxCallInfo)) (type BoxCallIndInfo (primitive BoxCallIndInfo)) +(type BoxReturnCallInfo (primitive BoxReturnCallInfo)) (type CondBrKind (primitive CondBrKind)) (type BranchTarget (primitive BranchTarget)) (type BoxJTSequenceInfo (primitive BoxJTSequenceInfo)) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 9fbed6a87646..9b6be155fe2b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -4,8 +4,7 @@ use cranelift_control::ControlPlane; use regalloc2::Allocation; use crate::binemit::{Reloc, StackMap}; -use crate::ir::{types::*, RelSourceLoc}; -use crate::ir::{LibCall, MemFlags, TrapCode}; +use crate::ir::{self, types::*, LibCall, MemFlags, RelSourceLoc, TrapCode}; use crate::isa::aarch64::inst::*; use crate::machinst::{ty_bits, Reg, RegClass, Writable}; use crate::trace; @@ -3205,6 +3204,57 @@ impl MachInstEmit for Inst { state.virtual_sp_offset ); } + &Inst::ReturnCall { + ref callee, + ref info, + } => { + emit_return_call_common_sequence( + &mut allocs, + sink, + emit_info, + state, + info.new_stack_arg_size, + info.old_stack_arg_size, + &info.uses, + ); + + // Note: this is not `Inst::Jump { .. }.emit(..)` because we + // have different metadata in this case: we don't have a label + // for the target, but rather a function relocation. + sink.add_reloc(Reloc::Arm64Call, callee, 0); + sink.put4(enc_jump26(0b000101, 0)); + sink.add_call_site(ir::Opcode::ReturnCall); + + // `emit_return_call_common_sequence` emits an island if + // necessary, so we can safely disable the worst-case-size check + // in this case. + start_off = sink.cur_offset(); + } + &Inst::ReturnCallInd { callee, ref info } => { + let callee = allocs.next(callee); + + emit_return_call_common_sequence( + &mut allocs, + sink, + emit_info, + state, + info.new_stack_arg_size, + info.old_stack_arg_size, + &info.uses, + ); + + Inst::IndirectBr { + rn: callee, + targets: vec![], + } + .emit(&[], sink, emit_info, state); + sink.add_call_site(ir::Opcode::ReturnCallIndirect); + + // `emit_return_call_common_sequence` emits an island if + // necessary, so we can safely disable the worst-case-size check + // in this case. + start_off = sink.cur_offset(); + } &Inst::CondBr { taken, not_taken, @@ -3712,3 +3762,158 @@ impl MachInstEmit for Inst { self.print_with_state(state, &mut allocs) } } + +fn emit_return_call_common_sequence( + allocs: &mut AllocationConsumer<'_>, + sink: &mut MachBuffer, + emit_info: &EmitInfo, + state: &mut EmitState, + new_stack_arg_size: u32, + old_stack_arg_size: u32, + uses: &CallArgList, +) { + for u in uses { + let _ = allocs.next(u.vreg); + } + + // We are emitting a dynamic number of instructions and might need an + // island. We emit four instructions regardless of how many stack arguments + // we have, and then two instructions per word of stack argument space. + let new_stack_words = new_stack_arg_size / 8; + let insts = 4 + 2 * new_stack_words; + let size_of_inst = 4; + let space_needed = insts * size_of_inst; + if sink.island_needed(space_needed) { + let jump_around_label = sink.get_label(); + let jmp = Inst::Jump { + dest: BranchTarget::Label(jump_around_label), + }; + jmp.emit(&[], sink, emit_info, state); + sink.emit_island(space_needed + 4, &mut state.ctrl_plane); + sink.bind_label(jump_around_label, &mut state.ctrl_plane); + } + + // Copy the new frame on top of our current frame. + // + // The current stack layout is the following: + // + // | ... | + // +---------------------+ + // | ... | + // | stack arguments | + // | ... | + // current | return address | + // frame | old FP | <-- FP + // | ... | + // | old stack slots | + // | ... | + // +---------------------+ + // | ... | + // new | new stack arguments | + // frame | ... | <-- SP + // +---------------------+ + // + // We need to restore the old FP, restore the return address from the stack + // to the link register, copy the new stack arguments over the old stack + // arguments, adjust SP to point to the new stack arguments, and then jump + // to the callee (which will push the old FP and RA again). Note that the + // actual jump happens outside this helper function. + + assert_eq!( + new_stack_arg_size % 8, + 0, + "size of new stack arguments must be 8-byte aligned" + ); + + // The delta from our frame pointer to the (eventual) stack pointer value + // when we jump to the tail callee. This is the difference in size of stack + // arguments as well as accounting for the two words we pushed onto the + // stack upon entry to this function (the return address and old frame + // pointer). + let fp_to_callee_sp = i64::from(old_stack_arg_size) - i64::from(new_stack_arg_size) + 16; + + let tmp1 = regs::writable_spilltmp_reg(); + let tmp2 = regs::writable_tmp2_reg(); + + // Restore the return address to the link register, and load the old FP into + // a temporary register. + // + // We can't put the old FP into the FP register until after we copy the + // stack arguments into place, since that uses address modes that are + // relative to our current FP. + // + // Note that the FP is saved in the function prologue for all non-leaf + // functions, even when `preserve_frame_pointers=false`. Note also that + // `return_call` instructions make it so that a function is considered + // non-leaf. Therefore we always have an FP to restore here. + Inst::LoadP64 { + rt: tmp1, + rt2: writable_link_reg(), + mem: PairAMode::SignedOffset( + regs::fp_reg(), + SImm7Scaled::maybe_from_i64(0, types::I64).unwrap(), + ), + flags: MemFlags::trusted(), + } + .emit(&[], sink, emit_info, state); + + // Copy the new stack arguments over the old stack arguments. + for i in (0..new_stack_arg_size / 8).rev() { + // Load the `i`th new stack argument word from the temporary stack + // space. + Inst::ULoad64 { + rd: tmp2, + mem: AMode::SPOffset { + off: i64::from(i * 8), + ty: types::I64, + }, + flags: ir::MemFlags::trusted(), + } + .emit(&[], sink, emit_info, state); + + // Store it to its final destination on the stack, overwriting our + // current frame. + Inst::Store64 { + rd: tmp2.to_reg(), + mem: AMode::FPOffset { + off: fp_to_callee_sp + i64::from(i * 8), + ty: types::I64, + }, + flags: ir::MemFlags::trusted(), + } + .emit(&[], sink, emit_info, state); + } + + // Initialize the SP for the tail callee, deallocating the temporary stack + // argument space and our current frame at the same time. + let (off, alu_op) = if let Ok(off) = u64::try_from(fp_to_callee_sp) { + (off, ALUOp::Add) + } else { + let abs = fp_to_callee_sp.abs(); + let off = u64::try_from(abs).unwrap(); + (off, ALUOp::Sub) + }; + Inst::AluRRImm12 { + alu_op, + size: OperandSize::Size64, + rd: regs::writable_stack_reg(), + rn: regs::fp_reg(), + imm12: Imm12::maybe_from_u64(off).unwrap(), + } + .emit(&[], sink, emit_info, state); + + // Move the old FP value from the temporary into the FP register. + Inst::Mov { + size: OperandSize::Size64, + rd: regs::writable_fp_reg(), + rm: tmp1.to_reg(), + } + .emit(&[], sink, emit_info, state); + + state.virtual_sp_offset -= i64::from(new_stack_arg_size); + trace!( + "return_call[_ind] adjusts virtual sp offset by {} -> {}", + new_stack_arg_size, + state.virtual_sp_offset + ); +} diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 5a9f546f905d..bcbe1fb84c5c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -122,6 +122,22 @@ pub struct CallIndInfo { pub callee_pop_size: u32, } +/// Additional information for `return_call[_ind]` instructions, left out of +/// line to lower the size of the `Inst` enum. +#[derive(Clone, Debug)] +pub struct ReturnCallInfo { + /// Arguments to the call instruction. + pub uses: CallArgList, + /// Instruction opcode. + pub opcode: Opcode, + /// The size of the current/old stack frame's stack arguments. + pub old_stack_arg_size: u32, + /// The size of the new stack frame's stack arguments. This is necessary + /// for copying the frame over our current frame. It must already be + /// allocated on the stack. + pub new_stack_arg_size: u32, +} + /// Additional information for JTSequence instructions, left out of line to lower the size of the Inst /// enum. #[derive(Clone, Debug)] @@ -873,6 +889,20 @@ fn aarch64_get_operands VReg>(inst: &Inst, collector: &mut Operan } collector.reg_clobbers(info.clobbers); } + &Inst::ReturnCall { + ref info, + callee: _, + } => { + for u in &info.uses { + collector.reg_fixed_use(u.vreg, u.preg); + } + } + &Inst::ReturnCallInd { ref info, callee } => { + collector.reg_use(callee); + for u in &info.uses { + collector.reg_fixed_use(u.vreg, u.preg); + } + } &Inst::CondBr { ref kind, .. } => match kind { CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => { collector.reg_use(*rt); @@ -1013,6 +1043,7 @@ impl MachInst for Inst { fn is_term(&self) -> MachTerminator { match self { &Inst::Ret { .. } | &Inst::AuthenticatedRet { .. } => MachTerminator::Ret, + &Inst::ReturnCall { .. } | &Inst::ReturnCallInd { .. } => MachTerminator::RetCall, &Inst::Jump { .. } => MachTerminator::Uncond, &Inst::CondBr { .. } => MachTerminator::Cond, &Inst::IndirectBr { .. } => MachTerminator::Indirect, @@ -2522,6 +2553,34 @@ impl Inst { let rn = pretty_print_reg(info.rn, allocs); format!("blr {}", rn) } + &Inst::ReturnCall { + ref callee, + ref info, + } => { + let mut s = format!( + "return_call {callee:?} old_stack_arg_size:{} new_stack_arg_size:{}", + info.old_stack_arg_size, info.new_stack_arg_size + ); + for ret in &info.uses { + let preg = pretty_print_reg(ret.preg, &mut empty_allocs); + let vreg = pretty_print_reg(ret.vreg, allocs); + write!(&mut s, " {vreg}={preg}").unwrap(); + } + s + } + &Inst::ReturnCallInd { callee, ref info } => { + let callee = pretty_print_reg(callee, allocs); + let mut s = format!( + "return_call_ind {callee} old_stack_arg_size:{} new_stack_arg_size:{}", + info.old_stack_arg_size, info.new_stack_arg_size + ); + for ret in &info.uses { + let preg = pretty_print_reg(ret.preg, &mut empty_allocs); + let vreg = pretty_print_reg(ret.vreg, allocs); + write!(&mut s, " {vreg}={preg}").unwrap(); + } + s + } &Inst::Args { ref args } => { let mut s = "args".to_string(); for arg in args { diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index b9a5f836356e..b35ae2461f85 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -2268,6 +2268,14 @@ (rule (lower (return args)) (lower_return args)) +;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (return_call (func_ref_data sig_ref extname dist) args)) + (gen_return_call sig_ref extname dist args)) + +(rule (lower (return_call_indirect sig_ref callee args)) + (gen_return_call_indirect sig_ref callee args)) + ;;;; Rules for loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index 1ad1de7abdb4..72f843372524 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -15,7 +15,8 @@ use super::{ VectorSize, NZCV, }; use crate::ir::condcodes; -use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm}; +use crate::isa; +use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm, ReturnCallInfo}; use crate::isa::aarch64::lower::{lower_address, lower_pair_address}; use crate::isa::aarch64::AArch64Backend; use crate::machinst::valueregs; @@ -41,6 +42,7 @@ use std::vec::Vec; type BoxCallInfo = Box; type BoxCallIndInfo = Box; +type BoxReturnCallInfo = Box; type VecMachLabel = Vec; type BoxJTSequenceInfo = Box; type BoxExternalName = Box; @@ -93,8 +95,24 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> { distance: RelocDistance, args: ValueSlice, ) -> InstOutput { - let _ = (callee_sig, callee, distance, args); - todo!() + let caller_conv = isa::CallConv::Tail; + debug_assert_eq!( + self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()), + caller_conv, + "Can only do `return_call`s from within a `tail` calling convention function" + ); + + let call_site = AArch64CallSite::from_func( + self.lower_ctx.sigs(), + callee_sig, + &callee, + distance, + caller_conv, + self.backend.flags().clone(), + ); + call_site.emit_return_call(self.lower_ctx, args); + + InstOutput::new() } fn gen_return_call_indirect( @@ -103,8 +121,26 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> { callee: Value, args: ValueSlice, ) -> InstOutput { - let _ = (callee_sig, callee, args); - todo!() + let caller_conv = isa::CallConv::Tail; + debug_assert_eq!( + self.lower_ctx.abi().call_conv(self.lower_ctx.sigs()), + caller_conv, + "Can only do `return_call`s from within a `tail` calling convention function" + ); + + let callee = self.put_in_reg(callee); + + let call_site = AArch64CallSite::from_ptr( + self.lower_ctx.sigs(), + callee_sig, + callee, + Opcode::ReturnCallIndirect, + caller_conv, + self.backend.flags().clone(), + ); + call_site.emit_return_call(self.lower_ctx, args); + + InstOutput::new() } fn sign_return_address_disabled(&mut self) -> Option<()> { diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 61cd28fbe0e0..feb3d447276c 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -842,11 +842,8 @@ impl ABIMachineSpec for X64ABIMachineSpec { impl X64CallSite { pub fn emit_return_call(mut self, ctx: &mut Lower, args: isle::ValueSlice) { - // Allocate additional stack space for the new stack frame. We will - // build it in the newly allocated space, but then copy it over our - // current frame at the last moment. - let new_stack_arg_size = self.emit_allocate_tail_call_frame(ctx); - let old_stack_arg_size = ctx.abi().stack_args_size(ctx.sigs()); + let (new_stack_arg_size, old_stack_arg_size) = + self.emit_temporary_tail_call_frame(ctx, args); // Make a copy of the frame pointer, since we use it when copying down // the new stack frame. @@ -875,19 +872,6 @@ impl X64CallSite { None }; - // Put all arguments in registers and stack slots (within that newly - // allocated stack space). - self.emit_args(ctx, args); - if let Some(i) = ctx.sigs()[self.sig()].stack_ret_arg() { - let ret_area_ptr = ctx.abi().ret_area_ptr().expect( - "if the tail callee has a return pointer, then the tail caller \ - must as well", - ); - for inst in self.gen_arg(ctx, i.into(), ValueRegs::one(ret_area_ptr.to_reg())) { - ctx.emit(inst); - } - } - // Finally, emit the macro instruction to copy the new stack frame over // our current one and do the actual tail call! diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index 1697c700c6a4..bac65037b102 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -1254,11 +1254,6 @@ impl Callee { insts.extend(M::gen_add_imm(self.call_conv, scratch, stack_limit, stack_size).into_iter()); insts.extend(M::gen_stack_lower_bound_trap(scratch.to_reg())); } - - /// Get the register holding the return-area pointer, if any. - pub(crate) fn ret_area_ptr(&self) -> Option> { - self.ret_area_ptr - } } /// Generates the instructions necessary for the `gv` to be materialized into a @@ -2175,14 +2170,14 @@ impl CallSite { } } - pub(crate) fn sig(&self) -> Sig { - self.sig - } - pub(crate) fn dest(&self) -> &CallDest { &self.dest } + pub(crate) fn opcode(&self) -> ir::Opcode { + self.opcode + } + pub(crate) fn take_uses(self) -> CallArgList { self.uses } @@ -2480,6 +2475,47 @@ impl CallSite { } } + /// Emit the code to forward a stack-return pointer argument through a tail + /// call. + pub fn emit_stack_ret_arg_for_tail_call(&mut self, ctx: &mut Lower) { + if let Some(i) = ctx.sigs()[self.sig].stack_ret_arg() { + let ret_area_ptr = ctx.abi().ret_area_ptr.expect( + "if the tail callee has a return pointer, then the tail caller \ + must as well", + ); + for inst in self.gen_arg(ctx, i.into(), ValueRegs::one(ret_area_ptr.to_reg())) { + ctx.emit(inst); + } + } + } + + /// Builds a new temporary callee frame for the tail call and puts arguments into + /// registers and stack slots (within the new temporary frame). + /// + /// It is the caller's responsibility to move the temporary callee frame on + /// top of the current caller frame before performing the actual tail call. + /// + /// Returns a pair of the old caller's stack argument size and the new + /// callee's stack argument size. + pub fn emit_temporary_tail_call_frame( + &mut self, + ctx: &mut Lower, + args: isle::ValueSlice, + ) -> (u32, u32) { + // Allocate additional stack space for the new stack frame. We will + // build it in the newly allocated space, but then copy it over our + // current frame at the last moment. + let new_stack_arg_size = self.emit_allocate_tail_call_frame(ctx); + let old_stack_arg_size = ctx.abi().stack_args_size(ctx.sigs()); + + // Put all arguments in registers and stack slots (within that newly + // allocated stack space). + self.emit_args(ctx, args); + self.emit_stack_ret_arg_for_tail_call(ctx); + + (new_stack_arg_size, old_stack_arg_size) + } + /// Define a return value after the call returns. pub fn gen_retval( &mut self, diff --git a/cranelift/filetests/filetests/isa/aarch64/return-call.clif b/cranelift/filetests/filetests/isa/aarch64/return-call.clif new file mode 100644 index 000000000000..b9dd597f828f --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/return-call.clif @@ -0,0 +1,541 @@ +test compile precise-output + +target aarch64 + +;;;; Test passing `i64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %callee_i64(i64) -> i64 tail { +block0(v0: i64): + v1 = iadd_imm.i64 v0, 10 + return v1 +} + +; VCode: +; block0: +; add x2, x2, #10 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; add x2, x2, #0xa +; ret + +function %call_i64(i64) -> i64 tail { + fn0 = %callee_i64(i64) -> i64 tail + +block0(v0: i64): + return_call fn0(v0) +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; load_ext_name x3, TestCase(%callee_i64)+0 +; return_call_ind x3 old_stack_arg_size:0 new_stack_arg_size:0 x2=x2 +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldr x3, #0x10 +; b #0x18 +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i64 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ldp x16, x30, [x29] +; add sp, x29, #0x10 +; mov x29, x16 +; br x3 + +;;;; Test colocated tail calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %colocated_i64(i64) -> i64 tail { + fn0 = colocated %callee_i64(i64) -> i64 tail + +block0(v0: i64): + return_call fn0(v0) +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; return_call TestCase(%callee_i64) old_stack_arg_size:0 new_stack_arg_size:0 x2=x2 +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldp x16, x30, [x29] +; add sp, x29, #0x10 +; mov x29, x16 +; b #0x14 ; reloc_external Call %callee_i64 0 + +;;;; Test passing `f64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %callee_f64(f64) -> f64 tail { +block0(v0: f64): + v1 = f64const 0x10.0 + v2 = fadd.f64 v0, v1 + return v2 +} + +; VCode: +; block0: +; fmov d3, #16 +; fadd d0, d0, d3 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; fmov d3, #16.00000000 +; fadd d0, d0, d3 +; ret + +function %call_f64(f64) -> f64 tail { + fn0 = %callee_f64(f64) -> f64 tail + +block0(v0: f64): + return_call fn0(v0) +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; load_ext_name x2, TestCase(%callee_f64)+0 +; return_call_ind x2 old_stack_arg_size:0 new_stack_arg_size:0 v0=v0 +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldr x2, #0x10 +; b #0x18 +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_f64 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ldp x16, x30, [x29] +; add sp, x29, #0x10 +; mov x29, x16 +; br x2 + +;;;; Test passing `i8`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %callee_i8(i8) -> i8 tail { +block0(v0: i8): + v1 = iconst.i8 0 + v2 = icmp eq v0, v1 + return v2 +} + +; VCode: +; block0: +; uxtb w2, w2 +; subs wzr, w2, #0 +; cset x2, eq +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uxtb w2, w2 +; cmp w2, #0 +; cset x2, eq +; ret + +function %call_i8(i8) -> i8 tail { + fn0 = %callee_i8(i8) -> i8 tail + +block0(v0: i8): + return_call fn0(v0) +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; load_ext_name x3, TestCase(%callee_i8)+0 +; return_call_ind x3 old_stack_arg_size:0 new_stack_arg_size:0 x2=x2 +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldr x3, #0x10 +; b #0x18 +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i8 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ldp x16, x30, [x29] +; add sp, x29, #0x10 +; mov x29, x16 +; br x3 + +;;;; Test passing many arguments on stack ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v25 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; ldr x9, [fp, #16] +; ldr x2, [fp, #24] +; ldp fp, lr, [sp], #16 +; add sp, sp, #16 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldur x9, [x29, #0x10] +; ldur x2, [x29, #0x18] +; ldp x29, x30, [sp], #0x10 +; add sp, sp, #0x10 +; ret + +function %tail_caller_stack_args() -> i64 tail { + fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + return_call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; movz x2, #10 +; movz x3, #15 +; movz x4, #20 +; movz x5, #25 +; movz x6, #30 +; movz x7, #35 +; movz x8, #40 +; movz x9, #45 +; movz x10, #50 +; movz x11, #55 +; movz x12, #60 +; movz x13, #65 +; movz x14, #70 +; movz x15, #75 +; movz x19, #80 +; movz x20, #85 +; movz x21, #90 +; movz x22, #95 +; movz x23, #100 +; movz x24, #105 +; movz x25, #110 +; movz x26, #115 +; movz x27, #120 +; movz x28, #125 +; movz x0, #130 +; movz x1, #135 +; sub sp, sp, #16 +; virtual_sp_offset_adjust 16 +; str x0, [sp] +; str x1, [sp, #8] +; load_ext_name x0, TestCase(%tail_callee_stack_args)+0 +; return_call_ind x0 old_stack_arg_size:0 new_stack_arg_size:16 x2=x2 x3=x3 x4=x4 x5=x5 x6=x6 x7=x7 x8=x8 x9=x9 x10=x10 x11=x11 x12=x12 x13=x13 x14=x14 x15=x15 x19=x19 x20=x20 x21=x21 x22=x22 x23=x23 x24=x24 x25=x25 x26=x26 x27=x27 x28=x28 +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; mov x2, #0xa +; mov x3, #0xf +; mov x4, #0x14 +; mov x5, #0x19 +; mov x6, #0x1e +; mov x7, #0x23 +; mov x8, #0x28 +; mov x9, #0x2d +; mov x10, #0x32 +; mov x11, #0x37 +; mov x12, #0x3c +; mov x13, #0x41 +; mov x14, #0x46 +; mov x15, #0x4b +; mov x19, #0x50 +; mov x20, #0x55 +; mov x21, #0x5a +; mov x22, #0x5f +; mov x23, #0x64 +; mov x24, #0x69 +; mov x25, #0x6e +; mov x26, #0x73 +; mov x27, #0x78 +; mov x28, #0x7d +; mov x0, #0x82 +; mov x1, #0x87 +; sub sp, sp, #0x10 +; stur x0, [sp] +; stur x1, [sp, #8] +; ldr x0, #0x84 +; b #0x8c +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_args 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ldp x16, x30, [x29] +; ldur x17, [sp, #8] +; stur x17, [x29, #8] +; ldur x17, [sp] +; stur x17, [x29] +; mov sp, x29 +; mov x29, x16 +; br x0 + +;;;; Test diff blocks with diff return calls with diff # of stack args ;;;;;;;;; + +function %different_callee1(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v25 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; ldr x9, [fp, #16] +; ldr x2, [fp, #24] +; ldp fp, lr, [sp], #16 +; add sp, sp, #16 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldur x9, [x29, #0x10] +; ldur x2, [x29, #0x18] +; ldp x29, x30, [sp], #0x10 +; add sp, sp, #0x10 +; ret + +function %different_callee2(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64, v26: i64): + return v26 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; ldr x9, [fp, #16] +; ldr x11, [fp, #24] +; ldr x2, [fp, #32] +; ldp fp, lr, [sp], #16 +; add sp, sp, #32 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldur x9, [x29, #0x10] +; ldur x11, [x29, #0x18] +; ldur x2, [x29, #0x20] +; ldp x29, x30, [sp], #0x10 +; add sp, sp, #0x20 +; ret + +function %caller_of_different_callees(i64) -> i64 tail { + fn0 = %different_callee1(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + fn1 = %different_callee2(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + +block0(v99: i64): + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + brif v99, block1, block2 + +block1: + return_call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + +block2: + v26 = iconst.i64 140 + return_call fn1(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; sub sp, sp, #16 +; block0: +; movz x14, #10 +; str x14, [sp] +; movz x3, #15 +; movz x4, #20 +; movz x5, #25 +; movz x6, #30 +; movz x7, #35 +; movz x8, #40 +; movz x9, #45 +; movz x10, #50 +; movz x11, #55 +; movz x12, #60 +; movz x13, #65 +; movz x14, #70 +; movz x15, #75 +; movz x19, #80 +; movz x20, #85 +; movz x21, #90 +; movz x22, #95 +; movz x23, #100 +; movz x24, #105 +; movz x25, #110 +; movz x26, #115 +; movz x27, #120 +; movz x28, #125 +; movz x1, #130 +; movz x0, #135 +; cbnz x2, label2 ; b label1 +; block1: +; movz x2, #140 +; sub sp, sp, #32 +; virtual_sp_offset_adjust 32 +; str x1, [sp] +; str x0, [sp, #8] +; str x2, [sp, #16] +; load_ext_name x0, TestCase(%different_callee2)+0 +; ldr x2, [sp, #32] +; return_call_ind x0 old_stack_arg_size:0 new_stack_arg_size:32 x2=x2 x3=x3 x4=x4 x5=x5 x6=x6 x7=x7 x8=x8 x9=x9 x10=x10 x11=x11 x12=x12 x13=x13 x14=x14 x15=x15 x19=x19 x20=x20 x21=x21 x22=x22 x23=x23 x24=x24 x25=x25 x26=x26 x27=x27 x28=x28 +; block2: +; ldr x2, [sp] +; sub sp, sp, #16 +; virtual_sp_offset_adjust 16 +; str x1, [sp] +; str x0, [sp, #8] +; load_ext_name x0, TestCase(%different_callee1)+0 +; return_call_ind x0 old_stack_arg_size:0 new_stack_arg_size:16 x2=x2 x3=x3 x4=x4 x5=x5 x6=x6 x7=x7 x8=x8 x9=x9 x10=x10 x11=x11 x12=x12 x13=x13 x14=x14 x15=x15 x19=x19 x20=x20 x21=x21 x22=x22 x23=x23 x24=x24 x25=x25 x26=x26 x27=x27 x28=x28 +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; sub sp, sp, #0x10 +; block1: ; offset 0xc +; mov x14, #0xa +; stur x14, [sp] +; mov x3, #0xf +; mov x4, #0x14 +; mov x5, #0x19 +; mov x6, #0x1e +; mov x7, #0x23 +; mov x8, #0x28 +; mov x9, #0x2d +; mov x10, #0x32 +; mov x11, #0x37 +; mov x12, #0x3c +; mov x13, #0x41 +; mov x14, #0x46 +; mov x15, #0x4b +; mov x19, #0x50 +; mov x20, #0x55 +; mov x21, #0x5a +; mov x22, #0x5f +; mov x23, #0x64 +; mov x24, #0x69 +; mov x25, #0x6e +; mov x26, #0x73 +; mov x27, #0x78 +; mov x28, #0x7d +; mov x1, #0x82 +; mov x0, #0x87 +; cbnz x2, #0xd4 +; block2: ; offset 0x7c +; mov x2, #0x8c +; sub sp, sp, #0x20 +; stur x1, [sp] +; stur x0, [sp, #8] +; stur x2, [sp, #0x10] +; ldr x0, #0x98 +; b #0xa0 +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %different_callee2 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ldur x2, [sp, #0x20] +; ldp x16, x30, [x29] +; ldur x17, [sp, #0x18] +; stur x17, [x29, #8] +; ldur x17, [sp, #0x10] +; stur x17, [x29] +; ldur x17, [sp, #8] +; stur x17, [x29, #-8] +; ldur x17, [sp] +; stur x17, [x29, #-0x10] +; sub sp, x29, #0x10 +; mov x29, x16 +; br x0 +; block3: ; offset 0xd4 +; ldur x2, [sp] +; sub sp, sp, #0x10 +; stur x1, [sp] +; stur x0, [sp, #8] +; ldr x0, #0xec +; b #0xf4 +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %different_callee1 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ldp x16, x30, [x29] +; ldur x17, [sp, #8] +; stur x17, [x29, #8] +; ldur x17, [sp] +; stur x17, [x29] +; mov sp, x29 +; mov x29, x16 +; br x0 + diff --git a/cranelift/filetests/filetests/isa/x64/return-call.clif b/cranelift/filetests/filetests/isa/x64/return-call.clif index fc3f2c48e4b1..18b9dce2db26 100644 --- a/cranelift/filetests/filetests/isa/x64/return-call.clif +++ b/cranelift/filetests/filetests/isa/x64/return-call.clif @@ -298,66 +298,60 @@ block0: ; VCode: ; pushq %rbp ; movq %rsp, %rbp -; subq %rsp, $128, %rsp +; subq %rsp, $112, %rsp ; block0: ; movl $10, %eax -; movq %rax, rsp(112 + virtual offset) +; movq %rax, rsp(96 + virtual offset) ; movl $15, %ecx -; movq %rcx, rsp(104 + virtual offset) +; movq %rcx, rsp(88 + virtual offset) ; movl $20, %edx -; movq %rdx, rsp(96 + virtual offset) +; movq %rdx, rsp(80 + virtual offset) ; movl $25, %ebx -; movq %rbx, rsp(88 + virtual offset) +; movq %rbx, rsp(72 + virtual offset) ; movl $30, %esi -; movq %rsi, rsp(80 + virtual offset) +; movq %rsi, rsp(64 + virtual offset) ; movl $35, %edi -; movq %rdi, rsp(72 + virtual offset) +; movq %rdi, rsp(56 + virtual offset) ; movl $40, %r8d -; movq %r8, rsp(64 + virtual offset) +; movq %r8, rsp(48 + virtual offset) ; movl $45, %r9d -; movq %r9, rsp(56 + virtual offset) +; movq %r9, rsp(40 + virtual offset) ; movl $50, %r10d -; movq %r10, rsp(48 + virtual offset) +; movq %r10, rsp(32 + virtual offset) ; movl $55, %r11d -; movq %r11, rsp(40 + virtual offset) +; movq %r11, rsp(24 + virtual offset) ; movl $60, %r15d -; movl $65, %eax -; movq %rax, rsp(32 + virtual offset) +; movl $65, %r12d ; movl $70, %r13d -; movl $75, %edi -; movl $80, %eax +; movl $75, %r14d +; movl $80, %ecx +; movq %rcx, rsp(16 + virtual offset) ; movl $85, %ecx -; movq %rcx, rsp(24 + virtual offset) ; movl $90, %edx ; movl $95, %ebx ; movl $100, %esi -; movl $105, %ecx -; movq %rcx, rsp(16 + virtual offset) +; movl $105, %edi ; movl $110, %r8d ; movl $115, %r9d ; movl $120, %r10d ; movl $125, %r11d -; movl $130, %ecx -; movq %rcx, rsp(8 + virtual offset) -; movl $135, %ecx -; movq %rcx, rsp(0 + virtual offset) +; movl $130, %eax +; movq %rax, rsp(8 + virtual offset) +; movl $135, %eax +; movq %rax, rsp(0 + virtual offset) ; subq %rsp, $128, %rsp ; virtual_sp_offset_adjust 128 -; movq %rbp, %r14 -; movq 8(%r14), %r12 ; movq %r15, 0(%rsp) -; movq rsp(32 + virtual offset), %rcx -; movq %rcx, 8(%rsp) +; movq %r12, 8(%rsp) ; movq %r13, 16(%rsp) -; movq %rdi, 24(%rsp) +; movq %r14, 24(%rsp) +; movq rsp(16 + virtual offset), %rax ; movq %rax, 32(%rsp) -; movq rsp(24 + virtual offset), %rax -; movq %rax, 40(%rsp) +; movq %rcx, 40(%rsp) ; movq %rdx, 48(%rsp) ; movq %rbx, 56(%rsp) ; movq %rsi, 64(%rsp) -; movq rsp(16 + virtual offset), %rax -; movq %rax, 72(%rsp) +; movq %rdi, 72(%rsp) ; movq %r8, 80(%rsp) ; movq %r9, 88(%rsp) ; movq %r10, 96(%rsp) @@ -366,82 +360,78 @@ block0: ; movq %rax, 112(%rsp) ; movq rsp(0 + virtual offset), %rax ; movq %rax, 120(%rsp) -; load_ext_name %tail_callee_stack_args+0, %r15 -; movq rsp(40 + virtual offset), %r11 -; movq rsp(48 + virtual offset), %r10 -; movq rsp(56 + virtual offset), %r9 -; movq rsp(64 + virtual offset), %r8 -; movq rsp(72 + virtual offset), %rdi -; movq rsp(80 + virtual offset), %rsi -; movq rsp(88 + virtual offset), %rbx -; movq rsp(96 + virtual offset), %rdx -; movq rsp(104 + virtual offset), %rcx -; movq rsp(112 + virtual offset), %rax -; return_call_unknown %r15 new_stack_arg_size:128 old_stack_arg_size:0 ret_addr:Some("%v219") fp:%v218 tmp:%v220 %rax=%rax %rcx=%rcx %rdx=%rdx %rbx=%rbx %rsi=%rsi %rdi=%rdi %r8=%r8 %r9=%r9 %r10=%r10 %r11=%r11 +; movq %rbp, %r15 +; movq 8(%r15), %r13 +; load_ext_name %tail_callee_stack_args+0, %r12 +; movq rsp(24 + virtual offset), %r11 +; movq rsp(32 + virtual offset), %r10 +; movq rsp(40 + virtual offset), %r9 +; movq rsp(48 + virtual offset), %r8 +; movq rsp(56 + virtual offset), %rdi +; movq rsp(64 + virtual offset), %rsi +; movq rsp(72 + virtual offset), %rbx +; movq rsp(80 + virtual offset), %rdx +; movq rsp(88 + virtual offset), %rcx +; movq rsp(96 + virtual offset), %rax +; return_call_unknown %r12 new_stack_arg_size:128 old_stack_arg_size:0 ret_addr:Some("%v219") fp:%v218 tmp:%v220 %rax=%rax %rcx=%rcx %rdx=%rdx %rbx=%rbx %rsi=%rsi %rdi=%rdi %r8=%r8 %r9=%r9 %r10=%r10 %r11=%r11 ; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp ; movq %rsp, %rbp -; subq $0x80, %rsp -; block1: ; offset 0xb +; subq $0x70, %rsp +; block1: ; offset 0x8 ; movl $0xa, %eax -; movq %rax, 0x70(%rsp) +; movq %rax, 0x60(%rsp) ; movl $0xf, %ecx -; movq %rcx, 0x68(%rsp) +; movq %rcx, 0x58(%rsp) ; movl $0x14, %edx -; movq %rdx, 0x60(%rsp) +; movq %rdx, 0x50(%rsp) ; movl $0x19, %ebx -; movq %rbx, 0x58(%rsp) +; movq %rbx, 0x48(%rsp) ; movl $0x1e, %esi -; movq %rsi, 0x50(%rsp) +; movq %rsi, 0x40(%rsp) ; movl $0x23, %edi -; movq %rdi, 0x48(%rsp) +; movq %rdi, 0x38(%rsp) ; movl $0x28, %r8d -; movq %r8, 0x40(%rsp) +; movq %r8, 0x30(%rsp) ; movl $0x2d, %r9d -; movq %r9, 0x38(%rsp) +; movq %r9, 0x28(%rsp) ; movl $0x32, %r10d -; movq %r10, 0x30(%rsp) +; movq %r10, 0x20(%rsp) ; movl $0x37, %r11d -; movq %r11, 0x28(%rsp) +; movq %r11, 0x18(%rsp) ; movl $0x3c, %r15d -; movl $0x41, %eax -; movq %rax, 0x20(%rsp) +; movl $0x41, %r12d ; movl $0x46, %r13d -; movl $0x4b, %edi -; movl $0x50, %eax +; movl $0x4b, %r14d +; movl $0x50, %ecx +; movq %rcx, 0x10(%rsp) ; movl $0x55, %ecx -; movq %rcx, 0x18(%rsp) ; movl $0x5a, %edx ; movl $0x5f, %ebx ; movl $0x64, %esi -; movl $0x69, %ecx -; movq %rcx, 0x10(%rsp) +; movl $0x69, %edi ; movl $0x6e, %r8d ; movl $0x73, %r9d ; movl $0x78, %r10d ; movl $0x7d, %r11d -; movl $0x82, %ecx -; movq %rcx, 8(%rsp) -; movl $0x87, %ecx -; movq %rcx, (%rsp) +; movl $0x82, %eax +; movq %rax, 8(%rsp) +; movl $0x87, %eax +; movq %rax, (%rsp) ; subq $0x80, %rsp -; movq %rbp, %r14 -; movq 8(%r14), %r12 ; movq %r15, (%rsp) -; movq 0xa0(%rsp), %rcx -; movq %rcx, 8(%rsp) +; movq %r12, 8(%rsp) ; movq %r13, 0x10(%rsp) -; movq %rdi, 0x18(%rsp) +; movq %r14, 0x18(%rsp) +; movq 0x90(%rsp), %rax ; movq %rax, 0x20(%rsp) -; movq 0x98(%rsp), %rax -; movq %rax, 0x28(%rsp) +; movq %rcx, 0x28(%rsp) ; movq %rdx, 0x30(%rsp) ; movq %rbx, 0x38(%rsp) ; movq %rsi, 0x40(%rsp) -; movq 0x90(%rsp), %rax -; movq %rax, 0x48(%rsp) +; movq %rdi, 0x48(%rsp) ; movq %r8, 0x50(%rsp) ; movq %r9, 0x58(%rsp) ; movq %r10, 0x60(%rsp) @@ -450,51 +440,53 @@ block0: ; movq %rax, 0x70(%rsp) ; movq 0x80(%rsp), %rax ; movq %rax, 0x78(%rsp) -; movabsq $0, %r15 ; reloc_external Abs8 %tail_callee_stack_args 0 -; movq 0xa8(%rsp), %r11 -; movq 0xb0(%rsp), %r10 -; movq 0xb8(%rsp), %r9 -; movq 0xc0(%rsp), %r8 -; movq 0xc8(%rsp), %rdi -; movq 0xd0(%rsp), %rsi -; movq 0xd8(%rsp), %rbx -; movq 0xe0(%rsp), %rdx -; movq 0xe8(%rsp), %rcx -; movq 0xf0(%rsp), %rax -; movq (%r14), %rbp -; movq 0x78(%rsp), %r13 -; movq %r13, 8(%r14) -; movq 0x70(%rsp), %r13 -; movq %r13, (%r14) -; movq 0x68(%rsp), %r13 -; movq %r13, -8(%r14) -; movq 0x60(%rsp), %r13 -; movq %r13, -0x10(%r14) -; movq 0x58(%rsp), %r13 -; movq %r13, -0x18(%r14) -; movq 0x50(%rsp), %r13 -; movq %r13, -0x20(%r14) -; movq 0x48(%rsp), %r13 -; movq %r13, -0x28(%r14) -; movq 0x40(%rsp), %r13 -; movq %r13, -0x30(%r14) -; movq 0x38(%rsp), %r13 -; movq %r13, -0x38(%r14) -; movq 0x30(%rsp), %r13 -; movq %r13, -0x40(%r14) -; movq 0x28(%rsp), %r13 -; movq %r13, -0x48(%r14) -; movq 0x20(%rsp), %r13 -; movq %r13, -0x50(%r14) -; movq 0x18(%rsp), %r13 -; movq %r13, -0x58(%r14) -; movq 0x10(%rsp), %r13 -; movq %r13, -0x60(%r14) -; movq 8(%rsp), %r13 -; movq %r13, -0x68(%r14) -; movq (%rsp), %r13 -; movq %r13, -0x70(%r14) -; leaq -0x78(%r14), %rsp -; movq %r12, (%rsp) -; jmpq *%r15 +; movq %rbp, %r15 +; movq 8(%r15), %r13 +; movabsq $0, %r12 ; reloc_external Abs8 %tail_callee_stack_args 0 +; movq 0x98(%rsp), %r11 +; movq 0xa0(%rsp), %r10 +; movq 0xa8(%rsp), %r9 +; movq 0xb0(%rsp), %r8 +; movq 0xb8(%rsp), %rdi +; movq 0xc0(%rsp), %rsi +; movq 0xc8(%rsp), %rbx +; movq 0xd0(%rsp), %rdx +; movq 0xd8(%rsp), %rcx +; movq 0xe0(%rsp), %rax +; movq (%r15), %rbp +; movq 0x78(%rsp), %r14 +; movq %r14, 8(%r15) +; movq 0x70(%rsp), %r14 +; movq %r14, (%r15) +; movq 0x68(%rsp), %r14 +; movq %r14, -8(%r15) +; movq 0x60(%rsp), %r14 +; movq %r14, -0x10(%r15) +; movq 0x58(%rsp), %r14 +; movq %r14, -0x18(%r15) +; movq 0x50(%rsp), %r14 +; movq %r14, -0x20(%r15) +; movq 0x48(%rsp), %r14 +; movq %r14, -0x28(%r15) +; movq 0x40(%rsp), %r14 +; movq %r14, -0x30(%r15) +; movq 0x38(%rsp), %r14 +; movq %r14, -0x38(%r15) +; movq 0x30(%rsp), %r14 +; movq %r14, -0x40(%r15) +; movq 0x28(%rsp), %r14 +; movq %r14, -0x48(%r15) +; movq 0x20(%rsp), %r14 +; movq %r14, -0x50(%r15) +; movq 0x18(%rsp), %r14 +; movq %r14, -0x58(%r15) +; movq 0x10(%rsp), %r14 +; movq %r14, -0x60(%r15) +; movq 8(%rsp), %r14 +; movq %r14, -0x68(%r15) +; movq (%rsp), %r14 +; movq %r14, -0x70(%r15) +; leaq -0x78(%r15), %rsp +; movq %r13, (%rsp) +; jmpq *%r12 diff --git a/cranelift/filetests/filetests/runtests/return-call-indirect.clif b/cranelift/filetests/filetests/runtests/return-call-indirect.clif index 698ead772d93..a2b1efd1e26b 100644 --- a/cranelift/filetests/filetests/runtests/return-call-indirect.clif +++ b/cranelift/filetests/filetests/runtests/return-call-indirect.clif @@ -4,9 +4,10 @@ test run set preserve_frame_pointers=true target x86_64 -;; target aarch64 -;; target aarch64 sign_return_address -;; target aarch64 has_pauth sign_return_address +target aarch64 +target aarch64 sign_return_address +target aarch64 has_pauth sign_return_address +;; target riscv64 ;; target s390x ;;;; Test passing `i64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/runtests/return-call.clif b/cranelift/filetests/filetests/runtests/return-call.clif index 3ecea5a929b8..1f4bb77df5c5 100644 --- a/cranelift/filetests/filetests/runtests/return-call.clif +++ b/cranelift/filetests/filetests/runtests/return-call.clif @@ -4,9 +4,10 @@ test run set preserve_frame_pointers=true target x86_64 -;; target aarch64 -;; target aarch64 sign_return_address -;; target aarch64 has_pauth sign_return_address +target aarch64 +target aarch64 sign_return_address +target aarch64 has_pauth sign_return_address +;; target riscv64 ;; target s390x ;;;; Test passing `i64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs index dbc8d3d6788f..a08a62f1428e 100644 --- a/cranelift/fuzzgen/src/function_generator.rs +++ b/cranelift/fuzzgen/src/function_generator.rs @@ -2045,12 +2045,16 @@ where .next() .is_some(); let is_tail_caller = self.signature.call_conv == CallConv::Tail; - // TODO: This is currently only supported on x86 - let supports_tail_calls = self.isa.triple().architecture == Architecture::X86_64; - // TODO: We currently require frame pointers for tail calls - let has_frame_pointers = self.isa.flags().preserve_frame_pointers(); - if is_tail_caller && has_tail_callees && supports_tail_calls & has_frame_pointers { + let supports_tail_calls = match self.isa.triple().architecture { + // TODO: x64 currently requires frame pointers for tail calls. + Architecture::X86_64 => self.isa.flags().preserve_frame_pointers(), + Architecture::Aarch64(target_lexicon::Aarch64Architecture::Aarch64) => true, + // TODO: Other platforms do not support tail calls yet. + _ => false, + }; + + if is_tail_caller && has_tail_callees && supports_tail_calls { valid_terminators.extend([ BlockTerminatorKind::TailCall, BlockTerminatorKind::TailCallIndirect, From 887398e1f7739b7efdf3c25aedbc3fd62c5c802a Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Fri, 21 Jul 2023 11:12:59 -0700 Subject: [PATCH 2/2] Review suggestion: reuse already-computed number of new stack words Co-authored-by: Trevor Elliott --- cranelift/codegen/src/isa/aarch64/inst/emit.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 9b6be155fe2b..085daaf27153 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -3858,7 +3858,7 @@ fn emit_return_call_common_sequence( .emit(&[], sink, emit_info, state); // Copy the new stack arguments over the old stack arguments. - for i in (0..new_stack_arg_size / 8).rev() { + for i in (0..new_stack_words).rev() { // Load the `i`th new stack argument word from the temporary stack // space. Inst::ULoad64 {