Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions cranelift/codegen/src/isa/x86/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,12 +620,8 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
// The calling convention described in
// https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only requires
// preserving the low 128 bits of XMM6-XMM15.
//
// TODO: For now, add just an `F64` rather than `F64X2` because `F64X2` would require
// encoding a fstDisp8 with REX bits set, and we currently can't encode that. F64 causes a
// whole XMM register to be preserved anyway.
let csr_arg =
ir::AbiParam::special_reg(types::F64, ir::ArgumentPurpose::CalleeSaved, fp_csr);
ir::AbiParam::special_reg(types::F64X2, ir::ArgumentPurpose::CalleeSaved, fp_csr);
func.signature.params.push(csr_arg);
func.signature.returns.push(csr_arg);
}
Expand Down Expand Up @@ -910,7 +906,7 @@ fn insert_common_prologue(

for reg in csrs.iter(FPR) {
// Append param to entry Block
let csr_arg = pos.func.dfg.append_block_param(block, types::F64);
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);

// Since regalloc has already run, we must assign a location.
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
Expand Down Expand Up @@ -1035,9 +1031,12 @@ fn insert_common_epilogue(
let mut fpr_offset = 0;

for reg in csrs.iter(FPR) {
let value = pos
.ins()
.load(types::F64, ir::MemFlags::trusted(), stack_addr, fpr_offset);
let value = pos.ins().load(
types::F64X2,
ir::MemFlags::trusted(),
stack_addr,
fpr_offset,
);
fpr_offset += types::F64X2.bytes() as i32;

if let Some(ref mut cfa_state) = cfa_state.as_mut() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,19 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64):
[-, %xmm7] v5 = fadd v0, v1
return
}
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7]) -> i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7] windows_fastcall {
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
; nextln: ss0 = explicit_slot 32, offset -80
; nextln: ss1 = incoming_arg 16, offset -48
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64 [%xmm6], v9: f64 [%xmm7]):
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
; nextln: x86_push v6
; nextln: copy_special %rsp -> %rbp
; nextln: adjust_sp_down_imm 64
; nextln: v7 = stack_addr.i64 ss0
; nextln: store notrap aligned v8, v7
; nextln: store notrap aligned v9, v7+16
; check: v10 = stack_addr.i64 ss0
; nextln: v11 = load.f64 notrap aligned v10
; nextln: v12 = load.f64 notrap aligned v10+16
; nextln: v11 = load.f64x2 notrap aligned v10
; nextln: v12 = load.f64x2 notrap aligned v10+16
; nextln: adjust_sp_up_imm 64
; nextln: v13 = x86_pop.i64
; nextln: v13, v11, v12
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,13 @@ block0(v0: i64, v1: i64):
; sameln: UnwindInfo {
; nextln: version: 1,
; nextln: flags: 0,
; nextln: prologue_size: 26,
; nextln: prologue_size: 25,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The astute reader may ask, "why did prologue sizes change if this was supposed to be a no-op cleanup-style change?"

It turns out my F64 causes a whole XMM register to be preserved anyway. comment became a lie between when I wrote it and when #1216 landed. Or maybe I was wrong the whole time? It was generating movsd (for example, f2 41 0f 11 33), where with this change Cranelift generates movups (-> 41 0f 11 33, one byte shorter).

This is why prologues are shorter, but more importantly, movsd only works with the low 64 bits of its xmm argument! movups is the correct instruction (movaps would also be acceptable), which moves the entire 128 bits.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha! I was about to ask that very question. (Don't know how astute of a reader I am, though.) Thanks for the explanation.

; nextln: unwind_code_count_raw: 7,
; nextln: frame_register: 5,
; nextln: frame_register_offset: 12,
; nextln: unwind_codes: [
; nextln: UnwindCode {
; nextln: offset: 26,
; nextln: offset: 25,
; nextln: op: SaveXmm128,
; nextln: info: 15,
; nextln: value: U16(
Expand Down Expand Up @@ -217,29 +217,29 @@ block0(v0: i64, v1: i64):
; sameln: UnwindInfo {
; nextln: version: 1,
; nextln: flags: 0,
; nextln: prologue_size: 44,
; nextln: prologue_size: 41,
; nextln: unwind_code_count_raw: 16,
; nextln: frame_register: 5,
; nextln: frame_register_offset: 10,
; nextln: unwind_codes: [
; nextln: UnwindCode {
; nextln: offset: 44,
; nextln: offset: 41,
; nextln: op: SaveXmm128,
; nextln: info: 8,
; nextln: value: U16(
; nextln: 2,
; nextln: ),
; nextln: },
; nextln: UnwindCode {
; nextln: offset: 38,
; nextln: offset: 36,
; nextln: op: SaveXmm128,
; nextln: info: 7,
; nextln: value: U16(
; nextln: 1,
; nextln: ),
; nextln: },
; nextln: UnwindCode {
; nextln: offset: 32,
; nextln: offset: 31,
; nextln: op: SaveXmm128,
; nextln: info: 6,
; nextln: value: U16(
Expand Down