stage2: Miscellaneous fixes to vector arithmetic and copy elision #13074
stage2: Miscellaneous fixes to vector arithmetic and copy elision #13074andrewrk merged 6 commits intoziglang:masterfrom
Conversation
|
Could you rebase against master branch please? |
bb319a1 to
5e4a560
Compare
|
All resolved now |
5e4a560 to
6b53162
Compare
|
That Liveness change looks like it might deserve its own PR, I'd like to take a close look at it. |
Aye aye, I'll spin one up |
6b53162 to
ca5539d
Compare
|
Going to revisit this now that #13221 has landed. |
90aae5a to
8ddda8d
Compare
|
Is there an ETA on this? I was surprised these fixes did not go into 0.10. |
ee6e525 to
acab0a2
Compare
There was a problem hiding this comment.
I would like to suggest a different way of doing the LLVM changes that looks like this:
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 8daffaefe..8f3af4c66 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -4555,7 +4555,7 @@ pub const FuncGen = struct {
.fptrunc => try self.airFptrunc(inst),
.fpext => try self.airFpext(inst),
.ptrtoint => try self.airPtrToInt(inst),
- .load => try self.airLoad(inst, body, i + 1),
+ .load => try self.airLoad(body[i..]),
.loop => try self.airLoop(inst),
.not => try self.airNot(inst),
.ret => try self.airRet(inst),
@@ -4614,7 +4614,7 @@ pub const FuncGen = struct {
.atomic_store_seq_cst => try self.airAtomicStore(inst, .SequentiallyConsistent),
.struct_field_ptr => try self.airStructFieldPtr(inst),
- .struct_field_val => try self.airStructFieldVal(inst),
+ .struct_field_val => try self.airStructFieldVal(body[i..]),
.struct_field_ptr_index_0 => try self.airStructFieldPtrIndex(inst, 0),
.struct_field_ptr_index_1 => try self.airStructFieldPtrIndex(inst, 1),
@@ -4623,7 +4623,7 @@ pub const FuncGen = struct {
.field_parent_ptr => try self.airFieldParentPtr(inst),
- .array_elem_val => try self.airArrayElemVal(inst),
+ .array_elem_val => try self.airArrayElemVal(body[i..]),
.slice_elem_val => try self.airSliceElemVal(inst),
.slice_elem_ptr => try self.airSliceElemPtr(inst),
.ptr_elem_val => try self.airPtrElemVal(inst),
@@ -5633,7 +5633,8 @@ pub const FuncGen = struct {
return self.builder.buildInBoundsGEP(llvm_elem_ty, base_ptr, &indices, indices.len, "");
}
- fn airArrayElemVal(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+ fn airArrayElemVal(self: *FuncGen, body_tail: []const Air.Inst.Index) !?*llvm.Value {
+ const inst = body_tail[0];
if (self.liveness.isUnused(inst)) return null;
const bin_op = self.air.instructions.items(.data)[inst].bin_op;
@@ -5646,7 +5647,11 @@ pub const FuncGen = struct {
const elem_ptr = self.builder.buildInBoundsGEP(array_llvm_ty, array_llvm_val, &indices, indices.len, "");
const elem_ty = array_ty.childType();
if (isByRef(elem_ty)) {
- return elem_ptr;
+ if (canElideLoad(self, body_tail)) {
+ return elem_ptr;
+ } else {
+ return self.loadByRef(elem_ptr, elem_ty, 0, false);
+ }
} else {
const elem_llvm_ty = try self.dg.lowerType(elem_ty);
return self.builder.buildLoad(elem_llvm_ty, elem_ptr, "");
@@ -5723,7 +5728,8 @@ pub const FuncGen = struct {
return self.fieldPtr(inst, struct_ptr, struct_ptr_ty, field_index);
}
- fn airStructFieldVal(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+ fn airStructFieldVal(self: *FuncGen, body_tail: []const Air.Inst.Index) !?*llvm.Value {
+ const inst = body_tail[0];
if (self.liveness.isUnused(inst)) return null;
const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
@@ -5782,7 +5788,15 @@ pub const FuncGen = struct {
const struct_llvm_ty = try self.dg.lowerType(struct_ty);
const field_ptr = self.builder.buildStructGEP(struct_llvm_ty, struct_llvm_val, llvm_field_index, "");
const field_ptr_ty = Type.initPayload(&ptr_ty_buf.base);
- return self.load(field_ptr, field_ptr_ty);
+ if (isByRef(field_ty)) {
+ if (canElideLoad(self, body_tail)) {
+ return field_ptr;
+ } else {
+ return self.loadByRef(field_ptr, field_ty, ptr_ty_buf.data.alignment(target), false);
+ }
+ } else {
+ return self.load(field_ptr, field_ptr_ty);
+ }
},
.Union => {
const union_llvm_ty = try self.dg.lowerType(struct_ty);
@@ -5792,7 +5806,11 @@ pub const FuncGen = struct {
const llvm_field_ty = try self.dg.lowerType(field_ty);
const field_ptr = self.builder.buildBitCast(union_field_ptr, llvm_field_ty.pointerType(0), "");
if (isByRef(field_ty)) {
- return field_ptr;
+ if (canElideLoad(self, body_tail)) {
+ return field_ptr;
+ } else {
+ return self.loadByRef(field_ptr, field_ty, layout.payload_align, false);
+ }
} else {
return self.builder.buildLoad(llvm_field_ty, field_ptr, "");
}
@@ -8030,35 +8048,35 @@ pub const FuncGen = struct {
return null;
}
- fn airLoad(
- self: *FuncGen,
- inst: Air.Inst.Index,
- body: []const Air.Inst.Index,
- body_i: usize,
- ) !?*llvm.Value {
- const ty_op = self.air.instructions.items(.data)[inst].ty_op;
- const ptr_ty = self.air.typeOf(ty_op.operand);
+ /// As an optimization, we want to avoid unnecessary copies of isByRef=true
+ /// types. Here, we scan forward in the current block, looking to see if
+ /// this load dies before any side effects occur. In such case, we can
+ /// safely return the operand without making a copy.
+ fn canElideLoad(fg: *FuncGen, body_tail: []const Air.Inst.Index) bool {
+ for (body_tail) |body_inst| {
+ switch (fg.liveness.categorizeOperand(fg.air, body_inst, body_tail[0])) {
+ .none => continue,
+ .write, .noret, .complex => return false,
+ .tomb => return true,
+ }
+ } else unreachable;
+ }
+
+ fn airLoad(fg: *FuncGen, body_tail: []const Air.Inst.Index) !?*llvm.Value {
+ const inst = body_tail[0];
+ const ty_op = fg.air.instructions.items(.data)[inst].ty_op;
+ const ptr_ty = fg.air.typeOf(ty_op.operand);
+ const ptr_info = ptr_ty.ptrInfo().data;
elide: {
- const ptr_info = ptr_ty.ptrInfo().data;
if (ptr_info.@"volatile") break :elide;
- if (self.liveness.isUnused(inst)) return null;
+ if (fg.liveness.isUnused(inst)) return null;
if (!isByRef(ptr_info.pointee_type)) break :elide;
-
- // It would be valid to fall back to the code below here that simply calls
- // load(). However, as an optimization, we want to avoid unnecessary copies
- // of isByRef=true types. Here, we scan forward in the current block,
- // looking to see if this load dies before any side effects occur.
- // In such case, we can safely return the operand without making a copy.
- for (body[body_i..]) |body_inst| {
- switch (self.liveness.categorizeOperand(self.air, body_inst, inst)) {
- .none => continue,
- .write, .noret, .complex => break :elide,
- .tomb => return try self.resolveInst(ty_op.operand),
- }
- } else unreachable;
+ if (!canElideLoad(fg, body_tail)) break :elide;
+ const ptr = try fg.resolveInst(ty_op.operand);
+ return ptr;
}
- const ptr = try self.resolveInst(ty_op.operand);
- return self.load(ptr, ptr_ty);
+ const ptr = try fg.resolveInst(ty_op.operand);
+ return fg.load(ptr, ptr_ty);
}
fn airBreakpoint(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
@@ -9522,6 +9540,32 @@ pub const FuncGen = struct {
return self.llvmModule().getIntrinsicDeclaration(id, types.ptr, types.len);
}
+ fn loadByRef(
+ fg: *FuncGen,
+ ptr: *llvm.Value,
+ pointee_type: Type,
+ ptr_alignment: u32,
+ is_volatile: bool,
+ ) !?*llvm.Value {
+ const pointee_llvm_ty = try fg.dg.lowerType(pointee_type);
+ const target = fg.dg.module.getTarget();
+ const result_align = pointee_type.abiAlignment(target);
+ const max_align = @max(result_align, ptr_alignment);
+ const result_ptr = fg.buildAlloca(pointee_llvm_ty, max_align);
+ const llvm_ptr_u8 = fg.context.intType(8).pointerType(0);
+ const llvm_usize = fg.context.intType(Type.usize.intInfo(target).bits);
+ const size_bytes = pointee_type.abiSize(target);
+ _ = fg.builder.buildMemCpy(
+ fg.builder.buildBitCast(result_ptr, llvm_ptr_u8, ""),
+ max_align,
+ fg.builder.buildBitCast(ptr, llvm_ptr_u8, ""),
+ max_align,
+ llvm_usize.constInt(size_bytes, .False),
+ is_volatile,
+ );
+ return result_ptr;
+ }
+
/// This function always performs a copy. For isByRef=true types, it creates a new
/// alloca and copies the value into it, then returns the alloca instruction.
/// For isByRef=false types, it creates a load instruction and returns it.
@@ -9533,24 +9577,10 @@ pub const FuncGen = struct {
const ptr_alignment = info.alignment(target);
const ptr_volatile = llvm.Bool.fromBool(ptr_ty.isVolatilePtr());
if (info.host_size == 0) {
- const elem_llvm_ty = try self.dg.lowerType(info.pointee_type);
if (isByRef(info.pointee_type)) {
- const result_align = info.pointee_type.abiAlignment(target);
- const max_align = @max(result_align, ptr_alignment);
- const result_ptr = self.buildAlloca(elem_llvm_ty, max_align);
- const llvm_ptr_u8 = self.context.intType(8).pointerType(0);
- const llvm_usize = self.context.intType(Type.usize.intInfo(target).bits);
- const size_bytes = info.pointee_type.abiSize(target);
- _ = self.builder.buildMemCpy(
- self.builder.buildBitCast(result_ptr, llvm_ptr_u8, ""),
- max_align,
- self.builder.buildBitCast(ptr, llvm_ptr_u8, ""),
- max_align,
- llvm_usize.constInt(size_bytes, .False),
- info.@"volatile",
- );
- return result_ptr;
+ return self.loadByRef(ptr, info.pointee_type, ptr_alignment, info.@"volatile");
}
+ const elem_llvm_ty = try self.dg.lowerType(info.pointee_type);
const llvm_inst = self.builder.buildLoad(elem_llvm_ty, ptr, "");
llvm_inst.setAlignment(ptr_alignment);
llvm_inst.setVolatile(ptr_volatile);The end result of LLVM IR between your branch and mine looks like this:
var x: [10][10]u32 = undefined;
x[0][1] = 0;
const a = x[0];
x[0][1] = 15;
try expect(a[1] == 0); ; Function Attrs: nounwind
define internal fastcc i16 @test.doTheTest() unnamed_addr #0 {
Entry:
- %0 = alloca [10 x [10 x i32]], align 4
+ %0 = alloca [10 x i32], align 4
%1 = alloca [10 x [10 x i32]], align 4
call void @llvm.memset.p0.i64(ptr align 4 %1, i8 -86, i64 400, i1 false)
%2 = getelementptr inbounds [10 x [10 x i32]], ptr %1, i32 0, i64 0
%3 = getelementptr inbounds [10 x i32], ptr %2, i32 0, i64 1
store i32 0, ptr %3, align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 %1, i64 400, i1 false)
- %4 = getelementptr inbounds [10 x [10 x i32]], ptr %0, i32 0, i64 0
+ %4 = getelementptr inbounds [10 x [10 x i32]], ptr %1, i32 0, i64 0
+ call void @llvm.memcpy.p0.p0.i64(ptr align 4 %0, ptr align 4 %4, i64 40, i1 false)
%5 = getelementptr inbounds [10 x [10 x i32]], ptr %1, i32 0, i64 0
%6 = getelementptr inbounds [10 x i32], ptr %5, i32 0, i64 1
store i32 15, ptr %6, align 4
- %7 = getelementptr inbounds [10 x i32], ptr %4, i32 0, i64 1
+ %7 = getelementptr inbounds [10 x i32], ptr %0, i32 0, i64 1
%8 = load i32, ptr %7, align 4
%9 = icmp eq i32 %8, 0
%10 = call fastcc i16 @testing.expect(i1 %9)This LLVM IR passes the respective test while copying 40 bytes instead of 400 bytes.
In my opinion, these changes to llvm.zig are easier to maintain, since they do not add messy logic to the load instruction but instead operate on the observation that array_elem_val is itself a load instruction (as is slice_elem_val, ptr_elem_val, and struct_field_val - these will need to be updated with similar logic).
|
I like that approach! Seems like a nice improvement. My sis got hitched last weekend 🎉 so I haven't gotten to fix this one up, but I should have it ready for review in a day or two. |
This is a follow-up to 9dc98fb, which made comptime initialization patterns for union/struct more robust, especially when storing to comptime-known pointers (and globals). Resolves ziglang#13063.
This change makes any of the `*_val` instructions check whether it's safe to elide copies for by-ref types rather than performing this elision blindly. AIR instructions fixed: - .array_elem_val - .struct_field_val - .unwrap_errunion_payload - .try - .optional_payload These now all respect value semantics, as expected. P.S. Thanks to Andrew for the new way to approach this. Many of the lines here are from his recommended change, which comes with the significant advantage that loads are now as small as the intervening memory access allows. Co-authored by: Andrew Kelley <andrew@ziglang.org>
Adds optimizations for by-ref types to: - .struct_field_val - .slice_elem_val - .ptr_elem_val I would have expected LLVM to be able to optimize away these temporaries since we don't leak pointers to them and they are fed straight from def to use, but empirically it does not. Resolves ziglang#12713 Resolves ziglang#12638
This bug is already resolved, just want to make sure we don't lose the test case. Closes ziglang#12043
These functions have a very error-prone API. They are essentially `all(cmp(op, ...))` but that's not reflected in the name. This renames these functions to `compareAllAgainstZero...` etc. for clarity and fixes >20 locations where the predicate was incorrect. In the future, the scalar `compare` should probably be split off from the vector comparison. Rank-polymorphic programming is great, but a proper implementation in Zig would decouple comparison and reduction, which then needs a way to fuse ops at comptime.
acab0a2 to
7b978bf
Compare
andrewrk
left a comment
There was a problem hiding this comment.
Looks great! I think we will have some happy users when this lands.
|
Thanks @topolarity ! |
stage2: Miscellaneous fixes to vector arithmetic and copy elision
compare(and family) tocompareAll(etc.), and fixes lots of places where the wrong predicate for vectors was used[Liveness] Add a peephole optimization for safety checks to Liveness, for copy/load elision.array_elem_val, etc. which do not make copiesP.S. As @Vexu pointed out to me, we still need a way to ensure that if, e.g.,
foo.arr[x][y].zperforms a copy, it only copies the target object rather than all of foo. This is (usually) handled correctly already when the result is a non-by-ref type, but it needs some tightening, esp. for by-ref types.Resolves #13064, Resolves #13065, Resolves #13069 + others mentioned in the commits