-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Open
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone
Description
Consider two forms of writing fabs() math function in C:
#include <inttypes.h>
double fabs_pointer(double v)
{
uint64_t u = *(uint64_t*)&v & 0x7FFFFFFFFFFFFFFF;
return *(double*)&u;
}
double fabs_union(double v)
{
union { double f; uint64_t i; } u = { v };
u.i &= -1ULL / 2;
return u.f;
}this code modifies raw bits of double input in-place and returns the modified value.
Clang with -O2 -march=ivybridge gives identical codegen:
.LCPI0_0:
.quad 0x7fffffffffffffff # double NaN
.quad 0x7fffffffffffffff # double NaN
fabs_pointer: # @fabs_pointer
vandps xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
ret
.LCPI1_0:
.quad 0x7fffffffffffffff # double NaN
.quad 0x7fffffffffffffff # double NaN
fabs_union: # @fabs_union
vandps xmm0, xmm0, xmmword ptr [rip + .LCPI1_0]
retWe can write the same code in C# in four different forms:
using System;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
public class C
{
public static double fabs(double v)
{
ulong u = BitConverter.DoubleToUInt64Bits(v);
return BitConverter.UInt64BitsToDouble(u & 0x7FFFFFFFFFFFFFFF);
}
public static unsafe double fabs_pointer(double v)
{
ulong u = *(ulong*)&v & 0x7FFFFFFFFFFFFFFF;
return *(double*)&u;
}
public static double fabs_manual_vectorization(double v)
{
return Sse2.And(
Vector128.CreateScalarUnsafe(v),
Vector128.Create(0x7FFFFFFFFFFFFFFF).AsDouble()).ToScalar();
}
[StructLayout(LayoutKind.Explicit)]
public struct S
{
[FieldOffset(0)] public double d;
[FieldOffset(0)] public ulong u;
}
public static double fabs_struct(double v)
{
var s = new S { d = v };
s.u &= 0x7FFFFFFFFFFFFFFF;
return s.d;
}
}but the ryujit's codegen varies a lot:
; Core CLR 6.0.21.52210 on amd64
C..ctor()
L0000: ret
C.fabs(Double)
L0000: vzeroupper
L0003: vmovq rax, xmm0
L0008: mov rdx, 0x7fffffffffffffff
L0012: and rax, rdx
L0015: vmovq xmm0, rax
L001a: ret
C.fabs_pointer(Double)
L0000: vzeroupper
L0003: vmovsd [rsp+8], xmm0
L0009: mov rax, 0x7fffffffffffffff
L0013: and rax, [rsp+8]
L0018: vmovq xmm0, rax
L001d: ret
C.fabs_manual_vectorization(Double)
L0000: vzeroupper
L0003: vandpd xmm0, xmm0, [0x7ffb39e304b0]
L000b: ret
C.fabs_struct(Double)
L0000: sub rsp, 0x18
L0004: vzeroupper
L0007: vmovsd [rsp+8], xmm0
L000d: mov rax, [rsp+8]
L0012: mov [rsp+0x10], rax
L0017: lea rax, [rsp+0x10]
L001c: mov rdx, 0x7fffffffffffffff
L0026: and [rax], rdx
L0029: vmovsd xmm0, [rsp+0x10]
L002f: add rsp, 0x18
L0033: retIn the ideal world, pointer indirection and explicit layout would give the same codegen as the manually vectorized variant.
category:cq
theme:vector-codegen
skill-level:expert
cost:small
impact:small
kasperk81 and BoyBaykillertanveerbadar
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI