Skip to content

Auto-vectorization of pointer indirection expr. and explicit layout #64026

@am11

Description

@am11

Consider two forms of writing fabs() math function in C:

#include <inttypes.h>

double fabs_pointer(double v)
{
    uint64_t u = *(uint64_t*)&v & 0x7FFFFFFFFFFFFFFF;
    return *(double*)&u;
}

double fabs_union(double v)
{
    union { double f; uint64_t i; } u = { v };
    u.i &= -1ULL / 2;
    return u.f;
}

this code modifies raw bits of double input in-place and returns the modified value.

Clang with -O2 -march=ivybridge gives identical codegen:

.LCPI0_0:
        .quad   0x7fffffffffffffff              # double NaN
        .quad   0x7fffffffffffffff              # double NaN
fabs_pointer:                          # @fabs_pointer
        vandps  xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        ret
.LCPI1_0:
        .quad   0x7fffffffffffffff              # double NaN
        .quad   0x7fffffffffffffff              # double NaN
fabs_union:                             # @fabs_union
        vandps  xmm0, xmm0, xmmword ptr [rip + .LCPI1_0]
        ret

We can write the same code in C# in four different forms:

using System;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

public class C
{
    public static double fabs(double v)
    {
        ulong u = BitConverter.DoubleToUInt64Bits(v);
        return BitConverter.UInt64BitsToDouble(u & 0x7FFFFFFFFFFFFFFF);
    }

    public static unsafe double fabs_pointer(double v)
    {
        ulong u = *(ulong*)&v & 0x7FFFFFFFFFFFFFFF;
        return *(double*)&u;
    }

    public static double fabs_manual_vectorization(double v)
    {
        return Sse2.And(
            Vector128.CreateScalarUnsafe(v),
            Vector128.Create(0x7FFFFFFFFFFFFFFF).AsDouble()).ToScalar();
    }
    
    [StructLayout(LayoutKind.Explicit)]
    public struct S
    {
        [FieldOffset(0)] public double d;
        [FieldOffset(0)] public ulong u;
    }

    public static double fabs_struct(double v)
    {
        var s = new S { d = v };
        s.u &= 0x7FFFFFFFFFFFFFFF;
        return s.d;
    }
}

but the ryujit's codegen varies a lot:

; Core CLR 6.0.21.52210 on amd64

C..ctor()
    L0000: ret

C.fabs(Double)
    L0000: vzeroupper
    L0003: vmovq rax, xmm0
    L0008: mov rdx, 0x7fffffffffffffff
    L0012: and rax, rdx
    L0015: vmovq xmm0, rax
    L001a: ret

C.fabs_pointer(Double)
    L0000: vzeroupper
    L0003: vmovsd [rsp+8], xmm0
    L0009: mov rax, 0x7fffffffffffffff
    L0013: and rax, [rsp+8]
    L0018: vmovq xmm0, rax
    L001d: ret

C.fabs_manual_vectorization(Double)
    L0000: vzeroupper
    L0003: vandpd xmm0, xmm0, [0x7ffb39e304b0]
    L000b: ret

C.fabs_struct(Double)
    L0000: sub rsp, 0x18
    L0004: vzeroupper
    L0007: vmovsd [rsp+8], xmm0
    L000d: mov rax, [rsp+8]
    L0012: mov [rsp+0x10], rax
    L0017: lea rax, [rsp+0x10]
    L001c: mov rdx, 0x7fffffffffffffff
    L0026: and [rax], rdx
    L0029: vmovsd xmm0, [rsp+0x10]
    L002f: add rsp, 0x18
    L0033: ret

In the ideal world, pointer indirection and explicit layout would give the same codegen as the manually vectorized variant.

category:cq
theme:vector-codegen
skill-level:expert
cost:small
impact:small

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions