Skip to content

Constant vectors propagated incorrectly #93698

@saucecontrol

Description

@saucecontrol

Description

I'm having multiple issues with JIT propagating constant vectors, even when the constants are manually hoisted out of a hot loop. See #76781 (comment) for a recent example of perf regression.

In attempting to work around the regression, I discovered a correctness bug, wherein JIT propagates a constant vector with the incorrect value.

Take the following SIMD method, with constant vectors manually hoisted out of the loop:

private static void convertGrey(float* pin, float* pout, nint cnt)
{
    var vlumR = Vector256.Create(.299f);
    var vlumG = Vector256.Create(.587f);
    var vlumB = Vector256.Create(.114f);
    var vperm = Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7);

    float* ip = pin, ipe = pin + cnt, op = pout;
    while (ip < ipe)
    {
        var v0 = Avx.LoadVector256(ip);
        var v1 = Avx.LoadVector256(ip + Vector256<float>.Count);
        var v2 = Avx.LoadVector256(ip + Vector256<float>.Count * 2);
        var v3 = Avx.LoadVector256(ip + Vector256<float>.Count * 3);
        ip += Vector256<float>.Count * 4;

        var vl0 = Avx.UnpackLow(v0, v1).AsDouble();
        var vh0 = Avx.UnpackHigh(v0, v1).AsDouble();
        var vl1 = Avx.UnpackLow(v2, v3).AsDouble();
        var vh1 = Avx.UnpackHigh(v2, v3).AsDouble();

        var vb = Avx.UnpackLow(vl0, vl1).AsSingle();
        var vg = Avx.UnpackHigh(vl0, vl1).AsSingle();
        var vr = Avx.UnpackLow(vh0, vh1).AsSingle();

        vb = Avx.Multiply(vb, vlumB);
        vb = Fma.MultiplyAdd(vg, vlumG, vb);
        vb = Fma.MultiplyAdd(vr, vlumR, vb);

        vb = Avx2.PermuteVar8x32(vb, vperm);

        Avx.Store(op, vb);
        op += Vector256<float>.Count;
    }
}

net6.0 codegen for this method is as expected

x64 asm
; Assembly listing for method GreyBench:convertGrey(long,long,long)
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; No PGO data

G_M21790_IG01:              ;; offset=0000H
       4883EC48             sub      rsp, 72
       C5F877               vzeroupper
       C5F829742430         vmovaps  qword ptr [rsp+30H], xmm6
       C5F8297C2420         vmovaps  qword ptr [rsp+20H], xmm7
       C57829442410         vmovaps  qword ptr [rsp+10H], xmm8
       C578290C24           vmovaps  qword ptr [rsp], xmm9
                                                ;; bbWeight=1    PerfScore 13.25
G_M21790_IG02:              ;; offset=001EH
       C5FD1005DA000000     vmovupd  ymm0, ymmword ptr[reloc @RWD00]
       C5FD100DF2000000     vmovupd  ymm1, ymmword ptr[reloc @RWD32]
       C5FD10150A010000     vmovupd  ymm2, ymmword ptr[reloc @RWD64]
       C5FD101D22010000     vmovupd  ymm3, ymmword ptr[reloc @RWD96]
       4A8D0481             lea      rax, [rcx+4*r8]
       483BC8               cmp      rcx, rax
       7371                 jae      SHORT G_M21790_IG04
                            align    [0 bytes]
                                                ;; bbWeight=1    PerfScore 13.75
G_M21790_IG03:              ;; offset=0047H
       C5FC1021             vmovups  ymm4, ymmword ptr[rcx]
       C5FC106920           vmovups  ymm5, ymmword ptr[rcx+32]
       C5FC107140           vmovups  ymm6, ymmword ptr[rcx+64]
       C5FC107960           vmovups  ymm7, ymmword ptr[rcx+96]
       4881C180000000       add      rcx, 128
       C55C14C5             vunpcklps ymm8, ymm4, ymm5
       C54C14CF             vunpcklps ymm9, ymm6, ymm7
       C5CC15F7             vunpckhps ymm6, ymm6, ymm7
       C4C13D14F9           vunpcklpd ymm7, ymm8, ymm9
       C4413D15C1           vunpckhpd ymm8, ymm8, ymm9
       C5C459FA             vmulps   ymm7, ymm7, ymm2
       C46275A8C7           vfmadd213ps ymm8, ymm1, ymm7
       C4C17C28F8           vmovaps  ymm7, ymm8
       C5DC15E5             vunpckhps ymm4, ymm4, ymm5
       C5DD14E6             vunpcklpd ymm4, ymm4, ymm6
       C4E27DA8E7           vfmadd213ps ymm4, ymm0, ymm7
       C5FC28FC             vmovaps  ymm7, ymm4
       C4E26516FF           vpermps  ymm7, ymm3, ymm7
       C5FC113A             vmovups  ymmword ptr[rdx], ymm7
       4883C220             add      rdx, 32
       483BC8               cmp      rcx, rax
       729F                 jb       SHORT G_M21790_IG03
                                                ;; bbWeight=4    PerfScore 157.00
G_M21790_IG04:              ;; offset=00A8H
       C5F828742430         vmovaps  xmm6, qword ptr [rsp+30H]
       C5F8287C2420         vmovaps  xmm7, qword ptr [rsp+20H]
       C57828442410         vmovaps  xmm8, qword ptr [rsp+10H]
       C578280C24           vmovaps  xmm9, qword ptr [rsp]
       C5F877               vzeroupper
       4883C448             add      rsp, 72
       C3                   ret
                                                ;; bbWeight=1    PerfScore 18.25
RWD00   dq      3E9916873E991687h, 3E9916873E991687h, 3E9916873E991687h, 3E9916873E991687h
RWD32   dq      3F1645A23F1645A2h, 3F1645A23F1645A2h, 3F1645A23F1645A2h, 3F1645A23F1645A2h
RWD64   dq      3DE978D53DE978D5h, 3DE978D53DE978D5h, 3DE978D53DE978D5h, 3DE978D53DE978D5h
RWD96   dq      0000000400000000h, 0000000500000001h, 0000000600000002h, 0000000700000003h


; Total bytes of code 199

net8.0 propagates the constants into the loop body, encoding them as memory operands

x64 asm
; Assembly listing for method GreyBench:convertGrey(ulong,ulong,long) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; FullOpts code
; optimized code
; rsp based frame
; fully interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       C5F877               vzeroupper

G_M000_IG02:                ;; offset=0x0003
       4A8D0481             lea      rax, [rcx+4*r8]
       483BC8               cmp      rcx, rax
       736C                 jae      SHORT G_M000_IG04
                            align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x000C
       C5FC1001             vmovups  ymm0, ymmword ptr [rcx]
       C5FC104920           vmovups  ymm1, ymmword ptr [rcx+0x20]
       C5FC105140           vmovups  ymm2, ymmword ptr [rcx+0x40]
       C5FC105960           vmovups  ymm3, ymmword ptr [rcx+0x60]
       4881C180000000       add      rcx, 128
       C5FC14E1             vunpcklps ymm4, ymm0, ymm1
       C5EC14EB             vunpcklps ymm5, ymm2, ymm3
       C5FC15C1             vunpckhps ymm0, ymm0, ymm1
       C5EC15CB             vunpckhps ymm1, ymm2, ymm3
       C5FD14C1             vunpcklpd ymm0, ymm0, ymm1
       C5DD15CD             vunpckhpd ymm1, ymm4, ymm5
       C5DD14D5             vunpcklpd ymm2, ymm4, ymm5
       62F16C38591534000000 vmulps   ymm2, ymm2, dword ptr [reloc @RWD00] {1to8}
       C4E26D980D4B000000   vfmadd132ps ymm1, ymm2, ymmword ptr [reloc @RWD32]
       C4E275980562000000   vfmadd132ps ymm0, ymm1, ymmword ptr [reloc @RWD64]
       C5FC100D7A000000     vmovups  ymm1, ymmword ptr [reloc @RWD96]
       C4E27516C0           vpermps  ymm0, ymm1, ymm0
       C5FC1102             vmovups  ymmword ptr [rdx], ymm0
       4883C220             add      rdx, 32
       483BC8               cmp      rcx, rax
       7294                 jb       SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x0078
       C5F877               vzeroupper
       C3                   ret

RWD00   dd      3DE978D5h               ;     0.114
RWD04   dd      00000000h, 00000000h, 00000000h, 00000000h, 00000000h, 00000000h
        dd      00000000h
RWD32   dq      3F1645A23F1645A2h, 3F1645A23F1645A2h, 3F1645A23F1645A2h, 3F1645A23F1645A2h
RWD64   dq      3E9916873E991687h, 3E9916873E991687h, 3E9916873E991687h, 3E9916873E991687h
RWD96   dq      0000000400000000h, 0000000500000001h, 0000000600000002h, 0000000700000003h

; Total bytes of code 124

So I decide to try to outsmart the JIT, making it lose track of the fact the constants are actually constant. Here's the method updated with some "manipulation" of the constant values, which should result in the values being unchanged but maybe not recognized as constants:

private static void convertGreyUnconst(float* pin, float* pout, nint cnt)
{
    var vzero = Avx2.ShiftRightLogical(Vector256<int>.AllBitsSet, 32).AsSingle();
    var vlumR = Avx.Or(Vector256.Create(.299f), vzero);
    var vlumG = Avx.Or(Vector256.Create(.587f), vzero);
    var vlumB = Avx.Or(Vector256.Create(.114f), vzero);
    var vperm = Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7);

    float* ip = pin, ipe = pin + cnt, op = pout;
    while (ip < ipe)
    {
        var v0 = Avx.LoadVector256(ip);
        var v1 = Avx.LoadVector256(ip + Vector256<float>.Count);
        var v2 = Avx.LoadVector256(ip + Vector256<float>.Count * 2);
        var v3 = Avx.LoadVector256(ip + Vector256<float>.Count * 3);
        ip += Vector256<float>.Count * 4;

        var vl0 = Avx.UnpackLow(v0, v1).AsDouble();
        var vh0 = Avx.UnpackHigh(v0, v1).AsDouble();
        var vl1 = Avx.UnpackLow(v2, v3).AsDouble();
        var vh1 = Avx.UnpackHigh(v2, v3).AsDouble();

        var vb = Avx.UnpackLow(vl0, vl1).AsSingle();
        var vg = Avx.UnpackHigh(vl0, vl1).AsSingle();
        var vr = Avx.UnpackLow(vh0, vh1).AsSingle();

        vb = Avx.Multiply(vb, vlumB);
        vb = Fma.MultiplyAdd(vg, vlumG, vb);
        vb = Fma.MultiplyAdd(vr, vlumR, vb);

        vb = Avx2.PermuteVar8x32(vb, vperm);

        Avx.Store(op, vb);
        op += Vector256<float>.Count;
    }
}

Again, this has the expected codegen on net6.0, but on net8.0, not only does it still attempt to propagate the constants, it propagates them as AllBitsSet (or NaN in floating point), resulting in incorrect codegen.

net8.0 codegen:

; Assembly listing for method GreyBench:convertGreyUnconst(ulong,ulong,long) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; FullOpts code
; optimized code
; rsp based frame
; fully interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       C5F877               vzeroupper

G_M000_IG02:                ;; offset=0x0003
       4A8D0481             lea      rax, [rcx+4*r8]
       483BC8               cmp      rcx, rax
       736A                 jae      SHORT G_M000_IG04
                            align    [0 bytes for IG03]

G_M000_IG03:                ;; offset=0x000C
       C5FC1001             vmovups  ymm0, ymmword ptr [rcx]
       C5FC104920           vmovups  ymm1, ymmword ptr [rcx+0x20]
       C5FC105140           vmovups  ymm2, ymmword ptr [rcx+0x40]
       C5FC105960           vmovups  ymm3, ymmword ptr [rcx+0x60]
       4881C180000000       add      rcx, 128
       C5FC14E1             vunpcklps ymm4, ymm0, ymm1
       C5EC14EB             vunpcklps ymm5, ymm2, ymm3
       C5FC15C1             vunpckhps ymm0, ymm0, ymm1
       C5EC15CB             vunpckhps ymm1, ymm2, ymm3
       C5FD14C1             vunpcklpd ymm0, ymm0, ymm1
       C5F576C9             vpcmpeqd ymm1, ymm1, ymm1
       C5DD15D5             vunpckhpd ymm2, ymm4, ymm5
       C5E576DB             vpcmpeqd ymm3, ymm3, ymm3
       C5DD14E5             vunpcklpd ymm4, ymm4, ymm5
       C5D576ED             vpcmpeqd ymm5, ymm5, ymm5
       C5DC59E5             vmulps   ymm4, ymm4, ymm5
       C4E265A8D4           vfmadd213ps ymm2, ymm3, ymm4
       C4E275A8C2           vfmadd213ps ymm0, ymm1, ymm2
       C5FC100D1C000000     vmovups  ymm1, ymmword ptr [reloc @RWD00]
       C4E27516C0           vpermps  ymm0, ymm1, ymm0
       C5FC1102             vmovups  ymmword ptr [rdx], ymm0
       4883C220             add      rdx, 32
       483BC8               cmp      rcx, rax
       7296                 jb       SHORT G_M000_IG03

G_M000_IG04:                ;; offset=0x0076
       C5F877               vzeroupper
       C3                   ret

RWD00   dq      0000000400000000h, 0000000500000001h, 0000000600000002h, 0000000700000003h

; Total bytes of code 122

Reproduction Steps

Full sample program:

<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFrameworks>net6.0;net8.0</TargetFrameworks>
    <Nullable>enable</Nullable>
    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
  </PropertyGroup>

</Project>
using System;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

public static unsafe class Program
{
    public static void Main(string[] args)
    {
        const int length = 256;
        const nuint cb = (uint)length * sizeof(float);
        var memin = (float*)NativeMemory.AlignedAlloc(cb, 64);
        var memout = (float*)NativeMemory.AlignedAlloc(cb / 4, 64);

        var rand = new Random(42);
        for (int i = 0; i < length; i++)
            memin[i] = rand.NextSingle();

        convertGrey(memin, memout, length);
        Console.WriteLine(Avx.LoadVector256(memout).ToString());

        convertGreyUnconst(memin, memout, length);
        Console.WriteLine(Avx.LoadVector256(memout).ToString());

        NativeMemory.AlignedFree(memin);
        NativeMemory.AlignedFree(memout);
    }

    private static void convertGrey(float* pin, float* pout, nint cnt)
    {
        var vlumR = Vector256.Create(.299f);
        var vlumG = Vector256.Create(.587f);
        var vlumB = Vector256.Create(.114f);
        var vperm = Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7);

        float* ip = pin, ipe = pin + cnt, op = pout;
        while (ip < ipe)
        {
            var v0 = Avx.LoadVector256(ip);
            var v1 = Avx.LoadVector256(ip + Vector256<float>.Count);
            var v2 = Avx.LoadVector256(ip + Vector256<float>.Count * 2);
            var v3 = Avx.LoadVector256(ip + Vector256<float>.Count * 3);
            ip += Vector256<float>.Count * 4;

            var vl0 = Avx.UnpackLow(v0, v1).AsDouble();
            var vh0 = Avx.UnpackHigh(v0, v1).AsDouble();
            var vl1 = Avx.UnpackLow(v2, v3).AsDouble();
            var vh1 = Avx.UnpackHigh(v2, v3).AsDouble();

            var vb = Avx.UnpackLow(vl0, vl1).AsSingle();
            var vg = Avx.UnpackHigh(vl0, vl1).AsSingle();
            var vr = Avx.UnpackLow(vh0, vh1).AsSingle();

            vb = Avx.Multiply(vb, vlumB);
            vb = Fma.MultiplyAdd(vg, vlumG, vb);
            vb = Fma.MultiplyAdd(vr, vlumR, vb);

            vb = Avx2.PermuteVar8x32(vb, vperm);

            Avx.Store(op, vb);
            op += Vector256<float>.Count;
        }
    }

    private static void convertGreyUnconst(float* pin, float* pout, nint cnt)
    {
        var vzero = Avx2.ShiftRightLogical(Vector256<int>.AllBitsSet, 32).AsSingle();
        var vlumR = Avx.Or(Vector256.Create(.299f), vzero);
        var vlumG = Avx.Or(Vector256.Create(.587f), vzero);
        var vlumB = Avx.Or(Vector256.Create(.114f), vzero);
        var vperm = Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7);

        float* ip = pin, ipe = pin + cnt, op = pout;
        while (ip < ipe)
        {
            var v0 = Avx.LoadVector256(ip);
            var v1 = Avx.LoadVector256(ip + Vector256<float>.Count);
            var v2 = Avx.LoadVector256(ip + Vector256<float>.Count * 2);
            var v3 = Avx.LoadVector256(ip + Vector256<float>.Count * 3);
            ip += Vector256<float>.Count * 4;

            var vl0 = Avx.UnpackLow(v0, v1).AsDouble();
            var vh0 = Avx.UnpackHigh(v0, v1).AsDouble();
            var vl1 = Avx.UnpackLow(v2, v3).AsDouble();
            var vh1 = Avx.UnpackHigh(v2, v3).AsDouble();

            var vb = Avx.UnpackLow(vl0, vl1).AsSingle();
            var vg = Avx.UnpackHigh(vl0, vl1).AsSingle();
            var vr = Avx.UnpackLow(vh0, vh1).AsSingle();

            vb = Avx.Multiply(vb, vlumB);
            vb = Fma.MultiplyAdd(vg, vlumG, vb);
            vb = Fma.MultiplyAdd(vr, vlumR, vb);

            vb = Avx2.PermuteVar8x32(vb, vperm);

            Avx.Store(op, vb);
            op += Vector256<float>.Count;
        }
    }
}

Run with TC=0 to see it fail.

Expected behavior

Works, as in net6.0

dotnet run -c Release -f net6.0
<0.19640669, 0.3899415, 0.5367923, 0.3595256, 0.32314867, 0.16159265, 0.42320746, 0.755876>
<0.19640669, 0.3899415, 0.5367923, 0.3595256, 0.32314867, 0.16159265, 0.42320746, 0.755876>
dotnet run -c Release -f net6.0 -p:TieredCompilation=false
<0.19640669, 0.3899415, 0.5367923, 0.3595256, 0.32314867, 0.16159265, 0.42320746, 0.755876>
<0.19640669, 0.3899415, 0.5367923, 0.3595256, 0.32314867, 0.16159265, 0.42320746, 0.755876>

Actual behavior

Works in tier0, then kaboom after optimization.

dotnet run -c Release -f net8.0
<0.19640669, 0.3899415, 0.5367923, 0.3595256, 0.32314867, 0.16159265, 0.42320746, 0.755876>
<0.19640669, 0.3899415, 0.5367923, 0.3595256, 0.32314867, 0.16159265, 0.42320746, 0.755876>
dotnet run -c Release -f net8.0 -p:TieredCompilation=false
<0.19640669, 0.3899415, 0.5367923, 0.3595256, 0.32314867, 0.16159265, 0.42320746, 0.755876>
<NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN>

Regression?

Yep, works in 6.0 and 7.0, fails in 8.0rc2

Known Workarounds

No response

Configuration

net8.0rc2 Windows 10

Other information

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions