Skip to content

HW intrinsics: Simple SIMD vectors swap operation thru temp variable goes thru stack spill #11071

@voinokin

Description

@voinokin

Two issues here (see repro code and disasm below):

  • Sequence of inter-register MOVxPS/PD or MOVDQx was expected.
  • It is unclear why stack slot for inlined version is initialized twice (can be clearly observed if "in-place" version is commented out and REP STOSD goes away)

The current workaround is to use this sequence:

Vector128<uint> aXORb = Sse2.Xor(a, b); a = Sse2.Xor(a, aXORb); b = Sse2.Xor(b, aXORb);
But it is slower than solution with MOVxxx due to register dependencies.

Vector128<float> tmp = Sse.StaticCast<uint, float>(a); a = b; b = Sse.StaticCast<float, uint>(tmp);
This gives expected sequence of MOVAPS with pre-VEX codegen on .NET Core 2.1, but I'm not sure this will stay due to dotnet/coreclr#18519


Repro code:

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void SwapNonGeneric(ref Vector128<uint> a, ref Vector128<uint> b)
{
    Vector128<uint> tmp = a; a = b; b = tmp;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static void Test()
{
    Vector128<uint> a = Sse2.ConvertScalarToVector128UInt32(0xA);
    Vector128<uint> b = Sse2.ConvertScalarToVector128UInt32(0xB);

    Vector128<uint> tmp = a; a = b; b = tmp;    // in-place version
    SwapNonGeneric(ref a, ref b);               // inlined version

    Console.WriteLine("A={0}, B={1}", Sse2.ConvertToUInt32(a), Sse2.ConvertToUInt32(b));
}

Disasm:

--- ...\Program2.cs ---
    Vector128<uint> a = Sse2.ConvertScalarToVector128UInt32(0xA);
000007FE72684910  push        rdi  
000007FE72684911  push        rsi  
000007FE72684912  sub         rsp,68h  
000007FE72684916  movaps      xmmword ptr [rsp+50h],xmm6  
000007FE7268491B  movaps      xmmword ptr [rsp+40h],xmm7  
000007FE72684920  lea         rdi,[rsp+20h]  
000007FE72684925  mov         ecx,8  <========== this initializes 2 stack slots
000007FE7268492A  xor         eax,eax  
000007FE7268492C  rep stos    dword ptr [rdi]  

    Vector128<uint> a = Sse2.ConvertScalarToVector128UInt32(0xA);
000007FE7268492E  mov         ecx,0Ah  
000007FE72684933  movd        xmm6,ecx  

    Vector128<uint> b = Sse2.ConvertScalarToVector128UInt32(0xB);
000007FE72684937  mov         ecx,0Bh  
000007FE7268493C  movd        xmm7,ecx  

    Vector128<uint> tmp = a; a = b; b = tmp;    // in-place version
000007FE72684940  movaps      xmmword ptr [rsp+30h],xmm6  
000007FE72684945  movaps      xmm6,xmm7  
000007FE72684948  movaps      xmm7,xmmword ptr [rsp+30h]  

    SwapNonGeneric(ref a, ref b);               // inlined version
000007FE7268494D  xor         ecx,ecx  <========= stack slot is re-initialized 2nd time
000007FE7268494F  mov         qword ptr [rsp+20h],rcx  
000007FE72684954  mov         qword ptr [rsp+28h],rcx  
000007FE72684959  movaps      xmmword ptr [rsp+20h],xmm6  
000007FE7268495E  movaps      xmm6,xmm7  
000007FE72684961  movaps      xmm7,xmmword ptr [rsp+20h]  

    Console.WriteLine("A={0}, B={1}", Sse2.ConvertToUInt32(a), Sse2.ConvertToUInt32(b));
000007FE72684966  mov         rcx,7FED12719E0h  
000007FE72684970  call        000007FED21722B0  
000007FE72684975  mov         rsi,rax  
000007FE72684978  movd        ecx,xmm6  
000007FE7268497C  mov         dword ptr [rsi+8],ecx  
000007FE7268497F  mov         rcx,7FED12719E0h  
000007FE72684989  call        000007FED21722B0  
000007FE7268498E  mov         r8,rax  
000007FE72684991  movd        edx,xmm7  
000007FE72684995  mov         dword ptr [r8+8],edx  
000007FE72684999  mov         rdx,rsi  
000007FE7268499C  mov         rcx,qword ptr [126230F8h]  
000007FE726849A4  call        000007FE72681F08  
000007FE726849A9  nop  
000007FE726849AA  movaps      xmm6,xmmword ptr [rsp+50h]  
000007FE726849AF  movaps      xmm7,xmmword ptr [rsp+40h]  
000007FE726849B4  add         rsp,68h  
000007FE726849B8  pop         rsi  
000007FE726849B9  pop         rdi  
000007FE726849BA  ret  

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIenhancementProduct code improvement that does NOT require public API changes/additionsoptimization

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions