Skip to content

[RyuJIT][LSRA] Let variables within a loop use register first #8846

@303248153

Description

@303248153

Consider this code:

[MethodImpl(MethodImplOptions.NoInlining)]
static int GetRandom() {
	return 0;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static void Print(int x) {
	// Console.WriteLine(x);
}

static void SampleA() {
	var a = GetRandom();
	var b = GetRandom();
	var c = GetRandom();
	var d = GetRandom();
	var e = GetRandom();
	var f = GetRandom();
	var g = GetRandom();
	var h = GetRandom();
	for (var x = 0; x < 100; ++x) {
		var xa = GetRandom();
		var xb = GetRandom();
		var xc = GetRandom();
		var xd = GetRandom();
		var xe = GetRandom();
		var xf = GetRandom();
		Print(xa);
		Print(xb);
		Print(xc);
		Print(xd);
		Print(xe);
		Print(xf);
	}
	Print(a);
	Print(b);
	Print(c);
	Print(d);
	Print(e);
	Print(f);
	Print(g);
	Print(h);
}

it generated following assembly code on Windows x64 .Net Core 2.0

  push        r14  
00007FFC38150544 41 55                push        r13  
00007FFC38150546 41 54                push        r12  
00007FFC38150548 57                   push        rdi  
00007FFC38150549 56                   push        rsi  
00007FFC3815054A 55                   push        rbp  
00007FFC3815054B 53                   push        rbx  
00007FFC3815054C 48 83 EC 48          sub         rsp,48h  
00007FFC38150550 E8 63 FB FF FF       call        00007FFC381500B8  
00007FFC38150555 8B F0                mov         esi,eax  
   192: 			var b = GetRandom();
00007FFC38150557 E8 5C FB FF FF       call        00007FFC381500B8  
00007FFC3815055C 8B F8                mov         edi,eax  
   193: 			var c = GetRandom();
00007FFC3815055E E8 55 FB FF FF       call        00007FFC381500B8  
00007FFC38150563 8B D8                mov         ebx,eax  
   194: 			var d = GetRandom();
00007FFC38150565 E8 4E FB FF FF       call        00007FFC381500B8  
00007FFC3815056A 8B E8                mov         ebp,eax  
   195: 			var e = GetRandom();
00007FFC3815056C E8 47 FB FF FF       call        00007FFC381500B8  
   195: 			var e = GetRandom();
00007FFC38150571 44 8B F0             mov         r14d,eax  
   196: 			var f = GetRandom();
00007FFC38150574 E8 3F FB FF FF       call        00007FFC381500B8  
00007FFC38150579 44 8B F8             mov         r15d,eax  
   197: 			var g = GetRandom();
00007FFC3815057C E8 37 FB FF FF       call        00007FFC381500B8  
00007FFC38150581 44 8B E0             mov         r12d,eax  
   198: 			var h = GetRandom();
00007FFC38150584 E8 2F FB FF FF       call        00007FFC381500B8  
00007FFC38150589 44 8B E8             mov         r13d,eax  
   199: 			for (var x = 0; x < 100; ++x) {
00007FFC3815058C 33 C0                xor         eax,eax  
00007FFC3815058E 89 44 24 44          mov         dword ptr [rsp+44h],eax  
   200: 				var xa = GetRandom();
00007FFC38150592 E8 21 FB FF FF       call        00007FFC381500B8  
00007FFC38150597 89 44 24 40          mov         dword ptr [rsp+40h],eax  
   201: 				var xb = GetRandom();
00007FFC3815059B E8 18 FB FF FF       call        00007FFC381500B8  
00007FFC381505A0 89 44 24 3C          mov         dword ptr [rsp+3Ch],eax  
   202: 				var xc = GetRandom();
00007FFC381505A4 E8 0F FB FF FF       call        00007FFC381500B8  
00007FFC381505A9 89 44 24 38          mov         dword ptr [rsp+38h],eax  
   203: 				var xd = GetRandom();
00007FFC381505AD E8 06 FB FF FF       call        00007FFC381500B8  
00007FFC381505B2 89 44 24 34          mov         dword ptr [rsp+34h],eax  
   204: 				var xe = GetRandom();
00007FFC381505B6 E8 FD FA FF FF       call        00007FFC381500B8  
00007FFC381505BB 89 44 24 30          mov         dword ptr [rsp+30h],eax  
   205: 				var xf = GetRandom();
00007FFC381505BF E8 F4 FA FF FF       call        00007FFC381500B8  
00007FFC381505C4 89 44 24 2C          mov         dword ptr [rsp+2Ch],eax  
00007FFC381505C8 8B 4C 24 40          mov         ecx,dword ptr [rsp+40h]  
00007FFC381505CC E8 EF FA FF FF       call        00007FFC381500C0  
   207: 				Print(xb);
00007FFC381505D1 8B 4C 24 3C          mov         ecx,dword ptr [rsp+3Ch]  
00007FFC381505D5 E8 E6 FA FF FF       call        00007FFC381500C0  
   208: 				Print(xc);
00007FFC381505DA 8B 4C 24 38          mov         ecx,dword ptr [rsp+38h]  
00007FFC381505DE E8 DD FA FF FF       call        00007FFC381500C0  
   209: 				Print(xd);
00007FFC381505E3 8B 4C 24 34          mov         ecx,dword ptr [rsp+34h]  
00007FFC381505E7 E8 D4 FA FF FF       call        00007FFC381500C0  
   210: 				Print(xe);
00007FFC381505EC 8B 4C 24 30          mov         ecx,dword ptr [rsp+30h]  
   210: 				Print(xe);
00007FFC381505F0 E8 CB FA FF FF       call        00007FFC381500C0  
   211: 				Print(xf);
00007FFC381505F5 8B 4C 24 2C          mov         ecx,dword ptr [rsp+2Ch]  
00007FFC381505F9 E8 C2 FA FF FF       call        00007FFC381500C0  
   199: 			for (var x = 0; x < 100; ++x) {
00007FFC381505FE 8B 4C 24 44          mov         ecx,dword ptr [rsp+44h]  
00007FFC38150602 FF C1                inc         ecx  
00007FFC38150604 83 F9 64             cmp         ecx,64h  
00007FFC38150607 89 4C 24 44          mov         dword ptr [rsp+44h],ecx  
00007FFC3815060B 7C 85                jl          00007FFC38150592  
   212: 			}
   213: 			Print(a);
00007FFC3815060D 8B CE                mov         ecx,esi  
00007FFC3815060F E8 AC FA FF FF       call        00007FFC381500C0  
   214: 			Print(b);
00007FFC38150614 8B CF                mov         ecx,edi  
   214: 			Print(b);
00007FFC38150616 E8 A5 FA FF FF       call        00007FFC381500C0  
   215: 			Print(c);
00007FFC3815061B 8B CB                mov         ecx,ebx  
00007FFC3815061D E8 9E FA FF FF       call        00007FFC381500C0  
   216: 			Print(d);
00007FFC38150622 8B CD                mov         ecx,ebp  
00007FFC38150624 E8 97 FA FF FF       call        00007FFC381500C0  
   217: 			Print(e);
00007FFC38150629 41 8B CE             mov         ecx,r14d  
00007FFC3815062C E8 8F FA FF FF       call        00007FFC381500C0  
   218: 			Print(f);
00007FFC38150631 41 8B CF             mov         ecx,r15d  
00007FFC38150634 E8 87 FA FF FF       call        00007FFC381500C0  
   219: 			Print(g);
00007FFC38150639 41 8B CC             mov         ecx,r12d  
00007FFC3815063C E8 7F FA FF FF       call        00007FFC381500C0  
   220: 			Print(h);
00007FFC38150641 41 8B CD             mov         ecx,r13d  
00007FFC38150644 48 B8 C0 00 15 38 FC 7F 00 00 mov         rax,7FFC381500C0h  
00007FFC3815064E 48 83 C4 48          add         rsp,48h  
00007FFC38150652 5B                   pop         rbx  
00007FFC38150653 5D                   pop         rbp  
00007FFC38150654 5E                   pop         rsi  
00007FFC38150655 5F                   pop         rdi  
00007FFC38150656 41 5C                pop         r12  
00007FFC38150658 41 5D                pop         r13  
00007FFC3815065A 41 5E                pop         r14  
00007FFC3815065C 41 5F                pop         r15  
00007FFC3815065E 48 FF E0             jmp         rax 

As we can see a~h is holding register esi~r13 yet xa~xf always use stack.

Then I write another version:

static volatile int Dummy;

static void SampleB() {
	var xa = Dummy;
	var xb = Dummy;
	var xc = Dummy;
	var xd = Dummy;
	var xe = Dummy;
	var xf = Dummy;
	var a = GetRandom();
	var b = GetRandom();
	var c = GetRandom();
	var d = GetRandom();
	var e = GetRandom();
	var f = GetRandom();
	var g = GetRandom();
	var h = GetRandom();
	Dummy = xa;
	Dummy = xb;
	Dummy = xc;
	Dummy = xd;
	Dummy = xe;
	Dummy = xf;
	for (var x = 0; x < 100; ++x) {
		xa = GetRandom();
		xb = GetRandom();
		xc = GetRandom();
		xd = GetRandom();
		xe = GetRandom();
		xf = GetRandom();
		Print(xa);
		Print(xb);
		Print(xc);
		Print(xd);
		Print(xe);
		Print(xf);
	}
	Print(a);
	Print(b);
	Print(c);
	Print(d);
	Print(e);
	Print(f);
	Print(g);
	Print(h);
}

And run a benchmark, gives

1000000 loops SampleA: 2.2854432
1000000 loops SampleB: 1.6177438

SampleB is much faster because it let xa~xf use register, the following 100 cycles will also use these registers.

I dig a bit with lldb on Linux x64, here is what's happing during LSRA form SampleA:
Notice it's on Linux x64 so the registers doesn't match with the assembly code above.

  • tryAllocateFreeReg(interval of a, refposition of st.lclvar a) gives rbx
  • tryAllocateFreeReg(interval of b, refposition of st.lclvar b) gives r14
  • tryAllocateFreeReg(interval of c, refposition of st.lclvar c) gives r15
  • tryAllocateFreeReg(interval of d, refposition of st.lclvar d) gives r12
  • tryAllocateFreeReg(interval of e, refposition of st.lclvar e) gives r13
  • tryAllocateFreeReg(interval of f, refposition of st.lclvar f) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of f holding rax will spill
  • tryAllocateFreeReg(interval of g, refposition of st.lclvar g) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of g holding rax will spill
  • tryAllocateFreeReg(interval of h, refposition of st.lclvar h) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of h holding rax will spill
  • tryAllocateFreeReg(interval of xa, refposition of st.lclvar xa) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of xa holding rax will spill
  • and so on...

During the process, allocateBusyReg is not used at all,
I can see getWeight(refPos of st.lclvar a) is 200 and getWeight(refPos of st.lclvar xa) is 800, but xa has no chance to get the register.

On the contrary, gcc and clang can handle this situation very well:

godblot

category:cq
theme:register-allocator
skill-level:expert
cost:large

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIenhancementProduct code improvement that does NOT require public API changes/additionsoptimizationtenet-performancePerformance related issue

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions