-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Description
Consider this code:
[MethodImpl(MethodImplOptions.NoInlining)]
static int GetRandom() {
return 0;
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void Print(int x) {
// Console.WriteLine(x);
}
static void SampleA() {
var a = GetRandom();
var b = GetRandom();
var c = GetRandom();
var d = GetRandom();
var e = GetRandom();
var f = GetRandom();
var g = GetRandom();
var h = GetRandom();
for (var x = 0; x < 100; ++x) {
var xa = GetRandom();
var xb = GetRandom();
var xc = GetRandom();
var xd = GetRandom();
var xe = GetRandom();
var xf = GetRandom();
Print(xa);
Print(xb);
Print(xc);
Print(xd);
Print(xe);
Print(xf);
}
Print(a);
Print(b);
Print(c);
Print(d);
Print(e);
Print(f);
Print(g);
Print(h);
}it generated following assembly code on Windows x64 .Net Core 2.0
push r14
00007FFC38150544 41 55 push r13
00007FFC38150546 41 54 push r12
00007FFC38150548 57 push rdi
00007FFC38150549 56 push rsi
00007FFC3815054A 55 push rbp
00007FFC3815054B 53 push rbx
00007FFC3815054C 48 83 EC 48 sub rsp,48h
00007FFC38150550 E8 63 FB FF FF call 00007FFC381500B8
00007FFC38150555 8B F0 mov esi,eax
192: var b = GetRandom();
00007FFC38150557 E8 5C FB FF FF call 00007FFC381500B8
00007FFC3815055C 8B F8 mov edi,eax
193: var c = GetRandom();
00007FFC3815055E E8 55 FB FF FF call 00007FFC381500B8
00007FFC38150563 8B D8 mov ebx,eax
194: var d = GetRandom();
00007FFC38150565 E8 4E FB FF FF call 00007FFC381500B8
00007FFC3815056A 8B E8 mov ebp,eax
195: var e = GetRandom();
00007FFC3815056C E8 47 FB FF FF call 00007FFC381500B8
195: var e = GetRandom();
00007FFC38150571 44 8B F0 mov r14d,eax
196: var f = GetRandom();
00007FFC38150574 E8 3F FB FF FF call 00007FFC381500B8
00007FFC38150579 44 8B F8 mov r15d,eax
197: var g = GetRandom();
00007FFC3815057C E8 37 FB FF FF call 00007FFC381500B8
00007FFC38150581 44 8B E0 mov r12d,eax
198: var h = GetRandom();
00007FFC38150584 E8 2F FB FF FF call 00007FFC381500B8
00007FFC38150589 44 8B E8 mov r13d,eax
199: for (var x = 0; x < 100; ++x) {
00007FFC3815058C 33 C0 xor eax,eax
00007FFC3815058E 89 44 24 44 mov dword ptr [rsp+44h],eax
200: var xa = GetRandom();
00007FFC38150592 E8 21 FB FF FF call 00007FFC381500B8
00007FFC38150597 89 44 24 40 mov dword ptr [rsp+40h],eax
201: var xb = GetRandom();
00007FFC3815059B E8 18 FB FF FF call 00007FFC381500B8
00007FFC381505A0 89 44 24 3C mov dword ptr [rsp+3Ch],eax
202: var xc = GetRandom();
00007FFC381505A4 E8 0F FB FF FF call 00007FFC381500B8
00007FFC381505A9 89 44 24 38 mov dword ptr [rsp+38h],eax
203: var xd = GetRandom();
00007FFC381505AD E8 06 FB FF FF call 00007FFC381500B8
00007FFC381505B2 89 44 24 34 mov dword ptr [rsp+34h],eax
204: var xe = GetRandom();
00007FFC381505B6 E8 FD FA FF FF call 00007FFC381500B8
00007FFC381505BB 89 44 24 30 mov dword ptr [rsp+30h],eax
205: var xf = GetRandom();
00007FFC381505BF E8 F4 FA FF FF call 00007FFC381500B8
00007FFC381505C4 89 44 24 2C mov dword ptr [rsp+2Ch],eax
00007FFC381505C8 8B 4C 24 40 mov ecx,dword ptr [rsp+40h]
00007FFC381505CC E8 EF FA FF FF call 00007FFC381500C0
207: Print(xb);
00007FFC381505D1 8B 4C 24 3C mov ecx,dword ptr [rsp+3Ch]
00007FFC381505D5 E8 E6 FA FF FF call 00007FFC381500C0
208: Print(xc);
00007FFC381505DA 8B 4C 24 38 mov ecx,dword ptr [rsp+38h]
00007FFC381505DE E8 DD FA FF FF call 00007FFC381500C0
209: Print(xd);
00007FFC381505E3 8B 4C 24 34 mov ecx,dword ptr [rsp+34h]
00007FFC381505E7 E8 D4 FA FF FF call 00007FFC381500C0
210: Print(xe);
00007FFC381505EC 8B 4C 24 30 mov ecx,dword ptr [rsp+30h]
210: Print(xe);
00007FFC381505F0 E8 CB FA FF FF call 00007FFC381500C0
211: Print(xf);
00007FFC381505F5 8B 4C 24 2C mov ecx,dword ptr [rsp+2Ch]
00007FFC381505F9 E8 C2 FA FF FF call 00007FFC381500C0
199: for (var x = 0; x < 100; ++x) {
00007FFC381505FE 8B 4C 24 44 mov ecx,dword ptr [rsp+44h]
00007FFC38150602 FF C1 inc ecx
00007FFC38150604 83 F9 64 cmp ecx,64h
00007FFC38150607 89 4C 24 44 mov dword ptr [rsp+44h],ecx
00007FFC3815060B 7C 85 jl 00007FFC38150592
212: }
213: Print(a);
00007FFC3815060D 8B CE mov ecx,esi
00007FFC3815060F E8 AC FA FF FF call 00007FFC381500C0
214: Print(b);
00007FFC38150614 8B CF mov ecx,edi
214: Print(b);
00007FFC38150616 E8 A5 FA FF FF call 00007FFC381500C0
215: Print(c);
00007FFC3815061B 8B CB mov ecx,ebx
00007FFC3815061D E8 9E FA FF FF call 00007FFC381500C0
216: Print(d);
00007FFC38150622 8B CD mov ecx,ebp
00007FFC38150624 E8 97 FA FF FF call 00007FFC381500C0
217: Print(e);
00007FFC38150629 41 8B CE mov ecx,r14d
00007FFC3815062C E8 8F FA FF FF call 00007FFC381500C0
218: Print(f);
00007FFC38150631 41 8B CF mov ecx,r15d
00007FFC38150634 E8 87 FA FF FF call 00007FFC381500C0
219: Print(g);
00007FFC38150639 41 8B CC mov ecx,r12d
00007FFC3815063C E8 7F FA FF FF call 00007FFC381500C0
220: Print(h);
00007FFC38150641 41 8B CD mov ecx,r13d
00007FFC38150644 48 B8 C0 00 15 38 FC 7F 00 00 mov rax,7FFC381500C0h
00007FFC3815064E 48 83 C4 48 add rsp,48h
00007FFC38150652 5B pop rbx
00007FFC38150653 5D pop rbp
00007FFC38150654 5E pop rsi
00007FFC38150655 5F pop rdi
00007FFC38150656 41 5C pop r12
00007FFC38150658 41 5D pop r13
00007FFC3815065A 41 5E pop r14
00007FFC3815065C 41 5F pop r15
00007FFC3815065E 48 FF E0 jmp rax As we can see a~h is holding register esi~r13 yet xa~xf always use stack.
Then I write another version:
static volatile int Dummy;
static void SampleB() {
var xa = Dummy;
var xb = Dummy;
var xc = Dummy;
var xd = Dummy;
var xe = Dummy;
var xf = Dummy;
var a = GetRandom();
var b = GetRandom();
var c = GetRandom();
var d = GetRandom();
var e = GetRandom();
var f = GetRandom();
var g = GetRandom();
var h = GetRandom();
Dummy = xa;
Dummy = xb;
Dummy = xc;
Dummy = xd;
Dummy = xe;
Dummy = xf;
for (var x = 0; x < 100; ++x) {
xa = GetRandom();
xb = GetRandom();
xc = GetRandom();
xd = GetRandom();
xe = GetRandom();
xf = GetRandom();
Print(xa);
Print(xb);
Print(xc);
Print(xd);
Print(xe);
Print(xf);
}
Print(a);
Print(b);
Print(c);
Print(d);
Print(e);
Print(f);
Print(g);
Print(h);
}And run a benchmark, gives
1000000 loops SampleA: 2.2854432
1000000 loops SampleB: 1.6177438
SampleB is much faster because it let xa~xf use register, the following 100 cycles will also use these registers.
I dig a bit with lldb on Linux x64, here is what's happing during LSRA form SampleA:
Notice it's on Linux x64 so the registers doesn't match with the assembly code above.
- tryAllocateFreeReg(interval of a, refposition of st.lclvar a) gives rbx
- tryAllocateFreeReg(interval of b, refposition of st.lclvar b) gives r14
- tryAllocateFreeReg(interval of c, refposition of st.lclvar c) gives r15
- tryAllocateFreeReg(interval of d, refposition of st.lclvar d) gives r12
- tryAllocateFreeReg(interval of e, refposition of st.lclvar e) gives r13
- tryAllocateFreeReg(interval of f, refposition of st.lclvar f) gives rax
- handle refposition of call, because it's
isPhysRegRef, the interval of f holding rax will spill - tryAllocateFreeReg(interval of g, refposition of st.lclvar g) gives rax
- handle refposition of call, because it's
isPhysRegRef, the interval of g holding rax will spill - tryAllocateFreeReg(interval of h, refposition of st.lclvar h) gives rax
- handle refposition of call, because it's
isPhysRegRef, the interval of h holding rax will spill - tryAllocateFreeReg(interval of xa, refposition of st.lclvar xa) gives rax
- handle refposition of call, because it's
isPhysRegRef, the interval of xa holding rax will spill - and so on...
During the process, allocateBusyReg is not used at all,
I can see getWeight(refPos of st.lclvar a) is 200 and getWeight(refPos of st.lclvar xa) is 800, but xa has no chance to get the register.
On the contrary, gcc and clang can handle this situation very well:
category:cq
theme:register-allocator
skill-level:expert
cost:large
