SuperCharged MemoryPoolIterator#1138
Conversation
|
Generates the following 91 bytes of asm (without the final inline) Details; Assembly listing for method MemoryPoolIterator:LocateFirstFoundByte(byref):int
; Emitting BLENDED_CODE for X64 CPU with SSE2
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 3, 3 ) byref -> rcx
; V01 loc0 [V01,T03] ( 2, 5 ) simd16 -> mm0 ld-addr-op
; V02 loc1 [V02,T00] ( 7, 22 ) int -> rax
; V03 loc2 [V03,T01] ( 4, 10 ) long -> rdx
; V04 loc3 [V04,T05] ( 2, 2 ) int -> rdx
; V05 tmp0 [V05,T04] ( 2, 4 ) simd16 -> mm0
; V06 OutArgs [V06 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V07 rat0 [V07 ] ( 1, 1 ) simd16 -> [rsp+0x28] do-not-enreg[XS] must-init addr-exposed
;
; Lcl frame size = 56
G_M61145_IG01:
4883EC38 sub rsp, 56
33C0 xor rax, rax
4889442428 mov qword ptr [rsp+28H], rax
4889442430 mov qword ptr [rsp+30H], rax
G_M61145_IG02:
0F1001 movups xmm0, xmmword ptr [rcx]
33C0 xor eax, eax
G_M61145_IG03:
83F802 cmp eax, 2
733B jae SHORT G_M61145_IG06
0F11442428 movups xmmword ptr [rsp+28H], xmm0
488B54C428 mov rdx, qword ptr [rsp+8*rax+28H]
4885D2 test rdx, rdx
7507 jne SHORT G_M61145_IG04
FFC0 inc eax
83F802 cmp eax, 2
7CE5 jl SHORT G_M61145_IG03
G_M61145_IG04:
488BCA mov rcx, rdx
48F7D9 neg rcx
4823CA and rcx, rdx
48BAE0C0A08060402000 mov rdx, 0x20406080A0C0E0
480FAFCA imul rcx, rdx
48C1E93D shr rcx, 61
8BD1 mov edx, ecx
8D04C2 lea eax, [rdx+8*rax]
G_M61145_IG05:
4883C438 add rsp, 56
C3 ret
G_M61145_IG06:
E8A6598D5F call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 91, prolog size 16 for method MemoryPoolIterator:LocateFirstFoundByte(byref):intDetailsSuccessfully inlined Vector:AsVectorInt64(struct):struct (7 IL bytes) (depth 1) [below ALWAYS_INLINE size]
**************** Inline Tree
Inlines into 06000166 MemoryPoolIterator:FindFirstEqualByte(byref):int
[0 IL=0008 TR=000271 06000167] [FAILED: too many il bytes] MemoryPoolIterator:FindFirstEqualByteSlow(byref):int
[1 IL=0020 TR=000016 0600009D] [below ALWAYS_INLINE size] Vector:AsVectorInt64(struct):struct
[0 IL=0176 TR=000263 06002C13] [FAILED: unprofitable inline] InvalidOperationException:.ctor():this
Budget: initialTime=606, finalTime=606, initialBudget=6060, currentBudget=6060
Budget: initialSize=4280, finalSize=4280
; Assembly listing for method MemoryPoolIterator:FindFirstEqualByte(byref):int
; Emitting BLENDED_CODE for X64 CPU with SSE2
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 4, 3 ) byref -> rsi
; V01 loc0 [V01,T05] ( 2, 4.5) simd16 -> mm0 ld-addr-op
; V02 loc1 [V02,T00] ( 7, 21 ) int -> rcx
; V03 loc2 [V03,T01] ( 9, 11.5) long -> rax
; V04 tmp0 [V04,T07] ( 3, 1.5) int -> rcx
; V05 tmp1 [V05,T08] ( 3, 1.5) int -> rcx
; V06 tmp2 [V06,T09] ( 3, 1.5) int -> rcx
; V07 tmp3 [V07,T03] ( 9, 4.5) int -> rcx
; V08 tmp4 [V08,T04] ( 9, 4.5) int -> rdx
; V09 tmp5 [V09,T10] ( 3, 1.5) int -> rcx
; V10 tmp6 [V10,T11] ( 3, 1.5) int -> rcx
; V11 tmp7 [V11,T12] ( 3, 1.5) int -> rcx
; V12 tmp8 [V12,T13] ( 3, 1.5) int -> rcx
; V13 tmp9 [V13,T14] ( 3, 0 ) ref -> rsi
; V14 tmp10 [V14,T06] ( 2, 2 ) simd16 -> mm0
; V15 OutArgs [V15 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V16 rat0 [V16 ] ( 1, 1 ) simd16 -> [rsp+0x20] do-not-enreg[XS] must-init addr-exposed
;
; Lcl frame size = 48
G_M48472_IG01:
push rsi
sub rsp, 48
xor rax, rax
mov qword ptr [rsp+20H], rax
mov qword ptr [rsp+28H], rax
mov rsi, rcx
G_M48472_IG02:
mov rcx, 0xD1FFAB1E
mov edx, 3
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
movzx rcx, byte ptr [reloc classVar[0xd1ffab1e]]
test ecx, ecx
jne SHORT G_M48472_IG04
mov rcx, rsi
call MemoryPoolIterator:FindFirstEqualByteSlow(byref):int
nop
G_M48472_IG03:
add rsp, 48
pop rsi
ret
G_M48472_IG04:
movups xmm0, xmmword ptr [rsi]
xor ecx, ecx
G_M48472_IG05:
cmp ecx, 2
jae G_M48472_IG17
movups xmmword ptr [rsp+20H], xmm0
mov rax, qword ptr [rsp+8*rcx+20H]
test rax, rax
je G_M48472_IG15
shl ecx, 3
mov edx, 0xD1FFAB1E
and rdx, rax
test rdx, rdx
jg SHORT G_M48472_IG09
mov rdx, 0xD1FFAB1E
and rdx, rax
test rdx, rdx
jg SHORT G_M48472_IG07
mov rdx, 0xD1FFAB1E
and rax, rdx
test rax, rax
jg SHORT G_M48472_IG06
mov edx, 7
jmp SHORT G_M48472_IG13
G_M48472_IG06:
mov edx, 6
jmp SHORT G_M48472_IG13
G_M48472_IG07:
mov rdx, 0xD1FFAB1E
and rax, rdx
test rax, rax
jg SHORT G_M48472_IG08
mov edx, 5
jmp SHORT G_M48472_IG13
G_M48472_IG08:
mov edx, 4
jmp SHORT G_M48472_IG13
G_M48472_IG09:
mov rdx, rax
and rdx, 0xFFFF
test rdx, rdx
jg SHORT G_M48472_IG11
and rax, 0xD1FFAB1E
test rax, rax
jg SHORT G_M48472_IG10
mov edx, 3
jmp SHORT G_M48472_IG13
G_M48472_IG10:
mov edx, 2
jmp SHORT G_M48472_IG13
G_M48472_IG11:
and rax, 255
test rax, rax
jg SHORT G_M48472_IG12
mov edx, 1
jmp SHORT G_M48472_IG13
G_M48472_IG12:
xor edx, edx
G_M48472_IG13:
lea eax, [rcx+rdx]
G_M48472_IG14:
add rsp, 48
pop rsi
ret
G_M48472_IG15:
inc ecx
cmp ecx, 2
jl G_M48472_IG05
G_M48472_IG16:
mov rcx, 0xD1FFAB1E
call CORINFO_HELP_NEWSFAST
mov rsi, rax
mov rcx, rsi
call InvalidOperationException:.ctor():this
mov rcx, rsi
call CORINFO_HELP_THROW
int3
G_M48472_IG17:
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code 318, prolog size 20 for method MemoryPoolIterator:FindFirstEqualByte(byref):int |
|
Benchmark code https://gist.github.com/benaadams/2dd3f99230757111e91915f638067a09
|
|
While the unrolled version is faster, it is also inlined into 16 places so will be duplicated 16 times so I've gone for the loop version. |
|
Wow. Where did you find this constant? How does this work? |
|
I couldn't believe there wasn't a number in the 2^64 space that did this. However, after exhaustive internet searching I couldn't find one. So I went to sleep, woke up in the middle of the night with a headache and idea of a method to try and actually derived it. Having derived it, I did some searching with the search engines and couldn't find any hits either in its decimal or hexadecimal from so called it Ben's Magic Number 😺 I will write a blog post on the method I used. |
|
@benaadams Dude... you need a blog post on how your brain works. Might help cure cancer or something. |
|
I think I can apply this to < |
|
Really cool. 👍 I look forward to reading the blog post. This doesn't seem to be doing exactly what FindFirstEqualByte or the debrujin sequence was doing, but it's probably all we need. I'm assuming/hoping that it's guaranteed that the return value of Here are some sample inputs that show the difference. The old FindFirstEqualByte would return 7 for all these inputs. But none of the functional tests failed, so that's a great sign 😄 |
|
@halter73 its specifically testing the least significant bit which is set for the common values of true |
| // | ||
| // This means when they are next used the have been pre-evaluated and the jit can | ||
| // either directly embed them or use them for branch elimiation. | ||
| internal static bool StaticReadonlysJitted; |
There was a problem hiding this comment.
Why is this required to turn the static readonlies into jitted const? Is this a bug in the jitter?
There was a problem hiding this comment.
The jit does the work when it first encounters the item. For readonly statics in a beforefieldinit class its when any static variable is first used which is documented.
That first function that is being jitted gets none of the benefits of the const like nature of the variables; though every function after that does. One of the features that doesn't happen is branch elimination - like on IsLittleEndian.
I don't know if its expected behaviour or a bug (might need to unwind and recomplile to resolve it). It came up before in #554 (comment)
There was a problem hiding this comment.
Ahh the original issue: https://github.com/dotnet/coreclr/issues/2526. Interesting that the CLR team doesn't want to run static ctors prior to jitting. That's too bad.
|
@halter73 been thinking about the input. I'm fairly confident its safe assuming true is every bit set from an inbuilt; else you couldn't binary not it to get false - you'd still have a form of true. |
|
@benaadams I think it's safe too. |
|
I tried to come up with a similar magic number that would allow to obtain the index after shifting its bits, and found a different one.
We then simply have to read the 1st byte of the resulting number to find the offset, ie: |
|
@nicodeslandes Congats! That is a lot more obvious and what is happening; and drops a shift. Already superseded; that's the power of OSS collaboration! |
002481f to
16ea886
Compare
16ea886 to
4bf0a57
Compare
|
@benaadams Sweet! Glad I could bring a modest contribution to this great piece of code. |
|
Funny. The 0x01020304050607 version is exactly what I explained to people in Redmond. I even wrote that constant on the whiteboard (I know @benaadams saw the tweet). I figured I might be missing something subtle, but that would make it easy for people to get the gist. Fortunately most of the people I was talking to had a vague understanding of the debrujin code, so they already understood multiplying by (v & (-v)) was a bit shift. The key insight from there is that the shifts are always 8 bit aligned. So you can make the mask/window up to 8 bits and not worry about any overlapping windows. 8 bits of course being more than enough to represent 0-7. This is why you don't need to use a debrujin sequence for array indexing. If there were overlapping windows, it would be a different story.
Is that the |
dfd85c0 to
4bf0a57
Compare
|
👏 |
Yes, removed it from this PR, will add back later with measurements.
Is to be used for data lengths shorter than Vector; so on a 32 byte wide vector currently the final 31 bytes of a GET request is processed per byte. Will cover other points in the PR (post merge clashes) |
|
@benaadams 👍 I guess that's why there's no |
|
Nice! 🎉 |
| private static readonly bool IsHardwareAccelerated = Vector.IsHardwareAccelerated; | ||
| private static readonly bool IsLittleEndian = BitConverter.IsLittleEndian; | ||
| private static readonly int _vectorSpan = Vector<byte>.Count; | ||
| private static readonly int _vectorUlongSpan = Vector<ulong>.Count; |
There was a problem hiding this comment.
Were you not seeing Vector.IsHardwareAccelerated and Vector<xxx>.Count being replaced by constants during jiting when used in-line in the method? The code you have now gets them replaced by constants during importing due to an optimization for ldsfld on read-only static fields, but impSIMDIntrinsic is supposed to kick in and replace their references with constants if you use them in-line. Doing the latter should have the benefit that the Jit could recognize that this is where the constants came from so that it could e.g. use that for an unrolling heuristic and unroll the loop in Seek to use constant rather than variable indexing (dotnet/coreclr#7422).
There was a problem hiding this comment.
@JosephTremoulet made change, seems to work. BitConverter.IsLittleEndian definately wasn't doing branch elimination though so have left it as an early eval readonly static when it does.
There was a problem hiding this comment.
@benaadams Did you run any benchmarks showing setting StaticReadonlysJitted speeds up Seek?
Obviously you would need to "warm up" Seek prior to the benchmark in order to avoid penalizing the non-StaticReadonlysJitted code for the time spent running the static constructors.
There was a problem hiding this comment.
@benaadams sounds good, thanks. Also makes me wonder if we should make BitConverter.IsLittleEndian a JIT intrinsic...
There was a problem hiding this comment.
@JosephTremoulet Its actually a const in coreclr; but it calls the corefx version which calculates it - might just need a forwarder?
There was a problem hiding this comment.
@halter73 without it, it includes both branches and calls the static ctor; then does a memory load from class var every execution in the main flow in the asm
mov rcx, 0xD1FFAB1E
mov edx, 3
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
movzx rcx, byte ptr [reloc classVar[0xd1ffab1e]]
test ecx, ecx
jne SHORT G_M48472_IG04
mov rcx, rsi
call MemoryPoolIterator:FindFirstEqualByteSlow(byref):int
nop So its an extra branch and long jump with branch per op and probably a volatile read? (just guessing on the cctor test)
There was a problem hiding this comment.
Is that measurable in a microbenchmark? If not, I would take the extra branch in the generated asm for cleaner code.
There was a problem hiding this comment.
I'll pull it from this PR and submit a new one and grumble about it there... 😉
| "emitEntryPoint": true, | ||
| "compile": { | ||
| "include": [ | ||
| "../shared/SocketInputExtensions.cs", |
There was a problem hiding this comment.
I just noticed you aren't using "../shared/**/*.cs" like the other test projects. Any reason for that?
There was a problem hiding this comment.
Some of them reference xunit so have to pull in a bunch of extra references that aren't used for the globbing
fee8441 to
2eba401
Compare
|
Rebased. What a journey..! I think its too complicated to squash else the "why" of git blame will be missing 😉 |
|
Though the real reason is I tried rearranging some things; but caused merge conflicts. Note if you can cause merge conflicts in a single PR it has got too big... |
|
Could squash the clean up ones? |
|
Added Multithreaded test in #1222 Results from 16 Core (32 HT) Server as follows: |
|
Any idea how much of TechEmPower numbers (both plaintext and Json ) have moved due to this change? |
|
This comment shows the performance difference in the plaintext benchmark prior to merging the PR. I haven't yet gotten the chance to run the TechEmpower benchmarks after merging. |
|
The saga is complete dotnet/coreclr#21073; end of an era! The age of CPU intrinsics in C# has begun! |
Is a branchless cache-oblivious evaluation of the set byte; rather than the current multiple ternary operator quasi tree. Around x 2.8 faster than current for Seek operations.
The De Bruijn alternative improvement that was evaluated needs a table lookup of 8 * 8 * bytes = 512 bytes in the data cache to be fast - which is 8 cache lines on x64. (It can be compressed to 24 bytes with some extra per lookup ops).
Whereas this uses a single 64bit const already in the instruction cache (which the De Bruijn also needs).
Also trims FindFirstEqualByte from 318 bytes of asm to 94 bytes of asm (before the final inline) so better for inlining which avoids the vector copy (Added inline due to issue dotnet/coreclr#7386)
Now also contains related Vectorization improvements; single core header parsing speeds (weak cpu) as follows
Using Multithreaded test from #1222, Results from 16 Core (32 HT) Server as follows:
Resolves #1129
Resolves #1141
Resolves #1213