Use dispatch helper for calling interface methods#123252
Use dispatch helper for calling interface methods#123252MichalStrehovsky wants to merge 18 commits intodotnet:mainfrom
Conversation
|
Test program: Details// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
class Program
{
static void Main()
{
for (int it = 0; it < 6; it++)
{
IFace c0 = new C0();
var sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed(c0, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
IFace[] faces = [
new C0(), new C1(), new C2(), new C3(),
new C4(), new C5(), new C6(), new C7(),
new C8(), new C9(), new CA(), new CB(),
new CC(), new CD(), new CE(), new CF(),
];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall2(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall4(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall8(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall16(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
for (int i = 0; i < faces.Length; i++)
{
DoCallFixed(faces[i], 1, 2);
}
sw = Stopwatch.StartNew();
IFace cf = new CF();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed(cf, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("---------------------------");
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCallFixed(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall2(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall4(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall8(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
i[4].Call(x, y);
i[5].Call(x, y);
i[6].Call(x, y);
i[7].Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall16(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
i[4].Call(x, y);
i[5].Call(x, y);
i[6].Call(x, y);
i[7].Call(x, y);
i[8].Call(x, y);
i[9].Call(x, y);
i[10].Call(x, y);
i[11].Call(x, y);
i[12].Call(x, y);
i[13].Call(x, y);
i[14].Call(x, y);
i[15].Call(x, y);
}
}
interface IFace
{
int Call(int x, int y);
}
class C0 : IFace { public int Call(int x, int y) => x + y; }
class C1 : IFace { public int Call(int x, int y) => x + y; }
class C2 : IFace { public int Call(int x, int y) => x + y; }
class C3 : IFace { public int Call(int x, int y) => x + y; }
class C4 : IFace { public int Call(int x, int y) => x + y; }
class C5 : IFace { public int Call(int x, int y) => x + y; }
class C6 : IFace { public int Call(int x, int y) => x + y; }
class C7 : IFace { public int Call(int x, int y) => x + y; }
class C8 : IFace { public int Call(int x, int y) => x + y; }
class C9 : IFace { public int Call(int x, int y) => x + y; }
class CA : IFace { public int Call(int x, int y) => x + y; }
class CB : IFace { public int Call(int x, int y) => x + y; }
class CC : IFace { public int Call(int x, int y) => x + y; }
class CD : IFace { public int Call(int x, int y) => x + y; }
class CE : IFace { public int Call(int x, int y) => x + y; }
class CF : IFace { public int Call(int x, int y) => x + y; }Main: DetailsPR: DetailsAs expected, right now this is a regression when the level of polymorphism is small. We start to get wins when the old cache size grows beyond a dozen entries or so. |
It's probably not that we consider it clobbered, but LSRA has rather decided that |
|
As another experiment, I tried this shape: In this version RhpResolveInterfaceMethodFast will also tail call the target, but otherwise it's the same assembly (i.e. fast monomorphic check and then hashtable lookup). The numbers look like below: DetailsThe monomorphic case is as fast (maybe even a little bit faster) as what we have in main. Between 2-4 there's a bit of a regression, and starting from 8 the hashtable wins. The interesting part is that the numbers now also make more sense than the PR version above - lines 3, 4, 6 of the result are roughly the same, which makes sense because they represent the "we found this in the first entry of the hashtable bucket". The numbers on lines 3, 4, 6 is in the PR version above are not "roughly the same" and I wonder if the extra call is to blame. |
|
I did another few experiments, this time adding more linear caches (not just for the monomorphic case, but also tried with 2 entries and 4 entries). The results look like this (for the same above program). Resolver is the shape described in the top post. TailcallCache1 is the shape described in #123252 (comment). TailCallCache2/4 is the same as TailCallCache1, but the dispatch cell has not just space for one MethodTable-CodePointer pair, but for 2/4 of them.
The numbers in Baseline are known to be bad - we know we need to add extra checks/barriers to it to make it correct (#121632 (comment)). The scenarios with a resolver look pretty bad, so I didn't explore it further. I think the TailCallCache1 looks the most attractive. Because in this scheme the shape of the dispatch cell is fixed, RyuJIT would be able to inline the first cache test (leaving hashtable lookup for the helper). What we currently have in Main has a complicated scheme that would be difficult to inline. So we can make the monomorphic path not require a helper call at all. I was trying to find some more real world scenario to measure on but so far haven't found anything interesting. E.g. JSON serialization has a lot of interface calls, but they are all monomorphic. |
aba4515 to
4c067ef
Compare
|
I pushed out new code and update top-post. This is now based on TailCallCache1 approach above. The RyuJIT change is completely gross, I don't have a good idea for how it should be implemented. Do we want this to stay We'll need to implement and maintain the hashtable lookup in assembly for the other platforms :(. We can probably have a CFG flavor of this helper that will call through the validator. Eventually, we could have RyuJIT inline the monomorphic path. Eventually, we could use this for generic virtual method calls too (they currently all go to the hashtable directly and the hashtable lookup is in C#). |
What would trigger the change in the meaning? I do not think we want to tie this to NAOT ABI since we will want this to work on CoreCLR w/ R2R as well (for iOS). I think the two options are a new |
| xor r9d,ecx | ||
| movsxd r9,r9d | ||
| mov rax,9E3779B97F4A7C15h | ||
| imul r9,rax |
There was a problem hiding this comment.
Where did this hash function come from?
There was a problem hiding this comment.
From here:
Then I made changes to the basic block layout and register allocation. Not sure if this is the way we want to go, but it's an option I saw.
There was a problem hiding this comment.
This is solving the same problem as VSD polymorphic hashtable that has a lot simpler hash function: https://github.com/dotnet/runtime/blob/main/src/coreclr/vm/amd64/virtualcallstubcpu.hpp#L399-L443
We may want to make the design and tunning of this hashtable and the VSD hashtable to be same/similar.
|
Draft Pull Request was automatically closed for 30 days of inactivity. Please let us know if you'd like to reopen it. |
Add a new JIT flag (bit 10) to control whether interface dispatch uses a direct helper call instead of virtual stub dispatch. NativeAOT sets this flag; CoreCLR/ReadyToRun does not. In LowerVirtualStubCall, when the flag is set: - For CT_INDIRECT calls (shared generic dictionary lookup), remove the orphaned gtCallAddr tree from LIR before changing the call type - Convert to CT_USER_FUNC with gtDirectCallAddress pointing to the dispatch resolver helper (CORINFO_HELP_INTERFACELOOKUP_FOR_SLOT) - Clear GTF_CALL_VIRT_STUB and GTF_CALL_M_VIRTSTUB_REL_INDIRECT - The VirtualStubCell arg still passes the dispatch cell in r11 When the flag is not set, the original VSD lowering path is preserved. Re-enable two JIT asserts that were disabled as a hack: - 'Found an unmarked unused value' in CheckLIR (lir.cpp) - 'Expected empty defList at end of block' in buildIntervals (lsrabuild.cpp) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
LowerVirtualStubCall now clears GTF_CALL_VIRT_STUB when converting to a dispatch helper call, so the IsVirtualStub() block in LowerCFGCall is unreachable for NativeAOT. Remove it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
The helper now dispatches to the target (tail-calls on x64) rather than just looking up the address. Rename to reflect the new semantics. Also remove the now-dead RBM_INTERFACEDISPATCH_FOR_SLOT_TRASH/RETURN register masks and their consumers in emit.cpp, codegencommon.cpp, codegenarmarch.cpp, and lsraarmarch.cpp. These were only used by the deleted CFG VSD code path that created CT_HELPER calls with this helper. The new dispatch helper path creates CT_USER_FUNC calls instead. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Add unoptimized implementations of RhpResolveInterfaceMethodFast for platforms that were missing it: Windows i386, Linux amd64, Linux arm64, Linux arm32, Linux loongarch64, and Linux riscv64. Each implementation performs a null check (triggering an AV that EHHelpers translates to NullReferenceException) then falls through to RhpCidResolve via RhpUniversalTransitionTailCall, matching the established RhpInterfaceDispatchSlow pattern for the platform. ARM32 omits the null check due to register pressure (no free scratch register), consistent with its existing RhpInitialDynamicInterfaceDispatch. The CMakeLists.txt entry is moved from a conditional block (Windows amd64/arm64 only) to the unconditional RUNTIME_SOURCES_ARCH_ASM list. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
4c067ef to
d5832a5
Compare
There was a problem hiding this comment.
Pull request overview
This PR prototypes switching interface dispatch to a resolve-helper-based call sequence (e.g., calling RhpResolveInterfaceMethodFast) and updates the dispatch cell/data layout to support a fast monomorphic cache with out-of-line metadata.
Changes:
- Introduces a new JIT flag (
USE_DISPATCH_HELPERS) and corresponding helper IDs/names for interface dispatch. - Refactors NativeAOT interface dispatch cells to a fixed 2-pointer shape (cached
MethodTable+ cached target), with interface+slot info stored in new ReadyToRun sections. - Removes legacy cached interface dispatch infrastructure in NativeAOT and updates runtime/JIT/tooling/asm stubs accordingly.
Reviewed changes
Copilot reviewed 71 out of 71 changed files in this pull request and generated 4 comments.
Show a summary per file
| File | Description |
|---|---|
| src/coreclr/vm/virtualcallstub.h | Removes now-unused resolve worker declaration for interface lookup slot. |
| src/coreclr/vm/virtualcallstub.cpp | Removes implementation of legacy interface resolve worker path. |
| src/coreclr/vm/jithelpers.cpp | Renames helper entry point declaration to JIT_InterfaceDispatchForSlot. |
| src/coreclr/vm/frames.h | Removes ResolveHelperFrame frame type support. |
| src/coreclr/vm/arm64/stubs.cpp | Removes argument-reg regdisplay helper tied to removed frame. |
| src/coreclr/vm/arm64/asmhelpers.asm | Replaces helper implementation with leaf stub dispatch jump. |
| src/coreclr/vm/amd64/cgenamd64.cpp | Removes ResolveHelperFrame regdisplay logic. |
| src/coreclr/vm/amd64/VirtualCallStubAMD64.asm | Replaces helper with leaf stub dispatch jump. |
| src/coreclr/vm/FrameTypes.h | Removes ResolveHelperFrame from frame type list. |
| src/coreclr/tools/superpmi/superpmi-shared/spmidumphelper.cpp | Adds dumping support for USE_DISPATCH_HELPERS flag. |
| src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp | Adds string name for new CORJIT flag. |
| src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs | Maps new helper to RhpResolveInterfaceMethodFast. |
| src/coreclr/tools/aot/ILCompiler.Compiler/ILCompiler.Compiler.csproj | Includes new InterfaceDispatchCellInfoSectionNode. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/MetadataManager.cs | Adds new ReadyToRun sections for dispatch cells and metadata. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/NodeFactory.cs | Removes always-rooted dispatch cell section from graph roots (now emitted via header). |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellSectionNode.cs | Changes dispatch cell region emission to BSS section sized by cell count. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellNode.cs | Refactors dispatch cell node to sortable symbol with externally-initialized offsets and fixed size. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellInfoSectionNode.cs | New RO-data section emitting interface+slot metadata and initializing cell offsets. |
| src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs | Adds CORJIT_FLAG_USE_DISPATCH_HELPERS. |
| src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs | Enables USE_DISPATCH_HELPERS flag for non-ReadyToRun builds. |
| src/coreclr/tools/Common/JitInterface/CorInfoHelpFunc.cs | Renames helper enum to INTERFACEDISPATCH_FOR_SLOT. |
| src/coreclr/tools/Common/Internal/Runtime/ModuleHeaders.cs | Assigns new ReadyToRun section IDs 209/210 for dispatch regions. |
| src/coreclr/tools/Common/Compiler/DependencyAnalysis/SortableDependencyNode.cs | Adds ordered emission slots for the new dispatch sections. |
| src/coreclr/runtime/CachedInterfaceDispatch.cpp | Adjusts cache allocation logic for VTableOffset case. |
| src/coreclr/nativeaot/System.Private.TypeLoader/src/Internal/Runtime/TypeLoader/GenericDictionaryCell.cs | Allocates new-style dispatch cell + inline metadata block for dynamic cells. |
| src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/RuntimeImports.cs | Removes RhNewInterfaceDispatchCell import; adds RhpRegisterDispatchCache. |
| src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/Augments/RuntimeAugments.cs | Removes managed NewInterfaceDispatchCell API. |
| src/coreclr/nativeaot/Runtime/startup.cpp | Removes legacy interface dispatch initialization. |
| src/coreclr/nativeaot/Runtime/riscv64/DispatchResolve.S | Adds RhpResolveInterfaceMethodFast entry for riscv64. |
| src/coreclr/nativeaot/Runtime/portable.cpp | Removes NYI cached interface dispatch helper stubs. |
| src/coreclr/nativeaot/Runtime/loongarch64/DispatchResolve.S | Adds RhpResolveInterfaceMethodFast entry for loongarch64. |
| src/coreclr/nativeaot/Runtime/inc/rhbinder.h | Removes legacy cached interface dispatch structs/types. |
| src/coreclr/nativeaot/Runtime/i386/DispatchResolve.asm | Adds RhpResolveInterfaceMethodFast entry for i386. |
| src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm | Removes “return-result” universal transition variant. |
| src/coreclr/nativeaot/Runtime/arm64/DispatchResolve.asm | Simplifies arm64 fast helper to universal-transition slow path. |
| src/coreclr/nativeaot/Runtime/arm64/DispatchResolve.S | Adds Unix arm64 RhpResolveInterfaceMethodFast. |
| src/coreclr/nativeaot/Runtime/arm64/CachedInterfaceDispatchAot.asm | Deletes old cached interface dispatch slow stub (Windows). |
| src/coreclr/nativeaot/Runtime/arm64/CachedInterfaceDispatchAot.S | Deletes old cached interface dispatch slow stub (Unix). |
| src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | Adds ARM RhpResolveInterfaceMethodFast. |
| src/coreclr/nativeaot/Runtime/amd64/UniversalTransition.asm | Removes “return-result” universal transition variant. |
| src/coreclr/nativeaot/Runtime/amd64/DispatchResolve.asm | Reworks amd64 fast helper to use 2-pointer cell + global cache. |
| src/coreclr/nativeaot/Runtime/amd64/DispatchResolve.S | Adds Unix amd64 RhpResolveInterfaceMethodFast. |
| src/coreclr/nativeaot/Runtime/amd64/CachedInterfaceDispatchAot.asm | Deletes old cached interface dispatch slow stub (Windows). |
| src/coreclr/nativeaot/Runtime/amd64/CachedInterfaceDispatchAot.S | Deletes old cached interface dispatch slow stub (Unix). |
| src/coreclr/nativeaot/Runtime/allocheap.h | Deletes allocator used by removed cached interface dispatch code. |
| src/coreclr/nativeaot/Runtime/allocheap.cpp | Deletes allocator implementation used by removed code. |
| src/coreclr/nativeaot/Runtime/SyncClean.cpp | Removes GC-time reclaim hook for old dispatch caches. |
| src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp | Removes universal transition return-result thunk category support. |
| src/coreclr/nativeaot/Runtime/RuntimeInstance.cpp | Adds g_pDispatchCache and RhpRegisterDispatchCache internal call. |
| src/coreclr/nativeaot/Runtime/EHHelpers.cpp | Updates interface-dispatch AV-location list to new helper set. |
| src/coreclr/nativeaot/Runtime/Crst.h | Removes unused interface dispatch lock enum value. |
| src/coreclr/nativeaot/Runtime/CachedInterfaceDispatch_Aot.cpp | Deletes old cached interface dispatch PAL implementation. |
| src/coreclr/nativeaot/Runtime/CachedInterfaceDispatchPal.h | Deletes old cached interface dispatch PAL header. |
| src/coreclr/nativeaot/Runtime/CMakeLists.txt | Removes cached dispatch sources/defines; adds DispatchResolve to all arch asm lists. |
| src/coreclr/nativeaot/Runtime/AsmOffsetsVerify.cpp | Removes include dependency on deleted cached dispatch header. |
| src/coreclr/nativeaot/Runtime/AsmOffsets.h | Removes asm offsets/constants for deleted cached dispatch structs. |
| src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs | Removes internal-call declarations for old cache APIs. |
| src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/CachedInterfaceDispatch.cs | Implements new dispatch cell metadata lookup, caching, and registration. |
| src/coreclr/nativeaot/CMakeLists.txt | Removes -mcx16 option previously used by old dispatch cache synchronization. |
| src/coreclr/jit/targetarm64.h | Removes register-kill metadata for old interface-lookup helper. |
| src/coreclr/jit/targetamd64.h | Removes register-kill metadata for old interface-lookup helper. |
| src/coreclr/jit/lsraarmarch.cpp | Removes special return-reg handling for old interface-lookup helper. |
| src/coreclr/jit/lower.cpp | Routes VSD calls to INTERFACEDISPATCH_FOR_SLOT helper under new flag/CFG, removing older CFG lowering path. |
| src/coreclr/jit/jitee.h | Adds JIT flag constant and mapping for USE_DISPATCH_HELPERS. |
| src/coreclr/jit/emit.cpp | Removes special saved-reg computation for old interface-lookup helper. |
| src/coreclr/jit/compiler.h | Adds ShouldUseDispatchHelpers() query for the new JIT flag. |
| src/coreclr/jit/codegencommon.cpp | Removes killset and pending-call-label handling tied to old helper. |
| src/coreclr/jit/codegenarmarch.cpp | Removes special return-reg selection for old interface-lookup helper. |
| src/coreclr/inc/jithelpers.h | Renames helper entry to CORINFO_HELP_INTERFACEDISPATCH_FOR_SLOT. |
| src/coreclr/inc/corjitflags.h | Adds CORJIT_FLAG_USE_DISPATCH_HELPERS to public header. |
| src/coreclr/inc/corinfo.h | Renames helper enum to CORINFO_HELP_INTERFACEDISPATCH_FOR_SLOT. |
Comments suppressed due to low confidence (1)
src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellSectionNode.cs:12
- This file now has several unused
usingdirectives/aliases (e.g.,System.Collections.Generic,Internal.Runtime,Internal.TypeSystem, and theDebugalias). WithTreatWarningsAsErrorsenabled, unused usings (CS8019) can break the build. Remove the unusedusingstatements.
You can also share your feedback on Copilot code review. Take the survey.
| info = new DispatchCellInfo | ||
| { | ||
| CellType = DispatchCellType.InterfaceAndSlot, | ||
| InterfaceType = (MethodTable*)(dispatchCellInfo + (cellIndex * 2)), |
There was a problem hiding this comment.
In the non-relative pointer case, InterfaceType is set to the address of the entry inside the info array (dispatchCellInfo + ...) rather than the value stored there. This will produce an invalid MethodTable* and break dispatch resolution on targets that don't use relative pointers. Load/dereference the element value instead of pointer-arithmetic casting.
| InterfaceType = (MethodTable*)(dispatchCellInfo + (cellIndex * 2)), | |
| InterfaceType = (MethodTable*)(*(dispatchCellInfo + (cellIndex * 2))), |
| { | ||
| private readonly NodeFactory _factory; | ||
| private readonly CompilerComparer _comparer = CompilerComparer.Instance; | ||
|
|
||
| public DispatchCellComparer(NodeFactory factory) | ||
| { | ||
| _factory = factory; | ||
| } |
There was a problem hiding this comment.
DispatchCellComparer stores a _factory field that is never used. With TreatWarningsAsErrors enabled, this will fail the build (CS0414). Remove the unused field/constructor parameter, or use it in comparisons if intended.
|
|
||
| using Internal.Runtime; | ||
| using Internal.Text; | ||
| using Internal.TypeSystem; | ||
|
|
||
| using ILCompiler.DependencyAnalysisFramework; | ||
|
|
There was a problem hiding this comment.
using Internal.Runtime; is now unused in this file (it was previously needed for relocation/flag types). With TreatWarningsAsErrors enabled, this can break the build due to CS8019. Remove the unused using directive.
|
/azp run runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 1 pipeline(s). |
When a direct call address is not reachable by BL on ARM32, the address is loaded into a register and the call type is switched to EC_INDIR_R. Clear params.addr to satisfy the emitter's invariant that EC_INDIR_R calls have addr == nullptr. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
|
/azp run runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 1 pipeline(s). |
The helper dispatches to the target (tail-calls) rather than just resolving the address, so the name should reflect the dispatch semantics, consistent with CORINFO_HELP_INTERFACEDISPATCH_FOR_SLOT. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Introduce guarded variants of the interface dispatch helpers that validate the computed dispatch target using the Windows Control Flow Guard validator (__guard_check_icall_fptr) before tail-calling it. - Add ValidateTarget parameter to the UNIVERSAL_TRANSITION macro and instantiate RhpUniversalTransitionGuardedTailCall alongside the existing RhpUniversalTransitionTailCall (AMD64 and ARM64 .asm). - Refactor RhpInterfaceDispatch into an INTERFACE_DISPATCH macro and instantiate RhpInterfaceDispatchGuarded that routes through the guarded UniversalTransition (AMD64 and ARM64 .asm). - Map CORINFO_HELP_INTERFACEDISPATCH_FOR_SLOT to the guarded variant when ControlFlowGuardAnnotations is enabled in the NativeAOT compiler. - Register the guarded dispatch stub for AV-to-NullRef translation in EHHelpers and the guarded return address in StackFrameIterator, guarded by TARGET_WINDOWS && (TARGET_AMD64 || TARGET_ARM64). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 71 out of 71 changed files in this pull request and generated 5 comments.
You can also share your feedback on Copilot code review. Take the survey.
src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/CachedInterfaceDispatch.cs
Show resolved
Hide resolved
...ools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellSectionNode.cs
Show resolved
Hide resolved
|
/azp run jit-cfg, runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 2 pipeline(s). |
|
|
||
| CORINFO_HELP_GVMLOOKUP_FOR_SLOT, // Resolve a generic virtual method target from this pointer and runtime method handle | ||
| CORINFO_HELP_INTERFACELOOKUP_FOR_SLOT, // Resolve a non-generic interface method from this pointer and dispatch cell | ||
| CORINFO_HELP_INTERFACEDISPATCH_FOR_SLOT, // Dispatch a non-generic interface method from this pointer and dispatch cell |
There was a problem hiding this comment.
Are we willing to give up better branch prediction from the non-shared indirect callsite?
BTW: I expect that we will need resolve variant for wasm. Dispatch variant won't work well for wasm since the helper would need to be target signature specific.
There was a problem hiding this comment.
Are we willing to give up better branch prediction from the non-shared indirect callsite?
The resolve helper is a lot slower than a dispatcher. I had resolve helper here in the past. The numbers are at #123252 (comment). Switching back to a dispatcher (#123252 (comment)) was an improvement everywhere. My guess is the branch predictor also uses past behavior or stack or something else, not just the IP.
There was a problem hiding this comment.
The numbers are at
I meant to link #123252 (comment). The resolver helper is in the Resolver column. What this branch is now is TailCallCache1 (but I temporarily deleted a bunch of assembly, so we're running a completely unoptimized version which will be slower).
I expect we'll end up better than TailCallCache1 if we make the hash function a tiny bit simpler. I'm looking at the VSD hashtable right now. We'll not be able to make it as good because we don't want separate stub per callsite, but it could still be a bit better than it was for those measurements.
Port the monomorphic MethodTable cache check from amd64/arm64 to the remaining architectures. i386 (Windows): Use push/pop ebx for a scratch register (matching StubDispatch.asm pattern). EDX is never touched. No barrier needed (x86 TSO guarantees store-store and load-load ordering). ARM32 (Unix): Use r1/r2 saved to red zone as scratch. DMB barrier between MethodTable and Code loads (ARMv7 has no load-acquire). GLOBAL_LABEL (no .thumb_func) for AV location matches EHHelpers.cpp faulting IP comparison. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
|
/azp run runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 1 pipeline(s). |
Still very much WIP, don't look at implementation. This works just enough that I can do perf measurements.
This is changing interface dispatch from this shape:
To this shape:
What this is also changing is the contents of the dispatch cell. The dispatch cell is now two pointers: a cached this pointer and a cached target method address. The information about the dispatch cell (interface type and slot) is stored out-of-line (the dispatch cell now goes to the .bss segment and the metadata is readonly data).
On a first dispatch, we call the slow resolution helper that will decompose the dispatch cell to MethodTable+slot, compute the result of lookup and store it in the dispatch cell itself. This is the fastest, monomorphic case.
If we later see dispatches with different kind of
this, we cache them in a global hashtable. The key of the global hashtable is thethisMethodTable address and the dispatch cell address. We use this as the key instead of interface MethodTable+slot+this MethodTable because it's faster to hash/compare and touches less memory.Because the contents/shape of the dispatch cell is now fixed, we can inline the monomorphic case in the invoke sequence. This is not done yet.
Cc @dotnet/ilc-contrib