Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions src/vm/methodtablebuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1521,14 +1521,11 @@ MethodTableBuilder::BuildMethodTableThrowing(
#if defined(CROSSGEN_COMPILE)
#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
if ((!IsNgenPDBCompilationProcess()
&& GetAppDomain()->ToCompilationDomain()->GetTargetModule() != g_pObjectClass->GetModule())
|| (strcmp(className, "Sse") != 0 && strcmp(className, "Sse2") != 0))
&& GetAppDomain()->ToCompilationDomain()->GetTargetModule() != g_pObjectClass->GetModule()))
#endif // defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
{
// Disable AOT compiling for managed implementation of hardware intrinsics.
// We specially treat them here to ensure correct ISA features are set during compilation
// The only exception to this rule are SSE and SSE2 intrinsics in CoreLib - we can
// safely expand those because we require them to be always available.
COMPlusThrow(kTypeLoadException, IDS_EE_HWINTRINSIC_NGEN_DISALLOWED);
}
#endif // defined(CROSSGEN_COMPILE)
Expand Down
118 changes: 116 additions & 2 deletions src/zap/zapinfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,9 @@ void ZapInfo::CompileMethod()
// this they can add the hint and reduce the perf cost at runtime.
m_pImage->m_pPreloader->PrePrepareMethodIfNecessary(m_currentMethodHandle);

DWORD methodAttribs = getMethodAttribs(m_currentMethodHandle);
// Retrieve method attributes from EEJitInfo - the ZapInfo's version updates
// some of the flags related to hardware intrinsics but we don't want that.
DWORD methodAttribs = m_pEEJitInfo->getMethodAttribs(m_currentMethodHandle);
if (methodAttribs & CORINFO_FLG_AGGRESSIVE_OPT)
{
// Skip methods marked with MethodImplOptions.AggressiveOptimization, they will be jitted instead. In the future,
Expand All @@ -447,6 +449,27 @@ void ZapInfo::CompileMethod()
return;
}

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
if (methodAttribs & CORINFO_FLG_JIT_INTRINSIC)
{
// Skip generating hardware intrinsic method bodies.
Comment thread
MichalStrehovsky marked this conversation as resolved.
//
// We don't know what the implementation should do (whether it can do the actual intrinsic thing, or whether
// it should throw a PlatformNotSupportedException).

const char* namespaceName;
getMethodNameFromMetadata(m_currentMethodHandle, nullptr, &namespaceName, nullptr);
if (strcmp(namespaceName, "System.Runtime.Intrinsics.X86") == 0
|| strcmp(namespaceName, "System.Runtime.Intrinsics.Arm.Arm64") == 0
|| strcmp(namespaceName, "System.Runtime.Intrinsics") == 0)
{
if (m_zapper->m_pOpt->m_verbose)
m_zapper->Info(W("Skipped due to being a hardware intrinsic\n"));
return;
}
}
#endif

m_jitFlags = ComputeJitFlags(m_currentMethodHandle);

#ifdef FEATURE_READYTORUN_COMPILER
Expand Down Expand Up @@ -2083,6 +2106,94 @@ void ZapInfo::GetProfilingHandle(BOOL *pbHookFunction,
*pbIndirectedHandles = TRUE;
}

//
// This strips the CORINFO_FLG_JIT_INTRINSIC flag from some of the hardware intrinsic methods.
//
DWORD FilterHardwareIntrinsicMethodAttribs(DWORD attribs, CORINFO_METHOD_HANDLE ftn, ICorDynamicInfo* pJitInfo)
{
if (attribs & CORINFO_FLG_JIT_INTRINSIC)
{
// Figure out which intrinsic we are dealing with.
const char* namespaceName;
const char* className;
const char* enclosingClassName;
const char* methodName = pJitInfo->getMethodNameFromMetadata(ftn, &className, &namespaceName, &enclosingClassName);

// Is this the get_IsSupported method that checks whether intrinsic is supported?
bool fIsGetIsSupportedMethod = strcmp(methodName, "get_IsSupported") == 0;

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
bool fIsX86intrinsic = strcmp(namespaceName, "System.Runtime.Intrinsics.X86") == 0;

// If it's anything related to Sse/Sse2, we can expand unconditionally since this is a baseline
// requirement of CoreCLR.
if (fIsX86intrinsic
&& (
strcmp(className, "Sse") == 0 || strcmp(className, "Sse2") == 0
|| (
strcmp(className, "X64") == 0
&& (
strcmp(enclosingClassName, "Sse") == 0 || strcmp(enclosingClassName, "Sse2") == 0
)
)
)
)
{
return attribs;
}

// If it's an intrinsic that requires VEX encoding, do not report as intrinsic
// to force this to become a regular method call.
// We don't allow RyuJIT to use VEX encoding at AOT compilation time, so these
// cannot be pregenerated. Not reporting them as intrinsic will make sure
// it will do the right thing at runtime (the called method will be JITted).
// It will be slower, but correct.
if (fIsX86intrinsic
&& (
strcmp(className, "Avx") == 0 || strcmp(className, "Fma") == 0 || strcmp(className, "Avx2") == 0 || strcmp(className, "Bmi1") == 0 || strcmp(className, "Bmi2") == 0
|| (
strcmp(className, "X64") == 0
&& (
strcmp(enclosingClassName, "Bmi1") == 0 || strcmp(enclosingClassName, "Bmi2") == 0
)
)
)
)
{
// We do want the IsSupported for VEX instructions to be recognized as intrinsic so that the
// potentially worse quality code doesn't actually run until tiered JIT starts
// kicking in and recompiling methods. Reporting this as intrinsic makes RyuJIT expand it
// into `return false`.
if (fIsGetIsSupportedMethod)
return attribs;

// Treat other intrinsic methods as a regular method call (into a JITted method).
return (attribs & ~CORINFO_FLG_JIT_INTRINSIC) | CORINFO_FLG_DONT_INLINE;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this going to be correct? The calls are recursive and the mustExpand support is dependent on CORINFO_FLG_JIT_INTRINSIC being set.

So, if it is recursive and CORINFO_FLG_JIT_INTRINSIC isn't set, it will be AOT'd as if it was a regular recursive call...

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we want is the caller to treat these as regular, non-inline method calls and for the JIT to not AOT the method bodies of the intrinsics themselves.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't that what CORINFO_FLG_DONT_INLINE will fix up?

The recursive call within the actual intrinsic method body will never go through here because we don't AOT compile them.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a compiler check for "is compiling for R2R/crossgen/AOT", right?

It might be simpler to just update the logic in Compiler::impHWIntrinsic method (see https://github.com/dotnet/coreclr/blob/master/src/jit/hwintrinsicxarch.cpp#L61-L70) to do the following

    bool isIsaSupported = comp->compSupports(isa) && comp->compSupportsHWIntrinsic(isa);

    if (strcmp(methodName, "get_IsSupported") == 0)
    {
        return isIsaSupported ? NI_IsSupported_True : NI_IsSupported_False;
    }
    else if (!isIsaSupported)
    {
+       if (isCompilingForAOT)
+       {
+           return nullptr;
+       }
        return NI_Throw_PlatformNotSupportedException;
    }

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe that would allow us to treat the AVX/AVX2/FMA/BMI1/BMI2 code paths as normal (you don't need to mark them as DONT_INLINE or strip the JIT_INTRINSIC flag).

The existing logic for not expanding method bodies would kick-in here: 6b5b161#diff-04a590b55693db432b3a24d0d9a59cceR451

And the added check would ensure that not supported ISAs (which these won't be due to us not setting the compiler flags in the zapper here: 6b5b161#diff-efefdb418a1b8be4f78c7c9aedf91cb4L1212) will return nullptr (forcing them to be emitted as regular method calls).

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding is correct here - the intrinsics are recognized during importing, prior to the call being considered for inlining.

I think it makes some sense to unconditionally fail to expand (i.e. return nullptr) for these when pre-jitting (aka AOT compiling). I don't think the JIT can distinguish when it's compiling SPC.dll vs. something else, but I don't think it makes a difference in practice, since any intrinsics that are called under the check for IsSupported will fail to pre-jit anyway.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe so. Intrinsics are special and are handled before the inlining logic.

But this line strips the intrinsic bit from the method. It should be treated as a simple method call, no?

That's what we want for BMI and friends - keep IsSupported untouched and reported as an intrinsic so that it expands to false, but treat everything else as a regular non-inlined method call - the fact that the IL is recursive shouldn't matter because RyuJIT is not going to look at it due to the no inline flag, no?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the fact that the IL is recursive shouldn't matter because RyuJIT is not going to look at it due to the no inline flag, no?

The problem ends up coming in when we go to AOT those methods. You stripped the JIT_INTRINSIC flag so the logic for "skip generating hardware intrinsic method bodies" won't kick in. Which means we will AOT the BMI and friends method bodies as regular recursive calls.

Copy link
Copy Markdown
Member Author

@MichalStrehovsky MichalStrehovsky Jun 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You stripped the JIT_INTRINSIC flag so the logic for "skip generating hardware intrinsic method bodies" won't kick in

Attributes in that spot are retrieved from the standard JitInterface - we don't call the getMethodAttribs that messes with the intrinsic bit there.

Precompiling method bodies on e.g. the Bmi1 class would be wrong.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, ok. That definitely wasn't clear to me. A comment calling it out might be helpful.

}

#endif // defined(_TARGET_X86_) || defined(_TARGET_AMD64_)

// Do not report the get_IsSupported method as an intrinsic if it's an intrinsic on the architecture
// we are targeting. This will turn the call into a regular call.
// We also make sure none of the hardware intrinsic method bodies get pregenerated in crossgen
// (see ZapInfo::CompileMethod) but get JITted instead. The JITted method will have the correct
// answer for the CPU the code is running on.
if (fIsGetIsSupportedMethod && (
#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
fIsX86intrinsic ||
#elif _TARGET_ARM64_
strcmp(namespaceName, "System.Runtime.Intrinsics.Arm.Arm64") == 0 ||
#endif
strcmp(namespaceName, "System.Runtime.Intrinsics") == 0))
{
// Treat as a regular method call (into a JITted method).
return (attribs & ~CORINFO_FLG_JIT_INTRINSIC) | CORINFO_FLG_DONT_INLINE;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same concern as I called out above. Intrinsics are special and are handled in the importer before inlining checks and the recursive calls are dependent on mustExpand (which itself depends on JIT_INTRINSIC flag).

It might be better to just fixup the compiler::impIntrinsic method to have a if (isCompilingForAOT) path for the IsSupported checks, which returns nullptr rather than a gtNewIconNode (see https://github.com/dotnet/coreclr/blob/master/src/jit/importer.cpp#L3486-L3494, as we handle the IsSupported and PlatformNotSupportedException intrinsics a bit earlier).

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense to me, though note that the JIT doesn't know whether it's SPC.dll or not.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that actually a concern now that we have a mechanism to handle functions which consume AVX/AVX2/FMA/BMI1/BMI2 methods, so things work even if they are called without an IsSupported check?

I believe the "worst case" scenario is someone doesn't use the check and calls, for example Sse41.DotProduct on a processor that doesn't support it. We will have emitted the correct instruction encoding and that will trigger a hardware exception as the instruction isn't supported.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is also the chance for subtle bugs, if users don't provide software fallbacks, but I think it might be better to just have that be upfront in the initial release. If we ever want to make this more generally available, users will always have that scenario.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right - I don't think it is a concern that we don't specially handle SPC.dll here.

I believe the "worst case" scenario is someone doesn't use the check and calls, for example Sse41.DotProduct on a processor that doesn't support it.

But the code won't work in any event. I guess if they were actually relying on getting PNSE it might. I suppose we could trap the hardware exception and throw PNSE, but I'm not sure that complication is justified.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I think it would be better to encourage people to have the appropriate IsSupported checks 😄

}
}

return attribs;
}

//return a callable stub that will do the virtual or interface call


Expand All @@ -2108,6 +2219,8 @@ void ZapInfo::getCallInfo(CORINFO_RESOLVED_TOKEN * pResolvedToken,
(CORINFO_CALLINFO_FLAGS)(flags | CORINFO_CALLINFO_KINDONLY),
pResult);

pResult->methodFlags = FilterHardwareIntrinsicMethodAttribs(pResult->methodFlags, pResult->hMethod, m_pEEJitInfo);

#ifdef FEATURE_READYTORUN_COMPILER
if (IsReadyToRunCompilation())
{
Expand Down Expand Up @@ -3678,7 +3791,8 @@ unsigned ZapInfo::getMethodHash(CORINFO_METHOD_HANDLE ftn)

DWORD ZapInfo::getMethodAttribs(CORINFO_METHOD_HANDLE ftn)
{
return m_pEEJitInfo->getMethodAttribs(ftn);
DWORD result = m_pEEJitInfo->getMethodAttribs(ftn);
return FilterHardwareIntrinsicMethodAttribs(result, ftn, m_pEEJitInfo);
}

void ZapInfo::setMethodAttribs(CORINFO_METHOD_HANDLE ftn, CorInfoMethodRuntimeFlags attribs)
Expand Down
24 changes: 22 additions & 2 deletions src/zap/zapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1189,11 +1189,31 @@ void Zapper::InitializeCompilerFlags(CORCOMPILE_VERSION_INFO * pVersionInfo)
#endif // _TARGET_X86_

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)
// If we're compiling CoreLib, allow RyuJIT to generate SIMD code so that we can expand some
// of the hardware intrinsics.
// If we're crossgenning CoreLib, allow generating non-VEX intrinsics. The generated code might
// not actually be supported by the processor at runtime so we compensate for it by
// not letting the get_IsSupported method to be intrinsically expanded in crossgen
// (see special handling around CORINFO_FLG_JIT_INTRINSIC in ZapInfo).
// That way the actual support checks will always be jitted.
// We only do this for CoreLib because forgetting to wrap intrinsics under IsSupported
// checks can lead to illegal instruction traps (instead of a nice managed exception).
Comment thread
MichalStrehovsky marked this conversation as resolved.
if (m_pEECompileInfo->GetAssemblyModule(m_hAssembly) == m_pEECompileInfo->GetLoaderModuleForMscorlib())
{
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_FEATURE_SIMD);

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could potentially cause problems. The JIT currently decides to emit the VEX or non-VEX encoding based on whether or not AVX is supported.

This isn't going to correctly handle that difference for an SSE exclusive vs an AVX checked code path, which will prevent the generated code from running on some downstream processors.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we keep the "do not crossgen" behavior for AVX/FMA/AVX2, would that fix the issue?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the safest thing to do for now would be to effectively treat the AOT codegen as if only SSE and SSE2 had been reported as supported by the CPUID checks the VM does and to treat everything else as "not supported" (we should likely also consider whether or not crossgen ignores the COMPlus_EnableISA=0 flags).

That should mitigate the startup time regressions and throughput should get fairly quickly resolved by rejitting the hot methods.

Anything else is going to have larger potential for a bug tail or may impact perf in interesting ways (hurting benchmark numbers, for example) and likely needs more investigating before we settle on a solution.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I confess that this hadn't occurred to me. I'd like to see us have the ability to include runtime-checked dual-version code, but I'm pretty sure the JIT can't generate both SSE and VEX encodings in the same method.
I'm not sure why we couldn't also handle the other Sse* intrinsics, with the assumption that such code will (as expected) be guarded with the appropriate IsSupported runtime call.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why we couldn't also handle the other Sse* intrinsics, with the assumption that such code will (as expected) be guarded with the appropriate IsSupported runtime call.

I agree.

Long term, we should be able to support a static AOT target or one of a couple forms of dynamic dispatch. The two most common, that I am aware of, are cached CPUID checks (which result in a branch per ISA target) and changing the method target (which involves fixing up which target the method points to).

But, those are post 3.0 goals and likely require some fixups to support generating both VEX and non-VEX code paths, for example.

m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_AES);
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_PCLMULQDQ);
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_SSE3);
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_SSSE3);
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_SSE41);
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_SSE42);
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_POPCNT);
// Leaving out CORJIT_FLAGS::CORJIT_FLAG_USE_AVX, CORJIT_FLAGS::CORJIT_FLAG_USE_FMA
// CORJIT_FLAGS::CORJIT_FLAG_USE_AVX2, CORJIT_FLAGS::CORJIT_FLAG_USE_BMI1,
// CORJIT_FLAGS::CORJIT_FLAG_USE_BMI2 on purpose - these require VEX encodings
// and the JIT doesn't support generating code for methods with mixed encodings.
m_pOpt->m_compilerFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_USE_LZCNT);
#endif // defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
}
#endif // defined(_TARGET_X86_) || defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)

Expand Down