-
Notifications
You must be signed in to change notification settings - Fork 5.3k
use YMM registers on x64 for BlkUnroll. #33665
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2989,13 +2989,19 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) | |
| // Fill as much as possible using SSE2 stores. | ||
| if (size >= XMM_REGSIZE_BYTES) | ||
| { | ||
| #ifdef FEATURE_SIMD | ||
| bool useYmm = (compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTES) && (size >= YMM_REGSIZE_BYTES); | ||
| #else | ||
| bool useYmm = false; | ||
| #endif | ||
| regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); | ||
|
|
||
| if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) | ||
| { | ||
| unsigned regSize = useYmm ? YMM_REGSIZE_BYTES : XMM_REGSIZE_BYTES; | ||
| // If the source is constant 0 then always use xorps, it's faster | ||
| // than copying the constant from a GPR to a XMM register. | ||
| emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg); | ||
| emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); | ||
| } | ||
| else | ||
| { | ||
|
|
@@ -3005,20 +3011,36 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) | |
| // For x86, we need one more to convert it from 8 bytes to 16 bytes. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This
Likewise for V256 it can be a shuffle + insert (AVX) or broadcast (AVX2):
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We start with 2 bytes of data after
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do we start with 2 bytes of data? this is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh yes, we are transferring from 1 byte that |
||
| emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); | ||
| #endif | ||
| if (useYmm) | ||
| { | ||
| emit->emitIns_R_R(INS_punpckldq, EA_32BYTE, srcXmmReg, srcXmmReg); | ||
| } | ||
| } | ||
|
|
||
| instruction simdMov = simdUnalignedMovIns(); | ||
| for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize) | ||
| { | ||
| if (dstLclNum != BAD_VAR_NUM) | ||
| { | ||
| emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); | ||
| } | ||
| else | ||
|
|
||
| auto unrollUsingXMM = [&](unsigned regSize) { | ||
| for (; size >= regSize; size -= regSize, dstOffset += regSize) | ||
| { | ||
| emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, | ||
| dstAddrIndexScale, dstOffset); | ||
| if (dstLclNum != BAD_VAR_NUM) | ||
| { | ||
| emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); | ||
| } | ||
| else | ||
| { | ||
| emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, | ||
| dstAddrIndexScale, dstOffset); | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| if (useYmm) | ||
| { | ||
| unrollUsingXMM(YMM_REGSIZE_BYTES); | ||
| } | ||
| if (size >= XMM_REGSIZE_BYTES) | ||
| { | ||
| unrollUsingXMM(XMM_REGSIZE_BYTES); | ||
tannergooding marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores. | ||
|
|
@@ -3206,34 +3228,46 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) | |
| { | ||
| regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT); | ||
|
|
||
| instruction simdMov = simdUnalignedMovIns(); | ||
| for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; | ||
| size -= regSize, srcOffset += regSize, dstOffset += regSize) | ||
| { | ||
| if (srcLclNum != BAD_VAR_NUM) | ||
| { | ||
| emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); | ||
| } | ||
| else | ||
| auto unrollUsingXMM = [&](unsigned regSize, regNumber tempReg) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer to make these actual methods rather than lambdas. To me the lambdas make it more difficult to see what's being modified, and it seems that sometimes the debugger(s) don't support them all that well. That said, doing that here would require some out (pointer) arguments). @BruceForstall do we have any guidance on this for the JIT?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I did not want to do a method because it would need 7+ arguments and ~5 of them were pointers or references.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right - I realize that there are tradeoffs, but I would probably make them in the other direction. I'd still like to hear from other JIT devs on this question, because I think it will continue to come up.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given that this is mainly just used to cover the regular + trailing case, and the trailing case should only ever be one iteration, might it be simpler to just copy the logic for the one trailing case needed?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For just this case I don't think it's worth debating over - but I think this is a good opportunity to discuss these tradeoffs relative to the JIT coding conventions. It may be that most others feel as I do, but it may also be that most JIT devs would prefer the slight obfuscation and possible debug challenges over the admittedly messy approach of passing in a bunch of pointer arguments. As I say, it's a tradeoff and it would be nice to get some consensus on where the balance should lie. |
||
| instruction simdMov = simdUnalignedMovIns(); | ||
| for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) | ||
| { | ||
| emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, | ||
| srcAddrIndexScale, srcOffset); | ||
| } | ||
| if (srcLclNum != BAD_VAR_NUM) | ||
| { | ||
| emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); | ||
| } | ||
| else | ||
| { | ||
| emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, | ||
| srcAddrIndexScale, srcOffset); | ||
| } | ||
|
|
||
| if (dstLclNum != BAD_VAR_NUM) | ||
| { | ||
| emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); | ||
| } | ||
| else | ||
| { | ||
| emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, | ||
| dstAddrIndexScale, dstOffset); | ||
| if (dstLclNum != BAD_VAR_NUM) | ||
| { | ||
| emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); | ||
| } | ||
| else | ||
| { | ||
| emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, | ||
| dstAddrIndexScale, dstOffset); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. | ||
| // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to | ||
| // allocate a GPR just for the remainder. | ||
| // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. | ||
| // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to | ||
| // allocate a GPR just for the remainder. | ||
| }; | ||
|
|
||
| #ifdef FEATURE_SIMD | ||
| if ((compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTES) && (size >= YMM_REGSIZE_BYTES)) | ||
| { | ||
| unrollUsingXMM(YMM_REGSIZE_BYTES, tempReg); | ||
| } | ||
| #endif | ||
| if (size >= XMM_REGSIZE_BYTES) | ||
| { | ||
| unrollUsingXMM(XMM_REGSIZE_BYTES, tempReg); | ||
| } | ||
| } | ||
|
|
||
| if (size > 0) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we want to tie this to the size of
Vector<T>rather than to the underlying ISAs that are available (based on the instructions used)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean to replace
compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTESwithcompiler->>getSIMDSupportLevel() == SIMD_AVX2_Supported? I do not have a preference here, what is your opinion?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, to replace it with a
compSupports(InstructionSet_AVX2)check so we aren't tied toVector<T>when we aren't actually usingVector<T>.The appropriate check might change slightly with #33274, but I imagine this PR would get merged first. CC. @davidwrighton
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, will do that before merge, thanks.