Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 50 additions & 12 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2752,26 +2752,47 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
src = src->AsUnOp()->gtGetOp1();
}

unsigned size = node->GetLayout()->GetSize();

// An SSE mov that accesses data larger than 8 bytes may be implemented using
// multiple memory accesses. Hence, the JIT must not use such stores when
// INITBLK zeroes a struct that contains GC pointers and can be observed by
// other threads (i.e. when dstAddr is not an address of a local).
// For example, this can happen when initializing a struct field of an object.
const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();

#ifdef TARGET_AMD64
// On Amd64 the JIT will not use SIMD stores for such structs and instead
// will always allocate a GP register for src node.
const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
#else
// On X86 the JIT will use movq for structs that are larger than 16 bytes
// since it is more beneficial than using two mov-s from a GP register.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: This is 4 moves, since 32-bit can do 4-byte moves, not 8

Copy link
Contributor Author

@echesakov echesakov Jun 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant slightly different thing by saying movq is more beneficial than using two mov-s from a GP register is that using
movq [m64], xmm is more beneficial than mov [m32], r32 mov [m32+4], r32

const bool willUseSimdMov = (size >= 16);
#endif

if (!src->isContained())
{
srcIntReg = genConsumeReg(src);
}
else
{
// If src is contained then it must be 0 and the size must be a multiple
// of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions.
// If src is contained then it must be 0.
assert(src->IsIntegralConst(0));
assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0);
assert(willUseSimdMov);
#ifdef TARGET_AMD64
assert(size % 16 == 0);
#else
assert(size % 8 == 0);
#endif
}

emitter* emit = GetEmitter();
unsigned size = node->GetLayout()->GetSize();

assert(size <= INT32_MAX);
assert(dstOffset < (INT32_MAX - static_cast<int>(size)));

// Fill as much as possible using SSE2 stores.
if (size >= XMM_REGSIZE_BYTES)
if (willUseSimdMov)
{
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);

Expand All @@ -2791,9 +2812,25 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
#endif
}

instruction simdMov = simdUnalignedMovIns();
for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize)
instruction simdMov = simdUnalignedMovIns();
unsigned regSize = XMM_REGSIZE_BYTES;
unsigned bytesWritten = 0;

while (bytesWritten < size)
{
#ifdef TARGET_X86
if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
{
simdMov = INS_movq;
regSize = 8;
}
#endif
if (bytesWritten + regSize > size)
{
assert(srcIntReg != REG_NA);
break;
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
Expand All @@ -2803,11 +2840,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}

dstOffset += regSize;
bytesWritten += regSize;
}

// TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores.
// On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need
// to allocate a GPR just for the remainder.
size -= bytesWritten;
}

// Fill the remainder using normal stores.
Expand Down Expand Up @@ -4604,7 +4642,7 @@ void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
// The VM doesn't allow such large array elements but let's be sure.
noway_assert(scale <= INT32_MAX);
#else // !TARGET_64BIT
tmpReg = node->GetSingleTempReg();
tmpReg = node->GetSingleTempReg();
#endif // !TARGET_64BIT

GetEmitter()->emitIns_R_I(emitter::inst3opImulForReg(tmpReg), EA_PTRSIZE, indexReg,
Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -5686,6 +5686,13 @@ struct GenTreeBlk : public GenTreeIndir
bool gtBlkOpGcUnsafe;
#endif

#ifdef TARGET_XARCH
bool IsOnHeapAndContainsReferences()
{
return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr();
}
#endif

GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout)
: GenTreeIndir(oper, type, addr, nullptr)
, m_layout(layout)
Expand Down
24 changes: 19 additions & 5 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16286,11 +16286,25 @@ void Compiler::impImportBlockCode(BasicBlock* block)
"type operand incompatible with type of address");
}

size = info.compCompHnd->getClassSize(resolvedToken.hClass); // Size
op2 = gtNewIconNode(0); // Value
op1 = impPopStack().val; // Dest
op1 = gtNewBlockVal(op1, size);
op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
op2 = gtNewIconNode(0); // Value
op1 = impPopStack().val; // Dest

if (eeIsValueClass(resolvedToken.hClass))
{
op1 = gtNewStructVal(resolvedToken.hClass, op1);
if (op1->OperIs(GT_OBJ))
{
gtSetObjGcInfo(op1->AsObj());
}
}
else
{
size = info.compCompHnd->getClassSize(resolvedToken.hClass);
assert(size == TARGET_POINTER_SIZE);
op1 = gtNewBlockVal(op1, size);
}

op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
goto SPILL_APPEND;

case CEE_INITBLK:
Expand Down
15 changes: 11 additions & 4 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)

if (fill == 0)
{
// If the size is multiple of XMM register size there's no need to load 0 in a GPR,
// codegen will use xorps to generate 0 directly in the temporary XMM register.
if ((size % XMM_REGSIZE_BYTES) == 0)
if (size >= XMM_REGSIZE_BYTES)
{
src->SetContained();
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
#ifdef TARGET_AMD64
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0);
#else
const bool willUseOnlySimdMov = (size % 8 == 0);
#endif
if (willUseOnlySimdMov)
{
src->SetContained();
}
}
}
#ifdef TARGET_AMD64
Expand Down
14 changes: 11 additions & 3 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
// The return value will be on the X87 stack, and we will need to move it.
dstCandidates = allRegs(registerType);
#else // !TARGET_X86
dstCandidates = RBM_FLOATRET;
dstCandidates = RBM_FLOATRET;
#endif // !TARGET_X86
}
else if (registerType == TYP_LONG)
Expand Down Expand Up @@ -1297,7 +1297,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
switch (blkNode->gtBlkOpKind)
{
case GenTreeBlk::BlkOpKindUnroll:
if (size >= XMM_REGSIZE_BYTES)
{
#ifdef TARGET_AMD64
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
const bool willUseSimdMov = canUse16BytesSimdMov && (size >= 16);
#else
const bool willUseSimdMov = (size >= 16);
#endif
if (willUseSimdMov)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
SetContainsAVXFlags();
Expand All @@ -1310,7 +1317,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
srcRegMask = allByteRegs();
}
#endif
break;
}
break;

case GenTreeBlk::BlkOpKindRepInstr:
dstAddrRegMask = RBM_RDI;
Expand Down