diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index 58eb054f468c62..532da5f3bc648e 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -986,11 +986,22 @@ LEAF_ENTRY Load_R0_R1_4B EPILOG_BRANCH_REG r5 LEAF_END Load_R0_R1_4B +LEAF_ENTRY Load_R0_R1_R2_4B + ldr r0, [r7], #4 +ALTERNATE_ENTRY Load_R1_R2_4B + ldr r1, [r7], #4 + ldr r2, [r7], #4 + ldr r5, [r6], #4 + EPILOG_BRANCH_REG r5 +LEAF_END Load_R0_R1_R2_4B + LEAF_ENTRY Load_R0_R1_R2_R3_4B ldr r0, [r7], #4 +ALTERNATE_ENTRY Load_R1_R2_R3_4B ldr r1, [r7], #4 ALTERNATE_ENTRY Load_R2_R3_4B ldr r2, [r7], #4 +ALTERNATE_ENTRY Load_R3_4B ldr r3, [r7], #4 ldr r5, [r6], #4 EPILOG_BRANCH_REG r5 @@ -1039,11 +1050,22 @@ LEAF_ENTRY Store_R0_R1_4B EPILOG_BRANCH_REG r5 LEAF_END Store_R0_R1_4B +LEAF_ENTRY Store_R0_R1_R2_4B + str r0, [r7], #4 +ALTERNATE_ENTRY Store_R1_R2_4B + str r1, [r7], #4 + str r2, [r7], #4 + ldr r5, [r6], #4 + EPILOG_BRANCH_REG r5 +LEAF_END Store_R0_R1_R2_4B + LEAF_ENTRY Store_R0_R1_R2_R3_4B str r0, [r7], #4 +ALTERNATE_ENTRY Store_R1_R2_R3_4B str r1, [r7], #4 ALTERNATE_ENTRY Store_R2_R3_4B str r2, [r7], #4 +ALTERNATE_ENTRY Store_R3_4B str r3, [r7], #4 ldr r5, [r6], #4 EPILOG_BRANCH_REG r5 @@ -1081,34 +1103,6 @@ LOCAL_LABEL(CopyLoop_4B): EPILOG_BRANCH_REG r5 LEAF_END Load_Stack_4B -LEAF_ENTRY Store_Stack - ldr r9, [r6], #4 // SP offset - ldr r8, [r6], #4 // number of stack slots - add r9, sp, r9 - add r9, r9, #__PWTB_TransitionBlock + SIZEOF__TransitionBlock -LOCAL_LABEL(StoreCopyLoop): - ldr r5, [r9], #4 - str r5, [r7], #8 - subs r8, r8, #4 - bne LOCAL_LABEL(StoreCopyLoop) - ldr r5, [r6], #4 - EPILOG_BRANCH_REG r5 -LEAF_END Store_Stack - -LEAF_ENTRY Store_Stack_4B - ldr r9, [r6], #4 // SP offset - ldr r8, [r6], #4 // number of stack slots - add r9, sp, r9 - add r9, r9, #__PWTB_TransitionBlock + SIZEOF__TransitionBlock -LOCAL_LABEL(StoreCopyLoop_4B): - ldr r5, [r9], #4 - str r5, [r7], #4 - subs r8, r8, #4 - bne LOCAL_LABEL(StoreCopyLoop_4B) - ldr r5, [r6], #4 - EPILOG_BRANCH_REG r5 -LEAF_END Store_Stack_4B - NESTED_ENTRY CallJittedMethodRetVoid, _TEXT, NoHandler PROLOG_PUSH "{r4-r10,lr}" mov r4, sp @@ -1119,6 +1113,8 @@ NESTED_ENTRY CallJittedMethodRetVoid, _TEXT, NoHandler CHECK_STACK_ALIGNMENT blx r5 mov sp, r4 + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetVoid, _TEXT @@ -1134,6 +1130,8 @@ NESTED_ENTRY CallJittedMethodRetI4, _TEXT, NoHandler blx r5 mov sp, r4 str r0, [r10] + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetI4, _TEXT @@ -1150,6 +1148,8 @@ NESTED_ENTRY CallJittedMethodRetI8, _TEXT, NoHandler mov sp, r4 str r0, [r10] str r1, [r10, 4] + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetI8, _TEXT @@ -1172,6 +1172,8 @@ NESTED_ENTRY CallJittedMethodRetBuffR0, _TEXT, NoHandler CHECK_STACK_ALIGNMENT blx r5 mov sp, r4 + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetBuffR0, _TEXT @@ -1186,6 +1188,8 @@ NESTED_ENTRY CallJittedMethodRetBuffR1, _TEXT, NoHandler CHECK_STACK_ALIGNMENT blx r5 mov sp, r4 + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetBuffR1, _TEXT @@ -1202,6 +1206,8 @@ NESTED_ENTRY CallJittedMethodRetI1, _TEXT, NoHandler mov sp, r4 sxtb r0, r0 str r0, [r10] + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetI1, _TEXT @@ -1218,6 +1224,8 @@ NESTED_ENTRY CallJittedMethodRetI2, _TEXT, NoHandler mov sp, r4 sxth r0, r0 str r0, [r10] + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetI2, _TEXT @@ -1234,6 +1242,8 @@ NESTED_ENTRY CallJittedMethodRetU1, _TEXT, NoHandler mov sp, r4 uxtb r0, r0 str r0, [r10] + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetU1, _TEXT @@ -1250,6 +1260,8 @@ NESTED_ENTRY CallJittedMethodRetU2, _TEXT, NoHandler mov sp, r4 uxth r0, r0 str r0, [r10] + ldr r4, [r4, 32] + str r2, [r4] EPILOG_POP "{r4-r10,pc}" NESTED_END CallJittedMethodRetU2, _TEXT @@ -1283,13 +1295,16 @@ LOCAL_LABEL(HaveInterpThreadContext): cmp r7, #0 beq LOCAL_LABEL(NoManagedThreadOrCallStub) add r6, r7, #OFFSETOF__CallStubHeader__Routines - ldr r7, [r5, #OFFSETOF__InterpThreadContext__pStackPointer] // HERE + ldr r7, [r5, #OFFSETOF__InterpThreadContext__pStackPointer] ldr r5, [r6], 4 // InterpThreadContext EPILOG_POP "{r0-r3}" CHECK_STACK_ALIGNMENT blx r5 + // Fill in the ContinuationContext register + ldr r2, [sp, #__PWTB_ArgumentRegisters + 8] + EPILOG_WITH_TRANSITION_BLOCK_RETURN NESTED_END InterpreterStub, _TEXT @@ -1368,6 +1383,35 @@ NESTED_ENTRY InterpreterStubRetBuffR1, _TEXT, NoHandler EPILOG_POP {pc} NESTED_END InterpreterStubRetBuffR1, _TEXT +LEAF_ENTRY Store_Stack + ldr r9, [r6], #4 // SP offset + ldr r8, [r6], #4 // number of stack slots + add r9, sp, r9 + add r9, r9, #__PWTB_TransitionBlock + SIZEOF__TransitionBlock +LOCAL_LABEL(StoreCopyLoop): + ldr r5, [r9], #4 + str r5, [r7], #8 + subs r8, r8, #4 + bne LOCAL_LABEL(StoreCopyLoop) + ldr r5, [r6], #4 + EPILOG_BRANCH_REG r5 +LEAF_END Store_Stack + +LEAF_ENTRY Store_Stack_4B + ldr r9, [r6], #4 // SP offset + ldr r8, [r6], #4 // number of stack slots + add r9, sp, r9 + add r9, r9, #__PWTB_TransitionBlock + SIZEOF__TransitionBlock +LOCAL_LABEL(StoreCopyLoop_4B): + ldr r5, [r9], #4 + str r5, [r7], #4 + subs r8, r8, #4 + bne LOCAL_LABEL(StoreCopyLoop_4B) + ldr r5, [r6], #4 + EPILOG_BRANCH_REG r5 +LEAF_END Store_Stack_4B + + // ------------------------------------------------------------------ // Create a real TransitionBlock and call CallInterpreterFuncletWorker // to execute an interpreter funclet (catch/finally/filter handler). @@ -1396,9 +1440,9 @@ NESTED_ENTRY CallInterpreterFunclet, _TEXT, NoHandler str r12, [sp, #4] // Load isFilter from original stack location and store as 5th param (1st stack arg) - // After PROLOG_WITH_TRANSITION_BLOCK, original stack args are at __PWTB_TransitionBlock offset + // After PROLOG_WITH_TRANSITION_BLOCK, original stack args are past the ArgumentRegisters // The 5th param (isFilter) was pushed before our stack allocation - ldr r12, [sp, #8 + __PWTB_TransitionBlock + SIZEOF__ArgumentRegisters] + ldr r12, [sp, #8 + __PWTB_ArgumentRegisters + SIZEOF__ArgumentRegisters] str r12, [sp, #0] // r0-r3 remain unchanged diff --git a/src/coreclr/vm/arm/cgencpu.h b/src/coreclr/vm/arm/cgencpu.h index fea7e5a41d5354..f073cca8d32872 100644 --- a/src/coreclr/vm/arm/cgencpu.h +++ b/src/coreclr/vm/arm/cgencpu.h @@ -63,6 +63,7 @@ EXTERN_C void checkStack(void); #define COMMETHOD_PREPAD 12 // # extra bytes to allocate in addition to sizeof(ComCallMethodDesc) #define STACK_ALIGN_SIZE 4 +#define CALL_STACK_ALIGN_SIZE 8 #define JUMP_ALLOCATE_SIZE 8 // # bytes to allocate for a jump instruction #define BACK_TO_BACK_JUMP_ALLOCATE_SIZE 8 // # bytes to allocate for a back to back jump instruction diff --git a/src/coreclr/vm/callstubgenerator.cpp b/src/coreclr/vm/callstubgenerator.cpp index 941ca75e0f0236..fe8d37fb29bceb 100644 --- a/src/coreclr/vm/callstubgenerator.cpp +++ b/src/coreclr/vm/callstubgenerator.cpp @@ -605,12 +605,20 @@ extern "C" void Store_R2_R3(); extern "C" void Store_R3(); extern "C" void Load_R0_R1_4B(); +extern "C" void Load_R0_R1_R2_4B(); extern "C" void Load_R0_R1_R2_R3_4B(); +extern "C" void Load_R1_R2_4B(); +extern "C" void Load_R1_R2_R3_4B(); extern "C" void Load_R2_R3_4B(); +extern "C" void Load_R3_4B(); extern "C" void Load_Stack_4B(); extern "C" void Store_R0_R1_4B(); +extern "C" void Store_R0_R1_R2_4B(); extern "C" void Store_R0_R1_R2_R3_4B(); +extern "C" void Store_R1_R2_4B(); +extern "C" void Store_R1_R2_R3_4B(); extern "C" void Store_R2_R3_4B(); +extern "C" void Store_R3_4B(); extern "C" void Store_Stack_4B(); #endif // TARGET_ARM @@ -787,7 +795,7 @@ extern "C" void Store_FA7(); PCODE CallStubGenerator::GetStackRoutine() { - LOG2((LF2_INTERPRETER, LL_INFO10000, "Load_Stack\n")); + LOG2((LF2_INTERPRETER, LL_INFO10000, "GetStackRoutine\n")); return m_interpreterToNative ? (PCODE)Load_Stack : (PCODE)Store_Stack; } @@ -1119,20 +1127,22 @@ PCODE CallStubGenerator::GetRegRoutine_4B(int r1, int r2) LOG2((LF2_INTERPRETER, LL_INFO10000, "GetRegRoutine_4B\n")); #endif static const PCODE GPRegLoadRoutines_4B[] = { - (PCODE)0, (PCODE)Load_R0_R1_4B, (PCODE)0, (PCODE)Load_R0_R1_R2_R3_4B, - (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)0, + (PCODE)0, (PCODE)Load_R0_R1_4B, (PCODE)Load_R0_R1_R2_4B, (PCODE)Load_R0_R1_R2_R3_4B, + (PCODE)0, (PCODE)0, (PCODE)Load_R1_R2_4B, (PCODE)Load_R1_R2_R3_4B, (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)Load_R2_R3_4B, - (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)0 + (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)Load_R3_4B }; static const PCODE GPRegStoreRoutines_4B[] = { - (PCODE)0, (PCODE)Store_R0_R1_4B, (PCODE)0, (PCODE)Store_R0_R1_R2_R3_4B, - (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)0, + (PCODE)0, (PCODE)Store_R0_R1_4B, (PCODE)Store_R0_R1_R2_4B, (PCODE)Store_R0_R1_R2_R3_4B, + (PCODE)0, (PCODE)0, (PCODE)Store_R1_R2_4B, (PCODE)Store_R1_R2_R3_4B, (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)Store_R2_R3_4B, - (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)0 + (PCODE)0, (PCODE)0, (PCODE)0, (PCODE)Store_R3_4B }; int index = r1 * NUM_ARGUMENT_REGISTERS + r2; - return m_interpreterToNative ? GPRegLoadRoutines_4B[index] : GPRegStoreRoutines_4B[index]; + PCODE routine = m_interpreterToNative ? GPRegLoadRoutines_4B[index] : GPRegStoreRoutines_4B[index]; + _ASSERTE(routine != 0); + return routine; } PCODE CallStubGenerator::GetStackRoutine_4B() @@ -1616,7 +1626,12 @@ CallStubHeader *CallStubGenerator::GenerateCallStub(MethodDesc *pMD, AllocMemTra #endif int targetSlotIndex = m_interpreterToNative ? m_targetSlotIndex : (m_routineIndex - 1); +#ifdef TARGET_ARM + // AAPCS compliant stack alignment for function calls + CallStubHeader *pHeader = new (pHeaderStorage) CallStubHeader(m_routineIndex, targetSlotIndex, pRoutines, ALIGN_UP(m_totalStackSize, CALL_STACK_ALIGN_SIZE), sig.IsAsyncCall(), hasSwiftError, hasSwiftReturnLowering, m_pInvokeFunction); +#else CallStubHeader *pHeader = new (pHeaderStorage) CallStubHeader(m_routineIndex, targetSlotIndex, pRoutines, ALIGN_UP(m_totalStackSize, STACK_ALIGN_SIZE), sig.IsAsyncCall(), hasSwiftError, hasSwiftReturnLowering, m_pInvokeFunction); +#endif // TARGET_ARM return pHeader; } @@ -1713,12 +1728,19 @@ CallStubHeader *CallStubGenerator::GenerateCallStubForSig(MetaSig &sig, MethodDe ComputeCallStub(sig, pRoutines, pContextMD); + int totalStackSize = m_totalStackSize; +#ifdef TARGET_ARM + // AAPCS compliant stack alignment for function calls + totalStackSize = ALIGN_UP(totalStackSize, CALL_STACK_ALIGN_SIZE); +#endif // TARGET_ARM + xxHash hashState; for (int i = 0; i < m_routineIndex; i++) { hashState.AddPointer((void*)pRoutines[i]); } - hashState.Add(m_totalStackSize); + + hashState.Add(totalStackSize); hashState.AddPointer((void*)m_pInvokeFunction); hashState.Add(sig.IsAsyncCall() ? 1 : 0); hashState.Add(m_targetSlotIndex); @@ -1731,7 +1753,7 @@ CallStubHeader *CallStubGenerator::GenerateCallStubForSig(MetaSig &sig, MethodDe m_routineIndex, m_targetSlotIndex, pRoutines, - ALIGN_UP(m_totalStackSize, STACK_ALIGN_SIZE), + ALIGN_UP(totalStackSize, STACK_ALIGN_SIZE), sig.IsAsyncCall(), #if defined(TARGET_APPLE) && defined(TARGET_ARM64) m_hasSwiftError, @@ -1757,9 +1779,9 @@ CallStubHeader *CallStubGenerator::GenerateCallStubForSig(MetaSig &sig, MethodDe void* pHeaderStorage = amTracker.Track(SystemDomain::GetGlobalLoaderAllocator()->GetHighFrequencyHeap()->AllocMem(S_SIZE_T(finalCachedCallStubSize))); // hasSwiftReturnLowering is always false here because m_interpreterToNative = true (see line 1601's logic) #if defined(TARGET_APPLE) && defined(TARGET_ARM64) - CachedCallStub *pHeader = new (pHeaderStorage) CachedCallStub(cachedHeaderKey.HashCode, m_routineIndex, m_targetSlotIndex, pRoutines, ALIGN_UP(m_totalStackSize, STACK_ALIGN_SIZE), sig.IsAsyncCall(), m_hasSwiftError, false /* hasSwiftReturnLowering */, m_pInvokeFunction); + CachedCallStub *pHeader = new (pHeaderStorage) CachedCallStub(cachedHeaderKey.HashCode, m_routineIndex, m_targetSlotIndex, pRoutines, ALIGN_UP(totalStackSize, STACK_ALIGN_SIZE), sig.IsAsyncCall(), m_hasSwiftError, false /* hasSwiftReturnLowering */, m_pInvokeFunction); #else - CachedCallStub *pHeader = new (pHeaderStorage) CachedCallStub(cachedHeaderKey.HashCode, m_routineIndex, m_targetSlotIndex, pRoutines, ALIGN_UP(m_totalStackSize, STACK_ALIGN_SIZE), sig.IsAsyncCall(), false, false, m_pInvokeFunction); + CachedCallStub *pHeader = new (pHeaderStorage) CachedCallStub(cachedHeaderKey.HashCode, m_routineIndex, m_targetSlotIndex, pRoutines, ALIGN_UP(totalStackSize, STACK_ALIGN_SIZE), sig.IsAsyncCall(), false, false, m_pInvokeFunction); #endif s_callStubCache->Add(pHeader); amTracker.SuppressRelease(); @@ -2288,20 +2310,6 @@ void CallStubGenerator::ComputeCallStubWorker(bool hasUnmanagedCallConv, CorInfo } } else -#elif defined(TARGET_ARM) && defined(ARM_SOFTFP) - if (argLocDesc.m_cGenReg != 0 && argLocDesc.m_byteStackSize != 0) - { - ArgLocDesc argLocDescReg = {}; - argLocDescReg.m_idxGenReg = argLocDesc.m_idxGenReg; - argLocDescReg.m_cGenReg = argLocDesc.m_cGenReg; - ProcessArgument(&argIt, argLocDescReg, pRoutines); - - ArgLocDesc argLocDescStack = {}; - argLocDescStack.m_byteStackIndex = argLocDesc.m_byteStackIndex; - argLocDescStack.m_byteStackSize = argLocDesc.m_byteStackSize; - ProcessArgument(&argIt, argLocDescStack, pRoutines); - } - else #endif // UNIX_AMD64_ABI { ProcessArgument(&argIt, argLocDesc, pRoutines); @@ -2377,9 +2385,10 @@ void CallStubGenerator::ProcessArgument(ArgIteratorType *pArgIt, ArgLocDesc& arg RoutineType argType = RoutineType::None; #ifdef TARGET_ARM - if (argLocDesc.m_cGenReg == 2 || argLocDesc.m_byteStackSize >= 8) + bool useRoutine4B = false; + if ((argLocDesc.m_cGenReg * 4 + argLocDesc.m_byteStackSize) >= 8) { - /* do nothing */ + useRoutine4B = true; } else #endif // TARGET_ARM @@ -2415,7 +2424,7 @@ void CallStubGenerator::ProcessArgument(ArgIteratorType *pArgIt, ArgLocDesc& arg { LOG2((LF2_INTERPRETER, LL_INFO10000, "m_cGenReg=%d\n", (int)argLocDesc.m_cGenReg)); #ifdef TARGET_ARM - if (argLocDesc.m_cGenReg == 2) + if (useRoutine4B) { pRoutines[m_routineIndex++] = GetRegRoutine_4B(argLocDesc.m_idxGenReg, argLocDesc.m_idxGenReg + argLocDesc.m_cGenReg - 1); } @@ -2499,7 +2508,7 @@ void CallStubGenerator::ProcessArgument(ArgIteratorType *pArgIt, ArgLocDesc& arg { LOG2((LF2_INTERPRETER, LL_INFO10000, "m_byteStackSize=%d\n", (int)argLocDesc.m_byteStackSize)); #ifdef TARGET_ARM - if (argLocDesc.m_byteStackSize >= 8) + if (useRoutine4B) { pRoutines[m_routineIndex++] = GetStackRoutine_4B(); pRoutines[m_routineIndex++] = argLocDesc.m_byteStackIndex; @@ -2602,6 +2611,15 @@ void CallStubGenerator::ProcessArgument(ArgIteratorType *pArgIt, ArgLocDesc& arg } #endif // ENREGISTERED_PARAMTYPE_MAXSIZE #endif // UNIX_AMD64_ABI +#ifdef TARGET_ARM + if (useRoutine4B) + { + if ((argLocDesc.m_cGenReg * 4 + argLocDesc.m_byteStackSize) % INTERP_STACK_SLOT_SIZE != 0) + { + pRoutines[m_routineIndex++] = (PCODE)InjectInterpStackAlign; + } + } +#endif // TARGET_ARM m_currentRoutineType = argType; }