From ef0e27c9f647d943a6aac9f147e2a68be783157a Mon Sep 17 00:00:00 2001
From: Sebastian Nickolls <sebastian.nickolls@arm.com>
Date: Fri, 3 Oct 2025 12:38:18 +0000
Subject: [PATCH 1/2] Implement UnknownSizeFrame for locals with unknown size

Implements a simple bump allocator for TYP_SIMD and TYP_MASK. Locals are
allocated to this space when lvaIsUnknownSizeLocal is true for the variable.

The frame is implemented on ARM64 as two homogenenous blocks containing either
TYP_SIMD or TYP_MASK locals. The x19 register is reserved for addressing locals
in the block. Updates codegen for SVE memory transfer instructions to accept
indices in multiples of the vector length (or VL / 8 for masks) instead of
deriving them from the size of the local.
---
 src/coreclr/jit/codegen.h          |   4 +
 src/coreclr/jit/codegenarmarch.cpp |  38 +++++++
 src/coreclr/jit/codegencommon.cpp  |  16 +++
 src/coreclr/jit/compiler.h         | 170 +++++++++++++++++++++++++++++
 src/coreclr/jit/compiler.hpp       |  14 ++-
 src/coreclr/jit/emitarm64.cpp      | 100 ++++++++---------
 src/coreclr/jit/jit.h              |   2 -
 src/coreclr/jit/lclvars.cpp        |  81 ++++++++++++++
 src/coreclr/jit/lsra.cpp           |  10 ++
 src/coreclr/jit/lsrabuild.cpp      |   8 ++
 src/coreclr/jit/regset.cpp         |   4 +-
 src/coreclr/jit/targetarm64.h      |   2 +
 12 files changed, 388 insertions(+), 61 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 6b2ca6af8484d4..f9b745f87b2910 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -426,6 +426,10 @@ class CodeGen final : public CodeGenInterface
 
     void genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed);
 
+#if defined(TARGET_ARM64)
+    void genUnknownSizeFrame();
+#endif
+
 #elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
     bool genInstrWithConstant(instruction ins,
                               emitAttr    attr,
diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp
index fcef600921e254..f797e743dbadb2 100644
--- a/src/coreclr/jit/codegenarmarch.cpp
+++ b/src/coreclr/jit/codegenarmarch.cpp
@@ -4876,9 +4876,47 @@ void CodeGen::genPushCalleeSavedRegisters()
     m_compiler->compFrameInfo.calleeSaveSpOffset = calleeSaveSpOffset;
     m_compiler->compFrameInfo.calleeSaveSpDelta  = calleeSaveSpDelta;
     m_compiler->compFrameInfo.offsetSpToSavedFp  = offsetSpToSavedFp;
+
+    if (m_compiler->compUsesUnknownSizeFrame)
+    {
+        genUnknownSizeFrame();
+    }
 #endif // TARGET_ARM64
 }
 
+#if defined(TARGET_ARM64)
+// See Compiler::UnknownSizeFrame for implementation details.
+void CodeGen::genUnknownSizeFrame()
+{
+    assert(m_compiler->compLocallocUsed && m_compiler->compUsesUnknownSizeFrame);
+    assert(m_compiler->unkSizeFrame.isFinalized);
+    unsigned totalVectorCount = m_compiler->unkSizeFrame.FrameSizeInVectors();
+
+    // We reserve REG_UNKBASE for addressing SVE locals. This will always point at the top of
+    // of the UnknownSizeFrame and we index into it.
+    // TODO-SVE: We may want this to point into the middle of the frame to reduce address
+    // computations (we have a signed 9-bit indexing immediate).
+    inst_Mov(TYP_I_IMPL, REG_UNKBASE, REG_SP, false);
+
+    if (0 < totalVectorCount && totalVectorCount <= 32)
+    {
+        GetEmitter()->emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, REG_SP, REG_SP, -(ssize_t)totalVectorCount);
+    }
+    else
+    {
+        // Generate `sp = sp - totalVectorCount * VL`
+        assert(totalVectorCount != 0);
+        regNumber rsvd = rsGetRsvdReg();
+        // mov   rsvd, #totalVectorCount
+        // rdvl  scratch, #1
+        // msub  sp, rsvd, scratch, sp
+        instGen_Set_Reg_To_Imm(EA_8BYTE, rsvd, totalVectorCount);
+        GetEmitter()->emitIns_R_I(INS_sve_rdvl, EA_8BYTE, REG_SCRATCH, 1);
+        GetEmitter()->emitIns_R_R_R_R(INS_msub, EA_8BYTE, REG_SP, rsvd, REG_SCRATCH, REG_SP);
+    }
+}
+#endif
+
 /*****************************************************************************
  *
  *  Generates code for a function epilog.
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 058316551fb377..2e88b7241dcd4f 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -3644,6 +3644,11 @@ void CodeGen::genCheckUseBlockInit()
             continue;
         }
 
+        if (m_compiler->lvaIsUnknownSizeLocal(varNum))
+        {
+            continue;
+        }
+
         if (m_compiler->fgVarIsNeverZeroInitializedInProlog(varNum))
         {
             varDsc->lvMustInit = 0;
@@ -4001,6 +4006,12 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
 
             noway_assert(varDsc->lvOnFrame);
 
+            if (m_compiler->lvaIsUnknownSizeLocal(varNum))
+            {
+                // This local will belong on the UnknownSizeFrame, which will handle zeroing instead.
+                continue;
+            }
+
             // lvMustInit can only be set for GC types or TYP_STRUCT types
             // or when compInitMem is true
             // or when in debug code
@@ -5067,6 +5078,11 @@ void CodeGen::genFnProlog()
             continue;
         }
 
+        if (m_compiler->lvaIsUnknownSizeLocal(varNum))
+        {
+            continue;
+        }
+
         signed int loOffs = varDsc->GetStackOffset();
         signed int hiOffs = varDsc->GetStackOffset() + m_compiler->lvaLclStackHomeSize(varNum);
 
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index cb1137a8b4d0c5..605475b159e9d6 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -4124,6 +4124,176 @@ class Compiler
 
     int lvaOSRLocalTier0FrameOffset(unsigned varNum);
 
+    //------------------------- UnknownSizeFrame ---------------------------------
+
+    void lvaInitUnknownSizeFrame();
+    void lvaAllocUnknownSizeLocal(unsigned varNum);
+
+    bool compUsesUnknownSizeFrame;
+
+#if defined(FEATURE_SIMD) && defined(TARGET_ARM64)
+    // For ARM64, the UnknownSizeFrame lives at the end of the statically
+    // allocated stack space. This means it belongs to the 'alloca' space on the
+    // frame, and it is essentially the first dynamically allocated stack
+    // variable.
+    //
+    // Currently, the only locals with unknown size are SIMD types supporting
+    // Vector<T>, TYP_SIMD and TYP_MASK. We do not know the size of these types
+    // at compile time, so we need to execute the rdvl/addvl instruction to
+    // learn this size and allocate the UnknownSizeFrame.
+    //
+    // We reserve the x19 register to point to the top of the UnknownSizeFrame
+    // and use this as the base address for local variables with unknown size.
+    // Reserving a register is simpler than using fp/sp, as fp may point
+    // to different locations depending on various properties of the frame, and
+    // the value of sp may change at runtime.
+    //
+    // Typically, a vector is loaded using a base address and some index which
+    // the instruction will scale by VL, for example: `ldr z0, [x19, #3 MUL VL]`.
+    // A mask is loaded with `ldr p0, [x19, #3 MUL VL]`, but in this case the
+    // `MUL VL` indicates we are scaling with the length of the predicate
+    // register rather than the vector. A predicate register is defined to have
+    // 1/8th the length of a vector register.
+    //
+    // We know that sizeof(TYP_SIMD) and sizeof(TYP_MASK) are invariant despite
+    // being unknown at compile time, so we allocate them in single homogeneous
+    // blocks per type. An individual local can be referenced from the start of
+    // its block by an index into the block.
+    //
+    // The difference in addressing-mode index scaling means we have to be
+    // careful where we place the mask locals block with respect to the vector
+    // locals block. If we place the mask locals after the vector locals, we'll
+    // need to offset the load index by (8 * nVector) to account for the vector
+    // locals.
+    //
+    // Instead, we choose to pad the mask locals block to VL and place it at the
+    // beginning of the frame (closest to fp). This way we'll need to offset
+    // vector load indices by `roundUp(nMask, 8) / 8`. This is less likely to
+    // put pressure on the immediate encoding range and result in requiring an
+    // address computation.
+    //
+    // The maximum wasted space from the padding is 7/8ths VL (224 bytes with
+    // the architectural maximum 256 byte vectors), which occurs when 1 mask
+    // local is spilled to the frame. Alternatively this is 28 bytes for 32 byte
+    // vectors, for an example closer to today's implementations.
+    //
+    // The padding also makes it simple to allocate the UnknownSizeFrame since
+    // the UnknownSizeFrame will be aligned to VL. The total number of vectors
+    // to allocate is `(roundUp(nMask, 8) / 8) + nVector`. The stack pointer
+    // can be adjusted with a single instruction `addvl sp, sp, #totalVectors`.
+    //
+    // See the diagram below for a visual representation of this scheme.
+    //
+    //                 ...
+    //  |          static space            |
+    //  |        (totalFrameSize)          |
+    //  +----------------------------------+ x19, begin UnknownSizeFrame
+    //  |         mask locals block        |                 ^
+    //  |          (nMask * VL/8)          |                 |
+    //  +----------------------------------+                 |
+    //  |      padding to VL alignment     |                 |
+    //  +----------------------------------+ (roundUp(nMask, 8)/8 + nVector)*VL
+    //  |                                  |                 |
+    //  |       vector locals block        |                 |
+    //  |         (nVector * VL)           |                 |
+    //  |                                  |                 v
+    //  +----------------------------------+ end UnknownSizeFrame
+    //  |                                  |
+    //  |       rest of alloca space       |
+    //                 ...                   sp
+    struct UnknownSizeFrame
+    {
+        // Number of allocated vectors/masks. These also represent the end of
+        // the allocation space for each block. The allocator for each block is
+        // a simple bump allocator.
+        unsigned nVector = 0;
+        unsigned nMask = 0;
+
+#ifdef DEBUG
+        bool isFinalized = false;
+#endif
+
+        // Returns the size of the mask block in number of vector lengths.
+        unsigned MaskBlockSizeInVectors()
+        {
+            assert(roundUp(0U, 8U) == 0);
+            return roundUp(nMask, 8) / 8;
+        }
+
+        // Returns the size of the vector block in number of vector lengths.
+        unsigned VectorBlockSize()
+        {
+            return nVector;
+        }
+
+        // Returns the size of the total UnknownSizeFrame in number of vector
+        // lengths.
+        unsigned FrameSizeInVectors()
+        {
+            return MaskBlockSizeInVectors() + VectorBlockSize();
+        }
+
+        // Allocate a mask, returning an index of the mask in the mask block.
+        unsigned AllocMask()
+        {
+            assert(!isFinalized);
+            unsigned idx = nMask;
+            nMask++;
+            return idx;
+        }
+
+        // Allocate a vector, returning an index of the vector in the vector
+        // block.
+        unsigned AllocVector()
+        {
+            assert(!isFinalized);
+            unsigned idx = nVector;
+            nVector++;
+            return idx;
+        }
+
+        // Returns a negative offset relative to the base of the UnknownSizeFrame
+        // for addressing an allocated vector or mask local.
+        // If `isMask == true`, given an index that was assigned to mask local,
+        // the returned offset is an index measured in units of VL/8.
+        // Otherwise given an index that was assigned to a vector local, the
+        // returned offset is measured in units of VL.
+        // The index parameter should have been obtained through AllocMask() or
+        // AllocVector().
+        int GetOffset(unsigned index, bool isMask = false)
+        {
+            // We can't compute addresses if we haven't finished allocating.
+            assert(isFinalized);
+
+            unsigned offset = UINT32_MAX;
+            if (isMask)
+            {
+                assert(index < nMask);
+                offset = index;
+            }
+            else
+            {
+                assert(index < nVector);
+                offset = MaskBlockSizeInVectors() + index;
+            }
+            assert(offset != UINT32_MAX);
+            // The index is always offset by 1 as we are writing from below fp
+            // upwards.
+            return -(int)(offset + 1);
+        }
+
+        // This system ensures we don't try and generate an address on the frame
+        // without finishing all allocations.
+        void Finalize()
+        {
+#ifdef DEBUG
+            isFinalized = true;
+#endif
+        }
+
+    } unkSizeFrame;
+#endif
+
     //------------------------ For splitting types ----------------------------
 
     void lvaInitTypeRef();
diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp
index 4484a8ae95075c..ec5e456888552e 100644
--- a/src/coreclr/jit/compiler.hpp
+++ b/src/coreclr/jit/compiler.hpp
@@ -2744,7 +2744,7 @@ inline
 #endif // !TARGET_AMD64
         }
 
-        FPbased = varDsc->lvFramePointerBased;
+        FPbased = varDsc->lvFramePointerBased && !lvaIsUnknownSizeLocal(varNum);
 
 #ifdef DEBUG
 #if FEATURE_FIXED_OUT_ARGS
@@ -2765,7 +2765,17 @@ inline
         }
 #endif // DEBUG
 
-        varOffset = varDsc->GetStackOffset();
+#ifdef TARGET_ARM64
+        if (lvaIsUnknownSizeLocal(varNum) && !varDsc->lvIsStructField)
+        {
+            assert(!FPbased);
+            varOffset = unkSizeFrame.GetOffset(varDsc->GetStackOffset(), varDsc->TypeIs(TYP_MASK));
+        }
+        else
+#endif
+        {
+            varOffset = varDsc->GetStackOffset();
+        }
     }
     else // Its a spill-temp
     {
diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index 2dd60855390f0f..acc5ed21606ba9 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -8119,6 +8119,17 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
             isSimple = false;
             scale    = 0;
 
+            if (varx >= 0 && m_compiler->lvaIsUnknownSizeLocal(varx))
+            {
+                assert(offs == 0);
+                assert(!FPbased);
+                // We shouldn't be materializing the address of a mask.
+                assert(m_compiler->lvaGetActualType(varx) != TYP_MASK);
+                // addvl reg1, x19, #imm
+                emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, reg1, REG_UNKBASE, imm);
+                return;
+            }
+
             if (disp >= 0)
             {
                 ins = INS_add;
@@ -8153,44 +8164,35 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
 
         case INS_sve_ldr:
         {
+            assert(isPredicateRegister(reg1) || isVectorRegister(reg1));
+
             isSimple = false;
             size     = EA_SCALABLE;
             attr     = size;
-            if (isPredicateRegister(reg1))
-            {
-                assert(offs == 0);
-                // For predicate, generate based off rsGetRsvdReg()
-                regNumber rsvdReg = codeGen->rsGetRsvdReg();
+            fmt      = isPredicateRegister(reg1) ? IF_SVE_ID_2A : IF_SVE_IE_2A;
 
-                // add rsvd, fp, #imm
-                emitIns_R_R_Imm(INS_add, EA_8BYTE, rsvdReg, encodingZRtoSP(reg2), imm);
-                // str p0, [rsvd, #0, mul vl]
-                emitIns_R_R_I(ins, attr, reg1, rsvdReg, 0);
-
-                return;
-            }
-
-            assert(isVectorRegister(reg1));
-            fmt = IF_SVE_IE_2A;
-
-            // TODO-SVE: Don't assume 128bit vectors
-            // Predicate size is vector length / 8
-            scale        = NaturalScale_helper(isVectorRegister(reg1) ? EA_16BYTE : EA_2BYTE);
-            ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
-
-            if (((imm & mask) == 0) && (isValidSimm<9>(imm >> scale)))
-            {
-                imm >>= scale; // The immediate is scaled by the size of the ld/st
-            }
-            else
+            if (FPbased)
             {
+                // This is loading a field of a struct on the stack. The immediate will be an absolute
+                // offset to the field, not scaled by VL.
+                reg2 = REG_FP;
+
                 useRegForImm      = true;
                 regNumber rsvdReg = codeGen->rsGetRsvdReg();
-                // For larger imm values (> 9 bits), calculate base + imm in a reserved register first.
                 codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm);
+
                 reg2 = rsvdReg;
                 imm  = 0;
             }
+            else
+            {
+                // SVE locals are TYP_SIMD or TYP_MASK, both should be placed on the UnknownSizeFrame.
+                // The base address of these locals should be REG_UNKBASE (x19).
+                assert(offs == 0);
+                // TODO-SVE: Handle generation of base address for large immediate scaled by VL/PL.
+                assert(isValidSimm<9>(imm));
+                reg2 = REG_UNKBASE;
+            }
         }
         break;
 
@@ -8424,46 +8426,34 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
 
         case INS_sve_str:
         {
+            assert(isVectorRegister(reg1) || isPredicateRegister(reg1));
             isSimple = false;
             size     = EA_SCALABLE;
             attr     = size;
+            fmt      = isPredicateRegister(reg1) ? IF_SVE_JG_2A : IF_SVE_JH_2A;
 
-            if (isPredicateRegister(reg1))
+            if (FPbased)
             {
-                assert(offs == 0);
-
-                // For predicate, generate based off rsGetRsvdReg()
-                regNumber rsvdReg = codeGen->rsGetRsvdReg();
-
-                // add rsvd, fp, #imm
-                emitIns_R_R_Imm(INS_add, EA_8BYTE, rsvdReg, encodingZRtoSP(reg2), imm);
-                // str p0, [rsvd, #0, mul vl]
-                emitIns_R_R_I(ins, attr, reg1, rsvdReg, 0);
-
-                return;
-            }
-
-            assert(isVectorRegister(reg1));
-            fmt = IF_SVE_JH_2A;
+                // This is storing to a field of a struct on the stack. The immediate will be an absolute
+                // offset to the field, not scaled by VL.
+                reg2 = REG_FP;
 
-            // TODO-SVE: Don't assume 128bit vectors
-            // Predicate size is vector length / 8
-            scale        = NaturalScale_helper(isVectorRegister(reg1) ? EA_16BYTE : EA_2BYTE);
-            ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
-
-            if (((imm & mask) == 0) && (isValidSimm<9>(imm >> scale)))
-            {
-                imm >>= scale; // The immediate is scaled by the size of the ld/st
-            }
-            else
-            {
                 useRegForImm      = true;
                 regNumber rsvdReg = codeGen->rsGetRsvdReg();
-                // For larger imm values (> 9 bits), calculate base + imm in a reserved register first.
                 codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm);
+
                 reg2 = rsvdReg;
                 imm  = 0;
             }
+            else
+            {
+                // SVE locals are TYP_SIMD or TYP_MASK, both should be placed on the UnknownSizeFrame.
+                // The base address of these locals should be REG_UNKBASE (x19).
+                assert(offs == 0);
+                // TODO-SVE: Handle generation of base address for large immediate scaled by VL/PL.
+                assert(isValidSimm<9>(imm));
+                reg2 = REG_UNKBASE;
+            }
         }
         break;
 
diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index 8b6aaa227ef84a..59a719af106b03 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -707,12 +707,10 @@ inline size_t roundUp(size_t size, size_t mult = sizeof(size_t))
     return (size + (mult - 1)) & ~(mult - 1);
 }
 
-#ifdef HOST_64BIT
 inline unsigned int roundUp(unsigned size, unsigned mult)
 {
     return (unsigned int)roundUp((size_t)size, (size_t)mult);
 }
-#endif // HOST_64BIT
 
 inline unsigned int unsigned_abs(int x)
 {
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index acd4be1edf083e..b8f93def3bf5f1 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -4231,6 +4231,15 @@ void Compiler::lvaAssignFrameOffsets(FrameLayoutState curState)
     assert(lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
 #endif // FEATURE_FIXED_OUT_ARGS
 
+    /*-------------------------------------------------------------------------
+     *
+     * Initialize tracking information for locals with unknown size.
+     *
+     *-------------------------------------------------------------------------
+     */
+
+    lvaInitUnknownSizeFrame();
+
     /*-------------------------------------------------------------------------
      *
      * First process the arguments.
@@ -4277,6 +4286,13 @@ void Compiler::lvaAssignFrameOffsets(FrameLayoutState curState)
     {
         codeGen->resetFramePointerUsedWritePhase();
     }
+#if defined(FEATURE_SIMD) && defined(TARGET_ARM64)
+    else
+    {
+        assert(curState == FINAL_FRAME_LAYOUT);
+        unkSizeFrame.Finalize();
+    }
+#endif
 }
 
 /*****************************************************************************
@@ -4404,6 +4420,11 @@ void Compiler::lvaFixVirtualFrameOffsets()
         // Can't be relative to EBP unless we have an EBP
         noway_assert(!varDsc->lvFramePointerBased || codeGen->doubleAlignOrFramePointerUsed());
 
+        if (lvaIsUnknownSizeLocal(lclNum))
+        {
+            continue;
+        }
+
         // Is this a non-param promoted struct field?
         //   if so then set doAssignStkOffs to false.
         //
@@ -4599,6 +4620,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
         int startOffset;
         if (lvaGetRelativeOffsetToCallerAllocatedSpaceForParameter(lclNum, &startOffset))
         {
+            assert(!lvaIsUnknownSizeLocal(lclNum));
+
             dsc->SetStackOffset(startOffset + relativeZero);
             JITDUMP("Set V%02u to offset %d\n", lclNum, startOffset);
 
@@ -5209,6 +5232,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
                 continue;
             }
+            else if (lvaIsUnknownSizeLocal(lclNum))
+            {
+                // Reserve dynamic stack space for this variable.
+                lvaAllocUnknownSizeLocal(lclNum);
+                continue;
+            }
 
             // These need to be located as the very first variables (highest memory address)
             // and so they have already been assigned an offset
@@ -5517,6 +5546,58 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 #endif // TARGET_ARM64
 }
 
+void Compiler::lvaInitUnknownSizeFrame()
+{
+#if defined(FEATURE_SIMD) && defined(TARGET_ARM64)
+    compUsesUnknownSizeFrame = false;
+#ifdef DEBUG
+    unkSizeFrame.isFinalized = false;
+#endif
+    unkSizeFrame.nMask   = 0;
+    unkSizeFrame.nVector = 0;
+#endif
+}
+
+//-------------------------------------------------------------------------------
+// lvaAllocUnknownSizeLocal: Allocate stack space for a local with unknown size
+//
+// A local with unknown size has a size that is not precisely known at compile time,
+// but may be derived dynamically through code. These locals are allocated into
+// their own stack space categorized by JIT type.
+//
+// Ideally, locals are primitive types that can fit into a homogeneous space containing
+// objects with the same unknown size. In this case, we can identify them by a simple
+// index into the space.
+void Compiler::lvaAllocUnknownSizeLocal(unsigned varNum)
+{
+    LclVarDsc* const varDsc = lvaGetDesc(varNum);
+    assert(varTypeHasUnknownSize(varDsc));
+
+#if defined(FEATURE_SIMD) && defined(TARGET_ARM64)
+    if (varDsc->TypeIs(TYP_SIMD))
+    {
+        varDsc->SetStackOffset((int)unkSizeFrame.AllocVector());
+    }
+    else if (varDsc->TypeIs(TYP_MASK))
+    {
+        varDsc->SetStackOffset((int)unkSizeFrame.AllocMask());
+    }
+    else
+#endif
+    {
+        // The only types with unknown size should be SIMD at the moment.
+        unreached();
+    }
+
+    compUsesUnknownSizeFrame = true;
+
+    // Technically we're not using localalloc, but the space these locals use
+    // will be at the beginning of the alloca space on the stack frame. So we
+    // should set this and inherit all of its behaviour, e.g. guarantee we get
+    // a frame pointer.
+    compLocallocUsed = true;
+}
+
 //------------------------------------------------------------------------
 // lvaParamHasLocalStackSpace: Check if a local that represents a parameter has
 // space allocated for it in the local stack frame.
diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp
index 734e0e67335d85..ad0fc7e4f0f064 100644
--- a/src/coreclr/jit/lsra.cpp
+++ b/src/coreclr/jit/lsra.cpp
@@ -2585,6 +2585,16 @@ void LinearScan::setFrameType()
     }
 #endif // TARGET_ARM
 
+#if defined(TARGET_ARM64)
+    if (m_compiler->compUsesUnknownSizeFrame)
+    {
+        // We reserve x19 for addressing vector and mask locals on the UnknownSizeFrame.
+        m_compiler->codeGen->regSet.rsMaskResvd |= RBM_UNKBASE;
+        JITDUMP("  Reserved REG_UNKBASE (%s) due to presence of UnknownSizeFrame\n", getRegName(REG_UNKBASE));
+        removeMask |= RBM_UNKBASE.GetIntRegSet();
+    }
+#endif
+
     if ((removeMask != RBM_NONE) && ((availableIntRegs & removeMask) != 0))
     {
         // We know that we're already in "read mode" for availableIntRegs. However,
diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp
index 79ddcc5b1c732e..30493867fbb133 100644
--- a/src/coreclr/jit/lsrabuild.cpp
+++ b/src/coreclr/jit/lsrabuild.cpp
@@ -2466,6 +2466,14 @@ void LinearScan::buildIntervals()
             currentLoc += 2;
         }
 
+#ifdef TARGET_ARM64
+        if (m_compiler->compUsesUnknownSizeFrame)
+        {
+            addKillForRegs(RBM_UNKBASE, currentLoc + 1);
+            currentLoc += 2;
+        }
+#endif
+
         // For frame poisoning we generate code into scratch BB right after prolog since
         // otherwise the prolog might become too large. In this case we will put the poison immediate
         // into the scratch register, so it will be killed here.
diff --git a/src/coreclr/jit/regset.cpp b/src/coreclr/jit/regset.cpp
index 61933141334969..58434e6a7912f5 100644
--- a/src/coreclr/jit/regset.cpp
+++ b/src/coreclr/jit/regset.cpp
@@ -626,7 +626,7 @@ TempDsc* RegSet::tmpGetTemp(var_types type)
     unsigned size = genTypeSize(type);
 
     // If TYP_STRUCT ever gets in here we do bad things (tmpSlot returns -1)
-    noway_assert(size >= sizeof(int));
+    noway_assert(size >= sizeof(int) && size != SIZE_UNKNOWN);
 
     /* Find the slot to search for a free temp of the right size */
 
@@ -688,7 +688,7 @@ void RegSet::tmpPreAllocateTemps(var_types type, unsigned count)
     unsigned size = genTypeSize(type);
 
     // If TYP_STRUCT ever gets in here we do bad things (tmpSlot returns -1)
-    noway_assert(size >= sizeof(int));
+    noway_assert(size >= sizeof(int) && size != SIZE_UNKNOWN);
 
     // Find the slot to search for a free temp of the right size.
     // Note that slots are shared by types of the identical size (e.g., TYP_REF and TYP_LONG on AMD64),
diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h
index 3f26dfab09ee93..aaa9cf4a485279 100644
--- a/src/coreclr/jit/targetarm64.h
+++ b/src/coreclr/jit/targetarm64.h
@@ -389,4 +389,6 @@
 #define REG_SWIFT_INTRET_ORDER REG_R0,REG_R1,REG_R2,REG_R3
 #define REG_SWIFT_FLOATRET_ORDER REG_V0,REG_V1,REG_V2,REG_V3
 
+#define REG_UNKBASE REG_R19
+#define RBM_UNKBASE RBM_R19
 // clang-format on

From 38e056e2dcbd98b97b778a0c51ba7c1cf8d5466a Mon Sep 17 00:00:00 2001
From: Sebastian Nickolls <sebastian.nickolls@arm.com>
Date: Thu, 12 Mar 2026 15:25:36 +0000
Subject: [PATCH 2/2] Reinstate HOST_64BIT ifdef

---
 src/coreclr/jit/jit.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index 59a719af106b03..8b6aaa227ef84a 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -707,10 +707,12 @@ inline size_t roundUp(size_t size, size_t mult = sizeof(size_t))
     return (size + (mult - 1)) & ~(mult - 1);
 }
 
+#ifdef HOST_64BIT
 inline unsigned int roundUp(unsigned size, unsigned mult)
 {
     return (unsigned int)roundUp((size_t)size, (size_t)mult);
 }
+#endif // HOST_64BIT
 
 inline unsigned int unsigned_abs(int x)
 {