Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 95 additions & 8 deletions src/coreclr/jit/assertionprop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1145,22 +1145,66 @@ AssertionIndex Compiler::optCreateAssertion(GenTree* op1, GenTree* op2, bool equ
#if defined(FEATURE_HW_INTRINSICS)
case GT_CNS_VEC:
{
// For now, only support SIMD constants up to 16 bytes (SIMD8/12/16).
if (!op1->TypeIs(TYP_SIMD8, TYP_SIMD12, TYP_SIMD16) || (op1->TypeGet() != op2->TypeGet()))
assert(varTypeIsSIMD(op1));
var_types simdType = op1->TypeGet();

if (op2->TypeGet() != simdType)
{
return NO_ASSERTION_INDEX;
}

ValueNum op1VN = optConservativeNormalVN(op1);
ValueNum op2VN = optConservativeNormalVN(op2);

if (!optLocalAssertionProp && (op1VN == ValueNumStore::NoVN || op2VN == ValueNumStore::NoVN))
{
// GlobalAP requires valid VNs.
return NO_ASSERTION_INDEX;
}

GenTreeVecCon* vecCon = op2->AsVecCon();

#if defined(TARGET_XARCH)
// TYP_SIMD32/64 constants are too large to track without a heap allocation.
//
// However, there are many common constants that are effectively broadcasting
// the lowest v128 across the entire vector. By checking for and allowing this
// case through, we can provide pay-for-play support for core scenarios
// without allocating.

if (simdType == TYP_SIMD64)
{
if (memcmp(&vecCon->gtSimdVal.v128[0], &vecCon->gtSimdVal.v128[1], sizeof(simd16_t)) != 0)
{
return NO_ASSERTION_INDEX;
}
else if (memcmp(&vecCon->gtSimdVal.v256[0], &vecCon->gtSimdVal.v256[1], sizeof(simd32_t)) != 0)
{
return NO_ASSERTION_INDEX;
}
simdType = TYP_SIMD16;
}
else if (simdType == TYP_SIMD32)
{
if (memcmp(&vecCon->gtSimdVal.v128[0], &vecCon->gtSimdVal.v128[1], sizeof(simd16_t)) != 0)
{
return NO_ASSERTION_INDEX;
}
simdType = TYP_SIMD16;
}
#elif defined(TARGET_ARM64)
if (simdType == TYP_SIMD)
{
// TODO-SVE: Handle SVE constants
return NO_ASSERTION_INDEX;
}
#endif

// Assert we've fixed up the value to fit one of the supported storage sizes
assert((simdType == TYP_SIMD8) || (simdType == TYP_SIMD12) || (simdType == TYP_SIMD16));

simd16_t simdVal = {};
memcpy(&simdVal, &op2->AsVecCon()->gtSimdVal, genTypeSize(op2->TypeGet()));
memcpy(&simdVal, &vecCon->gtSimdVal, genTypeSize(simdType));

AssertionDsc dsc =
AssertionDsc::CreateConstLclVarAssertion(this, lclNum, op1VN, simdVal, op2VN, equals);
Expand Down Expand Up @@ -1871,19 +1915,28 @@ AssertionInfo Compiler::optAssertionGenJtrue(GenTree* tree)
{
#if defined(TARGET_XARCH)
case NI_Vector128_op_Equality:
case NI_Vector256_op_Equality:
case NI_Vector512_op_Equality:
#elif defined(TARGET_ARM64)
case NI_Vector64_op_Equality:
case NI_Vector128_op_Equality:
#endif
{
break;
}

#if defined(TARGET_XARCH)
case NI_Vector128_op_Inequality:
case NI_Vector256_op_Inequality:
case NI_Vector512_op_Inequality:
#elif defined(TARGET_ARM64)
case NI_Vector64_op_Inequality:
case NI_Vector128_op_Inequality:
#endif
{
equals = !equals;
break;
}

default:
return NO_ASSERTION_INDEX;
Expand All @@ -1902,7 +1955,7 @@ AssertionInfo Compiler::optAssertionGenJtrue(GenTree* tree)
return NO_ASSERTION_INDEX;
}

assert(op1->TypeIs(TYP_SIMD8, TYP_SIMD12, TYP_SIMD16));
assert(varTypeIsSIMD(op1));
assert(op1->TypeIs(op2->TypeGet()));
}
else
Expand Down Expand Up @@ -3262,16 +3315,50 @@ GenTree* Compiler::optConstantAssertionProp(const AssertionDsc& curAssertion,
#if defined(FEATURE_HW_INTRINSICS)
case O2K_CONST_VEC:
{
assert(varTypeIsSIMD(tree));
var_types simdType = tree->TypeGet();

// The assertion was created from a LCL_VAR == CNS_VEC where types matched.
// For now, only support SIMD constants up to 16 bytes (SIMD8/12/16).
if (!tree->TypeIs(TYP_SIMD8, TYP_SIMD12, TYP_SIMD16) || !tree->TypeIs(lvaGetDesc(lclNum)->TypeGet()))
if (lvaGetDesc(lclNum)->TypeGet() != simdType)
{
return nullptr;
}

// We can't bash a LCL_VAR into a GenTreeVecCon (different node size), so allocate a fresh node.
GenTreeVecCon* vecCon = gtNewVconNode(tree->TypeGet());
memcpy(&vecCon->gtSimdVal, &curAssertion.GetOp2().GetSimdConstant(), genTypeSize(tree->TypeGet()));

GenTreeVecCon* vecCon = gtNewVconNode(simdType);
const simd16_t& simdVal = curAssertion.GetOp2().GetSimdConstant();

#if defined(TARGET_XARCH)
// TYP_SIMD32/64 constants are too large to track without a heap allocation.
//
// However, we support them anyways by only allowing through the cases which
// are effectively broadcasting the lowest v128 across the entire vector.

if (simdType == TYP_SIMD64)
{
memcpy(&vecCon->gtSimdVal.v128[1], &simdVal, sizeof(simd16_t));
memcpy(&vecCon->gtSimdVal.v128[2], &simdVal, sizeof(simd16_t));
memcpy(&vecCon->gtSimdVal.v128[3], &simdVal, sizeof(simd16_t));
simdType = TYP_SIMD16;
}
else if (simdType == TYP_SIMD32)
{
memcpy(&vecCon->gtSimdVal.v128[1], &simdVal, sizeof(simd16_t));
simdType = TYP_SIMD16;
}
#elif defined(TARGET_ARM64)
if (simdType == TYP_SIMD)
{
// TODO-SVE: Handle SVE constants
unreached();
}
#endif

// Assert we've fixed up the value to account for one of the supported storage sizes
assert((simdType == TYP_SIMD8) || (simdType == TYP_SIMD12) || (simdType == TYP_SIMD16));

memcpy(&vecCon->gtSimdVal, &simdVal, genTypeSize(simdType));
newTree = vecCon;
break;
}
Expand Down
10 changes: 8 additions & 2 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8044,13 +8044,19 @@ class Compiler
unsigned m_lclNum;
double m_dconVal;
IntegralRange m_range;
simd16_t m_simdVal; // for O2K_CONST_VEC (TYP_SIMD8/12/16 only). TODO-CQ: support wider SIMD via heap
// allocation.
struct
{
ssize_t m_iconVal;
FieldSeq* m_fieldSeq;
} m_icon;

// O2K_CONST_VEC: This only allows storing TYP_SIMD8/12/16 but
// we still support common cases for TYP_SIMD32/64 by presuming
// that the value is a broadcast. We could fully support other
// sizes in the future by adding m_encodedVconFlags and tracking
// whether a different heap allocated value was used or other
// special cases like Indices or Sequences
simd16_t m_simdVal;
Copy link
Copy Markdown
Member

@EgorBo EgorBo Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it's worth the complixity to detect SIMD32/SIMD64 that we can fit into simd16_t or we just add simd_t* to this union and allocate it on heap? Less changes + will support all kinds of constants

Copy link
Copy Markdown
Member Author

@tannergooding tannergooding Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would assume it is and may even be worthwhile to do for VEC_CNS as well. -- This is based on past throughput and allocation considerations we saw for VN and other phases.

Particularly on xarch, we are most typically hitting the TYP_SIMD32 path for hardware and so this would mean a lot more constants are heap allocating when they don't need to be.

We'd have to have a if (size > 16) { speciallyHandleTheHeapAlloc } anyways with such a scenario, so this approach shouldn't be adding extra overhead. Rather, it just optimizes and avoids the allocation for the most common cases. -- and I'd actually expect assertion prop doesn't care about most constants, particularly "arbitrary" ones, so it may be unimportant to ever add the heap allocation support.

We may even want to consider doing the same for GT_CNS_VEC so that we aren't forced to have a large node for all constants and to avoid needing to do repeat checks like "is this a broadcast".

Copy link
Copy Markdown
Member

@EgorBo EgorBo Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding I am not sure I follow. Your PR tries to fit CNS_VEC for SIMD32 and SIMD64 into simd16_t in AssertionDsc2 for things where 16 bytes components are the same. I just don't see why if we can support all kinds of constants naturally, just that SIMD32 and SIMD64 will require a small arena allocation because 99% of assertions are used for non-SIMD stuff and we don't want to increase the struct size for this nieche case (pay to play). Basically, I suggest this change: https://github.com/EgorBo/runtime-1/pull/new/jit/wide-simd-assertions (just vibe-coded, probably can be simplified). I doubt we will see any noticieable regression anywhere (both TP and alloc size).

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example, my commit can fold this:

Vector256<long> Test(ref Vector256<long> v)
{
    var a = v;
    if (a == Vector256.Create(1, 2, 3, 4))
        return a + a; // folded into [2,4,6,8]
    return default;
}

while your cannot.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just don't see why if we can support all kinds of constants naturally, just that SIMD32 and SIMD64 will require a small arena allocation

That would work too. The downside is that it's forcing an allocation for the most common cases when we can trivially avoid it for the majority scenario (most SIMD constants are broadcastable or repeating the V128 value).

Based on what we had seen in VN, there were enough simd32/64 constants that this added up and pessimized throughput. We fixed that by splitting it out into separate maps. We had similarly seen even in GenTreeVecCon and other areas that the cost of the size check and doing different things for each vector size was significantly less than always touching a full cache line.

I would expect that the "optimization" this PR is doing is still beneficial even with your full fix and that we may even want to do something similar for GenTreeVecCon itself to reduce the amount of data that has to be checked for the most common patterns/optimizations.


I'm fine with waiting for the "full fix" to go in and then comparing SPMI metrics or factoring it into this PR so we can compare and choose. Thoughts?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assertions are created only for x VectorX_op_Equality cns today - do we really have so many of them to have an impact on tp?

It's small "now" but may matter more as this expands. It can be useful for optimizing out a lot of edge cases for floating-point, conversions, and other scenarios when we know a valid is within a given constraint.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Up to you, https://github.com/EgorBo/runtime-1/pull/new/jit/wide-simd-assertions is less changes, generates -8,949 bytes on Windows-x64 locally

Do you have numbers on the throughput (I'm guessing near 0%) and allocation impact?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feel free to submit it as a PR to grab real numbers, i'm 100% sure TP is 0, alloc rate might be visible (>0) but not big enough to care IMO

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fyi: this pr produced -3,260 diff on win-x64 and simd_t* is -8,949

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extract it to #127401 to see the metrics and get full differences. We can take either or both, depending on what the numbers report.

};
public:

Expand Down
Loading