From dd00b22b75cf5781e648219f46adee0ddf554b23 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 30 Mar 2026 23:17:46 -0700 Subject: [PATCH 1/4] improve codegen for vector byte multiply --- src/coreclr/jit/gentree.cpp | 183 ++++++++++-------------------------- 1 file changed, 48 insertions(+), 135 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9b166fd05aac3d..f5acab55401560 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21606,181 +21606,94 @@ GenTree* Compiler::gtNewSimdBinOpNode( #if defined(TARGET_XARCH) if (varTypeIsByte(simdBaseType)) { - var_types widenedSimdBaseType; - NamedIntrinsic widenIntrinsic; - NamedIntrinsic narrowIntrinsic; - var_types widenedType; - unsigned widenedSimdSize; - - if (simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512)) - { - // Input is SIMD32 [U]Byte and AVX512 is supported: - // - Widen inputs as SIMD64 [U]Short - // - Multiply widened inputs (SIMD64 [U]Short) as widened product (SIMD64 [U]Short) - // - Narrow widened product (SIMD64 [U]Short) as SIMD32 [U]Byte - if (simdBaseType == TYP_BYTE) - { - widenedSimdBaseType = TYP_SHORT; - widenIntrinsic = NI_AVX512_ConvertToVector512Int16; - narrowIntrinsic = NI_AVX512_ConvertToVector256SByte; - } - else - { - widenedSimdBaseType = TYP_USHORT; - widenIntrinsic = NI_AVX512_ConvertToVector512UInt16; - narrowIntrinsic = NI_AVX512_ConvertToVector256Byte; - } + // If we can widen to the next vector size up, we can get by + // with a single multiply, then narrow back down. - widenedType = TYP_SIMD64; - widenedSimdSize = 64; + if ((simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX2)) || + (simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512))) + { + unsigned widenedSimdSize = simdSize * 2; + var_types widenedType = getSIMDTypeForSize(widenedSimdSize); + NamedIntrinsic widenIntrinsic = + (simdSize == 16) ? NI_AVX2_ConvertToVector256Int16 : NI_AVX512_ConvertToVector512Int16; - // Vector512 widenedOp1 = Avx512BW.ConvertToVector512UInt16(op1) + // Vector256 widenedOp1 = Avx2.ConvertToVector256Int16(op1) GenTree* widenedOp1 = gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic, simdBaseType, widenedSimdSize); - // Vector512 widenedOp2 = Avx512BW.ConvertToVector512UInt16(op2) + // Vector256 widenedOp2 = Avx2.ConvertToVector256Int16(op2) GenTree* widenedOp2 = gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic, simdBaseType, widenedSimdSize); - // Vector512 widenedProduct = widenedOp1 * widenedOp2; - GenTree* widenedProduct = gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2, - widenedSimdBaseType, widenedSimdSize); + // Vector256 widenedProduct = widenedOp1 * widenedOp2; + GenTree* widenedProduct = + gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2, TYP_SHORT, widenedSimdSize); - // Vector256 product = Avx512BW.ConvertToVector256Byte(widenedProduct) - return gtNewSimdHWIntrinsicNode(type, widenedProduct, narrowIntrinsic, widenedSimdBaseType, - widenedSimdSize); - } - else if (simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - // Input is SIMD16 [U]Byte and AVX512 is supported: - // - Widen inputs as SIMD32 [U]Short - // - Multiply widened inputs (SIMD32 [U]Short) as widened product (SIMD32 [U]Short) - // - Narrow widened product (SIMD32 [U]Short) as SIMD16 [U]Byte - widenIntrinsic = NI_AVX2_ConvertToVector256Int16; - - if (simdBaseType == TYP_BYTE) - { - widenedSimdBaseType = TYP_SHORT; - narrowIntrinsic = NI_AVX512_ConvertToVector128SByte; - } - else - { - widenedSimdBaseType = TYP_USHORT; - narrowIntrinsic = NI_AVX512_ConvertToVector128Byte; - } - - widenedType = TYP_SIMD32; - widenedSimdSize = 32; - - // Vector256 widenedOp1 = Avx2.ConvertToVector256Int16(op1).AsUInt16() - GenTree* widenedOp1 = - gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic, simdBaseType, widenedSimdSize); - - // Vector256 widenedOp2 = Avx2.ConvertToVector256Int16(op2).AsUInt16() - GenTree* widenedOp2 = - gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic, simdBaseType, widenedSimdSize); + NamedIntrinsic narrowIntrinsic = + (simdSize == 16) ? NI_AVX512_ConvertToVector128Byte : NI_AVX512_ConvertToVector256Byte; - // Vector256 widenedProduct = widenedOp1 * widenedOp2 - GenTree* widenedProduct = gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2, - widenedSimdBaseType, widenedSimdSize); - - // Vector128 product = Avx512BW.VL.ConvertToVector128Byte(widenedProduct) - return gtNewSimdHWIntrinsicNode(type, widenedProduct, narrowIntrinsic, widenedSimdBaseType, + // return Avx512BW.ConvertToVector128Byte(widenedProduct) + return gtNewSimdHWIntrinsicNode(type, widenedProduct, narrowIntrinsic, TYP_USHORT, widenedSimdSize); } else { - // Input is SIMD16 [U]Byte and AVX512 is NOT supported (only AVX2 will be used): - // - Widen inputs as SIMD32 [U]Short - // - Multiply widened inputs (SIMD32 [U]Short) as widened product (SIMD32 [U]Short) - // - Mask widened product (SIMD32 [U]Short) to select relevant bits - // - Pack masked product so that relevant bits are packed together in upper and lower halves - // - Shuffle packed product so that relevant bits are placed together in the lower half - // - Select lower (SIMD16 [U]Byte) from shuffled product (SIMD32 [U]Short) - widenedSimdBaseType = simdBaseType == TYP_BYTE ? TYP_SHORT : TYP_USHORT; - widenIntrinsic = NI_AVX2_ConvertToVector256Int16; - widenedType = TYP_SIMD32; - widenedSimdSize = 32; - - // Vector256 widenedOp1 = Avx2.ConvertToVector256Int16(op1).AsUInt16() - GenTree* widenedOp1 = - gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic, simdBaseType, simdSize); - - // Vector256 widenedOp2 = Avx2.ConvertToVector256Int16(op2).AsUInt16() - GenTree* widenedOp2 = - gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic, simdBaseType, simdSize); - - // Vector256 widenedProduct = widenedOp1 * widenedOp2 - GenTree* widenedProduct = gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2, - widenedSimdBaseType, widenedSimdSize); - - // Vector256 vecCon1 = Vector256.Create(0x00FF00FF00FF00FF).AsUInt16() - GenTreeVecCon* vecCon1 = gtNewVconNode(widenedType); - - for (unsigned i = 0; i < (widenedSimdSize / 8); i++) - { - vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF; - } + // Vector256 loByteMask = Vector256.Create(byte.MaxValue) + GenTreeVecCon* loByteMask = gtNewVconNode(widenedType); + loByteMask->EvaluateBroadcastInPlace(UINT8_MAX); - // Vector256 maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16() - GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1, - widenedSimdBaseType, widenedSimdSize); + // Vector256 maskedProduct = Avx2.And(widenedProduct, vecCon1) + GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, loByteMask, + TYP_SHORT, widenedSimdSize); GenTree* maskedProductDup = fgMakeMultiUse(&maskedProduct); - // Vector256 packedProduct = Avx2.PackUnsignedSaturate(maskedProduct, - // maskedProduct).AsUInt64() + // Vector256 packedProduct = Avx2.PackUnsignedSaturate(maskedProduct, maskedProduct) GenTree* packedProduct = gtNewSimdHWIntrinsicNode(widenedType, maskedProduct, maskedProductDup, NI_AVX2_PackUnsignedSaturate, TYP_UBYTE, widenedSimdSize); - var_types permuteBaseType = (simdBaseType == TYP_BYTE) ? TYP_LONG : TYP_ULONG; - - // Vector256 shuffledProduct = Avx2.Permute4x64(w1, 0xD8).AsByte() + // Vector256 shuffledProduct = Avx2.Permute4x64(packedProduct.AsUInt64(), 0b_11_01_10_00) GenTree* shuffledProduct = gtNewSimdHWIntrinsicNode(widenedType, packedProduct, gtNewIconNode(SHUFFLE_WYZX), - NI_AVX2_Permute4x64, permuteBaseType, widenedSimdSize); + NI_AVX2_Permute4x64, TYP_ULONG, widenedSimdSize); - // Vector128 product = shuffledProduct.getLower() + // return shuffledProduct.getLower().AsByte() return gtNewSimdGetLowerNode(type, shuffledProduct, simdBaseType, widenedSimdSize); } } - // No special handling could be performed, apply fallback logic: - // - Widen both inputs lower and upper halves as [U]Short (using helper method) - // - Multiply corrsponding widened input halves together as widened product halves - // - Narrow widened product halves as [U]Byte (using helper method) - widenedSimdBaseType = simdBaseType == TYP_BYTE ? TYP_SHORT : TYP_USHORT; + // Otherwise, we multiply twice and blend the values back together. + // This logic depends on the following facts: + // 1) the low byte of the pmullw(x, y) result is the same regardless of the sources' high bytes. + // 2) pmullw(x, y << 8) shifts the low byte of the result to the high byte. - // op1Dup = op1 GenTree* op1Dup = fgMakeMultiUse(&op1); - - // op2Dup = op2 GenTree* op2Dup = fgMakeMultiUse(&op2); - // Vector256 lowerOp1 = Avx2.ConvertToVector256Int16(op1.GetLower()).AsUInt16() - GenTree* lowerOp1 = gtNewSimdWidenLowerNode(type, op1, simdBaseType, simdSize); + // Vector128 hiByteMask = Vector128.Create(unchecked((short)0xFF00)) + GenTreeVecCon* hiByteMask = gtNewVconNode(type); + hiByteMask->EvaluateBroadcastInPlace(static_cast(0xFF00)); - // Vector256 lowerOp2 = Avx2.ConvertToVector256Int16(op2.GetLower()).AsUInt16() - GenTree* lowerOp2 = gtNewSimdWidenLowerNode(type, op2, simdBaseType, simdSize); + // Vector128 evenProduct = op1.AsInt16() * op1.AsInt16() + GenTree* evenProduct = gtNewSimdBinOpNode(GT_MUL, type, op1, op2, TYP_SHORT, simdSize); - // Vector256 lowerProduct = lowerOp1 * lowerOp2 - GenTree* lowerProduct = - gtNewSimdBinOpNode(GT_MUL, type, lowerOp1, lowerOp2, widenedSimdBaseType, simdSize); + // Vector128 oddOp1 = op1.AsInt16() >>> 8 + GenTree* oddOp1 = gtNewSimdBinOpNode(GT_RSZ, type, op1Dup, gtNewIconNode(8), TYP_SHORT, simdSize); - // Vector256 upperOp1 = Avx2.ConvertToVector256Int16(op1.GetUpper()).AsUInt16() - GenTree* upperOp1 = gtNewSimdWidenUpperNode(type, op1Dup, simdBaseType, simdSize); + // Vector128 oddOp2 = op2.AsInt16() & hiByteMask + GenTree* oddOp2 = gtNewSimdBinOpNode(GT_AND, type, op2Dup, hiByteMask, TYP_SHORT, simdSize); - // Vector256 upperOp2 = Avx2.ConvertToVector256Int16(op2.GetUpper()).AsUInt16() - GenTree* upperOp2 = gtNewSimdWidenUpperNode(type, op2Dup, simdBaseType, simdSize); + // Vector128 oddProduct = oddOp1 * oddOp2 + GenTree* oddProduct = gtNewSimdBinOpNode(GT_MUL, type, oddOp1, oddOp2, TYP_SHORT, simdSize); - // Vector256 upperProduct = upperOp1 * upperOp2 - GenTree* upperProduct = - gtNewSimdBinOpNode(GT_MUL, type, upperOp1, upperOp2, widenedSimdBaseType, simdSize); + // Vector128 evenMasked = evenProduct & ~hiByteMask + GenTree* evenMasked = + gtNewSimdBinOpNode(GT_AND_NOT, type, evenProduct, gtCloneExpr(hiByteMask), TYP_SHORT, simdSize); - // Narrow and merge halves using helper method - return gtNewSimdNarrowNode(type, lowerProduct, upperProduct, simdBaseType, simdSize); + // return (oddProduct | evenMasked).AsByte() + return gtNewSimdBinOpNode(GT_OR, type, oddProduct, evenMasked, simdBaseType, simdSize); } else if (varTypeIsLong(simdBaseType)) { From 67402f839513c40cbe8a8167f53fe6b8c879f37d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 30 Mar 2026 23:42:34 -0700 Subject: [PATCH 2/4] fix comment typos --- src/coreclr/jit/gentree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f5acab55401560..1842b118d1200e 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21644,7 +21644,7 @@ GenTree* Compiler::gtNewSimdBinOpNode( GenTreeVecCon* loByteMask = gtNewVconNode(widenedType); loByteMask->EvaluateBroadcastInPlace(UINT8_MAX); - // Vector256 maskedProduct = Avx2.And(widenedProduct, vecCon1) + // Vector256 maskedProduct = Avx2.And(widenedProduct, loByteMask) GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, loByteMask, TYP_SHORT, widenedSimdSize); GenTree* maskedProductDup = fgMakeMultiUse(&maskedProduct); @@ -21676,7 +21676,7 @@ GenTree* Compiler::gtNewSimdBinOpNode( GenTreeVecCon* hiByteMask = gtNewVconNode(type); hiByteMask->EvaluateBroadcastInPlace(static_cast(0xFF00)); - // Vector128 evenProduct = op1.AsInt16() * op1.AsInt16() + // Vector128 evenProduct = op1.AsInt16() * op2.AsInt16() GenTree* evenProduct = gtNewSimdBinOpNode(GT_MUL, type, op1, op2, TYP_SHORT, simdSize); // Vector128 oddOp1 = op1.AsInt16() >>> 8 From 380e63acc59b0faed7a19c7f9775b5e5f135442d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 31 Mar 2026 01:38:44 -0700 Subject: [PATCH 3/4] fix eval order --- src/coreclr/jit/gentree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 1842b118d1200e..8f30dcb224a605 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21692,8 +21692,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( GenTree* evenMasked = gtNewSimdBinOpNode(GT_AND_NOT, type, evenProduct, gtCloneExpr(hiByteMask), TYP_SHORT, simdSize); - // return (oddProduct | evenMasked).AsByte() - return gtNewSimdBinOpNode(GT_OR, type, oddProduct, evenMasked, simdBaseType, simdSize); + // return (evenMasked | oddProduct).AsByte() + return gtNewSimdBinOpNode(GT_OR, type, evenMasked, oddProduct, simdBaseType, simdSize); } else if (varTypeIsLong(simdBaseType)) { From 76dd060974856c636512a33c56341f66cd9e8fc6 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 6 Apr 2026 12:13:44 -0700 Subject: [PATCH 4/4] feedback --- src/coreclr/jit/gentree.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8f30dcb224a605..20cb75b238d708 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21609,19 +21609,19 @@ GenTree* Compiler::gtNewSimdBinOpNode( // If we can widen to the next vector size up, we can get by // with a single multiply, then narrow back down. - if ((simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX2)) || - (simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512))) + if (((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_AVX2)) || + ((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX512))) { unsigned widenedSimdSize = simdSize * 2; var_types widenedType = getSIMDTypeForSize(widenedSimdSize); NamedIntrinsic widenIntrinsic = (simdSize == 16) ? NI_AVX2_ConvertToVector256Int16 : NI_AVX512_ConvertToVector512Int16; - // Vector256 widenedOp1 = Avx2.ConvertToVector256Int16(op1) + // Vector256 widenedOp1 = Avx2.ConvertToVector256Int16(op1); GenTree* widenedOp1 = gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic, simdBaseType, widenedSimdSize); - // Vector256 widenedOp2 = Avx2.ConvertToVector256Int16(op2) + // Vector256 widenedOp2 = Avx2.ConvertToVector256Int16(op2); GenTree* widenedOp2 = gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic, simdBaseType, widenedSimdSize); @@ -21634,32 +21634,32 @@ GenTree* Compiler::gtNewSimdBinOpNode( NamedIntrinsic narrowIntrinsic = (simdSize == 16) ? NI_AVX512_ConvertToVector128Byte : NI_AVX512_ConvertToVector256Byte; - // return Avx512BW.ConvertToVector128Byte(widenedProduct) + // return Avx512BW.ConvertToVector128Byte(widenedProduct); return gtNewSimdHWIntrinsicNode(type, widenedProduct, narrowIntrinsic, TYP_USHORT, widenedSimdSize); } else { - // Vector256 loByteMask = Vector256.Create(byte.MaxValue) + // Vector256 loByteMask = Vector256.Create(byte.MaxValue); GenTreeVecCon* loByteMask = gtNewVconNode(widenedType); loByteMask->EvaluateBroadcastInPlace(UINT8_MAX); - // Vector256 maskedProduct = Avx2.And(widenedProduct, loByteMask) + // Vector256 maskedProduct = widenedProduct & loByteMask; GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, loByteMask, TYP_SHORT, widenedSimdSize); GenTree* maskedProductDup = fgMakeMultiUse(&maskedProduct); - // Vector256 packedProduct = Avx2.PackUnsignedSaturate(maskedProduct, maskedProduct) + // Vector256 packedProduct = Avx2.PackUnsignedSaturate(maskedProduct, maskedProduct); GenTree* packedProduct = gtNewSimdHWIntrinsicNode(widenedType, maskedProduct, maskedProductDup, NI_AVX2_PackUnsignedSaturate, TYP_UBYTE, widenedSimdSize); - // Vector256 shuffledProduct = Avx2.Permute4x64(packedProduct.AsUInt64(), 0b_11_01_10_00) + // Vector256 shuffledProduct = Avx2.Permute4x64(packedProduct.AsUInt64(), 0b_11_01_10_00); GenTree* shuffledProduct = gtNewSimdHWIntrinsicNode(widenedType, packedProduct, gtNewIconNode(SHUFFLE_WYZX), - NI_AVX2_Permute4x64, TYP_ULONG, widenedSimdSize); + NI_AVX2_Permute4x64, TYP_LONG, widenedSimdSize); - // return shuffledProduct.getLower().AsByte() + // return shuffledProduct.GetLower().AsByte(); return gtNewSimdGetLowerNode(type, shuffledProduct, simdBaseType, widenedSimdSize); } } @@ -21668,31 +21668,33 @@ GenTree* Compiler::gtNewSimdBinOpNode( // This logic depends on the following facts: // 1) the low byte of the pmullw(x, y) result is the same regardless of the sources' high bytes. // 2) pmullw(x, y << 8) shifts the low byte of the result to the high byte. + // 3) pmullw(x >> 8, y & 0xFF00) is the equivalent of 2) for the odd input bytes, + // leaving the result in the high (odd) byte, with a zero low byte. GenTree* op1Dup = fgMakeMultiUse(&op1); GenTree* op2Dup = fgMakeMultiUse(&op2); - // Vector128 hiByteMask = Vector128.Create(unchecked((short)0xFF00)) + // Vector128 hiByteMask = Vector128.Create(unchecked((short)0xFF00)); GenTreeVecCon* hiByteMask = gtNewVconNode(type); hiByteMask->EvaluateBroadcastInPlace(static_cast(0xFF00)); // Vector128 evenProduct = op1.AsInt16() * op2.AsInt16() GenTree* evenProduct = gtNewSimdBinOpNode(GT_MUL, type, op1, op2, TYP_SHORT, simdSize); - // Vector128 oddOp1 = op1.AsInt16() >>> 8 + // Vector128 oddOp1 = op1.AsInt16() >>> 8; GenTree* oddOp1 = gtNewSimdBinOpNode(GT_RSZ, type, op1Dup, gtNewIconNode(8), TYP_SHORT, simdSize); - // Vector128 oddOp2 = op2.AsInt16() & hiByteMask + // Vector128 oddOp2 = op2.AsInt16() & hiByteMask; GenTree* oddOp2 = gtNewSimdBinOpNode(GT_AND, type, op2Dup, hiByteMask, TYP_SHORT, simdSize); // Vector128 oddProduct = oddOp1 * oddOp2 GenTree* oddProduct = gtNewSimdBinOpNode(GT_MUL, type, oddOp1, oddOp2, TYP_SHORT, simdSize); - // Vector128 evenMasked = evenProduct & ~hiByteMask + // Vector128 evenMasked = evenProduct & ~hiByteMask; GenTree* evenMasked = gtNewSimdBinOpNode(GT_AND_NOT, type, evenProduct, gtCloneExpr(hiByteMask), TYP_SHORT, simdSize); - // return (evenMasked | oddProduct).AsByte() + // return (evenMasked | oddProduct).AsByte(); return gtNewSimdBinOpNode(GT_OR, type, evenMasked, oddProduct, simdBaseType, simdSize); } else if (varTypeIsLong(simdBaseType))