From 4e3f88211b35d9c640555b11793f1ee1816f36f0 Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Thu, 12 Mar 2026 21:22:59 -0400 Subject: [PATCH 1/7] Consult OS support for AVX/AVX-512 --- src/Target.cpp | 66 +++++++++++++++------ src/runtime/x86.ll | 8 +++ src/runtime/x86_cpu_features.cpp | 72 ++++++++++++++++------- test/generator/can_use_target_aottest.cpp | 24 +++++++- 4 files changed, 127 insertions(+), 43 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index c9bcd1310917..892de618d4ca 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -71,6 +71,10 @@ void cpuid(int info[4], int infoType, int extra) { __cpuidex(info, infoType, extra); } +uint64_t xgetbv(uint32_t xcr) { + return _xgetbv(xcr); +} + #elif defined(__x86_64__) || defined(__i386__) // CPU feature detection code taken from ispc @@ -83,6 +87,12 @@ void cpuid(int info[4], int infoType, int extra) { : "0"(infoType), "2"(extra)); } +uint64_t xgetbv(uint32_t xcr) { + uint32_t lo, hi; + __asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr)); + return ((uint64_t)hi << 32) | lo; +} + #endif #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64) @@ -342,13 +352,25 @@ Target calculate_host_target() { unsigned family = 0, model = 0; detect_family_and_model(info[0], family, model); - bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19] - bool have_sse2 = (info[3] & (1 << 26)) != 0; // EDX[26] - bool have_sse3 = (info[2] & (1 << 0)) != 0; // ECX[0] - bool have_avx = (info[2] & (1 << 28)) != 0; // ECX[28] - bool have_f16c = (info[2] & (1 << 29)) != 0; // ECX[29] - bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30] - bool have_fma = (info[2] & (1 << 12)) != 0; // ECX[12] + // Check OS support for AVX/AVX-512 state saving via XSAVE. + // Even if the CPU supports these features, the OS must enable + // the corresponding state components in XCR0 or use will fault. + bool have_osxsave = (info[2] & (1 << 27)) != 0; // ECX[27] + bool os_avx = false; + bool os_avx512 = false; + if (have_osxsave) { + uint64_t xcr0 = xgetbv(0); + os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2) + os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0); // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7) + } + + bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19] + bool have_sse2 = (info[3] & (1 << 26)) != 0; // EDX[26] + bool have_sse3 = (info[2] & (1 << 0)) != 0; // ECX[0] + bool have_avx = (info[2] & (1 << 28)) != 0 && os_avx; // ECX[28], requires OS AVX support + bool have_f16c = (info[2] & (1 << 29)) != 0 && os_avx; // ECX[29], VEX-encoded + bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30] + bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; // ECX[12], VEX-encoded user_assert(have_sse2) << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n" @@ -364,19 +386,27 @@ Target calculate_host_target() { if (processor == Target::Processor::ZnVer4) { Target t{os, arch, bits, processor, initial_features, vector_bits}; - t.set_features({Target::SSE41, Target::AVX, - Target::F16C, Target::FMA, - Target::AVX2, Target::AVX512, - Target::AVX512_Skylake, Target::AVX512_Cannonlake, - Target::AVX512_Zen4}); + t.set_feature(Target::SSE41); + if (os_avx) { + t.set_features({Target::AVX, Target::F16C, Target::FMA, Target::AVX2}); + } + if (os_avx512) { + t.set_features({Target::AVX512, Target::AVX512_Skylake, + Target::AVX512_Cannonlake, Target::AVX512_Zen4}); + } return t; } else if (processor == Target::Processor::ZnVer5) { Target t{os, arch, bits, processor, initial_features, vector_bits}; - t.set_features({Target::SSE41, Target::AVX, - Target::F16C, Target::FMA, - Target::AVX2, Target::AVXVNNI, Target::AVX512, - Target::AVX512_Skylake, Target::AVX512_Cannonlake, - Target::AVX512_Zen4, Target::AVX512_Zen5}); + t.set_feature(Target::SSE41); + if (os_avx) { + t.set_features({Target::AVX, Target::F16C, Target::FMA, + Target::AVX2, Target::AVXVNNI}); + } + if (os_avx512) { + t.set_features({Target::AVX512, Target::AVX512_Skylake, + Target::AVX512_Cannonlake, + Target::AVX512_Zen4, Target::AVX512_Zen5}); + } return t; } } @@ -421,7 +451,7 @@ Target calculate_host_target() { if ((info2[1] & avx2) == avx2) { initial_features.push_back(Target::AVX2); } - if ((info2[1] & avx512) == avx512) { + if (os_avx512 && (info2[1] & avx512) == avx512) { initial_features.push_back(Target::AVX512); // TODO: port to family/model -based detection. if ((info2[1] & avx512_knl) == avx512_knl) { diff --git a/src/runtime/x86.ll b/src/runtime/x86.ll index 78c96bf5fa5c..ad3df0f7cabe 100644 --- a/src/runtime/x86.ll +++ b/src/runtime/x86.ll @@ -146,3 +146,11 @@ define weak_odr void @x64_cpuid_halide(i32* %info) nounwind uwtable { call void asm sideeffect inteldialect "xchg rbx, rsi\0A\09mov eax, dword ptr $$0 $0\0A\09mov ecx, dword ptr $$4 $0\0A\09cpuid\0A\09mov dword ptr $$0 $0, eax\0A\09mov dword ptr $$4 $0, ebx\0A\09mov dword ptr $$8 $0, ecx\0A\09mov dword ptr $$12 $0, edx\0A\09xchg rbx, rsi", "=*m,~{eax},~{ebx},~{ecx},~{edx},~{esi},~{dirflag},~{fpsr},~{flags}"(i32* elementtype(i32) %info) ret void } + +; xgetbv: info[0] is ECX (input), output is info[0]=EAX, info[1]=EDX. +; Unlike cpuid, xgetbv does not clobber ebx/rbx, so one definition +; works for both 32-bit and 64-bit. +define weak_odr void @xgetbv_halide(i32* %info) nounwind uwtable { + call void asm sideeffect inteldialect "mov ecx, dword ptr $$0 $0\0A\09xgetbv\0A\09mov dword ptr $$0 $0, eax\0A\09mov dword ptr $$4 $0, edx", "=*m,~{eax},~{ecx},~{edx},~{dirflag},~{fpsr},~{flags}"(i32* elementtype(i32) %info) + ret void +} diff --git a/src/runtime/x86_cpu_features.cpp b/src/runtime/x86_cpu_features.cpp index 8e63c2495394..3a17d3fb971b 100644 --- a/src/runtime/x86_cpu_features.cpp +++ b/src/runtime/x86_cpu_features.cpp @@ -7,6 +7,7 @@ namespace Internal { extern "C" void x86_cpuid_halide(int32_t *); extern "C" void x64_cpuid_halide(int32_t *); +extern "C" void xgetbv_halide(int32_t *); namespace { @@ -22,6 +23,13 @@ ALWAYS_INLINE void cpuid(int32_t *info, int32_t fn_id, int32_t extra = 0) { } } +// Returns low 32 bits of XCR specified by xcr_id. +ALWAYS_INLINE uint32_t xgetbv(uint32_t xcr_id) { + int32_t xcr_info[2] = {(int32_t)xcr_id, 0}; + xgetbv_halide(xcr_info); + return (uint32_t)xcr_info[0]; +} + } // namespace extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { @@ -42,6 +50,18 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { int32_t info[4]; cpuid(info, 1); + // Check OS support for AVX/AVX-512 state saving via XSAVE. + // Even if the CPU supports these features, the OS must enable + // the corresponding state components in XCR0 or use will fault. + const bool have_osxsave = (info[2] & (1 << 27)) != 0; // ECX[27] + bool os_avx = false; + bool os_avx512 = false; + if (have_osxsave) { + uint32_t xcr0 = xgetbv(0); + os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2) + os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0); // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7) + } + uint32_t family = (info[0] >> 8) & 0xF; // Bits 8..11 uint32_t model = (info[0] >> 4) & 0xF; // Bits 4..7 if (family == 0x6 || family == 0xF) { @@ -58,28 +78,36 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { if (family == 0x19 && model == 0x61) { // Zen4 halide_set_available_cpu_feature(features, halide_target_feature_sse41); - halide_set_available_cpu_feature(features, halide_target_feature_avx); - halide_set_available_cpu_feature(features, halide_target_feature_f16c); - halide_set_available_cpu_feature(features, halide_target_feature_fma); - halide_set_available_cpu_feature(features, halide_target_feature_avx2); - halide_set_available_cpu_feature(features, halide_target_feature_avx512); - halide_set_available_cpu_feature(features, halide_target_feature_avx512_skylake); - halide_set_available_cpu_feature(features, halide_target_feature_avx512_cannonlake); - halide_set_available_cpu_feature(features, halide_target_feature_avx512_zen4); + if (os_avx) { + halide_set_available_cpu_feature(features, halide_target_feature_avx); + halide_set_available_cpu_feature(features, halide_target_feature_f16c); + halide_set_available_cpu_feature(features, halide_target_feature_fma); + halide_set_available_cpu_feature(features, halide_target_feature_avx2); + } + if (os_avx512) { + halide_set_available_cpu_feature(features, halide_target_feature_avx512); + halide_set_available_cpu_feature(features, halide_target_feature_avx512_skylake); + halide_set_available_cpu_feature(features, halide_target_feature_avx512_cannonlake); + halide_set_available_cpu_feature(features, halide_target_feature_avx512_zen4); + } return halide_error_code_success; } else if (family == 0x1a) { // Zen5 halide_set_available_cpu_feature(features, halide_target_feature_sse41); - halide_set_available_cpu_feature(features, halide_target_feature_avx); - halide_set_available_cpu_feature(features, halide_target_feature_f16c); - halide_set_available_cpu_feature(features, halide_target_feature_fma); - halide_set_available_cpu_feature(features, halide_target_feature_avx2); - halide_set_available_cpu_feature(features, halide_target_feature_avxvnni); - halide_set_available_cpu_feature(features, halide_target_feature_avx512); - halide_set_available_cpu_feature(features, halide_target_feature_avx512_skylake); - halide_set_available_cpu_feature(features, halide_target_feature_avx512_cannonlake); - halide_set_available_cpu_feature(features, halide_target_feature_avx512_zen4); - halide_set_available_cpu_feature(features, halide_target_feature_avx512_zen5); + if (os_avx) { + halide_set_available_cpu_feature(features, halide_target_feature_avx); + halide_set_available_cpu_feature(features, halide_target_feature_f16c); + halide_set_available_cpu_feature(features, halide_target_feature_fma); + halide_set_available_cpu_feature(features, halide_target_feature_avx2); + halide_set_available_cpu_feature(features, halide_target_feature_avxvnni); + } + if (os_avx512) { + halide_set_available_cpu_feature(features, halide_target_feature_avx512); + halide_set_available_cpu_feature(features, halide_target_feature_avx512_skylake); + halide_set_available_cpu_feature(features, halide_target_feature_avx512_cannonlake); + halide_set_available_cpu_feature(features, halide_target_feature_avx512_zen4); + halide_set_available_cpu_feature(features, halide_target_feature_avx512_zen5); + } return halide_error_code_success; } } @@ -89,10 +117,10 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { // complicated. const bool have_sse41 = (info[2] & (1 << 19)) != 0; - const bool have_avx = (info[2] & (1 << 28)) != 0; - const bool have_f16c = (info[2] & (1 << 29)) != 0; + const bool have_avx = (info[2] & (1 << 28)) != 0 && os_avx; + const bool have_f16c = (info[2] & (1 << 29)) != 0 && os_avx; const bool have_rdrand = (info[2] & (1 << 30)) != 0; - const bool have_fma = (info[2] & (1 << 12)) != 0; + const bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; if (have_sse41) { halide_set_available_cpu_feature(features, halide_target_feature_sse41); } @@ -127,7 +155,7 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { if ((info2[1] & avx2) == avx2) { halide_set_available_cpu_feature(features, halide_target_feature_avx2); } - if ((info2[1] & avx512) == avx512) { + if (os_avx512 && (info2[1] & avx512) == avx512) { halide_set_available_cpu_feature(features, halide_target_feature_avx512); if ((info2[1] & avx512_knl) == avx512_knl) { halide_set_available_cpu_feature(features, halide_target_feature_avx512_knl); diff --git a/test/generator/can_use_target_aottest.cpp b/test/generator/can_use_target_aottest.cpp index 64e8b13d5227..bd2624d700ae 100644 --- a/test/generator/can_use_target_aottest.cpp +++ b/test/generator/can_use_target_aottest.cpp @@ -31,6 +31,17 @@ static void cpuid(int info[4], int infoType, int extra) { : "0"(infoType), "2"(extra)); } #endif +#if defined(_MSC_VER) +static uint64_t xgetbv(uint32_t xcr) { + return _xgetbv(xcr); +} +#else +static uint64_t xgetbv(uint32_t xcr) { + uint32_t lo, hi; + __asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr)); + return ((uint64_t)hi << 32) | lo; +} +#endif #endif // TESTING_ON_X86 struct HostFeatures { @@ -52,11 +63,18 @@ int main(int argc, char **argv) { int info[4]; cpuid(info, 1, 0); + // Check OS support for AVX state saving via XSAVE + bool os_avx = false; + if (info[2] & (1 << 27)) { // OSXSAVE + uint64_t xcr0 = xgetbv(0); + os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2) + } + HostFeatures host_features; - if (info[2] & (1 << 28)) host_features.set(halide_target_feature_avx); + if ((info[2] & (1 << 28)) && os_avx) host_features.set(halide_target_feature_avx); if (info[2] & (1 << 19)) host_features.set(halide_target_feature_sse41); - if (info[2] & (1 << 29)) host_features.set(halide_target_feature_f16c); - if (info[2] & (1 << 12)) host_features.set(halide_target_feature_fma); + if ((info[2] & (1 << 29)) && os_avx) host_features.set(halide_target_feature_f16c); + if ((info[2] & (1 << 12)) && os_avx) host_features.set(halide_target_feature_fma); printf("host_features are: "); for (int i = 0; i < host_features.kWordCount; i++) { From f751e14b383b80f905f83b2c798660aaab6e9c17 Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Thu, 12 Mar 2026 23:39:08 -0400 Subject: [PATCH 2/7] clang-format --- src/Target.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index 892de618d4ca..411a3fdc37f0 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -360,17 +360,17 @@ Target calculate_host_target() { bool os_avx512 = false; if (have_osxsave) { uint64_t xcr0 = xgetbv(0); - os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2) - os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0); // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7) + os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2) + os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0); // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7) } - bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19] - bool have_sse2 = (info[3] & (1 << 26)) != 0; // EDX[26] - bool have_sse3 = (info[2] & (1 << 0)) != 0; // ECX[0] - bool have_avx = (info[2] & (1 << 28)) != 0 && os_avx; // ECX[28], requires OS AVX support - bool have_f16c = (info[2] & (1 << 29)) != 0 && os_avx; // ECX[29], VEX-encoded - bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30] - bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; // ECX[12], VEX-encoded + bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19] + bool have_sse2 = (info[3] & (1 << 26)) != 0; // EDX[26] + bool have_sse3 = (info[2] & (1 << 0)) != 0; // ECX[0] + bool have_avx = (info[2] & (1 << 28)) != 0 && os_avx; // ECX[28], requires OS AVX support + bool have_f16c = (info[2] & (1 << 29)) != 0 && os_avx; // ECX[29], VEX-encoded + bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30] + bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; // ECX[12], VEX-encoded user_assert(have_sse2) << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n" From 212a23d2289a0c8657cdf6a012c362cfb96077ad Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Fri, 13 Mar 2026 09:25:59 -0400 Subject: [PATCH 3/7] Use the info_avx10 array in Target.cpp --- src/Target.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index 411a3fdc37f0..59f4716bf70b 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -486,18 +486,18 @@ Target calculate_host_target() { // This checks that the AVX10 version is greater than zero. // It isn't really needed as for now only one version exists, but // the docs indicate bits 0:7 of EBX should be >= 0 so... - if ((info[1] & 0xff) >= 1) { + if ((info_avx10[1] & 0xff) >= 1) { initial_features.push_back(Target::AVX10_1); const uint32_t avx10_128 = 1U << 16; const uint32_t avx10_256 = 1U << 17; const uint32_t avx10_512 = 1U << 18; // Choose the maximum one that is available. - if (info[1] & avx10_512) { + if (info_avx10[1] & avx10_512) { vector_bits = 512; - } else if (info[1] & avx10_256) { + } else if (info_avx10[1] & avx10_256) { vector_bits = 256; - } else if (info[1] & avx10_128) { // Not clear it is worth turning on AVX10 for this case. + } else if (info_avx10[1] & avx10_128) { // Not clear it is worth turning on AVX10 for this case. vector_bits = 128; } } From 15d2fe34a775997f15b43437ff8aaa07477880a2 Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Fri, 13 Mar 2026 09:27:16 -0400 Subject: [PATCH 4/7] Detect SVE2 on ARM Windows --- src/Target.cpp | 8 +++++++- src/runtime/aarch64_cpu_features.cpp | 7 ++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index 59f4716bf70b..4c9096ed5d59 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -292,8 +292,9 @@ Target calculate_host_target() { #ifdef _MSC_VER - // Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19 + // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent #define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46) +#define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE (47) // This is the strategy used by Google's cpuinfo library for // detecting fp16 arithmetic support on Windows. @@ -312,6 +313,11 @@ Target calculate_host_target() { // has_scalable_vector = true; // } + if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) { + initial_features.push_back(Target::SVE2); + has_scalable_vector = true; + } + #endif #if defined(__aarch64__) diff --git a/src/runtime/aarch64_cpu_features.cpp b/src/runtime/aarch64_cpu_features.cpp index 4ab038fa983c..cc591341e42d 100644 --- a/src/runtime/aarch64_cpu_features.cpp +++ b/src/runtime/aarch64_cpu_features.cpp @@ -79,8 +79,9 @@ extern "C" BOOL IsProcessorFeaturePresent(DWORD feature); #define PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE (27) #define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE (43) -// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19 +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent #define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46) +#define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE (47) namespace { @@ -99,6 +100,10 @@ void set_platform_features(CpuFeatures *features) { if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) { halide_set_available_cpu_feature(features, halide_target_feature_sve); } + + if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) { + halide_set_available_cpu_feature(features, halide_target_feature_sve2); + } } } // namespace From 702dc669cc350c2cefa9b23105d5cc277ac0a8f3 Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Fri, 13 Mar 2026 09:42:47 -0400 Subject: [PATCH 5/7] Enable detecting AVX10 and APX, respecting XSAVE --- src/Target.cpp | 6 ++++-- src/runtime/x86_cpu_features.cpp | 37 +++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index 4c9096ed5d59..a4f776fe8304 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -364,10 +364,12 @@ Target calculate_host_target() { bool have_osxsave = (info[2] & (1 << 27)) != 0; // ECX[27] bool os_avx = false; bool os_avx512 = false; + bool os_apx = false; if (have_osxsave) { uint64_t xcr0 = xgetbv(0); os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2) os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0); // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7) + os_apx = (xcr0 & 0x80000) == 0x80000; // APX extended GPRs (bit 19) } bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19] @@ -485,7 +487,7 @@ Target calculate_host_target() { // AVX10 converged vector instructions. const uint32_t avx10 = 1U << 19; - if (info2[3] & avx10) { + if (os_avx512 && (info2[3] & avx10)) { int info_avx10[4]; cpuid(info_avx10, 0x24, 0x0); @@ -511,7 +513,7 @@ Target calculate_host_target() { // APX register extensions, etc. const uint32_t apx = 1U << 21; - if (info3[3] & apx) { + if (os_apx && (info3[3] & apx)) { initial_features.push_back(Target::X86APX); } } diff --git a/src/runtime/x86_cpu_features.cpp b/src/runtime/x86_cpu_features.cpp index 3a17d3fb971b..c4e831a86ced 100644 --- a/src/runtime/x86_cpu_features.cpp +++ b/src/runtime/x86_cpu_features.cpp @@ -35,14 +35,20 @@ ALWAYS_INLINE uint32_t xgetbv(uint32_t xcr_id) { extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { halide_set_known_cpu_feature(features, halide_target_feature_sse41); halide_set_known_cpu_feature(features, halide_target_feature_avx); - halide_set_known_cpu_feature(features, halide_target_feature_f16c); - halide_set_known_cpu_feature(features, halide_target_feature_fma); halide_set_known_cpu_feature(features, halide_target_feature_avx2); + halide_set_known_cpu_feature(features, halide_target_feature_avxvnni); + halide_set_known_cpu_feature(features, halide_target_feature_fma); + // halide_set_known_cpu_feature(features, halide_target_feature_fma4); + halide_set_known_cpu_feature(features, halide_target_feature_f16c); halide_set_known_cpu_feature(features, halide_target_feature_avx512); halide_set_known_cpu_feature(features, halide_target_feature_avx512_knl); halide_set_known_cpu_feature(features, halide_target_feature_avx512_skylake); halide_set_known_cpu_feature(features, halide_target_feature_avx512_cannonlake); + halide_set_known_cpu_feature(features, halide_target_feature_avx512_zen4); + halide_set_known_cpu_feature(features, halide_target_feature_avx512_zen5); halide_set_known_cpu_feature(features, halide_target_feature_avx512_sapphirerapids); + halide_set_known_cpu_feature(features, halide_target_feature_avx10_1); + halide_set_known_cpu_feature(features, halide_target_feature_x86_apx); // Detect CPU features by specific microarchitecture. int32_t vendor[4]; @@ -56,10 +62,12 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { const bool have_osxsave = (info[2] & (1 << 27)) != 0; // ECX[27] bool os_avx = false; bool os_avx512 = false; + bool os_apx = false; if (have_osxsave) { uint32_t xcr0 = xgetbv(0); os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2) os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0); // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7) + os_apx = (xcr0 & 0x80000) == 0x80000; // APX extended GPRs (bit 19) } uint32_t family = (info[0] >> 8) & 0xF; // Bits 8..11 @@ -135,8 +143,10 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { } if (use_64_bits && have_avx && have_f16c && have_rdrand) { - int info2[4]; + int32_t info2[4]; cpuid(info2, 7); + int32_t info3[4]; + cpuid(info3, 7, 1); constexpr uint32_t avx2 = 1U << 5; constexpr uint32_t avx512f = 1U << 16; constexpr uint32_t avx512dq = 1U << 17; @@ -166,8 +176,6 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) { halide_set_available_cpu_feature(features, halide_target_feature_avx512_cannonlake); - int32_t info3[4]; - cpuid(info3, 7, 1); if ((info3[0] & avxvnni) == avxvnni) { halide_set_available_cpu_feature(features, halide_target_feature_avxvnni); if ((info3[0] & avx512bf16) == avx512bf16) { @@ -176,6 +184,25 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { } } } + + // AVX10 converged vector instructions. + // AVX10 uses EVEX encoding with opmask registers at all vector widths, + // so it requires the same OS XSAVE support as AVX-512. + constexpr uint32_t avx10 = 1U << 19; + if (os_avx512 && (info2[3] & avx10)) { + int32_t info_avx10[4]; + cpuid(info_avx10, 0x24, 0x0); + if ((info_avx10[1] & 0xff) >= 1) { + halide_set_available_cpu_feature(features, halide_target_feature_avx10_1); + } + } + + // APX extended GPRs (R16-R31) require OS support via XSAVE + // state component 19 (XCR0 bit 19). + constexpr uint32_t apx = 1U << 21; + if (os_apx && (info3[3] & apx)) { + halide_set_available_cpu_feature(features, halide_target_feature_x86_apx); + } } return halide_error_code_success; } From 3533b4ae3425daa39cf1f27032a7b41c72d34a0d Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Fri, 13 Mar 2026 09:56:33 -0400 Subject: [PATCH 6/7] Enable FMA4 runtime and host detection --- src/Target.cpp | 9 +++++++++ src/runtime/x86_cpu_features.cpp | 12 +++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/Target.cpp b/src/Target.cpp index a4f776fe8304..4785d8c1b6c7 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -380,6 +380,12 @@ Target calculate_host_target() { bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30] bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; // ECX[12], VEX-encoded + // FMA4 is in CPUID extended leaf 0x80000001, ECX bit 16. + // It uses VEX-encoded YMM instructions, so requires OS AVX support. + int info_ext[4]; + cpuid(info_ext, 0x80000001, 0); + bool have_fma4 = (info_ext[2] & (1 << 16)) != 0 && os_avx; // ECX[16], VEX-encoded + user_assert(have_sse2) << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n" << "cpuid returned: " @@ -435,6 +441,9 @@ Target calculate_host_target() { if (have_fma) { initial_features.push_back(Target::FMA); } + if (have_fma4) { + initial_features.push_back(Target::FMA4); + } if (use_64_bits && have_avx && have_f16c && have_rdrand) { // So far, so good. AVX2/512? diff --git a/src/runtime/x86_cpu_features.cpp b/src/runtime/x86_cpu_features.cpp index c4e831a86ced..0b762f4058ac 100644 --- a/src/runtime/x86_cpu_features.cpp +++ b/src/runtime/x86_cpu_features.cpp @@ -38,7 +38,7 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { halide_set_known_cpu_feature(features, halide_target_feature_avx2); halide_set_known_cpu_feature(features, halide_target_feature_avxvnni); halide_set_known_cpu_feature(features, halide_target_feature_fma); - // halide_set_known_cpu_feature(features, halide_target_feature_fma4); + halide_set_known_cpu_feature(features, halide_target_feature_fma4); halide_set_known_cpu_feature(features, halide_target_feature_f16c); halide_set_known_cpu_feature(features, halide_target_feature_avx512); halide_set_known_cpu_feature(features, halide_target_feature_avx512_knl); @@ -129,6 +129,13 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { const bool have_f16c = (info[2] & (1 << 29)) != 0 && os_avx; const bool have_rdrand = (info[2] & (1 << 30)) != 0; const bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; + + // FMA4 is in CPUID extended leaf 0x80000001, ECX bit 16. + // It uses VEX-encoded YMM instructions, so requires OS AVX support. + int32_t info_ext[4]; + cpuid(info_ext, 0x80000001); + const bool have_fma4 = (info_ext[2] & (1 << 16)) != 0 && os_avx; + if (have_sse41) { halide_set_available_cpu_feature(features, halide_target_feature_sse41); } @@ -141,6 +148,9 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { if (have_fma) { halide_set_available_cpu_feature(features, halide_target_feature_fma); } + if (have_fma4) { + halide_set_available_cpu_feature(features, halide_target_feature_fma4); + } if (use_64_bits && have_avx && have_f16c && have_rdrand) { int32_t info2[4]; From 0bd19c099db5bf889d32f6c1ef10f848551bd1ad Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Fri, 13 Mar 2026 10:16:54 -0400 Subject: [PATCH 7/7] Refactor cpuid to be [[nodiscard]] --- src/Target.cpp | 97 ++++++++++++++++---------------- src/runtime/x86_cpu_features.cpp | 74 ++++++++++++------------ 2 files changed, 85 insertions(+), 86 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index 4785d8c1b6c7..c1b8ebbdb99a 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -65,13 +65,19 @@ __attribute__((target("+sve"))) int get_sve_vector_length() { } #endif +struct cpuid_result { + int eax, ebx, ecx, edx; +}; + #if defined(_M_IX86) || defined(_M_AMD64) -void cpuid(int info[4], int infoType, int extra) { +[[nodiscard]] cpuid_result cpuid(int infoType, int extra = 0) { + int info[4]; __cpuidex(info, infoType, extra); + return {info[0], info[1], info[2], info[3]}; } -uint64_t xgetbv(uint32_t xcr) { +[[nodiscard]] uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); } @@ -80,14 +86,16 @@ uint64_t xgetbv(uint32_t xcr) { // CPU feature detection code taken from ispc // (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll) -void cpuid(int info[4], int infoType, int extra) { +[[nodiscard]] cpuid_result cpuid(int infoType, int extra = 0) { + cpuid_result result; __asm__ __volatile__( "cpuid \n\t" - : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) + : "=a"(result.eax), "=b"(result.ebx), "=c"(result.ecx), "=d"(result.edx) : "0"(infoType), "2"(extra)); + return result; } -uint64_t xgetbv(uint32_t xcr) { +[[nodiscard]] uint64_t xgetbv(uint32_t xcr) { uint32_t lo, hi; __asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr)); return ((uint64_t)hi << 32) | lo; @@ -104,20 +112,19 @@ enum class VendorSignatures { }; VendorSignatures get_vendor_signature() { - int info[4]; - cpuid(info, 0, 0); + const auto info = cpuid(0); - if (info[0] < 1) { + if (info.eax < 1) { return VendorSignatures::Unknown; } // "Genu ineI ntel" - if (info[1] == 0x756e6547 && info[3] == 0x49656e69 && info[2] == 0x6c65746e) { + if (info.ebx == 0x756e6547 && info.edx == 0x49656e69 && info.ecx == 0x6c65746e) { return VendorSignatures::GenuineIntel; } // "Auth enti cAMD" - if (info[1] == 0x68747541 && info[3] == 0x69746e65 && info[2] == 0x444d4163) { + if (info.ebx == 0x68747541 && info.edx == 0x69746e65 && info.ecx == 0x444d4163) { return VendorSignatures::AuthenticAMD; } @@ -352,16 +359,15 @@ Target calculate_host_target() { VendorSignatures vendor_signature = get_vendor_signature(); - int info[4]; - cpuid(info, 1, 0); + const auto info = cpuid(1); unsigned family = 0, model = 0; - detect_family_and_model(info[0], family, model); + detect_family_and_model(info.eax, family, model); // Check OS support for AVX/AVX-512 state saving via XSAVE. // Even if the CPU supports these features, the OS must enable // the corresponding state components in XCR0 or use will fault. - bool have_osxsave = (info[2] & (1 << 27)) != 0; // ECX[27] + bool have_osxsave = (info.ecx & (1 << 27)) != 0; // ECX[27] bool os_avx = false; bool os_avx512 = false; bool os_apx = false; @@ -372,27 +378,26 @@ Target calculate_host_target() { os_apx = (xcr0 & 0x80000) == 0x80000; // APX extended GPRs (bit 19) } - bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19] - bool have_sse2 = (info[3] & (1 << 26)) != 0; // EDX[26] - bool have_sse3 = (info[2] & (1 << 0)) != 0; // ECX[0] - bool have_avx = (info[2] & (1 << 28)) != 0 && os_avx; // ECX[28], requires OS AVX support - bool have_f16c = (info[2] & (1 << 29)) != 0 && os_avx; // ECX[29], VEX-encoded - bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30] - bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; // ECX[12], VEX-encoded + bool have_sse41 = (info.ecx & (1 << 19)) != 0; // ECX[19] + bool have_sse2 = (info.edx & (1 << 26)) != 0; // EDX[26] + bool have_sse3 = (info.ecx & (1 << 0)) != 0; // ECX[0] + bool have_avx = (info.ecx & (1 << 28)) != 0 && os_avx; // ECX[28], requires OS AVX support + bool have_f16c = (info.ecx & (1 << 29)) != 0 && os_avx; // ECX[29], VEX-encoded + bool have_rdrand = (info.ecx & (1 << 30)) != 0; // ECX[30] + bool have_fma = (info.ecx & (1 << 12)) != 0 && os_avx; // ECX[12], VEX-encoded // FMA4 is in CPUID extended leaf 0x80000001, ECX bit 16. // It uses VEX-encoded YMM instructions, so requires OS AVX support. - int info_ext[4]; - cpuid(info_ext, 0x80000001, 0); - bool have_fma4 = (info_ext[2] & (1 << 16)) != 0 && os_avx; // ECX[16], VEX-encoded + const auto info_ext = cpuid(0x80000001); + bool have_fma4 = (info_ext.ecx & (1 << 16)) != 0 && os_avx; // ECX[16], VEX-encoded user_assert(have_sse2) << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n" << "cpuid returned: " - << std::hex << info[0] - << ", " << info[1] - << ", " << info[2] - << ", " << info[3] + << std::hex << info.eax + << ", " << info.ebx + << ", " << info.ecx + << ", " << info.edx << std::dec << "\n"; if (vendor_signature == VendorSignatures::AuthenticAMD) { @@ -447,11 +452,8 @@ Target calculate_host_target() { if (use_64_bits && have_avx && have_f16c && have_rdrand) { // So far, so good. AVX2/512? - // Call cpuid with eax=7, ecx=0 - int info2[4]; - cpuid(info2, 7, 0); - int info3[4]; - cpuid(info3, 7, 1); + const auto info2 = cpuid(7, 0); + const auto info3 = cpuid(7, 1); const uint32_t avx2 = 1U << 5; const uint32_t avx512f = 1U << 16; const uint32_t avx512dq = 1U << 17; @@ -465,29 +467,29 @@ Target calculate_host_target() { const uint32_t avx512_knl = avx512 | avx512pf | avx512er; const uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq; const uint32_t avx512_cannonlake = avx512_skylake | avx512ifma; // Assume ifma => vbmi - if ((info2[1] & avx2) == avx2) { + if ((info2.ebx & avx2) == avx2) { initial_features.push_back(Target::AVX2); } - if (os_avx512 && (info2[1] & avx512) == avx512) { + if (os_avx512 && (info2.ebx & avx512) == avx512) { initial_features.push_back(Target::AVX512); // TODO: port to family/model -based detection. - if ((info2[1] & avx512_knl) == avx512_knl) { + if ((info2.ebx & avx512_knl) == avx512_knl) { initial_features.push_back(Target::AVX512_KNL); } // TODO: port to family/model -based detection. - if ((info2[1] & avx512_skylake) == avx512_skylake) { + if ((info2.ebx & avx512_skylake) == avx512_skylake) { initial_features.push_back(Target::AVX512_Skylake); } // TODO: port to family/model -based detection. - if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) { + if ((info2.ebx & avx512_cannonlake) == avx512_cannonlake) { initial_features.push_back(Target::AVX512_Cannonlake); const uint32_t avxvnni = 1U << 4; // avxvnni (note, not avx512vnni) result in eax const uint32_t avx512bf16 = 1U << 5; // bf16 result in eax, with cpuid(eax=7, ecx=1) // TODO: port to family/model -based detection. - if ((info3[0] & avxvnni) == avxvnni) { + if ((info3.eax & avxvnni) == avxvnni) { initial_features.push_back(Target::AVXVNNI); - if ((info3[0] & avx512bf16) == avx512bf16) { + if ((info3.eax & avx512bf16) == avx512bf16) { initial_features.push_back(Target::AVX512_SapphireRapids); } } @@ -496,25 +498,24 @@ Target calculate_host_target() { // AVX10 converged vector instructions. const uint32_t avx10 = 1U << 19; - if (os_avx512 && (info2[3] & avx10)) { - int info_avx10[4]; - cpuid(info_avx10, 0x24, 0x0); + if (os_avx512 && (info2.edx & avx10)) { + const auto info_avx10 = cpuid(0x24, 0x0); // This checks that the AVX10 version is greater than zero. // It isn't really needed as for now only one version exists, but // the docs indicate bits 0:7 of EBX should be >= 0 so... - if ((info_avx10[1] & 0xff) >= 1) { + if ((info_avx10.ebx & 0xff) >= 1) { initial_features.push_back(Target::AVX10_1); const uint32_t avx10_128 = 1U << 16; const uint32_t avx10_256 = 1U << 17; const uint32_t avx10_512 = 1U << 18; // Choose the maximum one that is available. - if (info_avx10[1] & avx10_512) { + if (info_avx10.ebx & avx10_512) { vector_bits = 512; - } else if (info_avx10[1] & avx10_256) { + } else if (info_avx10.ebx & avx10_256) { vector_bits = 256; - } else if (info_avx10[1] & avx10_128) { // Not clear it is worth turning on AVX10 for this case. + } else if (info_avx10.ebx & avx10_128) { // Not clear it is worth turning on AVX10 for this case. vector_bits = 128; } } @@ -522,7 +523,7 @@ Target calculate_host_target() { // APX register extensions, etc. const uint32_t apx = 1U << 21; - if (os_apx && (info3[3] & apx)) { + if (os_apx && (info3.edx & apx)) { initial_features.push_back(Target::X86APX); } } diff --git a/src/runtime/x86_cpu_features.cpp b/src/runtime/x86_cpu_features.cpp index 0b762f4058ac..afb4ea0d3e46 100644 --- a/src/runtime/x86_cpu_features.cpp +++ b/src/runtime/x86_cpu_features.cpp @@ -13,18 +13,22 @@ namespace { constexpr bool use_64_bits = (sizeof(size_t) == 8); -ALWAYS_INLINE void cpuid(int32_t *info, int32_t fn_id, int32_t extra = 0) { - info[0] = fn_id; - info[1] = extra; +struct cpuid_result { + int32_t eax, ebx, ecx, edx; +}; + +[[nodiscard]] ALWAYS_INLINE cpuid_result cpuid(int32_t fn_id, int32_t extra = 0) { + int32_t info[4] = {fn_id, extra, 0, 0}; if (use_64_bits) { x64_cpuid_halide(info); } else { x86_cpuid_halide(info); } + return {info[0], info[1], info[2], info[3]}; } // Returns low 32 bits of XCR specified by xcr_id. -ALWAYS_INLINE uint32_t xgetbv(uint32_t xcr_id) { +[[nodiscard]] ALWAYS_INLINE uint32_t xgetbv(uint32_t xcr_id) { int32_t xcr_info[2] = {(int32_t)xcr_id, 0}; xgetbv_halide(xcr_info); return (uint32_t)xcr_info[0]; @@ -51,15 +55,13 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { halide_set_known_cpu_feature(features, halide_target_feature_x86_apx); // Detect CPU features by specific microarchitecture. - int32_t vendor[4]; - cpuid(vendor, 0); - int32_t info[4]; - cpuid(info, 1); + const auto vendor = cpuid(0); + const auto info = cpuid(1); // Check OS support for AVX/AVX-512 state saving via XSAVE. // Even if the CPU supports these features, the OS must enable // the corresponding state components in XCR0 or use will fault. - const bool have_osxsave = (info[2] & (1 << 27)) != 0; // ECX[27] + const bool have_osxsave = (info.ecx & (1 << 27)) != 0; // ECX[27] bool os_avx = false; bool os_avx512 = false; bool os_apx = false; @@ -70,18 +72,18 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { os_apx = (xcr0 & 0x80000) == 0x80000; // APX extended GPRs (bit 19) } - uint32_t family = (info[0] >> 8) & 0xF; // Bits 8..11 - uint32_t model = (info[0] >> 4) & 0xF; // Bits 4..7 + uint32_t family = (info.eax >> 8) & 0xF; // Bits 8..11 + uint32_t model = (info.eax >> 4) & 0xF; // Bits 4..7 if (family == 0x6 || family == 0xF) { if (family == 0xF) { // Examine extended family ID if family ID is 0xF. - family += (info[0] >> 20) & 0xFf; // Bits 20..27 + family += (info.eax >> 20) & 0xFf; // Bits 20..27 } // Examine extended model ID if family ID is 0x6 or 0xF. - model += ((info[0] >> 16) & 0xF) << 4; // Bits 16..19 + model += ((info.eax >> 16) & 0xF) << 4; // Bits 16..19 } - if (vendor[1] == 0x68747541 && vendor[3] == 0x69746e65 && vendor[2] == 0x444d4163) { + if (vendor.ebx == 0x68747541 && vendor.edx == 0x69746e65 && vendor.ecx == 0x444d4163) { // AMD if (family == 0x19 && model == 0x61) { // Zen4 @@ -124,17 +126,16 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { // microarchitectures above rather than making the code below more // complicated. - const bool have_sse41 = (info[2] & (1 << 19)) != 0; - const bool have_avx = (info[2] & (1 << 28)) != 0 && os_avx; - const bool have_f16c = (info[2] & (1 << 29)) != 0 && os_avx; - const bool have_rdrand = (info[2] & (1 << 30)) != 0; - const bool have_fma = (info[2] & (1 << 12)) != 0 && os_avx; + const bool have_sse41 = (info.ecx & (1 << 19)) != 0; + const bool have_avx = (info.ecx & (1 << 28)) != 0 && os_avx; + const bool have_f16c = (info.ecx & (1 << 29)) != 0 && os_avx; + const bool have_rdrand = (info.ecx & (1 << 30)) != 0; + const bool have_fma = (info.ecx & (1 << 12)) != 0 && os_avx; // FMA4 is in CPUID extended leaf 0x80000001, ECX bit 16. // It uses VEX-encoded YMM instructions, so requires OS AVX support. - int32_t info_ext[4]; - cpuid(info_ext, 0x80000001); - const bool have_fma4 = (info_ext[2] & (1 << 16)) != 0 && os_avx; + const auto info_ext = cpuid(0x80000001); + const bool have_fma4 = (info_ext.ecx & (1 << 16)) != 0 && os_avx; if (have_sse41) { halide_set_available_cpu_feature(features, halide_target_feature_sse41); @@ -153,10 +154,8 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { } if (use_64_bits && have_avx && have_f16c && have_rdrand) { - int32_t info2[4]; - cpuid(info2, 7); - int32_t info3[4]; - cpuid(info3, 7, 1); + const auto info2 = cpuid(7); + const auto info3 = cpuid(7, 1); constexpr uint32_t avx2 = 1U << 5; constexpr uint32_t avx512f = 1U << 16; constexpr uint32_t avx512dq = 1U << 17; @@ -172,23 +171,23 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { constexpr uint32_t avx512_knl = avx512 | avx512pf | avx512er; constexpr uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq; constexpr uint32_t avx512_cannonlake = avx512_skylake | avx512ifma; // Assume ifma => vbmi - if ((info2[1] & avx2) == avx2) { + if ((info2.ebx & avx2) == avx2) { halide_set_available_cpu_feature(features, halide_target_feature_avx2); } - if (os_avx512 && (info2[1] & avx512) == avx512) { + if (os_avx512 && (info2.ebx & avx512) == avx512) { halide_set_available_cpu_feature(features, halide_target_feature_avx512); - if ((info2[1] & avx512_knl) == avx512_knl) { + if ((info2.ebx & avx512_knl) == avx512_knl) { halide_set_available_cpu_feature(features, halide_target_feature_avx512_knl); } - if ((info2[1] & avx512_skylake) == avx512_skylake) { + if ((info2.ebx & avx512_skylake) == avx512_skylake) { halide_set_available_cpu_feature(features, halide_target_feature_avx512_skylake); } - if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) { + if ((info2.ebx & avx512_cannonlake) == avx512_cannonlake) { halide_set_available_cpu_feature(features, halide_target_feature_avx512_cannonlake); - if ((info3[0] & avxvnni) == avxvnni) { + if ((info3.eax & avxvnni) == avxvnni) { halide_set_available_cpu_feature(features, halide_target_feature_avxvnni); - if ((info3[0] & avx512bf16) == avx512bf16) { + if ((info3.eax & avx512bf16) == avx512bf16) { halide_set_available_cpu_feature(features, halide_target_feature_avx512_sapphirerapids); } } @@ -199,10 +198,9 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { // AVX10 uses EVEX encoding with opmask registers at all vector widths, // so it requires the same OS XSAVE support as AVX-512. constexpr uint32_t avx10 = 1U << 19; - if (os_avx512 && (info2[3] & avx10)) { - int32_t info_avx10[4]; - cpuid(info_avx10, 0x24, 0x0); - if ((info_avx10[1] & 0xff) >= 1) { + if (os_avx512 && (info2.edx & avx10)) { + const auto info_avx10 = cpuid(0x24, 0x0); + if ((info_avx10.ebx & 0xff) >= 1) { halide_set_available_cpu_feature(features, halide_target_feature_avx10_1); } } @@ -210,7 +208,7 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { // APX extended GPRs (R16-R31) require OS support via XSAVE // state component 19 (XCR0 bit 19). constexpr uint32_t apx = 1U << 21; - if (os_apx && (info3[3] & apx)) { + if (os_apx && (info3.edx & apx)) { halide_set_available_cpu_feature(features, halide_target_feature_x86_apx); } }