halide · alexreinking · Mar 14, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/src/Target.cpp b/src/Target.cpp
@@ -65,22 +65,40 @@ __attribute__((target("+sve"))) int get_sve_vector_length() {
 }
 #endif
 
+struct cpuid_result {
+    int eax, ebx, ecx, edx;
+};
+
 #if defined(_M_IX86) || defined(_M_AMD64)
 
-void cpuid(int info[4], int infoType, int extra) {
+[[nodiscard]] cpuid_result cpuid(int infoType, int extra = 0) {
+    int info[4];
     __cpuidex(info, infoType, extra);
+    return {info[0], info[1], info[2], info[3]};
+}
+
+[[nodiscard]] uint64_t xgetbv(uint32_t xcr) {
+    return _xgetbv(xcr);
 }
 
 #elif defined(__x86_64__) || defined(__i386__)
 
 // CPU feature detection code taken from ispc
 // (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)
 
-void cpuid(int info[4], int infoType, int extra) {
+[[nodiscard]] cpuid_result cpuid(int infoType, int extra = 0) {
+    cpuid_result result;
     __asm__ __volatile__(
         "cpuid                 \n\t"
-        : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+        : "=a"(result.eax), "=b"(result.ebx), "=c"(result.ecx), "=d"(result.edx)
         : "0"(infoType), "2"(extra));
+    return result;
+}
+
+[[nodiscard]] uint64_t xgetbv(uint32_t xcr) {
+    uint32_t lo, hi;
+    __asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr));
+    return ((uint64_t)hi << 32) | lo;
 }
 
 #endif
@@ -94,20 +112,19 @@ enum class VendorSignatures {
 };
 
 VendorSignatures get_vendor_signature() {
-    int info[4];
-    cpuid(info, 0, 0);
+    const auto info = cpuid(0);
 
-    if (info[0] < 1) {
+    if (info.eax < 1) {
         return VendorSignatures::Unknown;
     }
 
     // "Genu ineI ntel"
-    if (info[1] == 0x756e6547 && info[3] == 0x49656e69 && info[2] == 0x6c65746e) {
+    if (info.ebx == 0x756e6547 && info.edx == 0x49656e69 && info.ecx == 0x6c65746e) {
         return VendorSignatures::GenuineIntel;
     }
 
     // "Auth enti cAMD"
-    if (info[1] == 0x68747541 && info[3] == 0x69746e65 && info[2] == 0x444d4163) {
+    if (info.ebx == 0x68747541 && info.edx == 0x69746e65 && info.ecx == 0x444d4163) {
         return VendorSignatures::AuthenticAMD;
     }
 
@@ -282,8 +299,9 @@ Target calculate_host_target() {
 
 #ifdef _MSC_VER
 
-    // Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
+    // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
 #define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
+#define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE (47)
 
     // This is the strategy used by Google's cpuinfo library for
     // detecting fp16 arithmetic support on Windows.
@@ -302,6 +320,11 @@ Target calculate_host_target() {
     //     has_scalable_vector = true;
     // }
 
+    if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) {
+        initial_features.push_back(Target::SVE2);
+        has_scalable_vector = true;
+    }
+
 #endif
 
 #if defined(__aarch64__)
@@ -336,47 +359,73 @@ Target calculate_host_target() {
 
     VendorSignatures vendor_signature = get_vendor_signature();
 
-    int info[4];
-    cpuid(info, 1, 0);
+    const auto info = cpuid(1);
 
     unsigned family = 0, model = 0;
-    detect_family_and_model(info[0], family, model);
-
-    bool have_sse41 = (info[2] & (1 << 19)) != 0;   // ECX[19]
-    bool have_sse2 = (info[3] & (1 << 26)) != 0;    // EDX[26]
-    bool have_sse3 = (info[2] & (1 << 0)) != 0;     // ECX[0]
-    bool have_avx = (info[2] & (1 << 28)) != 0;     // ECX[28]
-    bool have_f16c = (info[2] & (1 << 29)) != 0;    // ECX[29]
-    bool have_rdrand = (info[2] & (1 << 30)) != 0;  // ECX[30]
-    bool have_fma = (info[2] & (1 << 12)) != 0;     // ECX[12]
+    detect_family_and_model(info.eax, family, model);
+
+    // Check OS support for AVX/AVX-512 state saving via XSAVE.
+    // Even if the CPU supports these features, the OS must enable
+    // the corresponding state components in XCR0 or use will fault.
+    bool have_osxsave = (info.ecx & (1 << 27)) != 0;  // ECX[27]
+    bool os_avx = false;
+    bool os_avx512 = false;
+    bool os_apx = false;
+    if (have_osxsave) {
+        uint64_t xcr0 = xgetbv(0);
+        os_avx = (xcr0 & 0x6) == 0x6;                   // XMM (bit 1) + YMM (bit 2)
+        os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0);  // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7)
+        os_apx = (xcr0 & 0x80000) == 0x80000;           // APX extended GPRs (bit 19)
+    }
+
+    bool have_sse41 = (info.ecx & (1 << 19)) != 0;           // ECX[19]
+    bool have_sse2 = (info.edx & (1 << 26)) != 0;            // EDX[26]
+    bool have_sse3 = (info.ecx & (1 << 0)) != 0;             // ECX[0]
+    bool have_avx = (info.ecx & (1 << 28)) != 0 && os_avx;   // ECX[28], requires OS AVX support
+    bool have_f16c = (info.ecx & (1 << 29)) != 0 && os_avx;  // ECX[29], VEX-encoded
+    bool have_rdrand = (info.ecx & (1 << 30)) != 0;          // ECX[30]
+    bool have_fma = (info.ecx & (1 << 12)) != 0 && os_avx;   // ECX[12], VEX-encoded
+
+    // FMA4 is in CPUID extended leaf 0x80000001, ECX bit 16.
+    // It uses VEX-encoded YMM instructions, so requires OS AVX support.
+    const auto info_ext = cpuid(0x80000001);
+    bool have_fma4 = (info_ext.ecx & (1 << 16)) != 0 && os_avx;  // ECX[16], VEX-encoded
 
     user_assert(have_sse2)
         << "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n"
         << "cpuid returned: "
-        << std::hex << info[0]
-        << ", " << info[1]
-        << ", " << info[2]
-        << ", " << info[3]
+        << std::hex << info.eax
+        << ", " << info.ebx
+        << ", " << info.ecx
+        << ", " << info.edx
         << std::dec << "\n";
 
     if (vendor_signature == VendorSignatures::AuthenticAMD) {
         processor = get_amd_processor(family, model, have_sse3);
 
         if (processor == Target::Processor::ZnVer4) {
             Target t{os, arch, bits, processor, initial_features, vector_bits};
-            t.set_features({Target::SSE41, Target::AVX,
-                            Target::F16C, Target::FMA,
-                            Target::AVX2, Target::AVX512,
-                            Target::AVX512_Skylake, Target::AVX512_Cannonlake,
-                            Target::AVX512_Zen4});
+            t.set_feature(Target::SSE41);
+            if (os_avx) {
+                t.set_features({Target::AVX, Target::F16C, Target::FMA, Target::AVX2});
+            }
+            if (os_avx512) {
+                t.set_features({Target::AVX512, Target::AVX512_Skylake,
+                                Target::AVX512_Cannonlake, Target::AVX512_Zen4});
+            }
             return t;
         } else if (processor == Target::Processor::ZnVer5) {
             Target t{os, arch, bits, processor, initial_features, vector_bits};
-            t.set_features({Target::SSE41, Target::AVX,
-                            Target::F16C, Target::FMA,
-                            Target::AVX2, Target::AVXVNNI, Target::AVX512,
-                            Target::AVX512_Skylake, Target::AVX512_Cannonlake,
-                            Target::AVX512_Zen4, Target::AVX512_Zen5});
+            t.set_feature(Target::SSE41);
+            if (os_avx) {
+                t.set_features({Target::AVX, Target::F16C, Target::FMA,
+                                Target::AVX2, Target::AVXVNNI});
+            }
+            if (os_avx512) {
+                t.set_features({Target::AVX512, Target::AVX512_Skylake,
+                                Target::AVX512_Cannonlake,
+                                Target::AVX512_Zen4, Target::AVX512_Zen5});
+            }
             return t;
         }
     }
@@ -397,14 +446,14 @@ Target calculate_host_target() {
     if (have_fma) {
         initial_features.push_back(Target::FMA);
     }
+    if (have_fma4) {
+        initial_features.push_back(Target::FMA4);
+    }
 
     if (use_64_bits && have_avx && have_f16c && have_rdrand) {
         // So far, so good.  AVX2/512?
-        // Call cpuid with eax=7, ecx=0
-        int info2[4];
-        cpuid(info2, 7, 0);
-        int info3[4];
-        cpuid(info3, 7, 1);
+        const auto info2 = cpuid(7, 0);
+        const auto info3 = cpuid(7, 1);
         const uint32_t avx2 = 1U << 5;
         const uint32_t avx512f = 1U << 16;
         const uint32_t avx512dq = 1U << 17;
@@ -418,29 +467,29 @@ Target calculate_host_target() {
         const uint32_t avx512_knl = avx512 | avx512pf | avx512er;
         const uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq;
         const uint32_t avx512_cannonlake = avx512_skylake | avx512ifma;  // Assume ifma => vbmi
-        if ((info2[1] & avx2) == avx2) {
+        if ((info2.ebx & avx2) == avx2) {
             initial_features.push_back(Target::AVX2);
         }
-        if ((info2[1] & avx512) == avx512) {
+        if (os_avx512 && (info2.ebx & avx512) == avx512) {
             initial_features.push_back(Target::AVX512);
             // TODO: port to family/model -based detection.
-            if ((info2[1] & avx512_knl) == avx512_knl) {
+            if ((info2.ebx & avx512_knl) == avx512_knl) {
                 initial_features.push_back(Target::AVX512_KNL);
             }
             // TODO: port to family/model -based detection.
-            if ((info2[1] & avx512_skylake) == avx512_skylake) {
+            if ((info2.ebx & avx512_skylake) == avx512_skylake) {
                 initial_features.push_back(Target::AVX512_Skylake);
             }
             // TODO: port to family/model -based detection.
-            if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) {
+            if ((info2.ebx & avx512_cannonlake) == avx512_cannonlake) {
                 initial_features.push_back(Target::AVX512_Cannonlake);
 
                 const uint32_t avxvnni = 1U << 4;     // avxvnni (note, not avx512vnni) result in eax
                 const uint32_t avx512bf16 = 1U << 5;  // bf16 result in eax, with cpuid(eax=7, ecx=1)
                 // TODO: port to family/model -based detection.
-                if ((info3[0] & avxvnni) == avxvnni) {
+                if ((info3.eax & avxvnni) == avxvnni) {
                     initial_features.push_back(Target::AVXVNNI);
-                    if ((info3[0] & avx512bf16) == avx512bf16) {
+                    if ((info3.eax & avx512bf16) == avx512bf16) {
                         initial_features.push_back(Target::AVX512_SapphireRapids);
                     }
                 }
@@ -449,33 +498,32 @@ Target calculate_host_target() {
 
         // AVX10 converged vector instructions.
         const uint32_t avx10 = 1U << 19;
-        if (info2[3] & avx10) {
-            int info_avx10[4];
-            cpuid(info_avx10, 0x24, 0x0);
+        if (os_avx512 && (info2.edx & avx10)) {
+            const auto info_avx10 = cpuid(0x24, 0x0);
 
             // This checks that the AVX10 version is greater than zero.
             // It isn't really needed as for now only one version exists, but
             // the docs indicate bits 0:7 of EBX should be >= 0 so...
-            if ((info[1] & 0xff) >= 1) {
+            if ((info_avx10.ebx & 0xff) >= 1) {
                 initial_features.push_back(Target::AVX10_1);
 
                 const uint32_t avx10_128 = 1U << 16;
                 const uint32_t avx10_256 = 1U << 17;
                 const uint32_t avx10_512 = 1U << 18;
                 // Choose the maximum one that is available.
-                if (info[1] & avx10_512) {
+                if (info_avx10.ebx & avx10_512) {
                     vector_bits = 512;
-                } else if (info[1] & avx10_256) {
+                } else if (info_avx10.ebx & avx10_256) {
                     vector_bits = 256;
-                } else if (info[1] & avx10_128) {  // Not clear it is worth turning on AVX10 for this case.
+                } else if (info_avx10.ebx & avx10_128) {  // Not clear it is worth turning on AVX10 for this case.
                     vector_bits = 128;
                 }
             }
         }
 
         // APX register extensions, etc.
         const uint32_t apx = 1U << 21;
-        if (info3[3] & apx) {
+        if (os_apx && (info3.edx & apx)) {
             initial_features.push_back(Target::X86APX);
         }
     }

diff --git a/src/runtime/aarch64_cpu_features.cpp b/src/runtime/aarch64_cpu_features.cpp
@@ -79,8 +79,9 @@ extern "C" BOOL IsProcessorFeaturePresent(DWORD feature);
 #define PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE (27)
 #define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE (43)
 
-// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
 #define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
+#define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE (47)
 
 namespace {
 
@@ -99,6 +100,10 @@ void set_platform_features(CpuFeatures *features) {
     if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
         halide_set_available_cpu_feature(features, halide_target_feature_sve);
     }
+
+    if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) {
+        halide_set_available_cpu_feature(features, halide_target_feature_sve2);
+    }
 }
 
 }  // namespace

diff --git a/src/runtime/x86.ll b/src/runtime/x86.ll
@@ -146,3 +146,11 @@ define weak_odr void @x64_cpuid_halide(i32* %info) nounwind uwtable {
   call void asm sideeffect inteldialect "xchg rbx, rsi\0A\09mov eax, dword ptr $$0 $0\0A\09mov ecx, dword ptr $$4 $0\0A\09cpuid\0A\09mov dword ptr $$0 $0, eax\0A\09mov dword ptr $$4 $0, ebx\0A\09mov dword ptr $$8 $0, ecx\0A\09mov dword ptr $$12 $0, edx\0A\09xchg rbx, rsi", "=*m,~{eax},~{ebx},~{ecx},~{edx},~{esi},~{dirflag},~{fpsr},~{flags}"(i32* elementtype(i32) %info)
   ret void
 }
+
+; xgetbv: info[0] is ECX (input), output is info[0]=EAX, info[1]=EDX.
+; Unlike cpuid, xgetbv does not clobber ebx/rbx, so one definition
+; works for both 32-bit and 64-bit.
+define weak_odr void @xgetbv_halide(i32* %info) nounwind uwtable {
+  call void asm sideeffect inteldialect "mov ecx, dword ptr $$0 $0\0A\09xgetbv\0A\09mov dword ptr $$0 $0, eax\0A\09mov dword ptr $$4 $0, edx", "=*m,~{eax},~{ecx},~{edx},~{dirflag},~{fpsr},~{flags}"(i32* elementtype(i32) %info)
+  ret void
+}