Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 102 additions & 54 deletions src/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,40 @@ __attribute__((target("+sve"))) int get_sve_vector_length() {
}
#endif

struct cpuid_result {
int eax, ebx, ecx, edx;
};

#if defined(_M_IX86) || defined(_M_AMD64)

void cpuid(int info[4], int infoType, int extra) {
[[nodiscard]] cpuid_result cpuid(int infoType, int extra = 0) {
int info[4];
__cpuidex(info, infoType, extra);
return {info[0], info[1], info[2], info[3]};
}

[[nodiscard]] uint64_t xgetbv(uint32_t xcr) {
return _xgetbv(xcr);
}

#elif defined(__x86_64__) || defined(__i386__)

// CPU feature detection code taken from ispc
// (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)

void cpuid(int info[4], int infoType, int extra) {
[[nodiscard]] cpuid_result cpuid(int infoType, int extra = 0) {
cpuid_result result;
__asm__ __volatile__(
"cpuid \n\t"
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
: "=a"(result.eax), "=b"(result.ebx), "=c"(result.ecx), "=d"(result.edx)
: "0"(infoType), "2"(extra));
return result;
}

[[nodiscard]] uint64_t xgetbv(uint32_t xcr) {
uint32_t lo, hi;
__asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr));
return ((uint64_t)hi << 32) | lo;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My .ll comment is to do basically this


#endif
Expand All @@ -94,20 +112,19 @@ enum class VendorSignatures {
};

VendorSignatures get_vendor_signature() {
int info[4];
cpuid(info, 0, 0);
const auto info = cpuid(0);

if (info[0] < 1) {
if (info.eax < 1) {
return VendorSignatures::Unknown;
}

// "Genu ineI ntel"
if (info[1] == 0x756e6547 && info[3] == 0x49656e69 && info[2] == 0x6c65746e) {
if (info.ebx == 0x756e6547 && info.edx == 0x49656e69 && info.ecx == 0x6c65746e) {
return VendorSignatures::GenuineIntel;
}

// "Auth enti cAMD"
if (info[1] == 0x68747541 && info[3] == 0x69746e65 && info[2] == 0x444d4163) {
if (info.ebx == 0x68747541 && info.edx == 0x69746e65 && info.ecx == 0x444d4163) {
return VendorSignatures::AuthenticAMD;
}

Expand Down Expand Up @@ -282,8 +299,9 @@ Target calculate_host_target() {

#ifdef _MSC_VER

// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
#define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE (47)

// This is the strategy used by Google's cpuinfo library for
// detecting fp16 arithmetic support on Windows.
Expand All @@ -302,6 +320,11 @@ Target calculate_host_target() {
// has_scalable_vector = true;
// }

if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::SVE2);
has_scalable_vector = true;
}

#endif

#if defined(__aarch64__)
Expand Down Expand Up @@ -336,47 +359,73 @@ Target calculate_host_target() {

VendorSignatures vendor_signature = get_vendor_signature();

int info[4];
cpuid(info, 1, 0);
const auto info = cpuid(1);

unsigned family = 0, model = 0;
detect_family_and_model(info[0], family, model);

bool have_sse41 = (info[2] & (1 << 19)) != 0; // ECX[19]
bool have_sse2 = (info[3] & (1 << 26)) != 0; // EDX[26]
bool have_sse3 = (info[2] & (1 << 0)) != 0; // ECX[0]
bool have_avx = (info[2] & (1 << 28)) != 0; // ECX[28]
bool have_f16c = (info[2] & (1 << 29)) != 0; // ECX[29]
bool have_rdrand = (info[2] & (1 << 30)) != 0; // ECX[30]
bool have_fma = (info[2] & (1 << 12)) != 0; // ECX[12]
detect_family_and_model(info.eax, family, model);

// Check OS support for AVX/AVX-512 state saving via XSAVE.
// Even if the CPU supports these features, the OS must enable
// the corresponding state components in XCR0 or use will fault.
bool have_osxsave = (info.ecx & (1 << 27)) != 0; // ECX[27]
bool os_avx = false;
bool os_avx512 = false;
bool os_apx = false;
if (have_osxsave) {
uint64_t xcr0 = xgetbv(0);
os_avx = (xcr0 & 0x6) == 0x6; // XMM (bit 1) + YMM (bit 2)
os_avx512 = os_avx && ((xcr0 & 0xE0) == 0xE0); // opmask (5) + ZMM_Hi256 (6) + Hi16_ZMM (7)
os_apx = (xcr0 & 0x80000) == 0x80000; // APX extended GPRs (bit 19)
}

bool have_sse41 = (info.ecx & (1 << 19)) != 0; // ECX[19]
bool have_sse2 = (info.edx & (1 << 26)) != 0; // EDX[26]
bool have_sse3 = (info.ecx & (1 << 0)) != 0; // ECX[0]
bool have_avx = (info.ecx & (1 << 28)) != 0 && os_avx; // ECX[28], requires OS AVX support
bool have_f16c = (info.ecx & (1 << 29)) != 0 && os_avx; // ECX[29], VEX-encoded
bool have_rdrand = (info.ecx & (1 << 30)) != 0; // ECX[30]
bool have_fma = (info.ecx & (1 << 12)) != 0 && os_avx; // ECX[12], VEX-encoded

// FMA4 is in CPUID extended leaf 0x80000001, ECX bit 16.
// It uses VEX-encoded YMM instructions, so requires OS AVX support.
const auto info_ext = cpuid(0x80000001);
bool have_fma4 = (info_ext.ecx & (1 << 16)) != 0 && os_avx; // ECX[16], VEX-encoded

user_assert(have_sse2)
<< "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n"
<< "cpuid returned: "
<< std::hex << info[0]
<< ", " << info[1]
<< ", " << info[2]
<< ", " << info[3]
<< std::hex << info.eax
<< ", " << info.ebx
<< ", " << info.ecx
<< ", " << info.edx
<< std::dec << "\n";

if (vendor_signature == VendorSignatures::AuthenticAMD) {
processor = get_amd_processor(family, model, have_sse3);

if (processor == Target::Processor::ZnVer4) {
Target t{os, arch, bits, processor, initial_features, vector_bits};
t.set_features({Target::SSE41, Target::AVX,
Target::F16C, Target::FMA,
Target::AVX2, Target::AVX512,
Target::AVX512_Skylake, Target::AVX512_Cannonlake,
Target::AVX512_Zen4});
t.set_feature(Target::SSE41);
if (os_avx) {
t.set_features({Target::AVX, Target::F16C, Target::FMA, Target::AVX2});
}
if (os_avx512) {
t.set_features({Target::AVX512, Target::AVX512_Skylake,
Target::AVX512_Cannonlake, Target::AVX512_Zen4});
}
return t;
} else if (processor == Target::Processor::ZnVer5) {
Target t{os, arch, bits, processor, initial_features, vector_bits};
t.set_features({Target::SSE41, Target::AVX,
Target::F16C, Target::FMA,
Target::AVX2, Target::AVXVNNI, Target::AVX512,
Target::AVX512_Skylake, Target::AVX512_Cannonlake,
Target::AVX512_Zen4, Target::AVX512_Zen5});
t.set_feature(Target::SSE41);
if (os_avx) {
t.set_features({Target::AVX, Target::F16C, Target::FMA,
Target::AVX2, Target::AVXVNNI});
}
if (os_avx512) {
t.set_features({Target::AVX512, Target::AVX512_Skylake,
Target::AVX512_Cannonlake,
Target::AVX512_Zen4, Target::AVX512_Zen5});
}
return t;
}
}
Expand All @@ -397,14 +446,14 @@ Target calculate_host_target() {
if (have_fma) {
initial_features.push_back(Target::FMA);
}
if (have_fma4) {
initial_features.push_back(Target::FMA4);
}

if (use_64_bits && have_avx && have_f16c && have_rdrand) {
// So far, so good. AVX2/512?
// Call cpuid with eax=7, ecx=0
int info2[4];
cpuid(info2, 7, 0);
int info3[4];
cpuid(info3, 7, 1);
const auto info2 = cpuid(7, 0);
const auto info3 = cpuid(7, 1);
const uint32_t avx2 = 1U << 5;
const uint32_t avx512f = 1U << 16;
const uint32_t avx512dq = 1U << 17;
Expand All @@ -418,29 +467,29 @@ Target calculate_host_target() {
const uint32_t avx512_knl = avx512 | avx512pf | avx512er;
const uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq;
const uint32_t avx512_cannonlake = avx512_skylake | avx512ifma; // Assume ifma => vbmi
if ((info2[1] & avx2) == avx2) {
if ((info2.ebx & avx2) == avx2) {
initial_features.push_back(Target::AVX2);
}
if ((info2[1] & avx512) == avx512) {
if (os_avx512 && (info2.ebx & avx512) == avx512) {
initial_features.push_back(Target::AVX512);
// TODO: port to family/model -based detection.
if ((info2[1] & avx512_knl) == avx512_knl) {
if ((info2.ebx & avx512_knl) == avx512_knl) {
initial_features.push_back(Target::AVX512_KNL);
}
// TODO: port to family/model -based detection.
if ((info2[1] & avx512_skylake) == avx512_skylake) {
if ((info2.ebx & avx512_skylake) == avx512_skylake) {
initial_features.push_back(Target::AVX512_Skylake);
}
// TODO: port to family/model -based detection.
if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) {
if ((info2.ebx & avx512_cannonlake) == avx512_cannonlake) {
initial_features.push_back(Target::AVX512_Cannonlake);

const uint32_t avxvnni = 1U << 4; // avxvnni (note, not avx512vnni) result in eax
const uint32_t avx512bf16 = 1U << 5; // bf16 result in eax, with cpuid(eax=7, ecx=1)
// TODO: port to family/model -based detection.
if ((info3[0] & avxvnni) == avxvnni) {
if ((info3.eax & avxvnni) == avxvnni) {
initial_features.push_back(Target::AVXVNNI);
if ((info3[0] & avx512bf16) == avx512bf16) {
if ((info3.eax & avx512bf16) == avx512bf16) {
initial_features.push_back(Target::AVX512_SapphireRapids);
}
}
Expand All @@ -449,33 +498,32 @@ Target calculate_host_target() {

// AVX10 converged vector instructions.
const uint32_t avx10 = 1U << 19;
if (info2[3] & avx10) {
int info_avx10[4];
cpuid(info_avx10, 0x24, 0x0);
if (os_avx512 && (info2.edx & avx10)) {
const auto info_avx10 = cpuid(0x24, 0x0);

// This checks that the AVX10 version is greater than zero.
// It isn't really needed as for now only one version exists, but
// the docs indicate bits 0:7 of EBX should be >= 0 so...
if ((info[1] & 0xff) >= 1) {
if ((info_avx10.ebx & 0xff) >= 1) {
initial_features.push_back(Target::AVX10_1);

const uint32_t avx10_128 = 1U << 16;
const uint32_t avx10_256 = 1U << 17;
const uint32_t avx10_512 = 1U << 18;
// Choose the maximum one that is available.
if (info[1] & avx10_512) {
if (info_avx10.ebx & avx10_512) {
vector_bits = 512;
} else if (info[1] & avx10_256) {
} else if (info_avx10.ebx & avx10_256) {
vector_bits = 256;
} else if (info[1] & avx10_128) { // Not clear it is worth turning on AVX10 for this case.
} else if (info_avx10.ebx & avx10_128) { // Not clear it is worth turning on AVX10 for this case.
vector_bits = 128;
}
}
}

// APX register extensions, etc.
const uint32_t apx = 1U << 21;
if (info3[3] & apx) {
if (os_apx && (info3.edx & apx)) {
initial_features.push_back(Target::X86APX);
}
}
Expand Down
7 changes: 6 additions & 1 deletion src/runtime/aarch64_cpu_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ extern "C" BOOL IsProcessorFeaturePresent(DWORD feature);
#define PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE (27)
#define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE (43)

// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
#define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE (47)

namespace {

Expand All @@ -99,6 +100,10 @@ void set_platform_features(CpuFeatures *features) {
if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
halide_set_available_cpu_feature(features, halide_target_feature_sve);
}

if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) {
halide_set_available_cpu_feature(features, halide_target_feature_sve2);
}
}

} // namespace
Expand Down
8 changes: 8 additions & 0 deletions src/runtime/x86.ll
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,11 @@ define weak_odr void @x64_cpuid_halide(i32* %info) nounwind uwtable {
call void asm sideeffect inteldialect "xchg rbx, rsi\0A\09mov eax, dword ptr $$0 $0\0A\09mov ecx, dword ptr $$4 $0\0A\09cpuid\0A\09mov dword ptr $$0 $0, eax\0A\09mov dword ptr $$4 $0, ebx\0A\09mov dword ptr $$8 $0, ecx\0A\09mov dword ptr $$12 $0, edx\0A\09xchg rbx, rsi", "=*m,~{eax},~{ebx},~{ecx},~{edx},~{esi},~{dirflag},~{fpsr},~{flags}"(i32* elementtype(i32) %info)
ret void
}

; xgetbv: info[0] is ECX (input), output is info[0]=EAX, info[1]=EDX.
; Unlike cpuid, xgetbv does not clobber ebx/rbx, so one definition
; works for both 32-bit and 64-bit.
define weak_odr void @xgetbv_halide(i32* %info) nounwind uwtable {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize this is a "me" thing, but I think we should limit the inline ASM to just the minimal required. I know this works across both 32- and 64-bit, but it requires quite a bit of understanding (e.g. that the struct has been prefilled with the XCR register ID, that loading into ECX is ok because the top bits would be zeros anyway, etc).

In other words, can this be written so that just the xgetbv is inline ASM (can pass in the index via a c constraint to get it in the right register, with the other clobbers being the same)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guilty of cargo-culting the cpuid function above it. I figured, "if cpuid couldn't be written using inline asm, I'm sure I'll hit the same issue here"

call void asm sideeffect inteldialect "mov ecx, dword ptr $$0 $0\0A\09xgetbv\0A\09mov dword ptr $$0 $0, eax\0A\09mov dword ptr $$4 $0, edx", "=*m,~{eax},~{ecx},~{edx},~{dirflag},~{fpsr},~{flags}"(i32* elementtype(i32) %info)
ret void
}
Loading
Loading