From 1af98181b7245110be0075f9850531c0814562ad Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Sun, 12 Apr 2026 06:20:53 -0500 Subject: [PATCH 1/4] ENH: Add ITK_X86_64_ISA_LEVEL CMake cache variable Define a CMake cache variable with dropdown values for selecting the x86-64 instruction set architecture level: default, x86-64, x86-64-v2, x86-64-v3, x86-64-v4, and native. The helper function itk_isa_level_arch_flag() resolves the selection to a concrete -march= flag (GCC/Clang) or /arch: flag (MSVC). This commit adds the infrastructure only; the next commit integrates it into ITK's compiler flag logic, replacing the old -march=corei7. See https://github.com/InsightSoftwareConsortium/ITK/issues/2634 Co-Authored-By: Claude Opus 4.6 (1M context) --- CMake/ITKSetStandardCompilerFlags.cmake | 101 ++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/CMake/ITKSetStandardCompilerFlags.cmake b/CMake/ITKSetStandardCompilerFlags.cmake index 96fd6235141..4e5b4515118 100644 --- a/CMake/ITKSetStandardCompilerFlags.cmake +++ b/CMake/ITKSetStandardCompilerFlags.cmake @@ -22,6 +22,107 @@ include(CheckCCompilerFlag) include(CheckPIESupported) check_pie_supported() +# =========================================================================== +# ITK_X86_64_ISA_LEVEL +# --------------------------------------------------------------------------- +# Selects the x86-64 instruction set architecture level for ITK compiler +# optimization flags. This replaces the historical hard-coded +# `-march=corei7` (Nehalem, 2008) instruction-set baseline. See issue +# https://github.com/InsightSoftwareConsortium/ITK/issues/2634 . +# +# The levels are the standard x86-64 micro-architecture levels published by +# AMD/Intel/Red Hat/SUSE in 2020 and supported by GCC >= 11, Clang >= 12: +# +# Level | -march= flag | Key ISA additions | ~Year +# ---------|-------------|--------------------------------------|------ +# default | (none) | Compiler toolchain default | — +# x86-64 | x86-64 | SSE, SSE2 (AMD64 baseline) | 2003 +# x86-64-v2| x86-64-v2 | + SSE3, SSSE3, SSE4.1/4.2, POPCNT | 2009 +# x86-64-v3| x86-64-v3 | + AVX, AVX2, BMI1/2, FMA | 2013 +# x86-64-v4| x86-64-v4 | + AVX-512F/BW/CD/DQ/VL | 2017 +# native | native | Host CPU's full ISA (not redistributable) | — +# +# Performance note: x86-64-v4 (AVX-512) may trigger CPU frequency throttling +# on Intel Sapphire Rapids and earlier, causing net regressions on +# memory-latency-bound kernels such as BSpline transform evaluation. +# Consider `-mprefer-vector-width=256` if using v4. +# +# On non-x86 platforms this variable is ignored; the toolchain default is +# used unless ITK_C_OPTIMIZATION_FLAGS / ITK_CXX_OPTIMIZATION_FLAGS are +# set explicitly. +# +# Cross-platform override: users can always set +# -DITK_C_OPTIMIZATION_FLAGS="..." / -DITK_CXX_OPTIMIZATION_FLAGS="..." on +# the cmake command line; that escape hatch bypasses this variable entirely. +# =========================================================================== +set( + ITK_X86_64_ISA_LEVEL + "x86-64-v2" + CACHE STRING + "x86-64 instruction set architecture level for compiler optimization.\ + default = no flags (compiler toolchain default),\ + x86-64 = SSE/SSE2 baseline (~2003),\ + x86-64-v2 = + SSE4.2/POPCNT (~2009) [DEFAULT],\ + x86-64-v3 = + AVX2/FMA (~2013),\ + x86-64-v4 = + AVX-512 (~2017, may throttle),\ + native = host CPU (not redistributable)" +) +set_property( + CACHE + ITK_X86_64_ISA_LEVEL + PROPERTY + STRINGS + "default" + "x86-64" + "x86-64-v2" + "x86-64-v3" + "x86-64-v4" + "native" +) + +if( + NOT + ITK_X86_64_ISA_LEVEL + MATCHES + "^(default|x86-64|x86-64-v2|x86-64-v3|x86-64-v4|native)$" +) + message( + FATAL_ERROR + "ITK_X86_64_ISA_LEVEL must be one of: default, x86-64, x86-64-v2, x86-64-v3, x86-64-v4, native " + "(got '${ITK_X86_64_ISA_LEVEL}')" + ) +endif() + +# Resolve ITK_X86_64_ISA_LEVEL to a concrete -march= / /arch: flag. +# Returns empty string for "default" or when the toolchain default is appropriate. +function(itk_isa_level_arch_flag _out_var) + set(_arch_flag "") + if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "x86_64|AMD64") + if(ITK_X86_64_ISA_LEVEL STREQUAL "default") + # No flag — use compiler toolchain default + elseif(MSVC) + # MSVC x64 implies SSE2; /arch: is only meaningful at AVX or above. + if(ITK_X86_64_ISA_LEVEL STREQUAL "x86-64-v3") + set(_arch_flag "/arch:AVX2") + elseif(ITK_X86_64_ISA_LEVEL STREQUAL "x86-64-v4") + set(_arch_flag "/arch:AVX512") + elseif(ITK_X86_64_ISA_LEVEL STREQUAL "native") + # MSVC has no -march=native equivalent; AVX2 is a reasonable proxy + set(_arch_flag "/arch:AVX2") + endif() + # x86-64 and x86-64-v2: leave default (SSE2 baseline) + else() + if(ITK_X86_64_ISA_LEVEL STREQUAL "native") + set(_arch_flag "-march=native") + else() + set(_arch_flag "-march=${ITK_X86_64_ISA_LEVEL}") + endif() + endif() + endif() + # Non-x86 platforms: ITK_X86_64_ISA_LEVEL is ignored; leave empty. + set(${_out_var} "${_arch_flag}" PARENT_SCOPE) +endfunction() + function(check_c_compiler_flags c_flag_var) set(local_c_flags "") set(flag_list "${ARGN}") From 1fec712f1b4dbf67e7334a4287be2ac8b2c78850 Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Sun, 12 Apr 2026 06:21:38 -0500 Subject: [PATCH 2/4] COMP: Replace hardcoded -march=corei7 with ITK_X86_64_ISA_LEVEL Integrate the ITK_X86_64_ISA_LEVEL cache variable (added in the prior commit) into check_compiler_optimization_flags(), replacing the historical hard-coded -march=corei7 (Nehalem, 2008). The default level is x86-64-v2 (SSE4.2, POPCNT), matching the previous corei7 baseline with portable, vendor-neutral level names. When set to "default", no -march or -mtune flags are emitted, leaving the compiler's built-in defaults in effect. When set to "native", both -march=native and -mtune=native are used for maximum performance on the build host (not redistributable). See https://github.com/InsightSoftwareConsortium/ITK/issues/2634 Co-Authored-By: Claude Opus 4.6 (1M context) --- CMake/ITKSetStandardCompilerFlags.cmake | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/CMake/ITKSetStandardCompilerFlags.cmake b/CMake/ITKSetStandardCompilerFlags.cmake index 4e5b4515118..d0f079feec3 100644 --- a/CMake/ITKSetStandardCompilerFlags.cmake +++ b/CMake/ITKSetStandardCompilerFlags.cmake @@ -380,6 +380,8 @@ function( set(${c_optimization_flags_var} "" PARENT_SCOPE) set(${cxx_optimization_flags_var} "" PARENT_SCOPE) + itk_isa_level_arch_flag(_itk_arch_flag) + if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(x86_64|AMD64)") if(MSVC) check_avx_flags(InstructionSetOptimizationFlags) @@ -391,6 +393,9 @@ function( /arch:SSE2 ) endif() + if(_itk_arch_flag) + list(APPEND InstructionSetOptimizationFlags ${_itk_arch_flag}) + endif() elseif(NOT EMSCRIPTEN OR WASI) if(${CMAKE_C_COMPILER} MATCHES "icc.*$") set(USING_INTEL_ICC_COMPILER TRUE) @@ -410,15 +415,16 @@ function( # Check this list on C++ compiler only set(cxx_flags "") - # Check this list on both C and C++ compilers - set( - InstructionSetOptimizationFlags - # https://gcc.gnu.org/onlinedocs/gcc-4.8.0/gcc/i386-and-x86_002d64-Options.html - # NOTE the corei7 release date was 2008 - #-mtune=native # Tune the code for the computer used compile ITK, but allow running on generic cpu archetectures - -mtune=generic # for reproducible results https://github.com/InsightSoftwareConsortium/ITK/issues/1939 - -march=corei7 # Use ABI settings to support corei7 (circa 2008 ABI feature sets, core-avx circa 2013) - ) + if(ITK_X86_64_ISA_LEVEL STREQUAL "default") + set(InstructionSetOptimizationFlags "") + elseif(ITK_X86_64_ISA_LEVEL STREQUAL "native") + set(InstructionSetOptimizationFlags -mtune=native) + else() + set(InstructionSetOptimizationFlags -mtune=generic) + endif() + if(_itk_arch_flag) + list(APPEND InstructionSetOptimizationFlags ${_itk_arch_flag}) + endif() endif() set(c_and_cxx_flags ${InstructionSetOptimizationFlags}) endif() From fc9cf8664d427bf92d4e98eff23de97191413de6 Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Sun, 12 Apr 2026 07:11:27 -0500 Subject: [PATCH 3/4] PERF: Add -mprefer-vector-width=256 for x86-64-v4 to avoid AVX-512 throttle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC with -march=x86-64-v4 auto-vectorises every function with 512-bit zmm registers (53 063 zmm instructions across 4 977 functions in the ResampleBenchmark binary). On Intel Sapphire Rapids (and earlier) any zmm instruction triggers a licence-based frequency downshift that persists ~670 µs. With 4 977 affected functions the CPU never exits the throttled P-state, causing 12–17 % wall-clock regressions on BSpline-dominated workloads compared to the x86-64-v2 default. -mprefer-vector-width=256 tells GCC to prefer 256-bit (ymm) vectors while still using the AVX-512 instruction encoding (EVEX prefix, 32 vector registers, mask registers, new ALU operations). This gives access to AVX-512 features without triggering the zmm frequency penalty. Benchmark evidence (Xeon w7-3545, NSLOTS=1, n=1): Binary zmm count: v4-bare: 53 063 v4-pw256: 4 208 (−92 %) Speedup vs v4-bare (pw256 faster = >1.0): DemonsRegistration: 1.21× (regression recovered) Resample (60 var): 1.10× (regression recovered) BinaryAdd: 1.05× UnaryAdd: 1.09× GradMag1Thread: 1.02× (preserved) Compared to the v2 default (-march=x86-64-v2): v4-bare regressed Resample by 13 % and Demons by 16 %. v4-pw256 brings both within 4 % of v2 while retaining AVX-512 encoding benefits on scalar benchmarks. Co-Authored-By: Claude Opus 4.6 (1M context) --- CMake/ITKSetStandardCompilerFlags.cmake | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/CMake/ITKSetStandardCompilerFlags.cmake b/CMake/ITKSetStandardCompilerFlags.cmake index d0f079feec3..27bb3ebb7ff 100644 --- a/CMake/ITKSetStandardCompilerFlags.cmake +++ b/CMake/ITKSetStandardCompilerFlags.cmake @@ -45,7 +45,8 @@ check_pie_supported() # Performance note: x86-64-v4 (AVX-512) may trigger CPU frequency throttling # on Intel Sapphire Rapids and earlier, causing net regressions on # memory-latency-bound kernels such as BSpline transform evaluation. -# Consider `-mprefer-vector-width=256` if using v4. +# The x86-64-v4 level automatically includes `-mprefer-vector-width=256` +# to avoid this; see the PERF commit in this series for benchmark data. # # On non-x86 platforms this variable is ignored; the toolchain default is # used unless ITK_C_OPTIMIZATION_FLAGS / ITK_CXX_OPTIMIZATION_FLAGS are @@ -114,6 +115,23 @@ function(itk_isa_level_arch_flag _out_var) else() if(ITK_X86_64_ISA_LEVEL STREQUAL "native") set(_arch_flag "-march=native") + elseif(ITK_X86_64_ISA_LEVEL STREQUAL "x86-64-v4") + # Use AVX-512 instruction encoding (EVEX prefix, 32 registers, mask + # registers) but prefer 256-bit vector width. Without this flag GCC + # auto-vectorises with 512-bit zmm registers, which triggers a CPU + # frequency downshift ("AVX-512 turbo penalty") on Intel Sapphire + # Rapids and earlier. Benchmarks show 53 000 zmm instructions across + # 4 977 functions in a bare -march=x86-64-v4 build, causing 12–17 % + # regressions on BSpline-dominated Resample benchmarks. With + # -mprefer-vector-width=256 the zmm count drops 92 % and the + # regressions are recovered. + # + # The two flags are stored as a semicolon-separated CMake list so that + # check_c_compiler_flag() tests each flag independently. Clang does + # not recognise -mprefer-vector-width=256; testing it separately lets + # -march=x86-64-v4 still be retained on Clang while the unsupported + # GCC-only hint is gracefully dropped. + set(_arch_flag "-march=x86-64-v4;-mprefer-vector-width=256") else() set(_arch_flag "-march=${ITK_X86_64_ISA_LEVEL}") endif() From bccc55c3dcf4ebd852473fe951f393f04a8ac96c Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Sun, 12 Apr 2026 08:03:00 -0500 Subject: [PATCH 4/4] COMP: default ISA level to x86-64 baseline Pip wheels, Docker images, and hardware translation layers (Rosetta) only guarantee x86-64 baseline. Users building for local use may want to consider x86-64-v2, which is the minimum level targeted by current Linux distributions (Fedora, RHEL 9, SUSE). --- CMake/ITKSetStandardCompilerFlags.cmake | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/CMake/ITKSetStandardCompilerFlags.cmake b/CMake/ITKSetStandardCompilerFlags.cmake index 27bb3ebb7ff..21fd4f53575 100644 --- a/CMake/ITKSetStandardCompilerFlags.cmake +++ b/CMake/ITKSetStandardCompilerFlags.cmake @@ -56,14 +56,20 @@ check_pie_supported() # -DITK_C_OPTIMIZATION_FLAGS="..." / -DITK_CXX_OPTIMIZATION_FLAGS="..." on # the cmake command line; that escape hatch bypasses this variable entirely. # =========================================================================== +# Default to "default" (x86-64 baseline) — the safest redistributable +# choice. Pip wheels, Docker images, and hardware translation layers +# (e.g., Rosetta) only guarantee the x86-64 baseline ISA. Users +# building for local use may want to consider x86-64-v2, which is the +# minimum level targeted by current Linux distributions (Fedora, RHEL 9, +# SUSE) and provides a consistent, well-tested ISA baseline. set( ITK_X86_64_ISA_LEVEL - "x86-64-v2" + "default" CACHE STRING "x86-64 instruction set architecture level for compiler optimization.\ - default = no flags (compiler toolchain default),\ + default = no flags (compiler toolchain default, ~x86-64 baseline),\ x86-64 = SSE/SSE2 baseline (~2003),\ - x86-64-v2 = + SSE4.2/POPCNT (~2009) [DEFAULT],\ + x86-64-v2 = + SSE4.2/POPCNT (~2009),\ x86-64-v3 = + AVX2/FMA (~2013),\ x86-64-v4 = + AVX-512 (~2017, may throttle),\ native = host CPU (not redistributable)"