From 886494b66875f2d7f612864af1f6b58dbc815446 Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Thu, 2 Apr 2026 15:01:51 -0500 Subject: [PATCH 1/2] PERF: Enable FFTW SIMD codelets with per-CPU introspection at configure time FFTW SIMD codelets (NEON, SSE/SSE2, AVX, AVX2) are hand-written assembly routines baked into the library at compile time. Previously all SIMD flags were hardcoded to OFF, producing scalar-only FFTW builds regardless of the host CPU. Add per-CPU SIMD detection at CMake configure time: - ARM64 (aarch64/arm64/ARM64): NEON=ON (mandatory in ARMv8) - x86/x86_64 with GCC/Clang: probe SSE, SSE2, AVX, AVX2 individually via __builtin_cpu_supports() / CheckCSourceRuns - x86/x86_64 with MSVC: skip probes (intrinsic unavailable), default OFF - Cross-compile ARM64: NEON=ON; x86_64: SSE+SSE2 only (conservative) - All other architectures: all SIMD off (safe fallback) Every flag is an individually overridable cache option (e.g. cmake -DFFTW_ENABLE_AVX2=OFF). Cherry-picked from PR #6004 (targeting release-5.4) with review fixes: - ARM64 regex includes all-caps variant for Windows ARM64 - MSVC compiler guard on __builtin_cpu_supports probes - ENABLE_SSE included in documentation comment Co-Authored-By: Claude Opus 4.6 (1M context) --- CMake/itkExternal_FFTW.cmake | 119 ++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 17 deletions(-) diff --git a/CMake/itkExternal_FFTW.cmake b/CMake/itkExternal_FFTW.cmake index 6d9e6918350..8dc7b57a2fe 100644 --- a/CMake/itkExternal_FFTW.cmake +++ b/CMake/itkExternal_FFTW.cmake @@ -1,19 +1,32 @@ # # Encapsulates building FFTW as an External Project. # -# NOTE: internal building of fftw is for convenience, -# and the version of fftw built here does not -# use modern hardware optimzations. +# SIMD codelet selection +# ---------------------- +# FFTW SIMD codelets are hand-written assembly routines baked into the +# library at compile time. Passing -march=native to the ITK build does +# NOT activate them; they must be requested explicitly via FFTW's own +# CMake options (ENABLE_NEON, ENABLE_SSE, ENABLE_SSE2, ENABLE_AVX, ENABLE_AVX2). # -# The build configuration chosen to be -# generalizable to as many hardware platforms. -# Being backward compatible for decades -# old hardware is the goal of this internal -# representation. +# This file detects appropriate defaults at cmake configure time: # -# This is primarily used to support testing -# and should not be used for production -# builds where performance is a concern. +# Native builds (CMAKE_CROSSCOMPILING is false): +# - ARM64 (aarch64/arm64/ARM64): NEON=ON (mandatory in ARMv8); x86 SIMD off. +# - x86/x86_64 with GCC/Clang: each of SSE, SSE2, AVX, AVX2 is probed +# individually via __builtin_cpu_supports() / CheckCSourceRuns so that +# the detected flags match the actual build-host CPU. A pre-AVX +# Sandy Bridge gets SSE+SSE2 only; a Haswell or later gets all four. +# On MSVC the probes are skipped (intrinsic unavailable) and SIMD +# defaults to off; users can override via FFTW_ENABLE_* options. +# - Other architectures: all SIMD off (conservative fallback). +# +# Cross-compiled builds (CMAKE_CROSSCOMPILING is true): +# - ARM64: NEON=ON (mandatory); x86 SIMD off. +# - x86_64: SSE+SSE2 only (baseline; AVX/AVX2 not assumed for target). +# - Other: all SIMD off. +# +# Every flag is an individually overridable cache option, e.g.: +# cmake -DFFTW_ENABLE_AVX2=OFF ... # # These instructions follow the guidance provided for modern cmake usage as described: # https://github.com/dev-cafe/cmake-cookbook/blob/master/chapter-08/recipe-03/c-example/external/upstream/fftw3/CMakeLists.txt @@ -64,6 +77,74 @@ if(NOT ITK_USE_SYSTEM_FFTW) set(FFTW_STAGED_INSTALL_PREFIX "${ITK_BINARY_DIR}/fftw") + # Detect SIMD defaults (see file header for full policy description). + # CheckCSourceRuns results are cached after the first cmake configure run. + include(CheckCSourceRuns) + + set(_fftw_default_neon OFF) + set(_fftw_default_sse OFF) + set(_fftw_default_sse2 OFF) + set(_fftw_default_avx OFF) + set(_fftw_default_avx2 OFF) + + if(NOT CMAKE_CROSSCOMPILING) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + # NEON is mandatory in ARMv8/AArch64 — every arm64 CPU has it. + set(_fftw_default_neon ON) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i686") + # Probe each x86 SIMD level individually via CPUID so the defaults + # are accurate for the actual build-host CPU (e.g. pre-AVX Sandy Bridge + # or pre-AVX2 Ivy Bridge get only the levels their hardware supports). + # __builtin_cpu_supports is a GCC/Clang intrinsic; skip on MSVC. + if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang|AppleClang") + foreach(_fftw_simd IN ITEMS sse sse2 avx avx2) + check_c_source_runs( + "int main(void){return __builtin_cpu_supports(\"${_fftw_simd}\")?0:1;}" + _fftw_cpu_has_${_fftw_simd} + ) + if(_fftw_cpu_has_${_fftw_simd}) + set(_fftw_default_${_fftw_simd} ON) + endif() + endforeach() + endif() + endif() + else() + # Cross-compiling: conservative architecture-level fallback. + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + set(_fftw_default_neon ON) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") + # SSE/SSE2 are baseline on all 64-bit x86 CPUs; AVX/AVX2 not assumed. + set(_fftw_default_sse ON) + set(_fftw_default_sse2 ON) + endif() + endif() + + option( + FFTW_ENABLE_NEON + "Enable FFTW NEON SIMD codelets (ARM64)" + ${_fftw_default_neon} + ) + option( + FFTW_ENABLE_SSE + "Enable FFTW SSE SIMD codelets (x86)" + ${_fftw_default_sse} + ) + option( + FFTW_ENABLE_SSE2 + "Enable FFTW SSE2 SIMD codelets (x86)" + ${_fftw_default_sse2} + ) + option( + FFTW_ENABLE_AVX + "Enable FFTW AVX SIMD codelets (x86)" + ${_fftw_default_avx} + ) + option( + FFTW_ENABLE_AVX2 + "Enable FFTW AVX2 SIMD codelets (x86)" + ${_fftw_default_avx2} + ) + # Macro to generate library filename with appropriate prefix/suffix # Args: output_var library_base_name macro(_library_name_to_filename output_var library_base_name) @@ -111,10 +192,12 @@ if(NOT ITK_USE_SYSTEM_FFTW) -DCMAKE_INSTALL_PREFIX:PATH=${FFTW_STAGED_INSTALL_PREFIX} -DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR} -DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR} - -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=OFF -DENABLE_AVX2:BOOL=OFF - -DENABLE_FLOAT:BOOL=ON -DENABLE_LONG_DOUBLE:BOOL=OFF + -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX} + -DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2} -DENABLE_FLOAT:BOOL=ON + -DENABLE_LONG_DOUBLE:BOOL=OFF -DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON} -DENABLE_OPENMP:BOOL=OFF -DENABLE_QUAD_PRECISION:BOOL=OFF - -DENABLE_SSE:BOOL=OFF -DENABLE_SSE2:BOOL=OFF -DENABLE_THREADS:BOOL=ON + -DENABLE_SSE:BOOL=${FFTW_ENABLE_SSE} + -DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2} -DENABLE_THREADS:BOOL=ON -DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR} -DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER} @@ -175,10 +258,12 @@ if(NOT ITK_USE_SYSTEM_FFTW) -DCMAKE_INSTALL_PREFIX:PATH=${FFTW_STAGED_INSTALL_PREFIX} -DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR} -DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR} - -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=OFF -DENABLE_AVX2:BOOL=OFF - -DENABLE_FLOAT:BOOL=OFF -DENABLE_LONG_DOUBLE:BOOL=OFF + -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX} + -DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2} -DENABLE_FLOAT:BOOL=OFF + -DENABLE_LONG_DOUBLE:BOOL=OFF -DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON} -DENABLE_OPENMP:BOOL=OFF -DENABLE_QUAD_PRECISION:BOOL=OFF - -DENABLE_SSE:BOOL=OFF -DENABLE_SSE2:BOOL=OFF -DENABLE_THREADS:BOOL=ON + -DENABLE_SSE:BOOL=${FFTW_ENABLE_SSE} + -DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2} -DENABLE_THREADS:BOOL=ON -DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR} -DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER} From a9b11c83657987eb9cf17ee14102bbfaabb50e72 Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Thu, 2 Apr 2026 15:23:15 -0500 Subject: [PATCH 2/2] STYLE: Add SIMD status message and fix SSE forwarding to fftwd - Add message(STATUS) showing detected FFTW SIMD flags at configure time so users can verify detection without inspecting the cache. - Remove ENABLE_SSE from the fftwd (double-precision) ExternalProject block; SSE1 codelets are float-only and have no effect on fftwd. - Document in the file header that option() defaults only apply on first configure and that ENABLE_SSE is not forwarded to fftwd. Addresses review comments from @greptile-apps on PR #6006. Co-Authored-By: Claude Opus 4.6 (1M context) --- CMake/itkExternal_FFTW.cmake | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/CMake/itkExternal_FFTW.cmake b/CMake/itkExternal_FFTW.cmake index 8dc7b57a2fe..7c70f641c57 100644 --- a/CMake/itkExternal_FFTW.cmake +++ b/CMake/itkExternal_FFTW.cmake @@ -27,6 +27,12 @@ # # Every flag is an individually overridable cache option, e.g.: # cmake -DFFTW_ENABLE_AVX2=OFF ... +# Note: option() defaults are only applied on the first configure. +# To re-detect after a toolchain change, delete the CMake cache or use +# cmake --fresh, or pass explicit -DFFTW_ENABLE_*= overrides. +# +# ENABLE_SSE (SSE1) is float-only and is not forwarded to the +# double-precision fftwd build. # # These instructions follow the guidance provided for modern cmake usage as described: # https://github.com/dev-cafe/cmake-cookbook/blob/master/chapter-08/recipe-03/c-example/external/upstream/fftw3/CMakeLists.txt @@ -145,6 +151,11 @@ if(NOT ITK_USE_SYSTEM_FFTW) ${_fftw_default_avx2} ) + message( + STATUS + "FFTW SIMD: NEON=${FFTW_ENABLE_NEON} SSE=${FFTW_ENABLE_SSE} SSE2=${FFTW_ENABLE_SSE2} AVX=${FFTW_ENABLE_AVX} AVX2=${FFTW_ENABLE_AVX2}" + ) + # Macro to generate library filename with appropriate prefix/suffix # Args: output_var library_base_name macro(_library_name_to_filename output_var library_base_name) @@ -262,7 +273,7 @@ if(NOT ITK_USE_SYSTEM_FFTW) -DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2} -DENABLE_FLOAT:BOOL=OFF -DENABLE_LONG_DOUBLE:BOOL=OFF -DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON} -DENABLE_OPENMP:BOOL=OFF -DENABLE_QUAD_PRECISION:BOOL=OFF - -DENABLE_SSE:BOOL=${FFTW_ENABLE_SSE} + -DENABLE_SSE:BOOL=OFF # SSE1 codelets are 32-bit float only; no effect on double-precision -DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2} -DENABLE_THREADS:BOOL=ON -DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR} -DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER}