halide · alexreinking · Mar 8, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.github/workflows/testing-arm-linux.yml b/.github/workflows/testing-arm-linux.yml
@@ -0,0 +1,120 @@
+name: ARM Linux
+
+on:
+  pull_request:
+    types: [ opened, synchronize, reopened ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  arm-linux:
+    name: arm-${{ matrix.bits }} / ${{ matrix.uv_group }}
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      fail-fast: false
+      matrix:
+        bits: [ "64", "32" ]
+        uv_group: [ "ci-llvm-main", "ci-llvm-22", "ci-llvm-21", "ci-llvm-20" ]
+        include:
+          - bits: 32
+            arch: armv7l
+            python: 3.11-armv7-gnueabihf  # needed for piwheels
+          - bits: 64
+            arch: aarch64
+            python: linux-aarch64-gnu
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: astral-sh/setup-uv@v5
+
+      - name: Install system dependencies
+        run: |
+          if [[ "${{ matrix.bits }}" == "32" ]]; then
+            sudo dpkg --add-architecture armhf
+          fi
+
+          apt_update() {
+            for i in 1 2 3; do
+              if sudo apt-get update; then return 0; fi
+              echo "apt-get update failed (attempt $i/3), retrying in 10s..."
+              sleep 10
+            done
+            return 1
+          }
+          apt_update
+
+          if [[ "${{ matrix.bits }}" == "32" ]]; then
+            sudo apt-get install -y \
+              binutils-arm-linux-gnueabihf \
+              g++-arm-linux-gnueabihf \
+              gcc-arm-linux-gnueabihf \
+              libc6:armhf \
+              libstdc++6:armhf \
+              libatomic1:armhf \
+              libpng-dev:armhf \
+              libjpeg-dev:armhf
+          else
+            sudo apt-get install -y \
+              libpng-dev \
+              libjpeg-dev
+          fi
+
+      - name: Sync CI environment
+        run: |
+          setarch ${{ matrix.arch }} bash -ec "
+            uv sync --python '${{ matrix.python }}' --group '${{ matrix.uv_group }}' --no-install-project
+            echo '${GITHUB_WORKSPACE}/.venv/bin' >> '$GITHUB_PATH'
+            echo 'VIRTUAL_ENV=${GITHUB_WORKSPACE}/.venv' >> '$GITHUB_ENV'
+          "
+
+      - name: Configure LLVM
+        run: echo "Halide_LLVM_ROOT=$(halide-llvm --prefix)" >> "$GITHUB_ENV"
+
+      - name: Configure CMake
+        run: |
+          TOOLCHAIN_ARGS=()
+          if [[ "${{ matrix.bits }}" == "32" ]]; then
+            TOOLCHAIN_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/cmake/toolchain.linux-arm32.cmake")
+          fi
+
+          cmake -G Ninja -S . -B build \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DHalide_LLVM_ROOT="${Halide_LLVM_ROOT}" \
+            -DWITH_PYTHON_BINDINGS=OFF \
+            "${TOOLCHAIN_ARGS[@]}"
+
+      - name: Initial build
+        run: cmake --build build
+
+      - name: Detect host target
+        run: |
+          HOST_TARGET=$(./build/src/autoschedulers/common/get_host_target)
+          echo "HAS_SVE2=$([[ "$HOST_TARGET" == *sve2* ]] && echo true || echo false)" >> "$GITHUB_ENV"
+          echo "Detected host target: ${HOST_TARGET}"
+
+      - name: Test (host)
+        if: matrix.bits == '32' || env.HAS_SVE2 == 'true'
+        run: |
+          cmake -S . -B build -DHalide_TARGET=host
+          cmake --build build
+          ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"
+
+      - name: Test (NEON)
+        if: matrix.bits == '64'
+        run: |
+          cmake -S . -B build -DHalide_TARGET=arm-64-linux-arm_dot_prod-arm_fp16
+          cmake --build build
+          ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"
+
+      - name: Test (no extensions)
+        run: |
+          cmake -S . -B build -DHalide_TARGET=cmake
+          cmake --build build
+          ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"
diff --git a/cmake/HalideTestHelpers.cmake b/cmake/HalideTestHelpers.cmake
@@ -69,11 +69,20 @@ function(add_halide_test TARGET)
     #
     # target_link_libraries("${TARGET}" PRIVATE Halide::TerminateHandler)
 
-    set_tests_properties(${TARGET} PROPERTIES
-                         LABELS "${args_GROUPS}"
-                         ENVIRONMENT "HL_TARGET=${Halide_TARGET};HL_JIT_TARGET=${Halide_TARGET}"
-                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
-                         WILL_FAIL ${args_EXPECT_FAILURE})
+    # Resolve the "cmake" meta-target
+    string(REGEX REPLACE "^cmake" "${Halide_CMAKE_TARGET}" _resolved_target "${Halide_TARGET}")
+
+    set_tests_properties(
+        ${TARGET}
+        PROPERTIES
+        LABELS "${args_GROUPS}"
+        ENVIRONMENT "HL_TARGET=${_resolved_target};HL_JIT_TARGET=${_resolved_target}"
+        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
+        WILL_FAIL ${args_EXPECT_FAILURE}
+    )
+    if ("autoschedulers_cpu" IN_LIST args_GROUPS)
+        set_tests_properties(${TARGET} PROPERTIES RUN_SERIAL TRUE)
+    endif ()
 
     if (NOT args_USE_EXIT_CODE_ONLY)
         set_tests_properties(${TARGET} PROPERTIES

diff --git a/cmake/toolchain.linux-arm32.cmake b/cmake/toolchain.linux-arm32.cmake
@@ -24,6 +24,9 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 
+set(CMAKE_C_FLAGS_INIT "-mfp16-format=ieee -Wno-psabi")
+set(CMAKE_CXX_FLAGS_INIT "-mfp16-format=ieee -Wno-psabi")
+
 # add_custom_command() will make bad decisions about running the command
 # when crosscompiling (it won't expand the target into a full path).
 # Setting CMAKE_CROSSCOMPILING_EMULATOR to /usr/bin/env tricks it into

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ readme = "./packaging/pip/README.md"
 requires-python = ">=3.10"
 dependencies = [
     "imageio>=2",
+    "pillow; platform_machine == 'armv8l' or platform_machine == 'armv7l'",
     "numpy>=1.26",
 ]
 dynamic = ['version']
@@ -38,7 +39,6 @@ classifiers = [
     "Environment :: WebAssembly",
     "Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: MIT License",
     "Natural Language :: English",
     "Operating System :: MacOS",
     "Operating System :: Microsoft :: Windows",
@@ -68,7 +68,8 @@ dev = [
     "setuptools-scm>=8.3.1",
 ]
 apps = [
-    "onnx==1.18.0", # for apps/onnx
+    "onnx==1.18.0; platform_machine != 'armv8l' and platform_machine != 'armv7l'", # for apps/onnx
+    "onnx==1.17.0; platform_machine == 'armv8l' or platform_machine == 'armv7l'", # for apps/onnx
     "pytest", # unspecified onnx dependency
 ]
 tools = [
@@ -202,8 +203,18 @@ conflicts = [
 
 [tool.uv.sources]
 halide-llvm = { index = "halide" }
+imageio = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
+numpy = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
+onnx = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
+pillow = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
+protobuf = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
 
 [[tool.uv.index]]
 name = "halide"
 url = "https://pypi.halide-lang.org/simple"
 explicit = true
+
+[[tool.uv.index]]
+name = "piwheels"
+url = "https://piwheels.org/simple"
+explicit = true
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -56,6 +56,10 @@ Target complete_arm_target(Target t) {
         }
     };
 
+    // ARMFp16 implies ARMv8.2-A; we don't know of any devices where
+    // that doesn't hold. The cascade loop below will set ARMv81a and ARMv8a.
+    add_implied_feature_if_supported(t, Target::ARMFp16, Target::ARMv82a);
+
     constexpr int num_arm_v8_features = 10;
     static const Target::Feature arm_v8_features[num_arm_v8_features] = {
         Target::ARMv89a,
@@ -1681,6 +1685,7 @@ void CodeGen_ARM::visit(const Store *op) {
                 vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, pred_type);
                 if (is_predicated_store) {
                     Value *sliced_store_vpred_val = slice_vector(store_pred_val, i, natural_lanes);
+                    sliced_store_vpred_val = convert_fixed_or_scalable_vector_type(sliced_store_vpred_val, pred_type);
                     vpred_val = builder->CreateAnd(vpred_val, sliced_store_vpred_val);
                 }
 
@@ -1854,6 +1859,7 @@ void CodeGen_ARM::visit(const Load *op) {
                 Value *vpred_val = codegen(vpred);
                 if (is_predicated_load) {
                     Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, natural_lanes);
+                    sliced_load_vpred_val = convert_fixed_or_scalable_vector_type(sliced_load_vpred_val, vpred_val->getType());
                     vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val);
                 }
 
@@ -1904,8 +1910,14 @@ Value *CodeGen_ARM::interleave_vectors(const std::vector<Value *> &vecs) {
         return CodeGen_Posix::interleave_vectors(vecs);
     }
 
-    // Lower into llvm.vector.interleave intrinsic
+    // Lower into llvm.vector.interleave intrinsic.
+    // LLVM only supports non-power-of-2 strides (e.g. 3) for scalable
+    // vectors starting in LLVM 22.
+#if LLVM_VERSION >= 220
     const std::set<int> supported_strides{2, 3, 4, 8};
+#else
+    const std::set<int> supported_strides{2, 4, 8};
+#endif
     const int stride = vecs.size();
     const int src_lanes = get_vector_num_elements(vecs[0]->getType());
 
@@ -1957,7 +1969,11 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
         }
 
         // Lower slice with stride into llvm.vector.deinterleave intrinsic
+#if LLVM_VERSION >= 220
         const std::set<int> supported_strides{2, 3, 4, 8};
+#else
+        const std::set<int> supported_strides{2, 4, 8};
+#endif
         if (supported_strides.find(slice_stride) != supported_strides.end() &&
             dst_lanes * slice_stride == src_lanes &&
             indices.front() < slice_stride &&  // Start position cannot be larger than stride
@@ -2410,6 +2426,10 @@ string CodeGen_ARM::mcpu_target() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
             return "swift";
+        } else if (target.has_feature(Target::ARMv82a)) {
+            return "cortex-a55";
+        } else if (target.has_feature(Target::ARMv8a)) {
+            return "cortex-a32";
         } else {
             return "cortex-a9";
         }
@@ -2436,7 +2456,10 @@ string CodeGen_ARM::mattrs() const {
         attrs.emplace_back("+fullfp16");
     }
     if (target.has_feature(Target::ARMv8a)) {
-        attrs.emplace_back("+v8a");
+        // The ARM (32-bit) backend calls this feature "v8"; the AArch64
+        // backend calls it "v8a". The dotted sub-versions (v8.1a, v8.2a,
+        // etc.) use the same names in both backends.
+        attrs.emplace_back(target.bits == 32 ? "+v8" : "+v8a");
     }
     if (target.has_feature(Target::ARMv81a)) {
         attrs.emplace_back("+v8.1a");

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -1515,6 +1515,17 @@ void CodeGen_LLVM::visit(const Reinterpret *op) {
         llvm::Type *llvm_dst_fixed = get_vector_type(llvm_type_of(dst.element_of()), dst.lanes(), VectorTypeConstraint::Fixed);
         value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
         value = fixed_to_scalable_vector_type(value);
+    } else if (isa<FixedVectorType>(value->getType()) && isa<ScalableVectorType>(llvm_dst)) {
+        // Cannot bitcast/ptrtoint directly between fixed and scalable vectors.
+        // First cast to a fixed vector of the destination element type, then convert to scalable.
+        llvm::Type *llvm_dst_fixed = get_vector_type(llvm_dst->getScalarType(), dst.lanes(), VectorTypeConstraint::Fixed);
+        value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
+        value = fixed_to_scalable_vector_type(value);
+    } else if (isa<ScalableVectorType>(value->getType()) && isa<FixedVectorType>(llvm_dst)) {
+        // Cannot bitcast/ptrtoint directly between scalable and fixed vectors.
+        // First convert to a fixed vector of the source element type, then cast.
+        value = scalable_to_fixed_vector_type(value);
+        value = builder->CreateBitOrPointerCast(value, llvm_dst);
     } else {
         // Our `Reinterpret` expr directly maps to LLVM IR bitcast/ptrtoint/inttoptr
         // instructions with no additional handling required:
@@ -4314,10 +4325,12 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
         const int input_lanes = val.type().lanes();
         const int input_bytes = input_lanes * val.type().bytes();
         const int vscale = std::max(effective_vscale, 1);
+        // LLVM added VECREDUCE_MUL/FMUL lowering for SVE in LLVM 22.
+        const bool mul_ok = LLVM_VERSION >= 220 || effective_vscale == 0;
         const bool llvm_has_intrinsic =
             // Must be one of these ops
             ((op->op == VectorReduce::Add ||
-              op->op == VectorReduce::Mul ||
+              (op->op == VectorReduce::Mul && mul_ok) ||
               op->op == VectorReduce::Min ||
               op->op == VectorReduce::Max) &&
              (use_llvm_vp_intrinsics ||
@@ -4920,6 +4933,13 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
         // otherwise.
         llvm::Type *scalar_type = vec->getType()->getScalarType();
 
+        if (scalar_type->isIntegerTy(1)) {
+            auto *result_type = cast<VectorType>(get_vector_type(scalar_type, size / effective_vscale, VectorTypeConstraint::VScale));
+            return handle_bool_as_i8(vec, result_type, [&](Value *v) {
+                return slice_vector(v, start, size);
+            });
+        }
+
         int intermediate_lanes = std::min(size, vec_lanes - start);
         llvm::Type *intermediate_type = get_vector_type(scalar_type, intermediate_lanes, VectorTypeConstraint::Fixed);
 
@@ -5190,6 +5210,18 @@ llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::
     return match_vector_type_scalable(value, guide->getType());
 }
 
+llvm::Value *CodeGen_LLVM::handle_bool_as_i8(llvm::Value *arg, llvm::VectorType *result_i1_type,
+                                             const std::function<llvm::Value *(llvm::Value *)> &fn) {
+    auto *arg_vty = cast<llvm::VectorType>(arg->getType());
+    bool scalable = isa<llvm::ScalableVectorType>(arg_vty);
+    int min_elts = scalable ? cast<llvm::ScalableVectorType>(arg_vty)->getMinNumElements() : cast<llvm::FixedVectorType>(arg_vty)->getNumElements();
+    auto constraint = scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed;
+    llvm::Type *arg_i8 = get_vector_type(i8_t, min_elts, constraint);
+    llvm::Value *widened = builder->CreateZExt(arg, arg_i8);
+    llvm::Value *result = fn(widened);
+    return builder->CreateTrunc(result, result_i1_type);
+}
+
 llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *arg,
                                                                  llvm::Type *desired_type) {
     llvm::Type *arg_type = arg->getType();
@@ -5199,6 +5231,18 @@ llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *ar
     }
 
     internal_assert(arg_type->getScalarType() == desired_type->getScalarType());
+
+    if (arg_type->isVectorTy() && desired_type->isVectorTy() &&
+        arg_type->getScalarType()->isIntegerTy(1)) {
+        bool dst_scalable = isa<llvm::ScalableVectorType>(desired_type);
+        int dst_elts = get_vector_num_elements(desired_type);
+        llvm::Type *dst_i8 = get_vector_type(i8_t, dst_scalable ? dst_elts / effective_vscale : dst_elts,
+                                             dst_scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
+        return handle_bool_as_i8(arg, cast<VectorType>(desired_type), [&](Value *v) {
+            return convert_fixed_or_scalable_vector_type(v, dst_i8);
+        });
+    }
+
     if (!arg_type->isVectorTy()) {
         arg = create_broadcast(arg, 1);
         arg_type = arg->getType();
@@ -5280,6 +5324,12 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)
     internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
     internal_assert(lanes == (scalable_type->getMinNumElements() * effective_vscale));
 
+    if (fixed_type->getElementType()->isIntegerTy(1)) {
+        return handle_bool_as_i8(fixed_arg, scalable_type, [&](Value *v) {
+            return fixed_to_scalable_vector_type(v);
+        });
+    }
+
     // E.g. <vscale x 2 x i64> llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
     const char *type_designator;
     if (fixed_type->getElementType()->isIntegerTy()) {
@@ -5297,7 +5347,7 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)
 
     std::vector<llvm::Value *> args;
     args.push_back(result_vec);
-    args.push_back(value);
+    args.push_back(fixed_arg);
     args.push_back(ConstantInt::get(i64_t, 0));
 
     return simple_call_intrin(intrin, args, scalable_type);
@@ -5316,6 +5366,12 @@ llvm::Value *CodeGen_LLVM::scalable_to_fixed_vector_type(llvm::Value *scalable_a
     internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
     internal_assert(fixed_type->getNumElements() == (scalable_type->getMinNumElements() * effective_vscale));
 
+    if (scalable_type->getElementType()->isIntegerTy(1)) {
+        return handle_bool_as_i8(scalable_arg, fixed_type, [&](Value *v) {
+            return scalable_to_fixed_vector_type(v);
+        });
+    }
+
     // E.g. <64 x i8> @llvm.vector.extract.v64i8.nxv8i8(<vscale x 8 x i8> %vresult, i64 0)
     const char *type_designator;
     if (scalable_type->getElementType()->isIntegerTy()) {