Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions .github/workflows/testing-arm-linux.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
name: ARM Linux

on:
pull_request:
types: [ opened, synchronize, reopened ]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

permissions:
contents: read

jobs:
arm-linux:
name: arm-${{ matrix.bits }} / ${{ matrix.uv_group }}
runs-on: ubuntu-24.04-arm
strategy:
fail-fast: false
matrix:
bits: [ "64", "32" ]
uv_group: [ "ci-llvm-main", "ci-llvm-22", "ci-llvm-21", "ci-llvm-20" ]
include:
- bits: 32
arch: armv7l
python: 3.11-armv7-gnueabihf # needed for piwheels
- bits: 64
arch: aarch64
python: linux-aarch64-gnu

steps:
- uses: actions/checkout@v4

- uses: astral-sh/setup-uv@v5

- name: Install system dependencies
run: |
if [[ "${{ matrix.bits }}" == "32" ]]; then
sudo dpkg --add-architecture armhf
fi

apt_update() {
for i in 1 2 3; do
if sudo apt-get update; then return 0; fi
echo "apt-get update failed (attempt $i/3), retrying in 10s..."
sleep 10
done
return 1
}
apt_update

if [[ "${{ matrix.bits }}" == "32" ]]; then
sudo apt-get install -y \
binutils-arm-linux-gnueabihf \
g++-arm-linux-gnueabihf \
gcc-arm-linux-gnueabihf \
libc6:armhf \
libstdc++6:armhf \
libatomic1:armhf \
libpng-dev:armhf \
libjpeg-dev:armhf
else
sudo apt-get install -y \
libpng-dev \
libjpeg-dev
fi

- name: Sync CI environment
run: |
setarch ${{ matrix.arch }} bash -ec "
uv sync --python '${{ matrix.python }}' --group '${{ matrix.uv_group }}' --no-install-project
echo '${GITHUB_WORKSPACE}/.venv/bin' >> '$GITHUB_PATH'
echo 'VIRTUAL_ENV=${GITHUB_WORKSPACE}/.venv' >> '$GITHUB_ENV'
"

- name: Configure LLVM
run: echo "Halide_LLVM_ROOT=$(halide-llvm --prefix)" >> "$GITHUB_ENV"

- name: Configure CMake
run: |
TOOLCHAIN_ARGS=()
if [[ "${{ matrix.bits }}" == "32" ]]; then
TOOLCHAIN_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/cmake/toolchain.linux-arm32.cmake")
fi

cmake -G Ninja -S . -B build \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DHalide_LLVM_ROOT="${Halide_LLVM_ROOT}" \
-DWITH_PYTHON_BINDINGS=OFF \
"${TOOLCHAIN_ARGS[@]}"

- name: Initial build
run: cmake --build build

- name: Detect host target
run: |
HOST_TARGET=$(./build/src/autoschedulers/common/get_host_target)
echo "HAS_SVE2=$([[ "$HOST_TARGET" == *sve2* ]] && echo true || echo false)" >> "$GITHUB_ENV"
echo "Detected host target: ${HOST_TARGET}"

- name: Test (host)
if: matrix.bits == '32' || env.HAS_SVE2 == 'true'
run: |
cmake -S . -B build -DHalide_TARGET=host
cmake --build build
ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"

- name: Test (NEON)
if: matrix.bits == '64'
run: |
cmake -S . -B build -DHalide_TARGET=arm-64-linux-arm_dot_prod-arm_fp16
cmake --build build
ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"

- name: Test (no extensions)
run: |
cmake -S . -B build -DHalide_TARGET=cmake
cmake --build build
ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"
19 changes: 14 additions & 5 deletions cmake/HalideTestHelpers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,20 @@ function(add_halide_test TARGET)
#
# target_link_libraries("${TARGET}" PRIVATE Halide::TerminateHandler)

set_tests_properties(${TARGET} PROPERTIES
LABELS "${args_GROUPS}"
ENVIRONMENT "HL_TARGET=${Halide_TARGET};HL_JIT_TARGET=${Halide_TARGET}"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
WILL_FAIL ${args_EXPECT_FAILURE})
# Resolve the "cmake" meta-target
string(REGEX REPLACE "^cmake" "${Halide_CMAKE_TARGET}" _resolved_target "${Halide_TARGET}")

set_tests_properties(
${TARGET}
PROPERTIES
LABELS "${args_GROUPS}"
ENVIRONMENT "HL_TARGET=${_resolved_target};HL_JIT_TARGET=${_resolved_target}"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
WILL_FAIL ${args_EXPECT_FAILURE}
)
if ("autoschedulers_cpu" IN_LIST args_GROUPS)
set_tests_properties(${TARGET} PROPERTIES RUN_SERIAL TRUE)
endif ()

if (NOT args_USE_EXIT_CODE_ONLY)
set_tests_properties(${TARGET} PROPERTIES
Expand Down
3 changes: 3 additions & 0 deletions cmake/toolchain.linux-arm32.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)

set(CMAKE_C_FLAGS_INIT "-mfp16-format=ieee -Wno-psabi")
set(CMAKE_CXX_FLAGS_INIT "-mfp16-format=ieee -Wno-psabi")

# add_custom_command() will make bad decisions about running the command
# when crosscompiling (it won't expand the target into a full path).
# Setting CMAKE_CROSSCOMPILING_EMULATOR to /usr/bin/env tricks it into
Expand Down
15 changes: 13 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ readme = "./packaging/pip/README.md"
requires-python = ">=3.10"
dependencies = [
"imageio>=2",
"pillow; platform_machine == 'armv8l' or platform_machine == 'armv7l'",
"numpy>=1.26",
]
dynamic = ['version']
Expand All @@ -38,7 +39,6 @@ classifiers = [
"Environment :: WebAssembly",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: MacOS",
"Operating System :: Microsoft :: Windows",
Expand Down Expand Up @@ -68,7 +68,8 @@ dev = [
"setuptools-scm>=8.3.1",
]
apps = [
"onnx==1.18.0", # for apps/onnx
"onnx==1.18.0; platform_machine != 'armv8l' and platform_machine != 'armv7l'", # for apps/onnx
"onnx==1.17.0; platform_machine == 'armv8l' or platform_machine == 'armv7l'", # for apps/onnx
"pytest", # unspecified onnx dependency
]
tools = [
Expand Down Expand Up @@ -202,8 +203,18 @@ conflicts = [

[tool.uv.sources]
halide-llvm = { index = "halide" }
imageio = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
numpy = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
onnx = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
pillow = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
protobuf = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }

[[tool.uv.index]]
name = "halide"
url = "https://pypi.halide-lang.org/simple"
explicit = true

[[tool.uv.index]]
name = "piwheels"
url = "https://piwheels.org/simple"
explicit = true
27 changes: 25 additions & 2 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ Target complete_arm_target(Target t) {
}
};

// ARMFp16 implies ARMv8.2-A; we don't know of any devices where
// that doesn't hold. The cascade loop below will set ARMv81a and ARMv8a.
add_implied_feature_if_supported(t, Target::ARMFp16, Target::ARMv82a);

constexpr int num_arm_v8_features = 10;
static const Target::Feature arm_v8_features[num_arm_v8_features] = {
Target::ARMv89a,
Expand Down Expand Up @@ -1681,6 +1685,7 @@ void CodeGen_ARM::visit(const Store *op) {
vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, pred_type);
if (is_predicated_store) {
Value *sliced_store_vpred_val = slice_vector(store_pred_val, i, natural_lanes);
sliced_store_vpred_val = convert_fixed_or_scalable_vector_type(sliced_store_vpred_val, pred_type);
vpred_val = builder->CreateAnd(vpred_val, sliced_store_vpred_val);
}

Expand Down Expand Up @@ -1854,6 +1859,7 @@ void CodeGen_ARM::visit(const Load *op) {
Value *vpred_val = codegen(vpred);
if (is_predicated_load) {
Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, natural_lanes);
sliced_load_vpred_val = convert_fixed_or_scalable_vector_type(sliced_load_vpred_val, vpred_val->getType());
vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val);
}

Expand Down Expand Up @@ -1904,8 +1910,14 @@ Value *CodeGen_ARM::interleave_vectors(const std::vector<Value *> &vecs) {
return CodeGen_Posix::interleave_vectors(vecs);
}

// Lower into llvm.vector.interleave intrinsic
// Lower into llvm.vector.interleave intrinsic.
// LLVM only supports non-power-of-2 strides (e.g. 3) for scalable
// vectors starting in LLVM 22.
#if LLVM_VERSION >= 220
const std::set<int> supported_strides{2, 3, 4, 8};
#else
const std::set<int> supported_strides{2, 4, 8};
#endif
const int stride = vecs.size();
const int src_lanes = get_vector_num_elements(vecs[0]->getType());

Expand Down Expand Up @@ -1957,7 +1969,11 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
}

// Lower slice with stride into llvm.vector.deinterleave intrinsic
#if LLVM_VERSION >= 220
const std::set<int> supported_strides{2, 3, 4, 8};
#else
const std::set<int> supported_strides{2, 4, 8};
#endif
if (supported_strides.find(slice_stride) != supported_strides.end() &&
dst_lanes * slice_stride == src_lanes &&
indices.front() < slice_stride && // Start position cannot be larger than stride
Expand Down Expand Up @@ -2410,6 +2426,10 @@ string CodeGen_ARM::mcpu_target() const {
if (target.bits == 32) {
if (target.has_feature(Target::ARMv7s)) {
return "swift";
} else if (target.has_feature(Target::ARMv82a)) {
return "cortex-a55";
} else if (target.has_feature(Target::ARMv8a)) {
return "cortex-a32";
} else {
return "cortex-a9";
}
Expand All @@ -2436,7 +2456,10 @@ string CodeGen_ARM::mattrs() const {
attrs.emplace_back("+fullfp16");
}
if (target.has_feature(Target::ARMv8a)) {
attrs.emplace_back("+v8a");
// The ARM (32-bit) backend calls this feature "v8"; the AArch64
// backend calls it "v8a". The dotted sub-versions (v8.1a, v8.2a,
// etc.) use the same names in both backends.
attrs.emplace_back(target.bits == 32 ? "+v8" : "+v8a");
}
if (target.has_feature(Target::ARMv81a)) {
attrs.emplace_back("+v8.1a");
Expand Down
60 changes: 58 additions & 2 deletions src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1515,6 +1515,17 @@ void CodeGen_LLVM::visit(const Reinterpret *op) {
llvm::Type *llvm_dst_fixed = get_vector_type(llvm_type_of(dst.element_of()), dst.lanes(), VectorTypeConstraint::Fixed);
value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
value = fixed_to_scalable_vector_type(value);
} else if (isa<FixedVectorType>(value->getType()) && isa<ScalableVectorType>(llvm_dst)) {
// Cannot bitcast/ptrtoint directly between fixed and scalable vectors.
// First cast to a fixed vector of the destination element type, then convert to scalable.
llvm::Type *llvm_dst_fixed = get_vector_type(llvm_dst->getScalarType(), dst.lanes(), VectorTypeConstraint::Fixed);
value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
value = fixed_to_scalable_vector_type(value);
} else if (isa<ScalableVectorType>(value->getType()) && isa<FixedVectorType>(llvm_dst)) {
// Cannot bitcast/ptrtoint directly between scalable and fixed vectors.
// First convert to a fixed vector of the source element type, then cast.
value = scalable_to_fixed_vector_type(value);
value = builder->CreateBitOrPointerCast(value, llvm_dst);
} else {
// Our `Reinterpret` expr directly maps to LLVM IR bitcast/ptrtoint/inttoptr
// instructions with no additional handling required:
Expand Down Expand Up @@ -4314,10 +4325,12 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
const int input_lanes = val.type().lanes();
const int input_bytes = input_lanes * val.type().bytes();
const int vscale = std::max(effective_vscale, 1);
// LLVM added VECREDUCE_MUL/FMUL lowering for SVE in LLVM 22.
const bool mul_ok = LLVM_VERSION >= 220 || effective_vscale == 0;
const bool llvm_has_intrinsic =
// Must be one of these ops
((op->op == VectorReduce::Add ||
op->op == VectorReduce::Mul ||
(op->op == VectorReduce::Mul && mul_ok) ||
op->op == VectorReduce::Min ||
op->op == VectorReduce::Max) &&
(use_llvm_vp_intrinsics ||
Expand Down Expand Up @@ -4920,6 +4933,13 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
// otherwise.
llvm::Type *scalar_type = vec->getType()->getScalarType();

if (scalar_type->isIntegerTy(1)) {
auto *result_type = cast<VectorType>(get_vector_type(scalar_type, size / effective_vscale, VectorTypeConstraint::VScale));
return handle_bool_as_i8(vec, result_type, [&](Value *v) {
return slice_vector(v, start, size);
});
}

int intermediate_lanes = std::min(size, vec_lanes - start);
llvm::Type *intermediate_type = get_vector_type(scalar_type, intermediate_lanes, VectorTypeConstraint::Fixed);

Expand Down Expand Up @@ -5190,6 +5210,18 @@ llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::
return match_vector_type_scalable(value, guide->getType());
}

llvm::Value *CodeGen_LLVM::handle_bool_as_i8(llvm::Value *arg, llvm::VectorType *result_i1_type,
const std::function<llvm::Value *(llvm::Value *)> &fn) {
auto *arg_vty = cast<llvm::VectorType>(arg->getType());
bool scalable = isa<llvm::ScalableVectorType>(arg_vty);
int min_elts = scalable ? cast<llvm::ScalableVectorType>(arg_vty)->getMinNumElements() : cast<llvm::FixedVectorType>(arg_vty)->getNumElements();
auto constraint = scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed;
llvm::Type *arg_i8 = get_vector_type(i8_t, min_elts, constraint);
llvm::Value *widened = builder->CreateZExt(arg, arg_i8);
llvm::Value *result = fn(widened);
return builder->CreateTrunc(result, result_i1_type);
}

llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *arg,
llvm::Type *desired_type) {
llvm::Type *arg_type = arg->getType();
Expand All @@ -5199,6 +5231,18 @@ llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *ar
}

internal_assert(arg_type->getScalarType() == desired_type->getScalarType());

if (arg_type->isVectorTy() && desired_type->isVectorTy() &&
arg_type->getScalarType()->isIntegerTy(1)) {
bool dst_scalable = isa<llvm::ScalableVectorType>(desired_type);
int dst_elts = get_vector_num_elements(desired_type);
llvm::Type *dst_i8 = get_vector_type(i8_t, dst_scalable ? dst_elts / effective_vscale : dst_elts,
dst_scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
return handle_bool_as_i8(arg, cast<VectorType>(desired_type), [&](Value *v) {
return convert_fixed_or_scalable_vector_type(v, dst_i8);
});
}

if (!arg_type->isVectorTy()) {
arg = create_broadcast(arg, 1);
arg_type = arg->getType();
Expand Down Expand Up @@ -5280,6 +5324,12 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)
internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
internal_assert(lanes == (scalable_type->getMinNumElements() * effective_vscale));

if (fixed_type->getElementType()->isIntegerTy(1)) {
return handle_bool_as_i8(fixed_arg, scalable_type, [&](Value *v) {
return fixed_to_scalable_vector_type(v);
});
}

// E.g. <vscale x 2 x i64> llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
const char *type_designator;
if (fixed_type->getElementType()->isIntegerTy()) {
Expand All @@ -5297,7 +5347,7 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)

std::vector<llvm::Value *> args;
args.push_back(result_vec);
args.push_back(value);
args.push_back(fixed_arg);
args.push_back(ConstantInt::get(i64_t, 0));

return simple_call_intrin(intrin, args, scalable_type);
Expand All @@ -5316,6 +5366,12 @@ llvm::Value *CodeGen_LLVM::scalable_to_fixed_vector_type(llvm::Value *scalable_a
internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
internal_assert(fixed_type->getNumElements() == (scalable_type->getMinNumElements() * effective_vscale));

if (scalable_type->getElementType()->isIntegerTy(1)) {
return handle_bool_as_i8(scalable_arg, fixed_type, [&](Value *v) {
return scalable_to_fixed_vector_type(v);
});
}

// E.g. <64 x i8> @llvm.vector.extract.v64i8.nxv8i8(<vscale x 8 x i8> %vresult, i64 0)
const char *type_designator;
if (scalable_type->getElementType()->isIntegerTy()) {
Expand Down
Loading
Loading