diff --git a/.claude/skills/building/SKILL.md b/.claude/skills/building/SKILL.md index 7ff7be38df1..d1322cdecae 100644 --- a/.claude/skills/building/SKILL.md +++ b/.claude/skills/building/SKILL.md @@ -1,23 +1,223 @@ --- name: building -description: Build ExecuTorch runners or C++ libraries. Use when compiling runners for Llama, Whisper, or other models, or building the C++ runtime. +description: Build ExecuTorch from source — Python package, C++ runtime, runners, cross-compilation, and backend-specific builds. Use when compiling anything in the ExecuTorch repo, diagnosing build failures, or setting up platform-specific builds. --- -# Building +# Building ExecuTorch -## Runners (Makefile) +## Step 1: Ensure Python environment (detect and fix automatically) + +**Path A — conda (preferred):** +```bash +# Initialize conda for non-interactive shells (required in Claude Code / CI) +eval "$(conda shell.bash hook 2>/dev/null)" + +# Check if executorch conda env exists; create if not +conda env list 2>/dev/null | grep executorch || \ + ls "$(conda info --base 2>/dev/null)/envs/" 2>/dev/null | grep executorch || \ + conda create -yn executorch python=3.12 + +# Activate +conda activate executorch +``` + +**Path B — no conda (fall back to venv):** +```bash +# Find a compatible Python (3.10–3.13). On macOS with only Homebrew Python 3.14+, +# install a compatible version first: brew install python@3.12 +python3.12 -m venv .executorch-venv # or python3.11, python3.10, python3.13 +source .executorch-venv/bin/activate +pip install --upgrade pip +``` + +**Then verify (either path):** + +Run `python --version` and `cmake --version`. Fix automatically: +- **Python not 3.10–3.13**: recreate the env with a correct Python version. +- **cmake missing or < 3.24**: run `pip install 'cmake>=3.24'` inside the env. +- **cmake >= 4.0**: works in practice, no action needed. + +Parallel jobs: `$(sysctl -n hw.ncpu)` on macOS, `$(nproc)` on Linux. + +## Step 2: Build + +Route based on what the user asks for: +- User mentions **Android** → skip to [Cross-compilation: Android](#cross-compilation) +- User mentions **iOS** or **frameworks** → skip to [Cross-compilation: iOS](#cross-compilation) +- User mentions a **model name** (llama, whisper, etc.) → skip to [LLM / ASR model runner](#llm--asr-model-runner-simplest-path-for-running-models) +- User mentions **C++ runtime** or **cmake** → skip to [C++ runtime](#c-runtime-standalone) +- Otherwise → default to **Python package** below + +### Python package (default) ```bash -make help # list all targets -make llama-cpu # Llama -make whisper-metal # Whisper on Metal -make gemma3-cuda # Gemma3 on CUDA +conda activate executorch +./install_executorch.sh --editable # editable install from source ``` +This handles everything: submodules, deps, C++ build, Python install. Takes ~10 min on Apple Silicon. + +For subsequent rebuilds (deps already present): `pip install -e . --no-build-isolation` + +For minimal install (skip example deps): `./install_executorch.sh --minimal` + +Enable additional backends: +```bash +CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh --editable +``` + +Verify: `python -c "from executorch.exir import to_edge_transform_and_lower; print('OK')"` + +### LLM / ASR model runner (simplest path for running models) + +```bash +conda activate executorch +make - +``` + +Available targets (run `make help` for full list): + +| Target | Backend | macOS | Linux | +|--------|---------|-------|-------| +| `llama-cpu` | CPU | yes | yes | +| `llama-cuda` | CUDA | — | yes | +| `llama-cuda-debug` | CUDA (debug) | — | yes | +| `llava-cpu` | CPU | yes | yes | +| `whisper-cpu` | CPU | yes | yes | +| `whisper-metal` | Metal | yes | — | +| `whisper-cuda` | CUDA | — | yes | +| `parakeet-cpu` | CPU | yes | yes | +| `parakeet-metal` | Metal | yes | — | +| `parakeet-cuda` | CUDA | — | yes | +| `voxtral-cpu` | CPU | yes | yes | +| `voxtral-cuda` | CUDA | — | yes | +| `voxtral-metal` | Metal | yes | — | +| `voxtral_realtime-cpu` | CPU | yes | yes | +| `voxtral_realtime-cuda` | CUDA | — | yes | +| `voxtral_realtime-metal` | Metal | yes | — | +| `gemma3-cpu` | CPU | yes | yes | +| `gemma3-cuda` | CUDA | — | yes | +| `sortformer-cpu` | CPU | yes | yes | +| `sortformer-cuda` | CUDA | — | yes | +| `silero-vad-cpu` | CPU | yes | yes | +| `clean` | — | yes | yes | Output: `cmake-out/examples/models//` -## C++ Libraries (CMake) +### C++ runtime (standalone) + +**With presets (recommended):** + +| Platform | Command | +|----------|---------| +| macOS | `cmake -B cmake-out --preset macos` (uses Xcode generator — requires Xcode) | +| Linux | `cmake -B cmake-out --preset linux -DCMAKE_BUILD_TYPE=Release` | +| Windows | `cmake -B cmake-out --preset windows -T ClangCL` | + +Then: `cmake --build cmake-out --config Release -j$(sysctl -n hw.ncpu)` (macOS) or `cmake --build cmake-out -j$(nproc)` (Linux) + +**LLM libraries via workflow presets** (configure + build + install in one command): +```bash +cmake --workflow --preset llm-release # CPU +cmake --workflow --preset llm-release-metal # Metal (macOS) +cmake --workflow --preset llm-release-cuda # CUDA (Linux/Windows) +``` + +**Manual CMake (custom flags):** +```bash +cmake -B cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON +cmake --build cmake-out --parallel "$(nproc 2>/dev/null || sysctl -n hw.ncpu)" +``` + +Run `cmake --list-presets` to see all available presets. + +### Cross-compilation + +**iOS/macOS frameworks:** +```bash +./scripts/build_apple_frameworks.sh --coreml --mps --xnnpack +``` +Link in Xcode with `-all_load` linker flag. + +**Android:** + +Requires `ANDROID_NDK` on PATH (typically set by Android Studio or standalone NDK install). ```bash -cmake --list-presets # list presets -cmake --workflow --preset llm-release # LLM CPU -cmake --workflow --preset llm-release-metal # LLM Metal +# Verify NDK is available +echo $ANDROID_NDK # must point to NDK root, e.g. ~/Library/Android/sdk/ndk/ +export ANDROID_ABIS=arm64-v8a BUILD_AAR_DIR=aar-out +mkdir -p $BUILD_AAR_DIR && sh scripts/build_android_library.sh ``` + +## Key build options + +Most commonly needed flags (full list: `CMakeLists.txt`): + +| Flag | What it enables | +|------|-----------------| +| `EXECUTORCH_BUILD_XNNPACK` | XNNPACK CPU backend | +| `EXECUTORCH_BUILD_COREML` | Core ML (macOS/iOS) | +| `EXECUTORCH_BUILD_MPS` | MPS GPU (macOS/iOS) | +| `EXECUTORCH_BUILD_METAL` | Metal compute (macOS, requires EXTENSION_TENSOR) | +| `EXECUTORCH_BUILD_CUDA` | CUDA GPU (Linux/Windows, requires EXTENSION_TENSOR) | +| `EXECUTORCH_BUILD_KERNELS_OPTIMIZED` | Optimized kernels | +| `EXECUTORCH_BUILD_KERNELS_QUANTIZED` | Quantized kernels | +| `EXECUTORCH_BUILD_EXTENSION_MODULE` | Module extension (requires DATA_LOADER + FLAT_TENSOR + NAMED_DATA_MAP) | +| `EXECUTORCH_BUILD_EXTENSION_LLM` | LLM extension | +| `EXECUTORCH_BUILD_TESTS` | Unit tests (`ctest --test-dir cmake-out --output-on-failure`) | +| `EXECUTORCH_BUILD_DEVTOOLS` | DevTools (Inspector, ETDump) | +| `EXECUTORCH_OPTIMIZE_SIZE` | Size-optimized build (`-Os`, no exceptions/RTTI) | +| `CMAKE_BUILD_TYPE` | `Release` or `Debug` (5-10x slower). Some presets (e.g. `llm-release`) set this; others require it explicitly. | + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| Missing headers / `CMakeLists.txt not found` in third-party | `git submodule sync --recursive && git submodule update --init --recursive` | +| Mysterious failures after `git pull` or branch switch | `rm -rf cmake-out/ pip-out/ && git submodule sync && git submodule update --init --recursive` | +| `conda env list` PermissionError | Use `CONDA_NO_PLUGINS=true conda env list` or check env dir directly | +| CMake >= 4.0 | Works in practice despite `< 4.0` in docs; only fix if build actually fails | +| `externally-managed-environment` / PEP 668 error | You're using system Python, not conda. Activate conda env first. | +| pip conflicts with torch versions | Fresh conda env; or `./install_executorch.sh --use-pt-pinned-commit` | +| Missing `Python.h` (Linux) | `sudo apt install python3.X-dev` | +| Missing operator registrations at runtime | Link kernel libs with `-Wl,-force_load,` (macOS) or `-Wl,--whole-archive -Wl,--no-whole-archive` (Linux) | +| `install_executorch.sh` fails on Intel Mac | No prebuilt PyTorch wheels; use `--use-pt-pinned-commit --minimal` | +| XNNPACK build errors about cpuinfo/pthreadpool | Ensure `EXECUTORCH_BUILD_CPUINFO=ON` and `EXECUTORCH_BUILD_PTHREADPOOL=ON` (both ON by default) | +| Duplicate kernel registration abort | Only link one `gen_operators_lib` per target | + +## Build output + +**From `./install_executorch.sh` (Python package):** + +| Artifact | Location | +|----------|----------| +| Python package | `site-packages/executorch` | + +**From CMake builds** (`cmake --install` with `CMAKE_INSTALL_PREFIX=cmake-out`): + +| Artifact | Location | +|----------|----------| +| Core runtime | `cmake-out/lib/libexecutorch.a` | +| XNNPACK backend | `cmake-out/lib/libxnnpack_backend.a` | +| executor_runner | `cmake-out/executor_runner` (Ninja/Make) or `cmake-out/Release/executor_runner` (Xcode) | +| Model runners | `cmake-out/examples/models//` | + +**From cross-compilation:** + +| Artifact | Location | +|----------|----------| +| iOS frameworks | `cmake-out/*.xcframework` | +| Android AAR | `aar-out/` | + +## Tips +- Always use `Release` for benchmarking; `Debug` is 5–10x slower +- `ccache` is auto-detected if installed (`brew install ccache`) +- `Ninja` is faster than Make (`-G Ninja`) — but `--preset macos` uses Xcode generator +- For LLM workflows, `make -` is the simplest path +- After `git pull`, clean and re-init submodules before rebuilding diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index d88996ff8cb..0652c805b53 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -1057,7 +1057,8 @@ jobs: test-samsung-quantmodels-linux: name: test-samsung-quantmodels-linux - # if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request' + # Skip this job if the pull request is from a fork (secrets are not available) + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request' uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write @@ -1094,7 +1095,8 @@ jobs: test-samsung-models-linux: name: test-samsung-models-linux - # if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request' + # Skip this job if the pull request is from a fork (secrets are not available) + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request' uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write diff --git a/backends/cadence/build_cadence_runner.sh b/backends/cadence/build_cadence_runner.sh index a8f44719dc7..82968b196b3 100755 --- a/backends/cadence/build_cadence_runner.sh +++ b/backends/cadence/build_cadence_runner.sh @@ -31,12 +31,19 @@ main() { local example_dir=backends/cadence local build_dir="cmake-out/${example_dir}" - local cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" + # Detect lib vs lib64 + if [ -d "${PWD}/cmake-out/lib64/cmake/ExecuTorch" ]; then + libdir="lib64" + else + libdir="lib" + fi + local cmake_prefix_path="${PWD}/cmake-out/${libdir}/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" rm -rf ${build_dir} CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_CADENCE_CPU_RUNNER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DPYTHON_EXECUTABLE="$(which python3)" \ -B"${build_dir}" \ "${example_dir}" cmake --build "${build_dir}" --config Release -j16 diff --git a/backends/cadence/generic/operators/CMakeLists.txt b/backends/cadence/generic/operators/CMakeLists.txt index b9afdc01cde..77d0b4949a3 100644 --- a/backends/cadence/generic/operators/CMakeLists.txt +++ b/backends/cadence/generic/operators/CMakeLists.txt @@ -79,21 +79,9 @@ target_include_directories( ) # Custom ops that are needed to run the test model. -add_library( - custom_ops - "quantized_add_out.cpp" - "quantized_linear_out.cpp" - "quantized_conv2d_nchw_out.cpp" - "quantized_conv2d_nhwc_out.cpp" - "quantized_relu_out.cpp" - "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" - "quantized_fully_connected_out.cpp" - "dequantize_per_tensor.cpp" - "quantized_matmul_out.cpp" - "op_requantize_out.cpp" - "im2row_out.cpp" -) +file(GLOB custom_ops_srcs "*.cpp") +add_library(custom_ops ${custom_ops_srcs}) + target_include_directories( custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} ${_common_include_directories} diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h index 4b9fdaebdf7..4672f05e777 100644 --- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h +++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h @@ -7,10 +7,8 @@ */ #pragma once -#include "cortex_m_ops_common.h" -extern "C" { #include "arm_nnfunctions.h" -} +#include "cortex_m_ops_common.h" namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h index 1b31367881f..4c0f83d6eb6 100644 --- a/backends/cortex_m/ops/cortex_m_ops_common.h +++ b/backends/cortex_m/ops/cortex_m_ops_common.h @@ -16,12 +16,12 @@ #include #include +#include #include #include -extern "C" { #include "arm_nn_types.h" -} +#include "arm_nnfunctions.h" using Tensor = torch::executor::Tensor; using ScalarType = executorch::aten::ScalarType; @@ -47,19 +47,19 @@ inline void validate_cmsis_nn_tensor_requirements( // Basic dtype validation ET_CHECK_MSG( input1.scalar_type() == expected_dtype, - "Input1 dtype must be %hhd, got %hhd", - expected_dtype, - input1.scalar_type()); + "Input1 dtype must be %d, got %d", + static_cast(expected_dtype), + static_cast(input1.scalar_type())); ET_CHECK_MSG( input2.scalar_type() == expected_dtype, - "Input2 dtype must be %hhd, got %hhd", - expected_dtype, - input2.scalar_type()); + "Input2 dtype must be %d, got %d", + static_cast(expected_dtype), + static_cast(input2.scalar_type())); ET_CHECK_MSG( output.scalar_type() == expected_dtype, - "Output dtype must be %hhd, got %hhd", - expected_dtype, - output.scalar_type()); + "Output dtype must be %d, got %d", + static_cast(expected_dtype), + static_cast(output.scalar_type())); if (require_same_sizes) { ET_CHECK_MSG( input1.sizes() == input2.sizes(), @@ -78,16 +78,17 @@ inline void validate_single_quant_params( const int64_t multiplier, const int64_t shift, const char* param_name) { + (void)zero_point; ET_CHECK_MSG( multiplier >= std::numeric_limits::min() && multiplier <= std::numeric_limits::max(), - "%s multiplier must be in int32 range [Value: %d]", + "%s multiplier must be in int32 range [Value: %" PRIi64 "]", param_name, multiplier); ET_CHECK_MSG( shift >= -31 && shift <= 31, - "%s shift must be in range [-31, 31] [Value: %d]", + "%s shift must be in range [-31, 31] [Value: %" PRIi64 "]", param_name, shift); } @@ -172,7 +173,7 @@ inline bool check_int32_within_range( value > std::numeric_limits::max()) { ET_LOG( Error, - "%s: %s value (%ld) exceeds int32_t range", + "%s: %s value (%" PRIi64 ") exceeds int32_t range", op_name, value_name, value); @@ -354,14 +355,14 @@ inline bool validate_per_channel_quant_params( if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) { ET_LOG( Error, - "weight_multiplier[%d] out of CMSIS-NN range: %d", + "weight_multiplier[%d] out of CMSIS-NN range: %" PRIi64, i, multipliers[i]); return false; } // Shift: {-31, 30} for arm_nn_requantize if (shifts[i] < -31 || shifts[i] > 30) { - ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]); + ET_LOG(Error, "weight_shift[%d] out of range: %" PRIi64, i, shifts[i]); return false; } } diff --git a/backends/cortex_m/ops/op_maximum.cpp b/backends/cortex_m/ops/op_maximum.cpp index 71a907f12ea..fc76f5c8c48 100644 --- a/backends/cortex_m/ops/op_maximum.cpp +++ b/backends/cortex_m/ops/op_maximum.cpp @@ -7,11 +7,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_minimum.cpp b/backends/cortex_m/ops/op_minimum.cpp index f220aa2664b..5a75cb8a1dc 100644 --- a/backends/cortex_m/ops/op_minimum.cpp +++ b/backends/cortex_m/ops/op_minimum.cpp @@ -9,11 +9,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_pad.cpp b/backends/cortex_m/ops/op_pad.cpp index 739c584c419..b400f4c7e19 100644 --- a/backends/cortex_m/ops/op_pad.cpp +++ b/backends/cortex_m/ops/op_pad.cpp @@ -8,10 +8,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp index 2cab7dc37fb..b4bbfdaffce 100644 --- a/backends/cortex_m/ops/op_quantized_add.cpp +++ b/backends/cortex_m/ops/op_quantized_add.cpp @@ -9,11 +9,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; diff --git a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp index ad77bb54aff..293c6ea6957 100644 --- a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp +++ b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp index 3eae9507ba7..0fa6a3f8536 100644 --- a/backends/cortex_m/ops/op_quantized_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp index b3cf926c2e1..8dec61e0af1 100644 --- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp index f04b65fa1fb..5d018cbc0c4 100644 --- a/backends/cortex_m/ops/op_quantized_linear.cpp +++ b/backends/cortex_m/ops/op_quantized_linear.cpp @@ -9,10 +9,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; diff --git a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp index 470a7ae791e..181a29c1b65 100644 --- a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp +++ b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_mul.cpp b/backends/cortex_m/ops/op_quantized_mul.cpp index 3d9d6ab54a4..524e74a6b9f 100644 --- a/backends/cortex_m/ops/op_quantized_mul.cpp +++ b/backends/cortex_m/ops/op_quantized_mul.cpp @@ -7,11 +7,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { namespace { diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp index 7126a2b2cf7..e3f6135c7b9 100644 --- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp @@ -8,10 +8,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_softmax.cpp b/backends/cortex_m/ops/op_softmax.cpp index a2b8f27fac1..c07a538db84 100644 --- a/backends/cortex_m/ops/op_softmax.cpp +++ b/backends/cortex_m/ops/op_softmax.cpp @@ -11,11 +11,6 @@ #include #include -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_transpose.cpp b/backends/cortex_m/ops/op_transpose.cpp index 25458435a3c..7fcbc034283 100644 --- a/backends/cortex_m/ops/op_transpose.cpp +++ b/backends/cortex_m/ops/op_transpose.cpp @@ -11,11 +11,6 @@ #include #include -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/xnnpack/runtime/XNNWorkspace.h b/backends/xnnpack/runtime/XNNWorkspace.h index 507953a10ab..b7ef442c460 100644 --- a/backends/xnnpack/runtime/XNNWorkspace.h +++ b/backends/xnnpack/runtime/XNNWorkspace.h @@ -34,6 +34,9 @@ class XNNWorkspace { XNNWorkspace& operator=(XNNWorkspace&&) = delete; std::pair, xnn_workspace_t> acquire() { + if (!lock_required_) { + return {std::unique_lock{}, workspace_.get()}; + } auto lock = std::unique_lock(mutex_); return {std::move(lock), workspace_.get()}; } @@ -52,6 +55,10 @@ class XNNWorkspace { return id_; } + void disable_locking() { + lock_required_ = false; + } + static runtime::Result> create() { // Because this class can't be moved, we need to construct it in-place. xnn_workspace_t workspace = nullptr; @@ -72,6 +79,7 @@ class XNNWorkspace { static inline std::atomic next_id_{0}; std::mutex mutex_; uint64_t id_; + bool lock_required_ = true; WorkspacePtr workspace_; }; diff --git a/backends/xnnpack/runtime/XNNWorkspaceManager.cpp b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp index d8c6dae4d6d..5af3395ed89 100644 --- a/backends/xnnpack/runtime/XNNWorkspaceManager.cpp +++ b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp @@ -56,6 +56,7 @@ XNNWorkspaceManager::get_or_create_workspace(uintptr_t program_id) const { return create_result.error(); } + create_result.get()->disable_locking(); return create_result.get(); } else if (mode == WorkspaceSharingMode::PerModel) { return get_or_create_model_workspace(program_id); diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index 395fb01d189..3d9c77d6ad6 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -17,7 +17,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) -set(_test_srcs runtime/test_xnnexecutor.cpp +set(_test_srcs runtime/test_xnnexecutor.cpp runtime/test_workspace_manager.cpp ${EXECUTORCH_ROOT}/extension/threadpool/test/threadpool_test.cpp ) diff --git a/backends/xnnpack/test/runtime/test_workspace_manager.cpp b/backends/xnnpack/test/runtime/test_workspace_manager.cpp index 8d3203f3f40..a7689966635 100644 --- a/backends/xnnpack/test/runtime/test_workspace_manager.cpp +++ b/backends/xnnpack/test/runtime/test_workspace_manager.cpp @@ -107,6 +107,18 @@ TEST_F(XNNWorkspaceManagerTest, DisabledMode) { workspace2->unsafe_get_workspace(), workspace3->unsafe_get_workspace()); } +TEST_F(XNNWorkspaceManagerTest, DisabledModeAcquireDoesNotLock) { + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled); + + auto workspace_result = workspace_manager_->get_or_create_workspace(12345); + ASSERT_TRUE(workspace_result.ok()); + auto workspace = workspace_result.get(); + + auto [lock, ptr] = workspace->acquire(); + ASSERT_NE(ptr, nullptr); + EXPECT_FALSE(lock.owns_lock()); +} + TEST_F(XNNWorkspaceManagerTest, PerModelMode) { // In PerModel mode, calls with the same program_id should return the same // workspace. @@ -139,6 +151,18 @@ TEST_F(XNNWorkspaceManagerTest, PerModelMode) { workspace1->unsafe_get_workspace(), workspace3->unsafe_get_workspace()); } +TEST_F(XNNWorkspaceManagerTest, PerModelAcquireStillLocks) { + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel); + + auto workspace_result = workspace_manager_->get_or_create_workspace(12345); + ASSERT_TRUE(workspace_result.ok()); + auto workspace = workspace_result.get(); + + auto [lock, ptr] = workspace->acquire(); + ASSERT_NE(ptr, nullptr); + EXPECT_TRUE(lock.owns_lock()); +} + TEST_F(XNNWorkspaceManagerTest, GlobalMode) { // In Global mode, all calls should return the same workspace. workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global); diff --git a/examples/arm/image_classification_example_ethos_u/runtime/CMakeLists.txt b/examples/arm/image_classification_example_ethos_u/runtime/CMakeLists.txt index 9d9f0645bd5..6704c0d6fda 100644 --- a/examples/arm/image_classification_example_ethos_u/runtime/CMakeLists.txt +++ b/examples/arm/image_classification_example_ethos_u/runtime/CMakeLists.txt @@ -118,9 +118,11 @@ set(LINK_FILE_OUT # Shared_Sram, in the application, we set ETHOSU_ARENA to 0 so that the # intermediate tensors are placed in the SRAM. If you generate a pte for a # different memory mode, you need to change the placement in the linker script. -# Read -# https://docs.pytorch.org/executorch/stable/backends-arm-ethos-u.html#ethos-u-memory-modes -# for more information. +# For more information, see the stable documentation: +# https://docs.pytorch.org/executorch/stable/backends/arm-ethos-u/arm-ethos-u-overview.html#ethos-u-memory-modes + +# For 1.0 compatibility (if required) +# https://docs.pytorch.org/executorch/1.0/backends-arm-ethos-u.html#ethos-u-memory-modes set(ETHOSU_ARENA "0") # Generate linker script - we have a few if/else statements in # Corstone-320.ld/Corstone-300.ld that are compiled into a final linker script. diff --git a/examples/qualcomm/custom_op/custom_ops_1.py b/examples/qualcomm/custom_op/custom_ops_1.py index 31b3b6ff3ec..ed99eabc9c8 100644 --- a/examples/qualcomm/custom_op/custom_ops_1.py +++ b/examples/qualcomm/custom_op/custom_ops_1.py @@ -70,11 +70,10 @@ def annotate_custom(gm: torch.fx.GraphModule) -> None: This function is specific for custom op. The source_fn of the rewritten nn module turns out to be "my_ops.mul3.default" """ - from executorch.backends.qualcomm.quantizer.annotators import _is_annotated - from executorch.backends.qualcomm.quantizer.qconfig import ( get_ptq_per_channel_quant_config, ) + from executorch.backends.qualcomm.quantizer.rules import _is_annotated from torch.fx import Node from torchao.quantization.pt2e.quantizer import QuantizationAnnotation from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py index 3e620ab0300..87d90bb61b7 100644 --- a/examples/qualcomm/oss_scripts/fastvit.py +++ b/examples/qualcomm/oss_scripts/fastvit.py @@ -12,16 +12,13 @@ import numpy as np import torch -from executorch.backends.qualcomm.quantizer.annotators import ( - QuantizationConfig, - QuantizationSpec, -) from executorch.backends.qualcomm.quantizer.observers.per_channel_param_observer import ( PerChannelParamObserver, ) from executorch.backends.qualcomm.quantizer.qconfig import ( _derived_bias_quant_spec, MovingAverageMinMaxObserver, + QuantizationConfig, ) from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype @@ -40,6 +37,7 @@ SimpleADB, topk_accuracy, ) +from torchao.quantization.pt2e.quantizer import QuantizationSpec def get_instance(repo_path: str, checkpoint_path: str): diff --git a/extension/apple/BUCK b/extension/apple/BUCK index 05371edfbdb..5fca78dd7c2 100644 --- a/extension/apple/BUCK +++ b/extension/apple/BUCK @@ -1,6 +1,5 @@ load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target") load("@fbsource//tools/build_defs:platform_defs.bzl", "IOS") -load("@fbsource//tools/build_defs/apple:autoglob.bzl", "EXPORT_UNLESS_INTERNAL") load("@fbsource//tools/build_defs/apple:fb_apple_library.bzl", "fb_apple_library") load("@fbsource//tools/build_defs/apple:fb_apple_resource.bzl", "fb_apple_resource") load("@fbsource//xplat/executorch/build/fb:clients.bzl", "EXECUTORCH_CLIENTS") diff --git a/extension/llm/apple/BUCK b/extension/llm/apple/BUCK index 26dd36145ba..36da3c77935 100644 --- a/extension/llm/apple/BUCK +++ b/extension/llm/apple/BUCK @@ -1,6 +1,5 @@ load("@fbcode_macros//build_defs:build_file_migration.bzl", "non_fbcode_target") load("@fbsource//tools/build_defs:platform_defs.bzl", "IOS") -load("@fbsource//tools/build_defs/apple:autoglob.bzl", "EXPORT_UNLESS_INTERNAL") load("@fbsource//tools/build_defs/apple:fb_apple_library.bzl", "fb_apple_library") load("@fbsource//xplat/executorch/build/fb:clients.bzl", "EXECUTORCH_CLIENTS") load("@fbsource//tools/build_defs/apple:fb_apple_resource.bzl", "fb_apple_resource") diff --git a/kernels/portable/cpu/op_constant_pad_nd.cpp b/kernels/portable/cpu/op_constant_pad_nd.cpp index d3f3fdd75d7..2127cca3d5c 100644 --- a/kernels/portable/cpu/op_constant_pad_nd.cpp +++ b/kernels/portable/cpu/op_constant_pad_nd.cpp @@ -51,9 +51,17 @@ void apply_padding_to_dim( size_t pad_before = 0; size_t pad_after = 0; - if (pad_i >= 0 && pad_i < pad.size() / 2) { - pad_before = pad[2 * pad_i]; - pad_after = pad[2 * pad_i + 1]; + if (pad_i < pad.size() / 2) { + int64_t pb = pad[2 * pad_i]; + int64_t pa = pad[2 * pad_i + 1]; + ET_KERNEL_CHECK_MSG( + ctx, + pb >= 0 && pa >= 0, + InvalidArgument, + /* void */, + "Padding values must be non-negative."); + pad_before = static_cast(pb); + pad_after = static_cast(pa); } size_t out_step_len = out_strides[dim]; @@ -62,6 +70,12 @@ void apply_padding_to_dim( // Do not copy padding beyond the out tensor bounds. // Use division to avoid potential overflow in multiplication. if (pad_before > 0) { + ET_KERNEL_CHECK_MSG( + ctx, + out_data <= out_data_end, + InvalidArgument, + /* void */, + "Out data pointer exceeds buffer bounds."); size_t remaining = out_data_end - out_data; ET_KERNEL_CHECK_MSG( ctx, @@ -92,7 +106,12 @@ void apply_padding_to_dim( /* void */, "Out tensor overlaps with the input tensor. This is not supported."); // Bounds check before memcpy - // Use overflow-safe check for remaining >= copy_len + ET_KERNEL_CHECK_MSG( + ctx, + out_data <= out_data_end, + InvalidArgument, + /* void */, + "Out data pointer exceeds buffer bounds."); size_t remaining = out_data_end - out_data; ET_KERNEL_CHECK_MSG( ctx, @@ -123,6 +142,10 @@ void apply_padding_to_dim( last_padded_dim, dim + 1); + if (ctx.failure_state() != Error::Ok) { + return; + } + out_data += out_step_len; self_data += in_step_len; } @@ -131,6 +154,12 @@ void apply_padding_to_dim( // Do not copy padding beyond the out tensor bounds. // Use division to avoid potential overflow in multiplication. if (pad_after > 0) { + ET_KERNEL_CHECK_MSG( + ctx, + out_data <= out_data_end, + InvalidArgument, + /* void */, + "Out data pointer exceeds buffer bounds."); size_t remaining = out_data_end - out_data; ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/util/kernel_ops_util.cpp b/kernels/portable/cpu/util/kernel_ops_util.cpp index daa85f6beec..46fac7bde39 100644 --- a/kernels/portable/cpu/util/kernel_ops_util.cpp +++ b/kernels/portable/cpu/util/kernel_ops_util.cpp @@ -564,6 +564,14 @@ bool check_constant_pad_args( pad.size() / 2, in.dim()); + for (size_t i = 0; i < pad.size(); ++i) { + ET_CHECK_OR_RETURN_FALSE( + pad[i] >= 0, + "Padding values must be non-negative, but got pad[%zu] = %" PRId64, + i, + pad[i]); + } + return true; } diff --git a/shim_et/xplat/executorch/build/runtime_wrapper.bzl b/shim_et/xplat/executorch/build/runtime_wrapper.bzl index 92fafc78bab..01004595ff1 100644 --- a/shim_et/xplat/executorch/build/runtime_wrapper.bzl +++ b/shim_et/xplat/executorch/build/runtime_wrapper.bzl @@ -123,6 +123,16 @@ def _patch_build_mode_flags(kwargs): # @oss-disable: "fbsource//xplat/assistant/oacr/native/scripts:compiler_flag_O2": ["-O2"], }) + # Add pthread flags for Emscripten/WASM builds with threading support. + # Required when linking into WASM binaries that use -sUSE_PTHREADS=1. + # Without these flags, wasm-ld fails with: + # "error: --shared-memory is disallowed by .o because it was not + # compiled with 'atomics' or 'bulk-memory' features." + kwargs["compiler_flags"] = kwargs["compiler_flags"] + select({ + "DEFAULT": [], + # @oss-disable: "ovr_config//runtime:wasm-emscripten": ["-pthread", "-matomics", "-mbulk-memory"], + }) + return kwargs def _has_pytorch_dep(dep_list):