diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 7dd16f856cd..3770189b447 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -54,13 +54,13 @@ case "${IMAGE_NAME}" in executorch-ubuntu-22.04-mediatek-sdk) MEDIATEK_SDK=yes CLANG_VERSION=12 - ANDROID_NDK_VERSION=r27b + ANDROID_NDK_VERSION=r28c ;; executorch-ubuntu-22.04-clang12-android) LINTRUNNER="" CLANG_VERSION=12 # From https://developer.android.com/ndk/downloads - ANDROID_NDK_VERSION=r27b + ANDROID_NDK_VERSION=r28c ;; *) echo "Invalid image name ${IMAGE_NAME}" diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt index ef3282ba6cc..49b079047a3 100644 --- a/.ci/docker/ci_commit_pins/optimum-executorch.txt +++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt @@ -1 +1 @@ -40b02a2dc61bbf901a2df91719f47c98d65368ec +44d8d54e38c0258357d4e92e1fefe21e845947a3 diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 8c9330d6f2c..aafc7565373 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -4d4abec80f03cd8fdefe1d9cb3a60d3690cd777e +cf9d09490c7f6685ec68d5db3acf2e0d73c54d00 diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index dcd2afa7a13..5527b9b4d6d 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -16,18 +16,21 @@ hypothesis==6.84.2 parameterized==0.9.0 # Doc build requirements, same as https://github.com/pytorch/pytorch/blob/main/.ci/docker/requirements-docs.txt -sphinx==5.3.0 +sphinx==7.2.6 +sphinxcontrib.katex==0.9.10 +breathe==4.36.0 # only if generating C++ +exhale==0.3.7 # only if generating C++ docs +docutils==0.18.1,<0.21 +sphinx-design==0.6.1 +sphinxcontrib-mermaid==1.0.0 +myst-parser==3.0.1 # if want to contribute in markdown +sphinx-gallery==0.14.0 # only if hosting interactive tutorials +sphinx-sitemap==2.7.1 sphinx-reredirects==0.1.4 -sphinx-gallery==0.14.0 -breathe==4.34.0 -exhale==0.2.3 -docutils==0.16 matplotlib>=3.9.4 +sphinx-copybutton==0.5.2 # PyTorch Theme --e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme -myst-parser==0.18.1 -sphinx_design==0.4.1 -sphinx-copybutton==0.5.0 +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2 # script unit test requirements yaspin==3.1.0 diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index 7f34e8afb63..30835cf5085 100755 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -38,6 +38,7 @@ set_up_aot() { -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 diff --git a/.ci/scripts/setup-openvino.sh b/.ci/scripts/setup-openvino.sh index ff667619125..587494f46ac 100755 --- a/.ci/scripts/setup-openvino.sh +++ b/.ci/scripts/setup-openvino.sh @@ -10,19 +10,17 @@ set -ex # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" -git clone https://github.com/openvinotoolkit/openvino.git -cd openvino && git checkout releases/2025/1 -git submodule update --init --recursive -sudo ./install_build_dependencies.sh -mkdir build && cd build -cmake .. -DCMAKE_BUILD_TYPE=Release -DENABLE_PYTHON=ON -make -j$(nproc) +# Download and install OpenVINO from release packages +OPENVINO_VERSION="2025.3" +OPENVINO_BUILD="2025.3.0.19807.44526285f24" +OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz" -cd .. -cmake --install build --prefix dist +curl -Lo /tmp/openvino_toolkit.tgz --retry 3 --fail ${OPENVINO_URL} +tar -xzf /tmp/openvino_toolkit.tgz +mv openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64 openvino -source dist/setupvars.sh -cd ../backends/openvino +source openvino/setupvars.sh +cd backends/openvino pip install -r requirements.txt cd scripts ./openvino_build.sh --enable_python diff --git a/.ci/scripts/setup-samsung-linux-deps.sh b/.ci/scripts/setup-samsung-linux-deps.sh index ed704b2bfbd..c1f2912713b 100644 --- a/.ci/scripts/setup-samsung-linux-deps.sh +++ b/.ci/scripts/setup-samsung-linux-deps.sh @@ -11,9 +11,9 @@ set -ex download_ai_lite_core() { API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download" - API_KEY="kn10SoSY3hkC-9Qny5TqD2mnqVrlupv3krnjLeBt5cY" + API_KEY=$SAMSUNG_AI_LITECORE_KEY - VERSION="0.5" + VERSION="0.7" OS_NAME="Ubuntu 22.04" OUT_FILE="/tmp/exynos-ai-litecore-v${VERSION}.tar.gz" TARGET_PATH="/tmp/exynos_ai_lite_core" @@ -52,7 +52,7 @@ download_ai_lite_core() { install_enn_backend() { NDK_INSTALLATION_DIR=/opt/ndk rm -rf "${NDK_INSTALLATION_DIR}" && sudo mkdir -p "${NDK_INSTALLATION_DIR}" - ANDROID_NDK_VERSION=r27b + ANDROID_NDK_VERSION=r28c # build Exynos backend export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk} @@ -62,7 +62,7 @@ install_enn_backend() { export PYTHONPATH=${PYTHONPATH:-}:${EXECUTORCH_ROOT}/.. } -AI_LITE_CORE_VERSION=0.5.0 +AI_LITE_CORE_VERSION=0.7.0 download_ai_lite_core ${AI_LITE_CORE_VERSION} install_enn_backend diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh new file mode 100755 index 00000000000..bae7dd6af16 --- /dev/null +++ b/.ci/scripts/test-cuda-build.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +CUDA_VERSION=${1:-"12.6"} + +echo "=== Testing ExecuTorch CUDA ${CUDA_VERSION} Build ===" + +# Function to build and test ExecuTorch with CUDA support +test_executorch_cuda_build() { + local cuda_version=$1 + + echo "Building ExecuTorch with CUDA ${cuda_version} support..." + echo "ExecuTorch will automatically detect CUDA and install appropriate PyTorch wheel" + + # Check available resources before starting + echo "=== System Information ===" + echo "Available memory: $(free -h | grep Mem | awk '{print $2}')" + echo "Available disk space: $(df -h . | tail -1 | awk '{print $4}')" + echo "CPU cores: $(nproc)" + echo "CUDA version check:" + nvcc --version || echo "nvcc not found" + nvidia-smi || echo "nvidia-smi not found" + + # Set CMAKE_ARGS to enable CUDA build - ExecuTorch will handle PyTorch installation automatically + export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" + + echo "=== Starting ExecuTorch Installation ===" + # Install ExecuTorch with CUDA support with timeout and error handling + timeout 5400 ./install_executorch.sh || { + local exit_code=$? + echo "ERROR: install_executorch.sh failed with exit code: $exit_code" + if [ $exit_code -eq 124 ]; then + echo "ERROR: Installation timed out after 90 minutes" + fi + exit $exit_code + } + + echo "SUCCESS: ExecuTorch CUDA build completed" + + # Verify the installation + echo "=== Verifying ExecuTorch CUDA Installation ===" + + # Test that ExecuTorch was built successfully + python -c " +import executorch +print('SUCCESS: ExecuTorch imported successfully') +" + + # Test CUDA availability and show details + python -c " +try: + import torch + print('INFO: PyTorch version:', torch.__version__) + print('INFO: CUDA available:', torch.cuda.is_available()) + + if torch.cuda.is_available(): + print('SUCCESS: CUDA is available for ExecuTorch') + print('INFO: CUDA version:', torch.version.cuda) + print('INFO: GPU device count:', torch.cuda.device_count()) + print('INFO: Current GPU device:', torch.cuda.current_device()) + print('INFO: GPU device name:', torch.cuda.get_device_name()) + + # Test basic CUDA tensor operation + device = torch.device('cuda') + x = torch.randn(10, 10).to(device) + y = torch.randn(10, 10).to(device) + z = torch.mm(x, y) + print('SUCCESS: CUDA tensor operation completed on device:', z.device) + print('INFO: Result tensor shape:', z.shape) + + print('SUCCESS: ExecuTorch CUDA integration verified') + else: + print('WARNING: CUDA not detected, but ExecuTorch built successfully') + exit(1) +except Exception as e: + print('ERROR: ExecuTorch CUDA test failed:', e) + exit(1) +" + + echo "SUCCESS: ExecuTorch CUDA ${cuda_version} build and verification completed successfully" +} + +# Main execution +echo "Current working directory: $(pwd)" +echo "Directory contents:" +ls -la + +# Run the CUDA build test +test_executorch_cuda_build "${CUDA_VERSION}" diff --git a/.ci/scripts/test_backend_linux.sh b/.ci/scripts/test_backend.sh similarity index 57% rename from .ci/scripts/test_backend_linux.sh rename to .ci/scripts/test_backend.sh index d230860875d..a48cc9ec41a 100755 --- a/.ci/scripts/test_backend_linux.sh +++ b/.ci/scripts/test_backend.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -10,16 +11,26 @@ SUITE=$1 FLOW=$2 ARTIFACT_DIR=$3 -REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.csv" +REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.json" echo "Running backend test job for suite $SUITE, flow $FLOW." echo "Saving job artifacts to $ARTIFACT_DIR." -# The generic Linux job chooses to use base env, not the one setup by the image eval "$(conda shell.bash hook)" CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" +if [[ "$(uname)" == "Darwin" ]]; then + bash .ci/scripts/setup-conda.sh + eval "$(conda shell.bash hook)" + CONDA_RUN_CMD="${CONDA_RUN} --no-capture-output" + ${CONDA_RUN_CMD} pip install awscli==1.37.21 + IS_MACOS=1 +else + CONDA_RUN_CMD="" + IS_MACOS=0 +fi + export PYTHON_EXECUTABLE=python # CMake options to use, in addition to the defaults. @@ -48,13 +59,23 @@ fi if [[ "$FLOW" == *arm* ]]; then # Setup ARM deps. .ci/scripts/setup-arm-baremetal-tools.sh + source examples/arm/ethos-u-scratch/setup_path.sh + + if [[ "$FLOW" == *ethos_u* ]]; then + # Prepare a test runner binary that can run on the Corstone-3x0 FVPs + backends/arm/scripts/build_executorch.sh + backends/arm/test/setup_testing.sh + fi fi -# We need the runner to test the built library. -PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true +if [[ $IS_MACOS -eq 1 ]]; then + SETUP_SCRIPT=.ci/scripts/setup-macos.sh +else + SETUP_SCRIPT=.ci/scripts/setup-linux.sh +fi +CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true EXIT_CODE=0 -python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$REPORT_FILE" || EXIT_CODE=$? - +${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$? # Generate markdown summary. -python -m executorch.backends.test.suite.generate_markdown_summary "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE +${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE diff --git a/.ci/scripts/test_backend_macos.sh b/.ci/scripts/test_backend_macos.sh deleted file mode 100755 index c31fd504b03..00000000000 --- a/.ci/scripts/test_backend_macos.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -set -eux - -SUITE=$1 -FLOW=$2 -ARTIFACT_DIR=$3 - -REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.csv" - -echo "Running backend test job for suite $SUITE, flow $FLOW." -echo "Saving job artifacts to $ARTIFACT_DIR." - -${CONDA_RUN} --no-capture-output pip install awscli==1.37.21 - -bash .ci/scripts/setup-conda.sh -eval "$(conda shell.bash hook)" - -PYTHON_EXECUTABLE=python -${CONDA_RUN} --no-capture-output .ci/scripts/setup-macos.sh --build-tool cmake --build-mode Release - -EXIT_CODE=0 -${CONDA_RUN} --no-capture-output python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$REPORT_FILE" || EXIT_CODE=$? - -# Generate markdown summary. -${CONDA_RUN} --no-capture-output python -m executorch.backends.test.suite.generate_markdown_summary "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py index 05b25299522..e5d815cfc00 100644 --- a/.ci/scripts/test_huggingface_optimum_model.py +++ b/.ci/scripts/test_huggingface_optimum_model.py @@ -43,7 +43,9 @@ def cli_export(command, model_dir): def check_causal_lm_output_quality( - model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0 + model_id: str, + generated_tokens: List[int], + max_perplexity_threshold: float = 100.0, ): """ Evaluates the quality of text generated by a causal language model by calculating its perplexity. @@ -58,12 +60,24 @@ def check_causal_lm_output_quality( """ logging.info(f"Starting perplexity check with model '{model_id}' ...") # Load model - model = AutoModelForCausalLM.from_pretrained( - model_id, - low_cpu_mem_usage=True, - use_cache=False, - torch_dtype=torch.bfloat16, - ) + cls_name = AutoModelForCausalLM + if "llava" in model_id: + from transformers import LlavaForConditionalGeneration + + cls_name = LlavaForConditionalGeneration + try: + model = cls_name.from_pretrained( + model_id, + low_cpu_mem_usage=True, + use_cache=False, + torch_dtype=torch.bfloat16, + ) + except TypeError: + model = cls_name.from_pretrained( + model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + ) with torch.no_grad(): outputs = model(input_ids=generated_tokens, labels=generated_tokens) @@ -156,6 +170,86 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only assert check_causal_lm_output_quality(model_id, generated_tokens) is True +def test_llm_with_image_modality( + model_id, model_dir, recipe, *, quantize=True, run_only=False +): + command = [ + "optimum-cli", + "export", + "executorch", + "--model", + model_id, + "--task", + "multimodal-text-to-text", + "--recipe", + recipe, + "--output_dir", + model_dir, + "--use_custom_sdpa", + "--use_custom_kv_cache", + "--qlinear", + "8da4w", + "--qembedding", + "8w", + ] + if not run_only: + cli_export(command, model_dir) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.save_pretrained(model_dir) + + # input + processor = AutoProcessor.from_pretrained(model_id) + image_url = "https://llava-vl.github.io/static/images/view.jpg" + conversation = [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", + } + ], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": image_url}, + { + "type": "text", + "text": "What are the things I should be cautious about when I visit here?", + }, + ], + }, + ] + inputs = processor.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + from executorch.extension.llm.runner import GenerationConfig, MultimodalRunner + + runner = MultimodalRunner(f"{model_dir}/model.pte", f"{model_dir}/tokenizer.model") + generated_text = runner.generate_text_hf( + inputs, + GenerationConfig(max_new_tokens=128, temperature=0, echo=False), + processor.image_token_id, + ) + print(f"\nGenerated text:\n\t{generated_text}") + # Free memory before loading eager for quality check + del runner + gc.collect() + assert ( + check_causal_lm_output_quality( + model_id, tokenizer.encode(generated_text, return_tensors="pt") + ) + is True + ) + + def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False): command = [ "optimum-cli", @@ -353,6 +447,9 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False): required=False, help="When provided, write the pte file to this directory. Otherwise, a temporary directory is created for the test.", ) + parser.add_argument( + "--run_only", action="store_true", help="Skip export and only run the test" + ) args = parser.parse_args() _text_generation_mapping = { @@ -384,8 +481,16 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False): "vit": ("google/vit-base-patch16-224", test_vit), } + _multimodal_model_mapping = { + "gemma3-4b": ("google/gemma-3-4b-it", test_llm_with_image_modality), + "llava": ("llava-hf/llava-1.5-7b-hf", test_llm_with_image_modality), + } + model_to_model_id_and_test_function = ( - _text_generation_mapping | _mask_fill_mapping | _misc_model_mapping + _text_generation_mapping + | _mask_fill_mapping + | _misc_model_mapping + | _multimodal_model_mapping ) if args.model not in model_to_model_id_and_test_function: @@ -400,4 +505,5 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False): model_dir=tmp_dir if args.model_dir is None else args.model_dir, recipe=args.recipe, quantize=args.quantize, + run_only=args.run_only, ) diff --git a/.ci/scripts/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh index a89c2cc5809..46c3f71f021 100755 --- a/.ci/scripts/test_ios_ci.sh +++ b/.ci/scripts/test_ios_ci.sh @@ -36,6 +36,7 @@ say() { say "Cloning the Demo App" +git config --global http.postBuffer 524288000 git clone --depth 1 https://github.com/meta-pytorch/executorch-examples.git say "Installing CoreML Backend Requirements" diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 84278e290f6..d9e527e7c78 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -159,6 +159,7 @@ cmake_install_executorch_libraries() { -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ -DEXECUTORCH_BUILD_QNN="$QNN" \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ -DQNN_SDK_ROOT="$QNN_SDK_ROOT" cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE" } @@ -236,7 +237,7 @@ if [[ "${CUSTOM}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true" fi if [[ "${QE}" == "ON" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,1024\"" + EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,768\"" fi if [[ "${MPS}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true" diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh index 5f472fad63b..a7ded52ccc6 100644 --- a/.ci/scripts/test_llama_torchao_lowbit.sh +++ b/.ci/scripts/test_llama_torchao_lowbit.sh @@ -31,6 +31,7 @@ cmake -DPYTHON_EXECUTABLE=python \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=OFF \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 3deefe1d5bf..d8cb9596ffc 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -38,6 +38,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS=" \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ @@ -107,7 +108,7 @@ cmake_build_llava_runner_for_android() { # only export the one without custom op for now since it's export_llava() { echo "Starting to export Llava. This will take about 6 mins" - $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts + $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts --max-context-len 768 } # Download a new image diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 74eb75c6ddd..34063a23374 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -48,22 +48,33 @@ prepare_artifacts_upload() { fi } + build_cmake_executor_runner() { local backend_string_select="${1:-}" echo "Building executor_runner" rm -rf ${CMAKE_OUTPUT_DIR} mkdir ${CMAKE_OUTPUT_DIR} + # Common options: + COMMON="-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE" if [[ "$backend_string_select" == "XNNPACK" ]]; then echo "Backend $backend_string_select selected" - (cd ${CMAKE_OUTPUT_DIR} \ - && cmake -DCMAKE_BUILD_TYPE=Release \ + cmake -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) + ${COMMON} \ + -B${CMAKE_OUTPUT_DIR} . + cmake --build ${CMAKE_OUTPUT_DIR} -j4 + elif [[ "$backend_string_select" == "CUDA" ]]; then + echo "Backend $backend_string_select selected" + cmake -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + ${COMMON} \ + -B${CMAKE_OUTPUT_DIR} . cmake --build ${CMAKE_OUTPUT_DIR} -j4 else cmake -DCMAKE_BUILD_TYPE=Debug \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + ${COMMON} \ -B${CMAKE_OUTPUT_DIR} . cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug fi @@ -131,13 +142,13 @@ test_model_with_xnnpack() { return 0 fi - # Delegation + # Delegation and test with pybindings if [[ ${WITH_QUANTIZATION} == true ]]; then SUFFIX="q8" - "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize + "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize --test_after_export else SUFFIX="fp32" - "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate + "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --test_after_export fi OUTPUT_MODEL_PATH="${MODEL_NAME}_xnnpack_${SUFFIX}.pte" @@ -320,6 +331,13 @@ test_model_with_mediatek() { EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit) } +test_model_with_cuda() { + # Export a basic .pte and .ptd, then run the model. + "${PYTHON_EXECUTABLE}" -m examples.cuda.scripts.export --model_name="${MODEL_NAME}" --output_dir "./" + build_cmake_executor_runner "CUDA" + ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" --data_path "./aoti_cuda_blob.ptd" +} + if [[ "${BACKEND}" == "portable" ]]; then echo "Testing ${MODEL_NAME} with portable kernels..." @@ -372,6 +390,12 @@ elif [[ "${BACKEND}" == "mediatek" ]]; then if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi +elif [[ "${BACKEND}" == "cuda" ]]; then + echo "Testing ${MODEL_NAME} with cuda..." + test_model_with_cuda + if [[ $? -eq 0 ]]; then + prepare_artifacts_upload + fi else set +e if [[ "${BACKEND}" == *"quantization"* ]]; then diff --git a/.ci/scripts/test_openvino.sh b/.ci/scripts/test_openvino.sh index 85884a6475b..2bb2115b1ec 100755 --- a/.ci/scripts/test_openvino.sh +++ b/.ci/scripts/test_openvino.sh @@ -10,7 +10,7 @@ set -ex # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" -source openvino/dist/setupvars.sh +source openvino/setupvars.sh cd backends/openvino/tests python test_runner.py --test_type ops python test_runner.py --test_type models diff --git a/.ci/scripts/test_qnn_static_llama_eval.sh b/.ci/scripts/test_qnn_static_llama_eval.sh new file mode 100644 index 00000000000..4baa28fe591 --- /dev/null +++ b/.ci/scripts/test_qnn_static_llama_eval.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +echo ">>> Script invoked with arguments: $@" + +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +# Download QNN_SDK. If already downloaded, export environment path +source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh" +install_qnn + +export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" +export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" +export PYTHONPATH=".." +cp schema/program.fbs exir/_serialize/program.fbs +cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs +cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python +cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +which "${PYTHON_EXECUTABLE}" + +# ------------------------------- +# Parse args +# ------------------------------- +EXTRA_FLAGS="" +THRESHOLD=62.0 # default fallback + +while [[ $# -gt 0 ]]; do + case "$1" in + --flags) + EXTRA_FLAGS="$2" + shift 2 + ;; + --threshold) + THRESHOLD="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Config +PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}" +MODEL="qwen2_5-0_5b" +MAX_SEQ=1024 +PTQ="16a4w" + +EXTRA_FLAGS="$@" + +# Run command and capture *both stdout and stderr* +LOG_FILE="eval_${MODEL}_$(date +%Y%m%d_%H%M%S).log" + +echo ">>> Running evaluation with flags: $EXTRA_FLAGS | threshold: $THRESHOLD" +$PYTHON_EXECUTABLE -m executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn \ + --decoder_model "$MODEL" \ + --quant_linear_only \ + --max_seq_length "$MAX_SEQ" \ + --ptq "$PTQ" \ + $EXTRA_FLAGS 2>&1 | tee "$LOG_FILE" + +# Extract last word_perplexity +LAST_PERP=$(grep "INFO:root:wikitext:" "$LOG_FILE" | tail -n 1 | sed -E "s/.*'word_perplexity,none': ([0-9.]+).*/\1/") + +if [[ -z "$LAST_PERP" ]]; then + echo "❌ Could not find word_perplexity in logs!" + exit 1 +fi + +echo ">>> Last word_perplexity = $LAST_PERP" + +# Compare against threshold +awk -v val="$LAST_PERP" -v thr="$THRESHOLD" 'BEGIN {exit (val > thr)}' +if [[ $? -ne 0 ]]; then + echo "❌ Regression detected: word_perplexity ($LAST_PERP) > threshold ($THRESHOLD)" + exit 1 +fi + +echo "✅ Check passed: word_perplexity ($LAST_PERP) <= $THRESHOLD" diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh index 3c9ac598f8f..da50d28800a 100644 --- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh +++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh @@ -5,6 +5,7 @@ set -euxo pipefail # Args / flags # ------------------------- TEST_WITH_RUNNER=0 +USE_TORCHAO_KERNELS=0 MODEL_NAME="" # Parse args @@ -22,10 +23,14 @@ while [[ $# -gt 0 ]]; do --test_with_runner) TEST_WITH_RUNNER=1 ;; + --use_torchao_kernels) + USE_TORCHAO_KERNELS=1 + ;; -h|--help) - echo "Usage: $0 [--test_with_runner]" + echo "Usage: $0 [--test_with_runner] [--use_torchao_kernels]" echo " model_name: qwen3_4b | phi_4_mini" echo " --test_with_runner: build ET + run llama_main to sanity-check the export" + echo " --use_torchao_kernels: use torchao kernels for linear and tied embedding" exit 0 ;; *) @@ -42,6 +47,13 @@ fi MODEL_OUT=model.pte + +# Default to XNNPACK +BACKEND_ARGS="-X --xnnpack-extended-ops" +if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then + BACKEND_ARGS="--use-torchao-kernels" +fi + case "$MODEL_NAME" in qwen3_4b) echo "Running Qwen3-4B export..." @@ -58,12 +70,12 @@ case "$MODEL_NAME" in --output_name $MODEL_OUT \ -kv \ --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ --max_context_length 1024 \ --max_seq_length 1024 \ + --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \ + --verbose \ --dtype fp32 \ - --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' + ${BACKEND_ARGS} ;; phi_4_mini) @@ -81,12 +93,12 @@ case "$MODEL_NAME" in --output_name $MODEL_OUT \ -kv \ --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ --max_context_length 1024 \ --max_seq_length 1024 \ + --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \ + --verbose \ --dtype fp32 \ - --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' + ${BACKEND_ARGS} ;; *) @@ -104,6 +116,10 @@ if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then fi # Install ET with CMake +EXECUTORCH_BUILD_KERNELS_TORCHAO="OFF" +if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then + EXECUTORCH_BUILD_KERNELS_TORCHAO="ON" +fi if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then echo "[runner] Building and testing llama_main ..." cmake -DPYTHON_EXECUTABLE=python \ @@ -113,6 +129,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ @@ -120,6 +137,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ + -DEXECUTORCH_BUILD_KERNELS_TORCHAO=${EXECUTORCH_BUILD_KERNELS_TORCHAO} \ -Bcmake-out . cmake --build cmake-out -j16 --config Release --target install diff --git a/.ci/scripts/test_wheel_package_qnn.sh b/.ci/scripts/test_wheel_package_qnn.sh index 39c52a4a396..4207f0392be 100644 --- a/.ci/scripts/test_wheel_package_qnn.sh +++ b/.ci/scripts/test_wheel_package_qnn.sh @@ -98,7 +98,7 @@ PYTHON_VERSION=$1 # Check wheel does NOT contain qualcomm/sdk # ---------------------------- echo "Checking wheel does not contain qualcomm/sdk..." -SDK_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep "executorch/backends/qualcomm/sdk" || true) +SDK_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep -E "executorch/backends/qualcomm/sdk" || true) if [ -n "$SDK_FILES" ]; then echo "ERROR: Wheel package contains unexpected qualcomm/sdk files:" echo "$SDK_FILES" @@ -111,7 +111,7 @@ fi # Check .so files in the wheel # ---------------------------- echo "Checking for .so files inside the wheel..." -WHEEL_SO_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep "executorch/backends/qualcomm/python" || true) +WHEEL_SO_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep -E "executorch/backends/qualcomm/python" || true) if [ -z "$WHEEL_SO_FILES" ]; then echo "ERROR: No .so files found in wheel under executorch/backends/qualcomm/python" exit 1 @@ -139,12 +139,35 @@ run_core_tests () { echo "=== [$LABEL] Installing wheel & deps ===" "$PIPBIN" install --upgrade pip "$PIPBIN" install "$WHEEL_FILE" - "$PIPBIN" install torch=="2.9.0.dev20250906" --index-url "https://download.pytorch.org/whl/nightly/cpu" - "$PIPBIN" install --pre torchao --index-url "https://download.pytorch.org/whl/nightly/cpu" + TORCH_VERSION=$( + "$PYBIN" - <<'PY' +import runpy +module_vars = runpy.run_path("torch_pin.py") +print(module_vars["TORCH_VERSION"]) +PY +) + + NIGHTLY_VERSION=$( + "$PYBIN" - <<'PY' +import runpy +module_vars = runpy.run_path("torch_pin.py") +print(module_vars["NIGHTLY_VERSION"]) +PY +) + echo "=== [$LABEL] Install torch==${TORCH_VERSION}.${NIGHTLY_VERSION} ===" + + # Install torchao based on the pinned PyTorch version + "$PIPBIN" install torch=="${TORCH_VERSION}.${NIGHTLY_VERSION}" --index-url "https://download.pytorch.org/whl/nightly/cpu" + + # Install torchao based on the pinned commit from third-party/ao submodule + pushd "$REPO_ROOT/third-party/ao" > /dev/null + USE_CPP=0 "$PYBIN" setup.py develop + popd > /dev/null echo "=== [$LABEL] Import smoke tests ===" "$PYBIN" -c "import executorch; print('executorch imported successfully')" "$PYBIN" -c "import executorch.backends.qualcomm; print('executorch.backends.qualcomm imported successfully')" + "$PYBIN" -c "from executorch.export.target_recipes import get_android_recipe; recipe = get_android_recipe('android-arm64-snapdragon-fp16'); print(f'executorch.export.target_recipes imported successfully: {recipe}')" echo "=== [$LABEL] List installed executorch/backends/qualcomm/python ===" local SITE_DIR diff --git a/.ci/scripts/test_yolo12.sh b/.ci/scripts/test_yolo12.sh index e3f20d5f970..594ddbf86ed 100755 --- a/.ci/scripts/test_yolo12.sh +++ b/.ci/scripts/test_yolo12.sh @@ -119,6 +119,8 @@ cmake_install_executorch_libraries() { -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -B"${build_dir}" @@ -131,6 +133,8 @@ cmake_install_executorch_libraries() { -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ diff --git a/.ci/scripts/unittest-buck2.sh b/.ci/scripts/unittest-buck2.sh index 340f7438f02..e78e682faac 100755 --- a/.ci/scripts/unittest-buck2.sh +++ b/.ci/scripts/unittest-buck2.sh @@ -15,7 +15,8 @@ buck2 query "//backends/apple/... + //backends/arm: + //backends/arm/debug/... + //backends/arm/_passes/... + //backends/arm/runtime/... + //backends/arm/tosa/... \ + //backends/example/... + \ //backends/mediatek/... + //backends/transforms/... + \ -//backends/xnnpack/... + //configurations/... + //extension/flat_tensor: + \ +//backends/xnnpack/... + //codegen/tools/... + \ +//configurations/... + //extension/flat_tensor: + \ //extension/llm/runner: + //kernels/aten/... + //kernels/optimized/... + \ //kernels/portable/... + //kernels/quantized/... + //kernels/test/... + \ //runtime/... + //schema/... + //test/... + //util/..." @@ -34,7 +35,17 @@ BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep - for op in "build" "test"; do buck2 $op $BUILDABLE_OPTIMIZED_OPS \ //examples/selective_build:select_all_dtype_selective_lib_portable_lib \ + //extension/llm/custom_ops/spinquant/test:fast_hadamard_transform_test \ + //extension/llm/runner/test:test_multimodal_input \ + //extension/llm/runner/test:test_generation_config \ //kernels/portable/... \ $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \ //runtime/executor: //runtime/kernel/... //runtime/platform/... done + +# Build only without testing +buck2 build //codegen/tools/... \ + //extension/llm/runner/io_manager:io_manager \ + //extension/llm/modules/... \ + //extension/llm/runner:multimodal_runner_lib \ + //extension/llm/runner:text_decoder_runner diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index f6f6ece786b..f896d3f1d40 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -125,14 +125,15 @@ build_executorch_runner_cmake() { clean_executorch_install_folders mkdir "${CMAKE_OUTPUT_DIR}" - pushd "${CMAKE_OUTPUT_DIR}" || return if [[ $1 == "Debug" ]]; then CXXFLAGS="-fsanitize=address,undefined" else CXXFLAGS="" fi - CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" .. - popd || return + CXXFLAGS="$CXXFLAGS" retry cmake \ + -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \ + -DCMAKE_BUILD_TYPE="${1:-Release}" \ + -B${CMAKE_OUTPUT_DIR} . if [ "$(uname)" == "Darwin" ]; then CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 )) diff --git a/.ci/scripts/wheel/test_base.py b/.ci/scripts/wheel/test_base.py index f8a7309a6c2..278e46fe75a 100644 --- a/.ci/scripts/wheel/test_base.py +++ b/.ci/scripts/wheel/test_base.py @@ -41,6 +41,18 @@ class ModelTest: def run_tests(model_tests: List[ModelTest]) -> None: + # Test that we can import the portable_lib module - verifies RPATH is correct + print("Testing portable_lib import...") + try: + from executorch.extension.pybindings._portable_lib import ( # noqa: F401 + _load_for_executorch, + ) + + print("✓ Successfully imported _load_for_executorch from portable_lib") + except ImportError as e: + print(f"✗ Failed to import portable_lib: {e}") + raise + # Why are we doing this envvar shenanigans? Since we build the testers, which # uses buck, we cannot run as root. This is a sneaky of getting around that # test. diff --git a/.github/scripts/cherry_pick.py b/.github/scripts/cherry_pick.py index 1239ee030dd..8de5279f51b 100755 --- a/.github/scripts/cherry_pick.py +++ b/.github/scripts/cherry_pick.py @@ -39,7 +39,15 @@ def parse_args() -> Any: ) parser.add_argument( "--classification", - choices=["regression", "critical", "fixnewfeature", "docs", "release"], + choices=[ + "regression", + "critical", + "fixnewfeature", + "docs", + "release", + "examples", + "testci", + ], required=True, help="the cherry pick category", ) diff --git a/.github/scripts/propose_ghstack_orig_pr.py b/.github/scripts/propose_ghstack_orig_pr.py index 53b796adaa3..3abcc6cdcf9 100644 --- a/.github/scripts/propose_ghstack_orig_pr.py +++ b/.github/scripts/propose_ghstack_orig_pr.py @@ -86,6 +86,17 @@ def get_pr_stack_from_number(ref: str, repo: Repository) -> List[int]: return pr_stack +def get_differential_revision(pr, repo: Repository) -> str: + body = repo.get_pull(pr.number).body + matches = re.findall(r"Differential Revision: .*", body) + count = len(matches) + if count == 1: + # If there's more than one Differential Revision, let's just return empty + # so that we can disambiguate manually. + return matches[0] + return "" + + def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository): # For the first PR, we want to merge to `main` branch, and we will update # as we go through the stack @@ -100,6 +111,7 @@ def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository): # The PR we want to create is then "branch_to_merge" <- gh/user/x/orig # gh/user/x/orig is the clean diff between gh/user/x/base <- gh/user/x/head orig_branch_merge_head = pr.base.ref.replace("base", "orig") + differential_revision_text = get_differential_revision(pr, repo) bot_metadata = f"""This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: https://github.com/pytorch/executorch/pull/{pr.number} by @{pr.user.login} ^ Please use this as the source of truth for the PR details, comments, and reviews @@ -107,6 +119,7 @@ def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository): ghstack PR head: https://github.com/pytorch/executorch/tree/{pr.head.ref} Merge bot PR base: https://github.com/pytorch/executorch/tree/{orig_branch_merge_base} Merge bot PR head: https://github.com/pytorch/executorch/tree/{orig_branch_merge_head} +{differential_revision_text} @diff-train-skip-merge""" existing_orig_pr = repo.get_pulls( diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml index 2449e94b2af..7b67c340350 100644 --- a/.github/workflows/_android.yml +++ b/.github/workflows/_android.yml @@ -48,26 +48,13 @@ jobs: bash examples/models/llama/install_requirements.sh bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom - mkdir -p examples/demo-apps/android/LlamaDemo/app/libs - cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs - pushd examples/demo-apps/android/LlamaDemo - ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest - popd - - DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo" - # The app directory is named using its build flavor as a suffix. - mkdir -p "${DEMO_APP_DIR}" - # Collect the app and its test suite - cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk "${DEMO_APP_DIR}" - cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk "${DEMO_APP_DIR}" - # Running Android emulator directly on the runner and not using Docker run-emulator: needs: build-llm-demo # NB: Use metal install for KVM support to run the emulator faster runs-on: linux.24xl.spr-metal env: - ANDROID_NDK_VERSION: r27b + ANDROID_NDK_VERSION: r28c API_LEVEL: 34 steps: - name: Setup SSH (Click me for login details) @@ -103,8 +90,6 @@ jobs: shell: bash run: | set -eux - curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk - curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch_android-debug-androidTest.apk unzip model.zip diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml index 5f41faa8cc7..ec426af8892 100644 --- a/.github/workflows/_test_backend.yml +++ b/.github/workflows/_test_backend.yml @@ -57,7 +57,7 @@ jobs: script: | set -eux - source .ci/scripts/test_backend_linux.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}" + source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}" test-backend-macos: if: ${{ inputs.run-macos }} @@ -81,4 +81,4 @@ jobs: # This is needed to get the prebuilt PyTorch wheel from S3 ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21 - source .ci/scripts/test_backend_macos.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}" + source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}" diff --git a/.github/workflows/add-unanswered-to-project.yml b/.github/workflows/add-unanswered-to-project.yml index ba2bc6c8436..8b8114d0c04 100644 --- a/.github/workflows/add-unanswered-to-project.yml +++ b/.github/workflows/add-unanswered-to-project.yml @@ -12,7 +12,7 @@ jobs: - name: Add open issues and open, non-draft PRs to org project (excluding certain authors) uses: actions/github-script@v7 with: - github-token: ${{ secrets.GITHUB_TOKEN }} + github-token: ${{ secrets.ET_EXT_CONTRIB }} script: | const projectId = "PVT_kwDOAUB9vs4A_PUL"; // PyTorch org project 136 const owner = 'pytorch'; @@ -20,20 +20,31 @@ jobs: // List of authors to exclude const excludedAuthors = new Set([ - "nil-is-all", "cbilgin", "KimishPatel", "psiddh", "digantdesai", "SS-JIA", "ahmtox", "mcr229", "shoumikhin", + "nil-is-all", "cbilgin", "kimishpatel", "psiddh", "digantdesai", "SS-JIA", "ahmtox", "mcr229", "shoumikhin", "manuelcandales", "metascroy", "cccclai", "rohansjoshi", "kirklandsign", "abhinaykukkadapu", "JacobSzwejbka", "Conarnar", "lucylq", "larryliu0820", "BujSet", "Gasoonjia", "Juntian777", "guangy10", "jackzhxng", "GregoryComer", "leafs1", "swolchok", "mergennachin", "tarun292", "byjlw", "jathu", "Jack-Khuu", "georgehong", - "zhenyan-zhang-meta", "silverguo", "dbort", "jorgep31415", "huydhn", "mcremon-meta", "trivedivivek", "angelayi", - "helunwencser", "hsharma35", "zhxchen17", "iseeyuan", "svekars", "nathanaelsee", "dulinriley", "jerryzh168", + "zhenyan-zhang-meta", "silverguo", "harishs88ss", "AlannaBurke", "dbort", "huydhn", "mcremon-meta", "trivedivivek", + "angelayi", "helunwencser", "hsharma35", "zhxchen17", "iseeyuan", "svekars", "nathanaelsee", "dulinriley", "jerryzh168", "cmodi-meta", "bigfootjon", "sxu", "ydwu4", "Riandy", "tugsbayasgalan", "bsoyluoglu", "yangw-dev", "YIWENX14", "namanahuja", "yushangdi", "limintang", "pianpwk", "viveknayakatmeta", "andreanicastro", "JakeStevens", - "gmagogsfm", "zonglinpeng", "eigen-k", "derekxu", "salilsdesai", "skrtskrtfb", "pssrawat", "r-barnes", "pytorchbot", - "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "Erik-Lundell", "zingo", "AdrianLundell", - "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80", - "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", "haowhsu-quic", "shewu-quic", - "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "cymbalrush", "DenisVieriu97", "billmguo", - "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "neuropilot-captain" + "gmagogsfm", "zonglinpeng", "eigen-k", "derekxu", "salilsdesai", "skrtskrtfb", "pssrawat", "r-barnes", + "kalpit-meta-1", "Will-MingLun-Li", "KapJI", "piyengar", "j-bahr", "BoyuanFeng", "fgasperij", "DariusHolmgren", + "sammarden-meta", "kushrast", "meta-emilian", "Rittzz", "jeanschmidt", "copyrightly", "mikekgfb", "vmpuri", + "zonglinpengmeta", "maggiemoss", "aorenste", "hoangminhle98", "Solumin", "meyering", "rchen152", + "AishwaryaSivaraman", "migeed-z", "ebgraham", "Esteb37", "nausicaasnow", "Camyll", "ezyang", "huiyujie", + "dltn", "cjhopman", "blackm00n", "agunapal", "SamGondelman", "Ninja91", "ivayloen", "DrJessop", "rodrigos01meta", + "akrieger", "cmt0", "yiming0416", "ethansfng", "ThomasJannaud", "nirvanagth", "marcinkwiatkowski", "3l1", + "omerjerk", "nitish2112", "yipjustin", "ejnguyen", "andrewor14", "phaiting", "mgiordy", "LeeOHzzZ", "adicatana", + "Polyomino", "ezrilow", "navsud", "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat", "azad-meta", + "pytorchbot", "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "app/dependabot", "Erik-Lundell", + "zingo", "AdrianLundell", "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", + "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", + "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", "emmakujala", "annietllnd", + "haowhsu-quic", "shewu-quic", "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", + "jethroqti", "cymbalrush", "DenisVieriu97", "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", "skywall", + "MartinPavella", "roman-janik-nxp", "novak-vaclav ", "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", + "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa", "Jiseong-oh", "alexdean08" ]); async function addItem(contentId, type, number) { @@ -80,11 +91,10 @@ jobs: owner, repo, state: 'open', - draft: false, } ); for (const pr of prs) { - if (!excludedAuthors.has(pr.user.login)) { + if (!pr.draft && !excludedAuthors.has(pr.user.login)) { await addItem(pr.node_id, 'pr', pr.number); } } diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml index f0b74342eb8..beda0f77c83 100644 --- a/.github/workflows/android-release-artifacts.yml +++ b/.github/workflows/android-release-artifacts.yml @@ -15,15 +15,11 @@ on: type: choice options: - "xnnpack" - - "vulkan+xnnpack" + - "vulkan" - "qnn" schedule: - cron: 0 10 * * * -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - jobs: check-if-aar-exists: name: check-if-aar-exists @@ -34,12 +30,13 @@ jobs: shell: bash run: | VERSION="${{ inputs.version }}" + FLAVOR="${{ inputs.flavor }}" if [ -z "$VERSION" ]; then echo "No version name specified. Will create a snapshot AAR" exit 0 fi - if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" | grep "200 OK"; then - echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" + if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}-${FLAVOR}/executorch.aar" | grep "200 OK"; then + echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}-${FLAVOR}/executorch.aar" echo "Will skip build/upload" exit 1 fi @@ -93,7 +90,14 @@ jobs: fi FLAVOR="${{ inputs.flavor }}" - if [[ "$FLAVOR" == "vulkan+xnnpack" || -z "$FLAVOR" ]]; then + if [ ! -z "$FLAVOR" ]; then + GRADLE_ARGS+=" -Dflavor=${FLAVOR}" + fi + + if [[ "$FLAVOR" == "vulkan" || -z "$FLAVOR" ]]; then + curl -O https://sdk.lunarg.com/sdk/download/1.4.321.1/linux/vulkansdk-linux-x86_64-1.4.321.1.tar.xz + tar xf vulkansdk-linux-x86_64-1.4.321.1.tar.xz -C /tmp + export PATH="/tmp/1.4.321.1/x86_64/bin:$PATH" export EXECUTORCH_BUILD_VULKAN=ON fi @@ -145,8 +149,12 @@ jobs: pip install awscli==1.32.18 AWS_CMD="aws s3 cp" VERSION="${{ inputs.version }}" + FLAVOR="${{ inputs.flavor }}" if [ -z "$VERSION" ]; then VERSION="snapshot-$(date +"%Y%m%d")" fi - ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}/executorch.aar --acl public-read - ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}/executorch.aar.sha256sums --acl public-read + if [ -z "$FLAVOR" ]; then + FLAVOR="xnnpack" + fi + ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}-${FLAVOR}/executorch.aar --acl public-read + ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}-${FLAVOR}/executorch.aar.sha256sums --acl public-read diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml new file mode 100644 index 00000000000..c1b22e692ab --- /dev/null +++ b/.github/workflows/cuda.yml @@ -0,0 +1,282 @@ +# Test ExecuTorch CUDA Build Compatibility +# This workflow tests whether ExecuTorch can be successfully built with CUDA support +# across different CUDA versions (12.6, 12.8, 12.9) using the command: +# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh +# +# Note: ExecuTorch automatically detects the system CUDA version using nvcc and +# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed. + +name: Test CUDA Builds + +on: + pull_request: + push: + branches: + - main + - release/* + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: false + +jobs: + test-cuda-builds: + strategy: + fail-fast: false + matrix: + cuda-version: ["12.6", "12.8", "13.0"] + + name: test-executorch-cuda-build-${{ matrix.cuda-version }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda-version }} + use-custom-docker-registry: false + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + # Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version + # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" + source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}" + + # This job will fail if any of the CUDA versions fail + check-all-cuda-builds: + needs: test-cuda-builds + runs-on: ubuntu-latest + if: always() + steps: + - name: Check if all CUDA builds succeeded + run: | + if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then + echo "ERROR: One or more ExecuTorch CUDA builds failed!" + echo "CUDA build results: ${{ needs.test-cuda-builds.result }}" + exit 1 + else + echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!" + fi + + test-models-cuda: + name: test-models-cuda + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: + model: [linear, add, add_mul, resnet18] + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda + + export-voxtral-cuda-artifact: + name: export-voxtral-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + strategy: + fail-fast: false + with: + timeout: 90 + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + upload-artifact: voxtral-cuda-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch" + CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip install mistral-common librosa + pip list + echo "::endgroup::" + + echo "::group::Export Voxtral" + optimum-cli export executorch \ + --model "mistralai/Voxtral-Mini-3B-2507" \ + --task "multimodal-text-to-text" \ + --recipe "cuda" \ + --dtype bfloat16 \ + --device cuda \ + --max_seq_len 1024 \ + --output_dir ./ + python -m executorch.extension.audio.mel_spectrogram \ + --feature_size 128 \ + --stack_output \ + --max_audio_len 300 \ + --output_file voxtral_preprocessor.pte + + test -f model.pte + test -f aoti_cuda_blob.ptd + test -f voxtral_preprocessor.pte + echo "::endgroup::" + + echo "::group::Store Voxtral Artifacts" + mkdir -p "${RUNNER_ARTIFACT_DIR}" + cp model.pte "${RUNNER_ARTIFACT_DIR}/" + cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" + cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" + ls -al "${RUNNER_ARTIFACT_DIR}" + echo "::endgroup::" + + benchmark-voxtral-cuda: + name: benchmark-voxtral-cuda + needs: export-voxtral-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + download-artifact: voxtral-cuda-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch Requirements" + CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare Voxtral Artifacts" + cp "${RUNNER_ARTIFACT_DIR}/model.pte" . + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . + ls -al model.pte aoti_cuda_blob.ptd + echo "::endgroup::" + + echo "::group::Build Voxtral Benchmark" + cmake -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_TESTS=ON \ + -Bcmake-out . + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner + echo "::endgroup::" + + echo "::group::Run Voxtral Benchmark" + + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd + + echo "::endgroup::" + + test-voxtral-cuda-e2e: + name: test-voxtral-cuda-e2e + needs: export-voxtral-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + download-artifact: voxtral-cuda-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch Requirements" + CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare Voxtral Artifacts" + cp "${RUNNER_ARTIFACT_DIR}/model.pte" . + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . + cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" . + TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" + curl -L $TOKENIZER_URL -o tekken.json + ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json + echo "::endgroup::" + + echo "::group::Download Test Audio File" + AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" + curl -L $AUDIO_URL -o poem.wav + echo "::endgroup::" + + echo "::group::Build Voxtral Runner" + cmake --preset llm \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out -S. + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release + + cmake -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -Sexamples/models/voxtral \ + -Bcmake-out/examples/models/voxtral/ + cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release + echo "::endgroup::" + + echo "::group::Run Voxtral Runner" + set +e + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ + --model_path model.pte \ + --data_path aoti_cuda_blob.ptd \ + --tokenizer_path tekken.json \ + --audio_path poem.wav \ + --processor_path voxtral_preprocessor.pte \ + --temperature 0 2>&1) + EXIT_CODE=$? + set -e + + echo "$OUTPUT" + + if ! echo "$OUTPUT" | grep -iq "poem"; then + echo "Expected output 'poem' not found in output" + exit 1 + fi + + if [ $EXIT_CODE -ne 0 ]; then + echo "Unexpected exit code: $EXIT_CODE" + exit $EXIT_CODE + fi + echo "::endgroup::" diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 585522a8d01..540c6cc05f6 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -31,7 +31,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux.2xlarge] + runner: [linux.4xlarge] docker-image-name: [ executorch-ubuntu-22.04-gcc9, executorch-ubuntu-22.04-clang12, diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ac9d1c7e6a0..a9d0f466e55 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -148,8 +148,6 @@ jobs: extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/*.java \ extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/*.java \ extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/*.java \ - examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \ - examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/*.java \ extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java \ extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/*.java) if [ -n "$FILES_NEEDS_FORMAT" ]; then diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index d8c551e8982..5b646cba9d1 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -286,15 +286,20 @@ jobs: # Test selective build PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}" - test-llava-runner-linux: - name: test-llava-runner-linux + test-multimodal-linux: + if: ${{ !github.event.pull_request.head.repo.fork }} + name: test-multimodal-linux uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write contents: read + secrets: inherit strategy: fail-fast: false + matrix: + model: ["gemma3-4b"] # llava gives segfault so not covering. with: + secrets-env: EXECUTORCH_HF_TOKEN runner: linux.24xlarge docker-image: ci-image:executorch-ubuntu-22.04-clang12 submodules: 'recursive' @@ -305,17 +310,20 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" + echo "::group::Setup ExecuTorch" PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake" + echo "::endgroup::" - # install Llava requirements - bash examples/models/llama/install_requirements.sh - bash examples/models/llava/install_requirements.sh + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + echo "::endgroup::" - # run python unittest - python -m unittest examples.models.llava.test.test_llava - - # run e2e (export, tokenizer and runner) - PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh + echo "::group::Test ${{ matrix.model }}" + python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack + echo "::endgroup::" test-moshi-linux: name: test-moshi-linux @@ -738,8 +746,8 @@ jobs: # Install llama requirements bash examples/models/llama/install_requirements.sh - # install a recent version of torchtune. - PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250730 --extra-index-url https://download.pytorch.org/whl/nightly/cpu + # install a recent version of torchtune (>= 20250730) + PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu # run llama runner in eager mode PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_lora.sh @@ -779,7 +787,6 @@ jobs: contents: read strategy: fail-fast: false - if: false # TODO Re-enable after fixing timeouts (#14314) with: runner: linux.2xlarge docker-image: ci-image:executorch-ubuntu-22.04-gcc9 @@ -900,7 +907,9 @@ jobs: permissions: id-token: write contents: read + secrets: inherit with: + secrets-env: SAMSUNG_AI_LITECORE_KEY runner: linux.2xlarge docker-image: ci-image:executorch-ubuntu-22.04-clang12-android submodules: 'recursive' @@ -917,6 +926,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake" # Setup Samsung SDK (AI Lite Core) and install enn backend + export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY source .ci/scripts/setup-samsung-linux-deps.sh # Test models serially @@ -925,6 +935,12 @@ jobs: python -m executorch.examples.samsung.aot_compiler --model_name=$model -c E9955 done + # Test quant models + model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter" + for m_script in $model_scripts; do + python -m executorch.examples.samsung.scripts.${m_script} -c e9955 -p A8W8 + done + # Test ops python -m unittest discover -s backends/samsung/test/ops -p "test_*.py" @@ -959,11 +975,16 @@ jobs: PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build # Test models serially - models="mv2 mv3 edsr resnet18 resnet50 dl3" + models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4" for model in $models; do python -m examples.vulkan.export --model_name=$model --test done + # For selected vision models, test with dynamic shapes + models="mv2 resnet18 resnet50 ic3 densenet161" + for model in $models; do + python -m examples.vulkan.export --model_name=$model --test -d + done test-vulkan-operators-linux: name: test-vulkan-operators-linux @@ -998,6 +1019,8 @@ jobs: ./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d ./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row + ./cmake-out/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations + ./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add # "Classic" Operator tests PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml index e57be2704a2..22e3d524f6b 100644 --- a/.github/workflows/test-backend-arm.yml +++ b/.github/workflows/test-backend-arm.yml @@ -4,12 +4,17 @@ on: schedule: - cron: 0 2 * * * push: + branches: + - release/* tags: - ciflow/nightly/* pull_request: paths: - .github/workflows/test-backend-arm.yml - .github/workflows/_test_backend.yml + - .ci/scripts/test_backend.sh + - backends/test/suite/flow.py + - backends/test/suite/flows/arm.py workflow_dispatch: concurrency: @@ -21,7 +26,7 @@ jobs: uses: ./.github/workflows/_test_backend.yml with: backend: arm - flows: '["arm_tosa"]' + flows: '["arm_tosa_fp", "arm_tosa_int", "arm_ethos_u55", "arm_ethos_u85"]' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 120 run-linux: true diff --git a/.github/workflows/test-backend-coreml.yml b/.github/workflows/test-backend-coreml.yml index c6970ddff61..247f9576595 100644 --- a/.github/workflows/test-backend-coreml.yml +++ b/.github/workflows/test-backend-coreml.yml @@ -4,6 +4,8 @@ on: schedule: - cron: 0 2 * * * push: + branches: + - release/* tags: - ciflow/nightly/* pull_request: diff --git a/.github/workflows/test-backend-qnn.yml b/.github/workflows/test-backend-qnn.yml index 00933d6c74e..907c4d2dac0 100644 --- a/.github/workflows/test-backend-qnn.yml +++ b/.github/workflows/test-backend-qnn.yml @@ -4,6 +4,8 @@ on: schedule: - cron: 0 2 * * * push: + branches: + - release/* tags: - ciflow/nightly/* pull_request: diff --git a/.github/workflows/test-backend-vulkan.yml b/.github/workflows/test-backend-vulkan.yml index f04fdcdd1f1..cb2478fc825 100644 --- a/.github/workflows/test-backend-vulkan.yml +++ b/.github/workflows/test-backend-vulkan.yml @@ -4,6 +4,8 @@ on: schedule: - cron: 0 2 * * * push: + branches: + - release/* tags: - ciflow/nightly/* pull_request: diff --git a/.github/workflows/test-backend-xnnpack.yml b/.github/workflows/test-backend-xnnpack.yml index 2ae423dd99b..086c9625a38 100644 --- a/.github/workflows/test-backend-xnnpack.yml +++ b/.github/workflows/test-backend-xnnpack.yml @@ -4,6 +4,8 @@ on: schedule: - cron: 0 2 * * * push: + branches: + - release/* tags: - ciflow/nightly/* pull_request: diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 975a8ebbb30..8add54af49c 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -289,6 +289,7 @@ jobs: - test_arm_baremetal: test_models_ethos-u55 - test_arm_baremetal: test_models_ethos-u85 - test_arm_baremetal: test_smaller_stories_llama + - test_arm_baremetal: test_memory_allocation fail-fast: false with: runner: linux.2xlarge.memory @@ -345,7 +346,7 @@ jobs: elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then setup_script_args="--target-toolchain zephyr" toolchain_prefix=arm-zephyr-eabi- - threshold="135168" # 132 KiB + threshold="135240" # 132 KiB toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake else echo "Fail unsupport OS selection ${{ matrix.os }}" @@ -594,15 +595,22 @@ jobs: strategy: matrix: model: [qwen3_4b, phi_4_mini] + runner: [linux.2xlarge] + docker-image: [executorch-ubuntu-22.04-clang12] + backend: [xnnpack] include: - model: qwen3_4b - test_with_runner: true + runner: linux.arm64.2xlarge + docker-image: executorch-ubuntu-22.04-gcc11-aarch64 + backend: torchao - model: phi_4_mini - test_with_runner: false + runner: linux.arm64.2xlarge + docker-image: executorch-ubuntu-22.04-gcc11-aarch64 + backend: torchao fail-fast: false with: - runner: linux.2xlarge - docker-image: ci-image:executorch-ubuntu-22.04-clang12 + runner: ${{ matrix.runner }} + docker-image: ci-image:${{ matrix.docker-image }} submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 900 @@ -612,38 +620,54 @@ jobs: conda activate "${CONDA_ENV}" PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake + + if [[ "${{ matrix.backend }}" == "torchao" ]]; then + BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_PARALLEL_BACKEND=OPENMP pip install third-party/ao + fi + pip install -U "huggingface_hub[cli]" - bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }} - - # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner. - # test-llava-runner-macos: - # name: test-llava-runner-macos - # uses: pytorch/test-infra/.github/workflows/macos_job.yml@main - # strategy: - # fail-fast: false - # with: - # runner: macos-14-xlarge - # python-version: '3.11' - # submodules: 'recursive' - # ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # timeout: 900 - # script: | - # BUILD_TOOL=cmake - - # bash .ci/scripts/setup-conda.sh - # # Setup MacOS dependencies as there is no Docker support on MacOS atm - # GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}" - - # # install Llava requirements - # ${CONDA_RUN} bash examples/models/llama/install_requirements.sh - # ${CONDA_RUN} bash examples/models/llava/install_requirements.sh - - # # run python unittest - # ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava - - # # run e2e (export, tokenizer and runner) - # PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh + bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.model != 'phi_4_mini' && '--test_with_runner' || '' }} ${{ matrix.backend == 'torchao' && '--use_torchao_kernels' || '' }} + + test-multimodal-macos: + if: ${{ !github.event.pull_request.head.repo.fork }} + name: test-multimodal-macos + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + strategy: + fail-fast: false + matrix: + model: ["gemma3-4b"] # llava gives segfault so not covering. + with: + secrets-env: EXECUTORCH_HF_TOKEN + runner: macos-15-xlarge + python-version: '3.11' + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + echo "::group::Set up ExecuTorch" + bash .ci/scripts/setup-conda.sh + eval "$(conda shell.bash hook)" + + # Install requirements + ${CONDA_RUN} python install_executorch.py + echo "::endgroup::" + + echo "::group::Set up Huggingface" + ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate + ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + ${CONDA_RUN} pip list + echo "::endgroup::" + + echo "::group::Test ${{ matrix.model }}" + ${CONDA_RUN} python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack + echo "::endgroup::" test-qnn-model: name: test-qnn-model @@ -800,11 +824,26 @@ jobs: echo "Recipe: $RECIPE" echo "Quantize: $QUANTIZE" - echo "::group::Set up ExecuTorch" # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" - PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake + + echo "::group::Setup ExecuTorch" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake" + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + echo "::endgroup::" + + echo "::group::Test MODEL: $MODEL RECIPE: $RECIPE QUANTIZE: $QUANTIZE" + export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}" + python .ci/scripts/test_huggingface_optimum_model.py --model "$MODEL" --recipe "$RECIPE" $QUANTIZE --model_dir "$OUTPUT_DIR" + echo "::endgroup::" + # Build executor_runner with ETdump enabled PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \ -DCMAKE_INSTALL_PREFIX=cmake-out \ @@ -813,6 +852,7 @@ jobs: -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ @@ -822,25 +862,6 @@ jobs: -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -Bcmake-out . cmake --build cmake-out -j16 --target install --config Release - echo "::endgroup::" - - echo "::group::Set up Hugging Face" - pip install -U "huggingface_hub[cli]" - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN - OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - git clone https://github.com/huggingface/optimum-executorch - pushd optimum-executorch - # There is no release yet, for CI stability, always test from the same commit on main - git checkout $OPTIMUM_ET_COMMIT - python install_dev.py --skip_override_torch - popd - pip list - echo "::endgroup::" - - echo "::group::Run tests" - export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}" - python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR} - echo "::endgroup::" echo "::group::Generate artifacts for performance profiling" ./cmake-out/executor_runner \ @@ -907,16 +928,11 @@ jobs: ${CONDA_RUN} python install_executorch.py echo "::endgroup::" - echo "::group::Set up Hugging Face" - pip install -U "huggingface_hub[cli]" - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN - OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - git clone https://github.com/huggingface/optimum-executorch - pushd optimum-executorch - # There is no release yet, for CI stability, always test from the same commit on main - git checkout $OPTIMUM_ET_COMMIT - ${CONDA_RUN} python install_dev.py --skip_override_torch - popd + echo "::group::Set up Huggingface" + ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate + ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} ${CONDA_RUN} pip list echo "::endgroup::" @@ -962,6 +978,60 @@ jobs: # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}" + # this is for filtering out the qnn changes such that qnn jobs only triggered when the specific files are changed + changes: + runs-on: ubuntu-latest + outputs: + qnn: ${{ steps.filter.outputs.qnn }} + steps: + - uses: actions/checkout@v4 + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + qnn: + - 'backends/qualcomm/**' + - 'examples/qualcomm/**' + - 'examples/models/llama/**' + + test-static-llama-qnn-eval-linux: + needs: changes # has dependency on changes jobs defined above + if: needs.changes.outputs.qnn == 'true' + name: test-static-llama-qnn-eval-linux + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: + config: + - name: "baseline" + flags: "" + threshold: 62.0 + with: + runner: linux.2xlarge + docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 180 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + BUILD_TOOL="cmake" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + # Setup executorch + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}" + # Setup install_requirements for llama + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh + + echo ">>> Running config: ${{ matrix.config.name }}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama_eval.sh \ + --flags "${{ matrix.config.flags }}" \ + --threshold "${{ matrix.config.threshold }}" + unittest-release: uses: ./.github/workflows/_unittest.yml permissions: @@ -1016,8 +1086,8 @@ jobs: strategy: fail-fast: false matrix: - model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe] - backend: [portable, xnnpack-f32, xnnpack-q8] + model: [mv3, resnet50, vit, mobilebert, emformer_transcribe] + backend: [portable, xnnpack-q8] with: submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} diff --git a/.lintrunner.toml b/.lintrunner.toml index 0b6a6eb8908..b366c141799 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -206,6 +206,7 @@ exclude_patterns = [ '**/*.png', '**/*.webp', '**/*.jpeg', + '**/*.mp3', '**/*.mp4', '**/*.pte', '**/*.pth', @@ -216,6 +217,9 @@ exclude_patterns = [ '**/*.jpg', '**/*.jar', '**/*.gif', + 'extension/llm/tokenizers', + 'extension/llm/tokenizers/**', + 'examples/cuda', # File contains @generated 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', 'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h', diff --git a/CMakeLists.txt b/CMakeLists.txt index fc427d517a9..10e2eb437e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,7 +226,7 @@ if(EXECUTORCH_BUILD_CPUINFO) install( TARGETS cpuinfo EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${_common_include_directories} ) @@ -266,10 +266,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL) executorch_move_interface_include_directories_to_build_time_only( pthreadpool_interface ) + + if(APPLE) + # Use hidden visibility for pthreadpool on Apple platforms to avoid issues + # with pthreadpool symbols from libtorch_cpu taking precedence over the ones + # from the pthreadpool library statically linked in _portable_lib. The + # pthreadpool public APIs are marked as weak by default on some Apple + # platforms, so setting to hidden visibility works around this by not + # putting the symbol in the indirection table. See + # https://github.com/pytorch/executorch/issues/14321 for more details. + target_compile_options(pthreadpool PRIVATE -fvisibility=hidden) + endif() + install( TARGETS pthreadpool pthreadpool_interface fxdiv EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${_common_include_directories} ) @@ -284,7 +296,10 @@ if(EXECUTORCH_BUILD_TESTS) endif() # TODO(dbort): Fix these warnings and remove this flag. -set(_common_compile_options -Wno-deprecated-declarations -fPIC) +set(_common_compile_options + $<$:/wd4996> + $<$>:-Wno-deprecated-declarations -fPIC> +) # Let files say "include ". # TODO(#6475): This requires/assumes that the repo lives in a directory named @@ -587,6 +602,16 @@ endif() if(EXECUTORCH_BUILD_CORTEX_M) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m) + list(APPEND _executorch_backends coretex_m_backend) +endif() + +if(EXECUTORCH_BUILD_CUDA) + # Build common AOTI functionality (required for CUDA) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti) + # Build CUDA-specific AOTI functionality + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda) + # Add aoti_cuda to backends - it already depends on aoti_common + list(APPEND _executorch_backends aoti_cuda) endif() if(EXECUTORCH_BUILD_EXTENSION_APPLE) @@ -630,6 +655,11 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE) list(APPEND _executorch_extensions extension_module_static) endif() +if(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/named_data_map) + list(APPEND _executorch_extensions extension_named_data_map) +endif() + if(EXECUTORCH_BUILD_EXTENSION_LLM) if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) set(SUPPORT_REGEX_LOOKAHEAD ON) @@ -650,15 +680,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM) list(APPEND _executorch_extensions tokenizers) endif() -if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) - list(APPEND _executorch_extensions extension_llm_runner) -endif() - -if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple) -endif() - if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) install( @@ -717,7 +738,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO) install( TARGETS torchao_ops_executorch torchao_kernels_aarch64 EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${_common_include_directories} ) @@ -728,7 +749,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO) install( TARGETS kleidiai EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${_common_include_directories} ) @@ -738,9 +759,6 @@ endif() if(EXECUTORCH_BUILD_PYBIND) - # Add codegen tools subdirectory for selective_build pybind module - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/codegen/tools) - if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader) endif() @@ -749,6 +767,9 @@ if(EXECUTORCH_BUILD_PYBIND) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() + # Add codegen tools subdirectory for selective_build pybind module + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/codegen/tools) + # Create bundled_module target only for pybindings when bundled_program exists # This target has hard dependencies on devtools generated headers if(TARGET bundled_program) @@ -769,7 +790,10 @@ if(EXECUTORCH_BUILD_PYBIND) bundled_module PUBLIC ${_common_include_directories} ) target_compile_options( - bundled_module PUBLIC -Wno-deprecated-declarations -fPIC + bundled_module + PUBLIC $<$:/wd4996> + $<$>:-Wno-deprecated-declarations + -fPIC> ) endif() @@ -841,8 +865,14 @@ if(EXECUTORCH_BUILD_PYBIND) endif() # compile options for pybind - set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti - -fexceptions + set(_pybind_compile_options + $<$:/EHsc + /GR + /wd4996> + $<$>:-Wno-deprecated-declarations + -fPIC + -frtti + -fexceptions> ) # util lib @@ -869,6 +899,21 @@ if(EXECUTORCH_BUILD_PYBIND) target_compile_options(portable_lib PUBLIC ${_pybind_compile_options}) target_link_libraries(portable_lib PRIVATE ${_dep_libs}) + # Set RPATH to find PyTorch libraries relative to the installation location + # This goes from executorch/extension/pybindings up to site-packages, then to + # torch/lib + if(APPLE) + set_target_properties( + portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib" + INSTALL_RPATH "@loader_path/../../../torch/lib" + ) + else() + set_target_properties( + portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib" + INSTALL_RPATH "$ORIGIN/../../../torch/lib" + ) + endif() + install( TARGETS portable_lib EXPORT ExecuTorchTargets @@ -889,6 +934,15 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING) list(APPEND _executorch_extensions extension_training) endif() +if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) + list(APPEND _executorch_extensions extension_llm_runner) +endif() + +if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple) +endif() + if(EXECUTORCH_BUILD_KERNELS_LLM) # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops) @@ -984,7 +1038,7 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL "" install( TARGETS executorch_selected_kernels EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} ) else() # No selective build - link the full library. @@ -1006,6 +1060,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) extension_runner_util gflags executorch_backends ) + if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR) + list(APPEND _executor_runner_libs extension_flat_tensor) + endif() + if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) elseif(EXECUTORCH_BUILD_CADENCE) diff --git a/CMakePresets.json b/CMakePresets.json index bcf3bbc8d83..379f4f418ed 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -63,7 +63,8 @@ "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake", "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/ios.cmake", "PLATFORM": "OS64", - "DEPLOYMENT_TARGET": "17.0" + "DEPLOYMENT_TARGET": "17.0", + "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0" }, "condition": { "lhs": "${hostSystemName}", @@ -80,7 +81,8 @@ "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake", "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/ios.cmake", "PLATFORM": "SIMULATORARM64", - "DEPLOYMENT_TARGET": "17.0" + "DEPLOYMENT_TARGET": "17.0", + "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0" }, "condition": { "lhs": "${hostSystemName}", diff --git a/CODEOWNERS b/CODEOWNERS index 10baed9ede4..11f3ca07615 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -14,6 +14,7 @@ /backends/transforms @kimishpatel /backends/vulkan @SS-JIA /backends/xnnpack @digantdesai @mcr229 +/backends/nxp @robert-kalmar /devtools @Gasoonjia @@ -33,6 +34,7 @@ /examples/qualcomm @cccclai /examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka /examples/xnnpack @digantdesai @mcr229 +/examples/nxp @robert-kalmar /exir/backend @cccclai @kimishpatel @JacobSzwejbka /exir @JacobSzwejbka @larryliu0820 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2f4de863dad..45e03bd36e1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -199,8 +199,7 @@ We use [`lintrunner`](https://pypi.org/project/lintrunner/) to help make sure th code follows our standards. Set it up with: ``` -pip install lintrunner==0.12.7 -pip install lintrunner-adapters==0.12.4 +./install_requirements.sh # (automatically run by install_executorch.sh) lintrunner init ``` diff --git a/README-wheel.md b/README-wheel.md index a59af8ea05f..7ae9b0aa2e0 100644 --- a/README-wheel.md +++ b/README-wheel.md @@ -25,6 +25,6 @@ tutorials and documentation. Here are some starting points: * [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and optimizing its performance using quantization and hardware delegation. -* Running etLLM on [iOS](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple) and [Android](docs/source/llm/llama-demo-android.md) devices. +* Running etLLM on [iOS](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple) and [Android](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android) devices. * Build and run LLaMA in a demo mobile app, and learn how to integrate models with your own apps. diff --git a/README.md b/README.md index 17327990a1d..531fcc3b4ef 100644 --- a/README.md +++ b/README.md @@ -1,72 +1,250 @@
- Logo -

ExecuTorch: A powerful on-device AI Framework

+ ExecuTorch logo mark +

ExecuTorch

+

On-device AI inference powered by PyTorch

-
- Contributors - Stargazers - Join our Discord community - Check out the documentation -
+ PyPI - Version + GitHub - Contributors + GitHub - Stars + Discord - Chat with Us + Documentation
-**ExecuTorch** is an end-to-end solution for on-device inference and training. It powers much of Meta's on-device AI experiences across Facebook, Instagram, Meta Quest, Ray-Ban Meta Smart Glasses, WhatsApp, and more. +**ExecuTorch** is PyTorch's unified solution for deploying AI models on-device—from smartphones to microcontrollers—built for privacy, performance, and portability. It powers Meta's on-device AI across **Instagram, WhatsApp, Quest 3, Ray-Ban Meta Smart Glasses**, and [more](https://docs.pytorch.org/executorch/main/success-stories.html). + +Deploy **LLMs, vision, speech, and multimodal models** with the same PyTorch APIs you already know—accelerating research to production with seamless model export, optimization, and deployment. No manual C++ rewrites. No format conversions. No vendor lock-in. + +
+ 📘 Table of Contents + +- [Why ExecuTorch?](#why-executorch) +- [How It Works](#how-it-works) +- [Quick Start](#quick-start) + - [Installation](#installation) + - [Export and Deploy in 3 Steps](#export-and-deploy-in-3-steps) + - [Run on Device](#run-on-device) + - [LLM Example: Llama](#llm-example-llama) +- [Platform & Hardware Support](#platform--hardware-support) +- [Production Deployments](#production-deployments) +- [Examples & Models](#examples--models) +- [Key Features](#key-features) +- [Documentation](#documentation) +- [Community & Contributing](#community--contributing) +- [License](#license) + +
+ +## Why ExecuTorch? + +- **🔒 Native PyTorch Export** — Direct export from PyTorch. No .onnx, .tflite, or intermediate format conversions. Preserve model semantics. +- **⚡ Production-Proven** — Powers billions of users at [Meta with real-time on-device inference](https://engineering.fb.com/2025/07/28/android/executorch-on-device-ml-meta-family-of-apps/). +- **💾 Tiny Runtime** — 50KB base footprint. Runs on microcontrollers to high-end smartphones. +- **🚀 [12+ Hardware Backends](https://docs.pytorch.org/executorch/main/backends-overview.html)** — Open-source acceleration for Apple, Qualcomm, ARM, MediaTek, Vulkan, and more. +- **🎯 One Export, Multiple Backends** — Switch hardware targets with a single line change. Deploy the same model everywhere. + +## How It Works + +ExecuTorch uses **ahead-of-time (AOT) compilation** to prepare PyTorch models for edge deployment: + +1. **🧩 Export** — Capture your PyTorch model graph with `torch.export()` +2. **⚙️ Compile** — Quantize, optimize, and partition to hardware backends → `.pte` +3. **🚀 Execute** — Load `.pte` on-device via lightweight C++ runtime + +Models use a standardized [Core ATen operator set](https://docs.pytorch.org/executorch/main/compiler-ir-advanced.html#intermediate-representation). [Partitioners](https://docs.pytorch.org/executorch/main/compiler-delegate-and-partitioner.html) delegate subgraphs to specialized hardware (NPU/GPU) with CPU fallback. + +Learn more: [How ExecuTorch Works](https://docs.pytorch.org/executorch/main/intro-how-it-works.html) • [Architecture Guide](https://docs.pytorch.org/executorch/main/getting-started-architecture.html) + +## Quick Start + +### Installation + +```bash +pip install executorch +``` + +For platform-specific setup (Android, iOS, embedded systems), see the [Quick Start](https://docs.pytorch.org/executorch/main/quick-start-section.html) documentation for additional info. + +### Export and Deploy in 3 Steps + +```python +import torch +from executorch.exir import to_edge_transform_and_lower +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner + +# 1. Export your PyTorch model +model = MyModel().eval() +example_inputs = (torch.randn(1, 3, 224, 224),) +exported_program = torch.export.export(model, example_inputs) + +# 2. Optimize for target hardware (switch backends with one line) +program = to_edge_transform_and_lower( + exported_program, + partitioner=[XnnpackPartitioner()] # CPU | CoreMLPartitioner() for iOS | QnnPartitioner() for Qualcomm +).to_executorch() + +# 3. Save for deployment +with open("model.pte", "wb") as f: + f.write(program.buffer) + +# Test locally via ExecuTorch runtime's pybind API (optional) +from executorch.runtime import Runtime +runtime = Runtime.get() +method = runtime.load_program("model.pte").load_method("forward") +outputs = method.execute([torch.randn(1, 3, 224, 224)]) +``` + +### Run on Device + +**[C++](https://docs.pytorch.org/executorch/main/using-executorch-cpp.html)** +```cpp +#include +#include + +Module module("model.pte"); +auto tensor = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f}); +auto outputs = module.forward({tensor}); +``` + +**[Swift (iOS)](https://docs.pytorch.org/executorch/main/ios-section.html)** +```swift +let module = Module(filePath: "model.pte") +let input = Tensor([1.0, 2.0, 3.0, 4.0]) +let outputs: [Value] = try module.forward([input]) +``` + +**[Kotlin (Android)](https://docs.pytorch.org/executorch/main/android-section.html)** +```kotlin +val module = Module.load("model.pte") +val inputTensor = Tensor.fromBlob(floatArrayOf(1.0f, 2.0f, 3.0f, 4.0f), longArrayOf(2, 2)) +val outputs = module.forward(EValue.from(inputTensor)) +``` + +### LLM Example: Llama + +Export Llama models using the [`export_llm`](https://docs.pytorch.org/executorch/main/llm/export-llm.html) script or [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch): + +```bash +# Using export_llm +python -m executorch.extension.llm.export.export_llm --model llama3_2 --output llama.pte + +# Using Optimum-ExecuTorch +optimum-cli export executorch \ + --model meta-llama/Llama-3.2-1B \ + --task text-generation \ + --recipe xnnpack \ + --output_dir llama_model +``` -It supports a wide range of models including LLMs (Large Language Models), CV (Computer Vision), ASR (Automatic Speech Recognition), and TTS (Text to Speech). +Run on-device with the LLM runner API: -Platform Support: -- Operating Systems: - - iOS - - MacOS (ARM64) - - Android - - Linux - - Microcontrollers +**[C++](https://docs.pytorch.org/executorch/main/llm/run-with-c-plus-plus.html)** +```cpp +#include -- Hardware Acceleration: - - Apple - - Arm - - Cadence - - MediaTek - - NXP - - OpenVINO - - Qualcomm - - Vulkan - - XNNPACK +auto runner = create_llama_runner("llama.pte", "tiktoken.bin"); +executorch::extension::llm::GenerationConfig config{ + .seq_len = 128, .temperature = 0.8f}; +runner->generate("Hello, how are you?", config); +``` -Key value propositions of ExecuTorch are: +**[Swift (iOS)](https://docs.pytorch.org/executorch/main/llm/run-on-ios.html)** +```swift +let runner = TextRunner(modelPath: "llama.pte", tokenizerPath: "tiktoken.bin") +try runner.generate("Hello, how are you?", Config { + $0.sequenceLength = 128 +}) { token in + print(token, terminator: "") +} +``` -- **Portability:** Compatibility with a wide variety of computing platforms, - from high-end mobile phones to highly constrained embedded systems and - microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and Developer - Tools from PyTorch model authoring and conversion, to debugging and deployment - to a wide variety of platforms. -- **Performance:** Providing end users with a seamless and high-performance - experience due to a lightweight runtime and utilizing full hardware - capabilities such as CPUs, NPUs, and DSPs. +**Kotlin (Android)** — [API Docs](https://docs.pytorch.org/executorch/main/javadoc/org/pytorch/executorch/extension/llm/package-summary.html) • [Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android/LlamaDemo) +```kotlin +val llmModule = LlmModule("llama.pte", "tiktoken.bin", 0.8f) +llmModule.load() +llmModule.generate("Hello, how are you?", 128, object : LlmCallback { + override fun onResult(result: String) { print(result) } + override fun onStats(stats: String) { } +}) +``` -## Getting Started -To get started you can: +For multimodal models (vision, audio), use the [MultiModal runner API](extension/llm/runner) which extends the LLM runner to handle image and audio inputs alongside text. See [Llava](examples/models/llava/README.md) and [Voxtral](examples/models/voxtral/README.md) examples. -- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device -- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away -- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [Llava](examples/models/llava/README.md), [Voxtral](examples/models/voxtral/README.md), and [LFM2](examples/models/lfm2/README.md). +See [examples/models/llama](examples/models/llama/README.md) for complete workflow including quantization, mobile deployment, and advanced options. -## Feedback and Engagement +**Next Steps:** +- 📖 [Step-by-step tutorial](https://docs.pytorch.org/executorch/main/getting-started.html) — Complete walkthrough for your first model +- ⚡ [Colab notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) — Try ExecuTorch instantly in your browser +- 🤖 [Deploy Llama models](examples/models/llama/README.md) — LLM workflow with quantization and mobile demos -We welcome any feedback, suggestions, and bug reports from the community to help -us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/Dh43CKSAdc) +## Platform & Hardware Support -## Contributing +| **Platform** | **Supported Backends** | +|------------------|----------------------------------------------------------| +| Android | XNNPACK, Vulkan, Qualcomm, MediaTek, Samsung Exynos | +| iOS | XNNPACK, MPS, CoreML (Neural Engine) | +| Linux / Windows | XNNPACK, OpenVINO, CUDA *(experimental)* | +| macOS | XNNPACK, MPS, Metal *(experimental)* | +| Embedded / MCU | XNNPACK, ARM Ethos-U, NXP, Cadence DSP | -We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/Dh43CKSAdc) +See [Backend Documentation](https://docs.pytorch.org/executorch/main/backends-overview.html) for detailed hardware requirements and optimization guides. +## Production Deployments -## Directory Structure +ExecuTorch powers on-device AI at scale across Meta's family of apps, VR/AR devices, and partner deployments. [View success stories →](https://docs.pytorch.org/executorch/main/success-stories.html) -Please refer to the [Codebase structure](CONTRIBUTING.md#codebase-structure) section of the [Contributing Guidelines](CONTRIBUTING.md) for more details. +## Examples & Models + +**LLMs:** [Llama 3.2/3.1/3](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [LiquidAI LFM2](examples/models/lfm2/README.md) + +**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language) + +**Vision/Speech:** [MobileNetV2](https://github.com/meta-pytorch/executorch-examples/tree/main/mv2), [DeepLabV3](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3) + +**Resources:** [`examples/`](examples/) directory • [executorch-examples](https://github.com/meta-pytorch/executorch-examples) mobile demos • [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch) for HuggingFace models + +## Key Features + +ExecuTorch provides advanced capabilities for production deployment: + +- **Quantization** — Built-in support via [torchao](https://docs.pytorch.org/ao) for 8-bit, 4-bit, and dynamic quantization +- **Memory Planning** — Optimize memory usage with ahead-of-time allocation strategies +- **Developer Tools** — ETDump profiler, ETRecord inspector, and model debugger +- **Selective Build** — Strip unused operators to minimize binary size +- **Custom Operators** — Extend with domain-specific kernels +- **Dynamic Shapes** — Support variable input sizes with bounded ranges + +See [Advanced Topics](https://docs.pytorch.org/executorch/main/advanced-topics-section.html) for quantization techniques, custom backends, and compiler passes. + +## Documentation + +- [**Documentation Home**](https://docs.pytorch.org/executorch/main/index.html) — Complete guides and tutorials +- [**API Reference**](https://docs.pytorch.org/executorch/main/api-section.html) — Python, C++, Java/Kotlin APIs +- [**Backend Integration**](https://docs.pytorch.org/executorch/main/backend-delegates-integration.html) — Build custom hardware backends +- [**Troubleshooting**](https://docs.pytorch.org/executorch/main/using-executorch-troubleshooting.html) — Common issues and solutions + +## Community & Contributing + +We welcome contributions from the community! + +- 💬 [**GitHub Discussions**](https://github.com/pytorch/executorch/discussions) — Ask questions and share ideas +- 🎮 [**Discord**](https://discord.gg/Dh43CKSAdc) — Chat with the team and community +- 🐛 [**Issues**](https://github.com/pytorch/executorch/issues) — Report bugs or request features +- 🤝 [**Contributing Guide**](CONTRIBUTING.md) — Guidelines and codebase structure ## License -ExecuTorch is BSD licensed, as found in the LICENSE file. + +ExecuTorch is BSD licensed, as found in the [LICENSE](LICENSE) file. + +

+ +--- + +
+

Part of the PyTorch ecosystem

+

+ GitHub • + Documentation +

+
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt new file mode 100644 index 00000000000..fcabb0a3f2b --- /dev/null +++ b/backends/aoti/CMakeLists.txt @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Build AOTI backend for runtime. +# +# ### Editing this file ### +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +# Use ExecuTorch's standard way to find PyTorch libraries for AOTI +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +find_package_torch() + +# Common AOTI functionality - combines all AOTI common components +set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp) +add_library(aoti_common STATIC ${_aoti_common_sources}) +target_include_directories( + aoti_common + PUBLIC $ + $ + $ + # PyTorch AOTI headers from ExecuTorch's torch detection + ${TORCH_INCLUDE_DIRS} +) +target_compile_options( + aoti_common + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) +# Ensure symbols are exported properly +target_link_options( + aoti_common PUBLIC $<$>:-Wl,--export-dynamic> +) + +# Link against ExecuTorch libraries and standard libraries +target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS}) +executorch_target_link_options_shared_lib(aoti_common) + +install( + TARGETS aoti_common + EXPORT ExecuTorchTargets + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) diff --git a/backends/aoti/README.md b/backends/aoti/README.md new file mode 100644 index 00000000000..74b45a35e5d --- /dev/null +++ b/backends/aoti/README.md @@ -0,0 +1,28 @@ +# AOTI Common Library + +This directory contains **common library components** for AOTI (Ahead-of-Time Inference) driven backends in ExecutorTorch, **not a standalone backend**. + +## Purpose + +The code in this directory provides shared functionality and utilities that are used by actual AOTI-driven backends such as: + +- **CUDA backend** - Uses AOTI for GPU acceleration +- Other AOTI-powered backends + +## Components + +- **`common_shims.cpp/h`** - Common shim functions that bridge ExecuTorch tensor operations with AOTI requirements +- **`aoti_model_container.cpp/h`** - Model container functionality for AOTI models +- **`utils.h`** - Utility functions and type definitions +- **`tests/`** - Unit tests for the common functionality + +## Usage + +This library is intended to be used as a dependency by actual AOTI backend implementations. It is not a backend that can be used directly for model execution. + +For example backend implementations that use this common library, see: +- `executorch/backends/cuda/` - CUDA AOTI backend + +## Building + +The common library components are built as part of the AOTI backend build process. See the `TARGETS` file for build configurations. diff --git a/backends/aoti/TARGETS b/backends/aoti/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/aoti_model_container.cpp b/backends/aoti/aoti_model_container.cpp new file mode 100644 index 00000000000..46a246faeb8 --- /dev/null +++ b/backends/aoti/aoti_model_container.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch { +namespace backends { +namespace aoti { + +extern "C" { + +// Global function pointers for AOT Inductor model container operations +// These will be loaded dynamically from the shared library +AOTInductorModelContainerCreateWithDeviceFunc + AOTInductorModelContainerCreateWithDevice = nullptr; +AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr; +AOTInductorModelContainerGetNumInputsFunc + AOTInductorModelContainerGetNumInputs = nullptr; +AOTInductorModelContainerGetNumOutputsFunc + AOTInductorModelContainerGetNumOutputs = nullptr; +AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr; + +// Additional global function pointers for AOT Inductor model container +// operations needed by Metal backend +AOTInductorModelContainerGetInputNameFunc + AOTInductorModelContainerGetInputName = nullptr; +AOTInductorModelContainerGetNumConstantsFunc + AOTInductorModelContainerGetNumConstants = nullptr; + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h new file mode 100644 index 00000000000..877f019c457 --- /dev/null +++ b/backends/aoti/aoti_model_container.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Type definitions +using AOTITensorHandle = Tensor*; +using AOTIRuntimeError = Error; + +// Forward declarations for AOT Inductor model container +struct AOTInductorModelContainerOpaque; +using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*; +using AOTInductorStreamHandle = void*; +using AOTIProxyExecutorHandle = void*; + +// Function pointer types for AOT Inductor model container operations +using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + const char* device_str, + const char* cubin_dir); + +using AOTInductorModelContainerDeleteFunc = + AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle); + +using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_inputs); + +using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_outputs); + +using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + Tensor** input_handles, // array of input Tensor*; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + Tensor** output_handles, // array for writing output Tensor*; handles + // will be stolen by the caller; the array itself + // is borrowed + size_t n_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle); + +// Global function pointers (will be loaded dynamically) +extern AOTInductorModelContainerCreateWithDeviceFunc + AOTInductorModelContainerCreateWithDevice; +extern AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete; +extern AOTInductorModelContainerGetNumInputsFunc + AOTInductorModelContainerGetNumInputs; +extern AOTInductorModelContainerGetNumOutputsFunc + AOTInductorModelContainerGetNumOutputs; +extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun; + +// Retrieves the name of an input tensor by index from the AOTI model container. +// Needed by Metal backend +using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t input_idx, + const char** input_name); + +// Retrieves the number of constants from the AOTI model container. +// Needed by Metal backend +using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants); + +// Global function pointers (will be loaded dynamically). +// Needed by Metal backend +extern AOTInductorModelContainerGetInputNameFunc + AOTInductorModelContainerGetInputName; +extern AOTInductorModelContainerGetNumConstantsFunc + AOTInductorModelContainerGetNumConstants; + +} // extern "C" + +// AOTI Delegate Handle structure +struct AOTIDelegateHandle { + void* so_handle; + std::string so_path; + AOTInductorModelContainerHandle container_handle; + void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header + // dependency +}; + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp new file mode 100644 index 00000000000..1afd137aa26 --- /dev/null +++ b/backends/aoti/common_shims.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +namespace internal { +// Global storage for tensor metadata +std::unordered_map> tensor_to_sizes; +std::unordered_map> tensor_to_strides; +} // namespace internal + +extern "C" { + +// Autograd mode functions +int32_t aoti_torch_grad_mode_is_enabled() { + // No autograd ever + return false; +} + +void aoti_torch_grad_mode_set_enabled(bool enabled) { + if (enabled) { + throw std::runtime_error("Cannot enable autograd"); + } +} + +// Tensor attribute operations +AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr) { + *ret_data_ptr = tensor->mutable_data_ptr(); + return Error::Ok; +} + +AOTITorchError aoti_torch_get_storage_offset( + Tensor* tensor, + int64_t* ret_storage_offset) { + // Storage offset is always 0 in ET + *ret_storage_offset = 0; + + return Error::Ok; +} + +AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) { + auto it = internal::tensor_to_strides.find(tensor); + bool needs_update = false; + + if (it == internal::tensor_to_strides.end()) { + needs_update = true; + } else { + // CRITICAL: Multimodal models reuse tensors with different shapes across + // executions (e.g., variable-length audio). We MUST validate cached + // metadata matches current tensor state, or CUDA kernels will receive + // incorrect shapes leading to memory corruption and segfaults. + auto tensor_strides = tensor->strides(); + needs_update = !std::equal( + it->second.begin(), + it->second.end(), + tensor_strides.begin(), + tensor_strides.end()); + } + + if (needs_update) { + std::vector strides(tensor->dim()); + auto tensor_strides = tensor->strides(); + for (int i = 0; i < tensor->dim(); i++) { + strides[i] = tensor_strides[i]; + } + it = + internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)) + .first; + } + + // For 0D tensors, data() returns nullptr on empty vectors, but we need to + // return a valid pointer + if (it->second.empty()) { + static int64_t empty_strides_placeholder = 0; + *ret_strides = &empty_strides_placeholder; + } else { + *ret_strides = it->second.data(); + } + + return Error::Ok; +} + +AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) { + *ret_dtype = static_cast(tensor->scalar_type()); + + return Error::Ok; +} + +AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) { + auto it = internal::tensor_to_sizes.find(tensor); + bool needs_update = false; + + if (it == internal::tensor_to_sizes.end()) { + needs_update = true; + } else { + // CRITICAL: Multimodal models reuse tensors with different shapes across + // executions (e.g., variable-length audio). We MUST validate cached + // metadata matches current tensor state, or CUDA kernels will receive + // incorrect shapes leading to memory corruption and segfaults. + auto tensor_sizes = tensor->sizes(); + needs_update = !std::equal( + it->second.begin(), + it->second.end(), + tensor_sizes.begin(), + tensor_sizes.end()); + } + + if (needs_update) { + std::vector sizes(tensor->dim()); + auto tensor_sizes = tensor->sizes(); + for (int i = 0; i < tensor->dim(); i++) { + sizes[i] = tensor_sizes[i]; + } + it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)) + .first; + } + + // For 0D tensors, data() returns nullptr on empty vectors, but we need to + // return a valid pointer + if (it->second.empty()) { + static int64_t empty_sizes_placeholder = 0; + *ret_sizes = &empty_sizes_placeholder; + } else { + *ret_sizes = it->second.data(); + } + + return Error::Ok; +} + +AOTITorchError aoti_torch_get_device_index( + Tensor* tensor, + int32_t* ret_device_index) { + // Let's assume all tensors AOTI using are on CUDA:0 + *ret_device_index = 0; + return Error::Ok; +} + +AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) { + *ret_dim = static_cast(tensor->dim()); + return Error::Ok; +} + +// Device and layout utility functions +int32_t aoti_torch_device_type_cpu() { + // Let's say cpu is 0 for ET as well + return 0; +} + +int32_t aoti_torch_layout_strided() { + // ET only support strided layout, the return value will always be 0, a.k.a + // at::Layout::Strided; + return 0; +} + +// Dtype constants - these return the PyTorch dtype codes +int32_t aoti_torch_dtype_float32() { + return 6; // PyTorch's float32 dtype code +} + +int32_t aoti_torch_dtype_bfloat16() { + return 15; // PyTorch's bfloat16 dtype code +} + +int32_t aoti_torch_dtype_int64() { + return 4; // PyTorch's int64 dtype code +} + +// Dtype utility function needed by Metal backend. +// Returns the size of the dtype in bytes. +size_t aoti_torch_dtype_element_size(int32_t dtype) { + return dtype_to_element_size(dtype); +} + +// Cleanup functions +void cleanup_tensor_metadata() { + internal::tensor_to_sizes.clear(); + internal::tensor_to_strides.clear(); +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h new file mode 100644 index 00000000000..b79e4c86715 --- /dev/null +++ b/backends/aoti/common_shims.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +// Common using declarations for ExecuTorch types +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Common AOTI type aliases +using AOTIRuntimeError = Error; +using AOTITorchError = Error; + +// Global storage for tensor metadata +extern std::unordered_map> tensor_to_sizes; +extern std::unordered_map> tensor_to_strides; + +// Attribute-related operations (memory-irrelevant) +AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); + +AOTITorchError aoti_torch_get_storage_offset( + Tensor* tensor, + int64_t* ret_storage_offset); + +AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); + +AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); + +AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); + +AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); + +AOTITorchError aoti_torch_get_device_index( + Tensor* tensor, + int32_t* ret_device_index); + +AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); + +// Utility functions for device and layout information +int32_t aoti_torch_device_type_cpu(); +int32_t aoti_torch_layout_strided(); +int32_t aoti_torch_dtype_float32(); +int32_t aoti_torch_dtype_bfloat16(); +int32_t aoti_torch_dtype_int64(); + +// Dtype utility function needed by Metal backend +size_t aoti_torch_dtype_element_size(int32_t dtype); + +// Autograd mode functions +int32_t aoti_torch_grad_mode_is_enabled(); +void aoti_torch_grad_mode_set_enabled(bool enabled); + +// Cleanup functions for clearing global state +void cleanup_tensor_metadata(); + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl new file mode 100644 index 00000000000..8bf44573bb3 --- /dev/null +++ b/backends/aoti/targets.bzl @@ -0,0 +1,58 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + # AOTI common shims functionality + runtime.cxx_library( + name = "common_shims", + srcs = [ + "common_shims.cpp", + ], + headers = [ + "common_shims.h", + "utils.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + # Constructor needed for backend registration. + compiler_flags = ["-Wno-global-constructors"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/core/exec_aten:lib", + ], + ) + + # AOTI model container functionality + runtime.cxx_library( + name = "model_container", + srcs = [ + "aoti_model_container.cpp", + ], + headers = [ + "aoti_model_container.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + # Constructor needed for backend registration. + compiler_flags = ["-Wno-global-constructors"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/runtime/backend:interface", + "//executorch/runtime/core:core", + ], + ) + + # Common AOTI functionality (combining both common_shims and model_container) + runtime.cxx_library( + name = "aoti_common", + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + visibility = ["@EXECUTORCH_CLIENTS"], + exported_deps = [ + ":common_shims", + ":model_container", + ], + ) diff --git a/backends/aoti/tests/TARGETS b/backends/aoti/tests/TARGETS new file mode 100644 index 00000000000..8daa8abd4d7 --- /dev/null +++ b/backends/aoti/tests/TARGETS @@ -0,0 +1,22 @@ +load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") + +oncall("executorch") + +cpp_unittest( + name = "test_common_shims", + srcs = [ + "test_common_shims.cpp", + ], + headers = [ + "utils.h", + ], + deps = [ + "//executorch/backends/aoti:common_shims", + "//executorch/extension/tensor:tensor", + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/extension/tensor:tensor", + ], +) diff --git a/backends/aoti/tests/test_common_shims.cpp b/backends/aoti/tests/test_common_shims.cpp new file mode 100644 index 00000000000..980eae96122 --- /dev/null +++ b/backends/aoti/tests/test_common_shims.cpp @@ -0,0 +1,324 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::aoti; +using namespace executorch::backends::aoti::test; +using namespace executorch::runtime; +using executorch::runtime::etensor::Tensor; + +// Test fixture for common shims tests +class CommonShimsTest : public ::testing::Test { + protected: + void SetUp() override { + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + } + + void TearDown() override { + // Clean up metadata and free any tensor data + cleanup_tensor_metadata(); + for (auto& tensor : test_tensors_) { + free_tensor_data(tensor.get()); + } + test_tensors_.clear(); + } + + // Helper to create and track test tensors for cleanup + Tensor* create_tracked_tensor(const std::vector& sizes) { + auto tensor = create_test_tensor(sizes); + Tensor* ptr = tensor.get(); + test_tensors_.push_back(tensor); + return ptr; + } + + private: + std::vector> test_tensors_; +}; + +// Test aoti_torch_get_sizes basic functionality +TEST_F(CommonShimsTest, GetSizesBasicFunctionality) { + // Test 1D tensor + auto tensor_1d = create_tracked_tensor({5}); + int64_t* sizes_ptr; + AOTITorchError error = aoti_torch_get_sizes(tensor_1d, &sizes_ptr); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(sizes_ptr, nullptr); + EXPECT_EQ(sizes_ptr[0], 5); + + // Test 2D tensor + auto tensor_2d = create_tracked_tensor({3, 4}); + error = aoti_torch_get_sizes(tensor_2d, &sizes_ptr); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(sizes_ptr, nullptr); + EXPECT_EQ(sizes_ptr[0], 3); + EXPECT_EQ(sizes_ptr[1], 4); + + // Test 3D tensor + auto tensor_3d = create_tracked_tensor({2, 3, 4}); + error = aoti_torch_get_sizes(tensor_3d, &sizes_ptr); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(sizes_ptr, nullptr); + EXPECT_EQ(sizes_ptr[0], 2); + EXPECT_EQ(sizes_ptr[1], 3); + EXPECT_EQ(sizes_ptr[2], 4); +} + +// Test aoti_torch_get_strides basic functionality +TEST_F(CommonShimsTest, GetStridesBasicFunctionality) { + // Test 1D tensor + auto tensor_1d = create_tracked_tensor({5}); + int64_t* strides_ptr; + AOTITorchError error = aoti_torch_get_strides(tensor_1d, &strides_ptr); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(strides_ptr, nullptr); + EXPECT_EQ(strides_ptr[0], 1); + + // Test 2D tensor - row major: [3, 4] should have strides [4, 1] + auto tensor_2d = create_tracked_tensor({3, 4}); + error = aoti_torch_get_strides(tensor_2d, &strides_ptr); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(strides_ptr, nullptr); + EXPECT_EQ(strides_ptr[0], 4); + EXPECT_EQ(strides_ptr[1], 1); + + // Test 3D tensor - row major: [2, 3, 4] should have strides [12, 4, 1] + auto tensor_3d = create_tracked_tensor({2, 3, 4}); + error = aoti_torch_get_strides(tensor_3d, &strides_ptr); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(strides_ptr, nullptr); + EXPECT_EQ(strides_ptr[0], 12); + EXPECT_EQ(strides_ptr[1], 4); + EXPECT_EQ(strides_ptr[2], 1); +} + +// Test caching logic for sizes +TEST_F(CommonShimsTest, SizesCachingLogic) { + auto tensor = create_tracked_tensor({2, 3, 4}); + + // First call should cache the sizes + int64_t* sizes_ptr1; + AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr1); + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(sizes_ptr1, nullptr); + + // Second call should return the same cached pointer + int64_t* sizes_ptr2; + error = aoti_torch_get_sizes(tensor, &sizes_ptr2); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(sizes_ptr1, sizes_ptr2); // Should be the exact same pointer + + // Values should still be correct + EXPECT_EQ(sizes_ptr2[0], 2); + EXPECT_EQ(sizes_ptr2[1], 3); + EXPECT_EQ(sizes_ptr2[2], 4); +} + +// Test caching logic for strides +TEST_F(CommonShimsTest, StridesCachingLogic) { + auto tensor = create_tracked_tensor({2, 3, 4}); + + // First call should cache the strides + int64_t* strides_ptr1; + AOTITorchError error = aoti_torch_get_strides(tensor, &strides_ptr1); + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(strides_ptr1, nullptr); + + // Second call should return the same cached pointer + int64_t* strides_ptr2; + error = aoti_torch_get_strides(tensor, &strides_ptr2); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(strides_ptr1, strides_ptr2); // Should be the exact same pointer + + // Values should still be correct + EXPECT_EQ(strides_ptr2[0], 12); + EXPECT_EQ(strides_ptr2[1], 4); + EXPECT_EQ(strides_ptr2[2], 1); +} + +// Test that different tensors have different cached entries +TEST_F(CommonShimsTest, DifferentTensorsCacheSeparately) { + auto tensor1 = create_tracked_tensor({2, 3}); + auto tensor2 = create_tracked_tensor({4, 5}); + + // Get sizes for both tensors + int64_t* sizes1_ptr; + int64_t* sizes2_ptr; + + EXPECT_EQ(aoti_torch_get_sizes(tensor1, &sizes1_ptr), Error::Ok); + EXPECT_EQ(aoti_torch_get_sizes(tensor2, &sizes2_ptr), Error::Ok); + + // Pointers should be different (different cache entries) + EXPECT_NE(sizes1_ptr, sizes2_ptr); + + // Values should be correct + EXPECT_EQ(sizes1_ptr[0], 2); + EXPECT_EQ(sizes1_ptr[1], 3); + EXPECT_EQ(sizes2_ptr[0], 4); + EXPECT_EQ(sizes2_ptr[1], 5); + + // Test strides as well + int64_t* strides1_ptr; + int64_t* strides2_ptr; + + EXPECT_EQ(aoti_torch_get_strides(tensor1, &strides1_ptr), Error::Ok); + EXPECT_EQ(aoti_torch_get_strides(tensor2, &strides2_ptr), Error::Ok); + + // Pointers should be different (different cache entries) + EXPECT_NE(strides1_ptr, strides2_ptr); + + // Values should be correct + EXPECT_EQ(strides1_ptr[0], 3); + EXPECT_EQ(strides1_ptr[1], 1); + EXPECT_EQ(strides2_ptr[0], 5); + EXPECT_EQ(strides2_ptr[1], 1); +} + +// Test cache persistence across multiple calls +TEST_F(CommonShimsTest, CachePersistence) { + auto tensor = create_tracked_tensor({3, 4, 5}); + + // Multiple calls to sizes should all return the same pointer + int64_t* sizes_ptr1; + int64_t* sizes_ptr2; + int64_t* sizes_ptr3; + + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok); + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr3), Error::Ok); + + EXPECT_EQ(sizes_ptr1, sizes_ptr2); + EXPECT_EQ(sizes_ptr2, sizes_ptr3); + + // Multiple calls to strides should all return the same pointer + int64_t* strides_ptr1; + int64_t* strides_ptr2; + int64_t* strides_ptr3; + + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok); + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr3), Error::Ok); + + EXPECT_EQ(strides_ptr1, strides_ptr2); + EXPECT_EQ(strides_ptr2, strides_ptr3); +} + +// Test 0D tensor (scalar) +TEST_F(CommonShimsTest, ScalarTensor) { + auto tensor_0d = create_tracked_tensor({}); + + // Test sizes for 0D tensor + int64_t* sizes_ptr; + AOTITorchError error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr); + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(sizes_ptr, nullptr); + + // Test strides for 0D tensor + int64_t* strides_ptr; + error = aoti_torch_get_strides(tensor_0d, &strides_ptr); + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(strides_ptr, nullptr); + + // Cache should work for 0D tensors too + int64_t* sizes_ptr2; + error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr2); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(sizes_ptr, sizes_ptr2); +} + +// Test large tensor dimensions +TEST_F(CommonShimsTest, LargeTensorDimensions) { + auto tensor = create_tracked_tensor({100, 200, 300, 400}); + + // Test sizes + int64_t* sizes_ptr; + AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr); + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(sizes_ptr, nullptr); + EXPECT_EQ(sizes_ptr[0], 100); + EXPECT_EQ(sizes_ptr[1], 200); + EXPECT_EQ(sizes_ptr[2], 300); + EXPECT_EQ(sizes_ptr[3], 400); + + // Test strides - expected: [24000000, 120000, 400, 1] + int64_t* strides_ptr; + error = aoti_torch_get_strides(tensor, &strides_ptr); + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(strides_ptr, nullptr); + EXPECT_EQ(strides_ptr[0], 24000000); + EXPECT_EQ(strides_ptr[1], 120000); + EXPECT_EQ(strides_ptr[2], 400); + EXPECT_EQ(strides_ptr[3], 1); +} + +// Test that cleanup_tensor_metadata clears the cache +TEST_F(CommonShimsTest, CleanupFunctionality) { + auto tensor = create_tracked_tensor({2, 3}); + + // Cache some data + int64_t* sizes_ptr1; + int64_t* strides_ptr1; + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); + + // Clear the cache + cleanup_tensor_metadata(); + + // Getting sizes/strides again should create new cache entries + // (We can't directly test if the pointers are different since that would be + // implementation-dependent, but we can at least verify the functions still + // work) + int64_t* sizes_ptr2; + int64_t* strides_ptr2; + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok); + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok); + + // Values should still be correct + EXPECT_EQ(sizes_ptr2[0], 2); + EXPECT_EQ(sizes_ptr2[1], 3); + EXPECT_EQ(strides_ptr2[0], 3); + EXPECT_EQ(strides_ptr2[1], 1); +} + +// Test mixed operations to ensure caches are independent +TEST_F(CommonShimsTest, IndependentCaches) { + auto tensor = create_tracked_tensor({2, 3, 4}); + + // Get sizes first + int64_t* sizes_ptr1; + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok); + + // Get strides + int64_t* strides_ptr1; + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok); + + // Get sizes again - should be cached + int64_t* sizes_ptr2; + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok); + EXPECT_EQ(sizes_ptr1, sizes_ptr2); + + // Get strides again - should be cached + int64_t* strides_ptr2; + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok); + EXPECT_EQ(strides_ptr1, strides_ptr2); + + // Sizes and strides pointers should be different (different caches) + EXPECT_NE(sizes_ptr1, strides_ptr1); +} diff --git a/backends/aoti/tests/utils.h b/backends/aoti/tests/utils.h new file mode 100644 index 00000000000..1f26f7e2d51 --- /dev/null +++ b/backends/aoti/tests/utils.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { +namespace test { + +// Use the same type aliases as in common_shims.h +using executorch::runtime::etensor::Tensor; + +/** + * Creates a test tensor with the specified shape and scalar type + */ +inline std::shared_ptr create_test_tensor( + const std::vector& sizes, + exec_aten::ScalarType dtype = exec_aten::ScalarType::Float) { + // Calculate total number of elements + int64_t total_elements = 1; + for (int64_t size : sizes) { + total_elements *= size; + } + + // Calculate strides (row-major layout) + std::vector strides(sizes.size()); + if (sizes.size() > 0) { + strides[sizes.size() - 1] = 1; + for (int i = sizes.size() - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + } + + // Allocate data buffer + size_t dtype_size = exec_aten::elementSize(dtype); + void* data = malloc(total_elements * dtype_size); + + // Convert sizes and strides to the required type + std::vector sizes_converted( + sizes.begin(), sizes.end()); + std::vector strides_converted( + strides.begin(), strides.end()); + + // Create the tensor with the correct argument types and count + auto tensor = executorch::extension::from_blob( + data, sizes_converted, strides_converted, dtype); + + return tensor; +} + +/** + * Helper to clean up tensor data that was allocated with malloc + */ +inline void free_tensor_data(Tensor* tensor) { + if (tensor && tensor->mutable_data_ptr()) { + free(tensor->mutable_data_ptr()); + } +} + +} // namespace test +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h new file mode 100644 index 00000000000..78c07bcea6e --- /dev/null +++ b/backends/aoti/utils.h @@ -0,0 +1,99 @@ + +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +// Common using declarations for ExecuTorch types +using executorch::runtime::Error; + +extern "C" { + +// Common AOTI type aliases +using AOTITorchError = Error; + +// Map int32_t dtype to ExecuTorch ScalarType (robust version of hardcoded +// ScalarType::Float) +inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) { + // Convert based on known PyTorch dtype codes (without CUDA-specific + // dependency) + switch (dtype) { + case 4: // PyTorch's int64 dtype code + return executorch::aten::ScalarType::Long; + case 6: // PyTorch's float32 dtype code + return executorch::aten::ScalarType::Float; + case 15: // PyTorch's bfloat16 dtype code + return executorch::aten::ScalarType::BFloat16; + // Future support for additional dtypes can be added here + default: + ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype); + return executorch::aten::ScalarType::Undefined; + } +} + +// Map int32_t dtype to number of bytes per element (reusing ExecuTorch's +// elementSize function) +inline size_t dtype_to_element_size(int32_t dtype) { + // First convert int32_t dtype to ExecuTorch ScalarType, then use existing + // elementSize function + executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype); + if (scalar_type == executorch::aten::ScalarType::Undefined) { + ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype); + return 0; // Return 0 to indicate error + } + + // Reuse ExecuTorch's existing elementSize function from scalar_type_util.h + return executorch::runtime::elementSize(scalar_type); +} + +// Storage offset validation utility function +inline AOTITorchError validate_storage_offset(int64_t storage_offset) { + // Storage offset must always be 0 + if (storage_offset != 0) { + ET_LOG( + Error, + "Storage offset must be 0. Got storage_offset: %ld", + storage_offset); + return Error::InvalidArgument; + } + return Error::Ok; +} + +// Check if tensor is in contiguous memory format (NCHW for 4D tensors) +// Contiguous format means strides decrease from left to right: +// For NCHW: strides = [C*H*W, H*W, W, 1] +inline bool is_tensor_contiguous( + int64_t ndim, + const int64_t* sizes, + const int64_t* strides) { + int64_t expected_stride = 1; + for (int64_t i = ndim - 1; i >= 0; i--) { + if (strides[i] != expected_stride) { + return false; + } + expected_stride *= sizes[i]; + } + return true; +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt index 9879a05e3dc..17e2d94e336 100644 --- a/backends/apple/coreml/CMakeLists.txt +++ b/backends/apple/coreml/CMakeLists.txt @@ -115,7 +115,7 @@ if(APPLE) endif() target_compile_options(coreml_util PUBLIC -fPIC) -install(TARGETS coreml_util DESTINATION lib) +install(TARGETS coreml_util DESTINATION ${CMAKE_INSTALL_LIBDIR}) install( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util @@ -154,7 +154,7 @@ target_compile_options(coreml_inmemoryfs PUBLIC -fPIC) install( TARGETS coreml_inmemoryfs - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${_common_include_directories} ) @@ -251,7 +251,7 @@ if(APPLE) install( TARGETS coremldelegate coreml_util coreml_inmemoryfs EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py index d1614f30451..16ace2e7a88 100644 --- a/backends/apple/coreml/compiler/coreml_preprocess.py +++ b/backends/apple/coreml/compiler/coreml_preprocess.py @@ -6,6 +6,7 @@ import logging import shutil +import tempfile import uuid from dataclasses import asdict, dataclass from enum import Enum @@ -415,7 +416,7 @@ def preprocess_model( mlmodel: ct.models.MLModel, model_type: MODEL_TYPE ) -> PreprocessResult: identifier = "executorch_" + str(uuid.uuid4()) - dir_path: Path = Path("tmp") / identifier + dir_path: Path = Path(tempfile.gettempdir()) / identifier model_dir_path: Path = dir_path / "lowered_module" model_spec: ct.proto.Model_pb2 = mlmodel.get_spec() logger.warning( diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h index a9e06efa90d..11d957044e9 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h @@ -99,17 +99,6 @@ NS_ASSUME_NONNULL_BEGIN - (NSUInteger)compact:(NSUInteger)sizeInBytes error:(NSError* __autoreleasing*)error; -/// Executes a block with a unique temporary directory. -/// -/// A new temporary subdirectory URL is created inside the receiver’s designated -/// base directory. The directory is passed to the block, which can use it to -/// perform temporary file operations. After the block finishes executing, -/// the directory and its contents are removed. -/// -/// @param block A block to execute. The block receives a unique URL. -- (void)withTemporaryDirectory:(void (^)(NSURL* directoryURL))block; - - /// Purges the assets storage. The assets are moved to the trash directory and are asynchronously /// deleted. /// @@ -128,12 +117,6 @@ NS_ASSUME_NONNULL_BEGIN /// contents are deleted asynchronously. @property (copy, readonly, nonatomic) NSURL* trashDirectoryURL; - -/// The staging directory URL, used to hold assets that are being prepared or processed -/// before they are moved into their final location. The contents of this directory -/// are temporary and may be cleared when no longer needed. -@property (copy, readonly, nonatomic) NSURL* stagingDirectoryURL; - /// The file manager. @property (strong, readonly, nonatomic) NSFileManager* fileManager; diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm index 53c3d1cdc69..256026e1f09 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm @@ -254,29 +254,6 @@ BOOL is_asset_alive(NSMapTable *assets_in_use_map, return assets; } - -NSURL * _Nullable move_to_directory(NSURL *url, - NSURL *directoryURL, - NSFileManager *fileManager, - NSError * __autoreleasing *error) { - if (!url) { - ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorInternalError, "Move operation failed: source URL is nil."); - return nil; - } - - if (!directoryURL) { - ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorInternalError, "Move operation failed: destination URL is nil."); - return nil; - } - - NSURL *dstURL = [directoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString]; - if (![fileManager moveItemAtURL:url toURL:dstURL error:error]) { - return nil; - } - - return dstURL; -} - } //namespace @interface ETCoreMLAssetManager () { @@ -322,17 +299,12 @@ - (nullable instancetype)initWithDatabase:(const std::shared_ptr&)data if (!managedAssetsDirectoryURL) { return nil; } - + NSURL *managedTrashDirectoryURL = ::create_directory_if_needed(trashDirectoryURL, @"models", fileManager, error); if (!managedTrashDirectoryURL) { return nil; } - - NSURL *managedStagingDirectoryURL = ::create_directory_if_needed(assetsDirectoryURL, @"staging", fileManager, error); - if (!managedStagingDirectoryURL) { - return nil; - } - + // If directory is empty then purge the stores if (::is_directory_empty(managedAssetsDirectoryURL, fileManager, nil)) { assetsMetaStore.impl()->purge(ec); @@ -343,7 +315,6 @@ - (nullable instancetype)initWithDatabase:(const std::shared_ptr&)data _assetsStore = std::move(assetsStore); _assetsMetaStore = std::move(assetsMetaStore); _assetsDirectoryURL = managedAssetsDirectoryURL; - _stagingDirectoryURL = managedStagingDirectoryURL; _trashDirectoryURL = managedTrashDirectoryURL; _estimatedSizeInBytes = sizeInBytes.value(); _maxAssetsSizeInBytes = maxAssetsSizeInBytes; @@ -375,15 +346,15 @@ - (nullable instancetype)initWithDatabaseURL:(NSURL *)databaseURL error:error]; } -- (void)withTemporaryDirectory:(void (^)(NSURL *directoryURL))block { - NSURL *dstURL = [self.stagingDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString]; - block(dstURL); - if (![self.fileManager fileExistsAtPath:dstURL.path]) { - return; +- (nullable NSURL *)moveURL:(NSURL *)url + toUniqueURLInDirectory:(NSURL *)directoryURL + error:(NSError * __autoreleasing *)error { + NSURL *dstURL = [directoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString]; + if (![self.fileManager moveItemAtURL:url toURL:dstURL error:error]) { + return nil; } - - move_to_directory(dstURL, self.trashDirectoryURL, self.fileManager, nil); - [self cleanupTrashDirectory]; + + return dstURL; } - (void)cleanupAssetIfNeeded:(ETCoreMLAsset *)asset { @@ -436,8 +407,9 @@ - (nullable ETCoreMLAsset *)_storeAssetAtURL:(NSURL *)srcURL return false; } - // If a file already exists at `dstURL`, move it to the trash for removal. - move_to_directory(dstURL, self.trashDirectoryURL, self.fileManager, nil); + // If an asset exists move it + [self moveURL:dstURL toUniqueURLInDirectory:self.trashDirectoryURL error:nil]; + // Move the asset to assets directory. if (![self.fileManager moveItemAtURL:srcURL toURL:dstURL error:error]) { return false; @@ -461,25 +433,16 @@ - (nullable ETCoreMLAsset *)_storeAssetAtURL:(NSURL *)srcURL } - (void)triggerCompaction { - if (self.estimatedSizeInBytes >= self.maxAssetsSizeInBytes) { - __weak __typeof(self) weakSelf = self; - dispatch_async(self.syncQueue, ^{ - NSError *localError = nil; - if (![weakSelf _compact:self.maxAssetsSizeInBytes error:&localError]) { - ETCoreMLLogError(localError, "Failed to compact asset store."); - } - }); + if (self.estimatedSizeInBytes < self.maxAssetsSizeInBytes) { + return; } - - // Always clean the trash directory to ensure a minimal footprint. - // The `trashQueue` is serialized, so only one cleanup will run at a time. - [self cleanupTrashDirectory]; -} - -- (void)cleanupTrashDirectory { + __weak __typeof(self) weakSelf = self; - dispatch_async(self.trashQueue, ^{ - [weakSelf removeFilesInTrashDirectory]; + dispatch_async(self.syncQueue, ^{ + NSError *localError = nil; + if (![weakSelf _compact:self.maxAssetsSizeInBytes error:&localError]) { + ETCoreMLLogError(localError, "Failed to compact asset store."); + } }); } @@ -585,7 +548,7 @@ - (BOOL)_removeAssetWithIdentifier:(NSString *)identifier NSURL *assetURL = ::get_asset_url(assetValue); if ([self.fileManager fileExistsAtPath:assetURL.path] && - !move_to_directory(assetURL, self.trashDirectoryURL, self.fileManager, error)) { + ![self moveURL:assetURL toUniqueURLInDirectory:self.trashDirectoryURL error:error]) { return false; } @@ -686,7 +649,13 @@ - (NSUInteger)_compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing identifier); } } - + + // Trigger cleanup. + __weak __typeof(self) weakSelf = self; + dispatch_async(self.trashQueue, ^{ + [weakSelf removeFilesInTrashDirectory]; + }); + return _estimatedSizeInBytes; } @@ -695,10 +664,7 @@ - (NSUInteger)compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing * dispatch_sync(self.syncQueue, ^{ result = [self _compact:sizeInBytes error:error]; }); - - // Always clean the trash directory to ensure a minimal footprint. - // The `trashQueue` is serialized, so only one cleanup will run at a time. - [self cleanupTrashDirectory]; + return result; } @@ -742,7 +708,7 @@ - (BOOL)_purge:(NSError * __autoreleasing *)error { } // Move the the whole assets directory to the temp directory. - if (!move_to_directory(self.assetsDirectoryURL, self.trashDirectoryURL, self.fileManager, error)) { + if (![self moveURL:self.assetsDirectoryURL toUniqueURLInDirectory:self.trashDirectoryURL error:error]) { return false; } @@ -758,7 +724,13 @@ - (BOOL)_purge:(NSError * __autoreleasing *)error { ::set_error_from_error_code(ec, error); // Trigger cleanup - [self cleanupTrashDirectory]; + if (status) { + __weak __typeof(self) weakSelf = self; + dispatch_async(self.trashQueue, ^{ + [weakSelf removeFilesInTrashDirectory]; + }); + } + return static_cast(status); } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm index 9e8ae04842e..05aa910d954 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm @@ -62,12 +62,21 @@ + (nullable ETCoreMLModel *)loadModelWithContentsOfURL:(NSURL *)compiledModelURL if (model) { return model; } - - if (error) { - *error = localError; + + if (localError) { + ETCoreMLLogError(localError, + "Failed to load model from compiled asset with identifier = %@", + identifier); } - - return nil; + + // If store failed then we will load the model from compiledURL. + auto backingAsset = Asset::make(compiledModelURL, identifier, assetManager.fileManager, error); + if (!backingAsset) { + return nil; + } + + asset = [[ETCoreMLAsset alloc] initWithBackingAsset:backingAsset.value()]; + return ::get_model_from_asset(asset, configuration, metadata, error); } @end diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm index 524ceaf7e28..2347936fd34 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm @@ -345,10 +345,6 @@ void add_compute_unit(std::string& identifier, MLComputeUnits compute_units) { return [ETCoreMLModelDebugInfo modelDebugInfoFromData:file_data error:error]; } -NSString *raw_model_identifier(NSString *identifier) { - return [NSString stringWithFormat:@"raw_%@", identifier]; -} - #endif } //namespace @@ -412,7 +408,7 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier { return modelAsset; } - __block NSError *localError = nil; + NSError *localError = nil; modelAsset = [self.assetManager assetWithIdentifier:identifier error:&localError]; if (localError) { ETCoreMLLogError(localError, @@ -424,9 +420,8 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier { } - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier - modelURL:(nullable NSURL *)modelURL inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS - dstURL:(NSURL *)dstURL + assetManager:(ETCoreMLAssetManager *)assetManager error:(NSError * __autoreleasing *)error { auto modelAssetType = get_model_asset_type(inMemoryFS); if (!modelAssetType) { @@ -435,135 +430,80 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier "AOT blob is missing model file."); return nil; } - - // If modelURL is not provided, write model files to the destination directory (dstURL) - // and obtain a URL pointing to them. Otherwise, use the provided modelURL. - modelURL = (modelURL == nil) ? ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error) : modelURL; - if (!modelURL) { - // Failed to generate or locate model files, return nil. - return nil; - } - - // Handle based on the type of the model asset. + + NSURL *dstURL = [self.assetManager.trashDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString]; + NSURL *modelURL = ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error); switch (modelAssetType.value()) { case ModelAssetType::CompiledModel: { - // The model is already compiled; no further action needed. - // Return the existing model URL. + // Model is already compiled. ETCoreMLLogInfo("The model in the pte file is pre-compiled. Skipping compilation."); return modelURL; } - + case ModelAssetType::Model: { - // The model is not compiled yet. - // Compile the model at the specified URL with a maximum wait time of 5 minutes. + // Compile the model. ETCoreMLLogInfo("The model in the pte file is not pre-compiled. Compiling with a 5 min timeout."); NSURL *compiledModelURL = [ETCoreMLModelCompiler compileModelAtURL:modelURL maxWaitTimeInSeconds:(5 * 60) error:error]; - // Return the URL of the compiled model or nil if compilation fails. + return compiledModelURL; } } } -- (nullable ETCoreMLAsset *)compiledModelAssetWithMetadata:(const ModelMetadata&)metadata - modelURL:(nullable NSURL *)modelURL - inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS - error:(NSError * __autoreleasing *)error { - NSString *identifier = @(metadata.identifier.c_str()); - __block ETCoreMLAsset *compiledModelAsset = [self assetWithIdentifier:identifier]; - if (compiledModelAsset) { - ETCoreMLLogInfo("Cache Hit: Successfully retrieved compiled model with identifier=%@ from the models cache.", identifier); - } else { - ETCoreMLLogInfo("Cache Miss: Compiled Model with identifier=%@ was not found in the models cache.", identifier); - } - - [self.assetManager withTemporaryDirectory:^(NSURL * _Nonnull directoryURL) { - if (compiledModelAsset) { - return; - } - - // The directory specified by `directoryURL` is unique and will be automatically cleaned up - // once the enclosing block completes. - NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier - modelURL:modelURL - inMemoryFS:inMemoryFS - dstURL:directoryURL - error:error]; - if (compiledModelURL) { - // Move the compiled model to the asset manager to transfer ownership. - ETCoreMLLogInfo("Storing compiled asset with identifier=%@ in the asset manager.", identifier); - compiledModelAsset = [self.assetManager storeAssetAtURL:compiledModelURL withIdentifier:identifier error:error]; - } - }]; - - return compiledModelAsset; -} - #if ET_EVENT_TRACER_ENABLED -- (nullable ETCoreMLAsset *)modelAssetWithMetadata:(const ModelMetadata&)metadata - inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS - error:(NSError * __autoreleasing *)error { +- (nullable id)modelExecutorWithMetadata:(const ModelMetadata&)metadata + inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS + configuration:(MLModelConfiguration *)configuration + error:(NSError * __autoreleasing *)error { NSString *identifier = @(metadata.identifier.c_str()); - NSString *rawIdentifier = raw_model_identifier(identifier); - __block ETCoreMLAsset *modelAsset = [self assetWithIdentifier:rawIdentifier]; - if (modelAsset) { + // Otherwise try to retrieve the compiled asset. + ETCoreMLAsset *compiledModelAsset = [self assetWithIdentifier:identifier]; + if (compiledModelAsset) { ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier); } else { ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier); } - - [self.assetManager withTemporaryDirectory:^(NSURL * _Nonnull directoryURL) { - if (modelAsset) { - return; - } - - auto modelAssetType = get_model_asset_type(inMemoryFS); - if (modelAssetType != ModelAssetType::Model) { - return; - } - - // The directory specified by `directoryURL` is unique and will be automatically cleaned up - // once the enclosing block completes. - NSURL *modelURL = ::write_model_files(directoryURL, - self.fileManager, - identifier, - modelAssetType.value(), - inMemoryFS, - error); + + // Create a unique directory for writing model files. + NSURL *dstURL = [self.assetManager.trashDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString]; + auto modelAssetType = get_model_asset_type(inMemoryFS); + ETCoreMLAsset *modelAsset = nil; + // Write the model files. + if (modelAssetType == ModelAssetType::Model) { + NSURL *modelURL = ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error); if (modelURL) { - // Move the model to the asset manager to transfer ownership. - modelAsset = [self.assetManager storeAssetAtURL:modelURL withIdentifier:rawIdentifier error:error]; + modelAsset = make_asset(modelURL, + identifier, + self.fileManager, + error); } - }]; - - return modelAsset; -} - -- (nullable id)modelExecutorWithMetadata:(const ModelMetadata&)metadata - inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS - configuration:(MLModelConfiguration *)configuration - error:(NSError * __autoreleasing *)error { - NSError *localError = nil; - ETCoreMLAsset *modelAsset = [self modelAssetWithMetadata:metadata inMemoryFS:inMemoryFS error:&localError]; - if (localError) { - if (error) { - *error = localError; - } - - return nil; } - - ETCoreMLAsset *compiledModelAsset = [self compiledModelAssetWithMetadata:metadata - modelURL:modelAsset.contentURL - inMemoryFS:inMemoryFS - error:error]; + + if (!compiledModelAsset) { + // Compile the model. + NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier + inMemoryFS:inMemoryFS + assetManager:self.assetManager + error:error]; + compiledModelAsset = make_asset(compiledModelURL, + identifier, + self.fileManager, + error); + } + if (!compiledModelAsset) { return nil; } + + NSError *localError = nil; + ETCoreMLModelDebugInfo *debug_info = get_model_debug_info(inMemoryFS, &localError); + if (localError) { + ETCoreMLLogError(localError, "Failed to parse debug info file"); + } + - ETCoreMLModelDebugInfo *debug_info = get_model_debug_info(inMemoryFS, error); - // The analyzer requires both the raw (uncompiled) asset and the compiled model asset to perform analysis. return [[ETCoreMLModelAnalyzer alloc] initWithCompiledModelAsset:compiledModelAsset modelAsset:modelAsset modelDebugInfo:debug_info @@ -572,33 +512,41 @@ - (nullable ETCoreMLAsset *)modelAssetWithMetadata:(const ModelMetadata&)metadat assetManager:self.assetManager error:error]; } + #else - (nullable id)modelExecutorWithMetadata:(const ModelMetadata&)metadata inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS configuration:(MLModelConfiguration *)configuration error:(NSError * __autoreleasing *)error { - ETCoreMLAsset *compiledModelAsset = [self compiledModelAssetWithMetadata:metadata - modelURL:nil - inMemoryFS:inMemoryFS - error:error]; - if (!compiledModelAsset) { - return nil; + NSString *identifier = @(metadata.identifier.c_str()); + // Otherwise try to retrieve the compiled asset. + ETCoreMLAsset *asset = [self assetWithIdentifier:identifier]; + ETCoreMLModel *model = asset ? get_model_from_asset(asset, configuration, metadata, error) : nil; + if (model) { + ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier); + return [[ETCoreMLDefaultModelExecutor alloc] initWithModel:model]; } - - ETCoreMLModel *model = [ETCoreMLModelLoader loadModelWithContentsOfURL:compiledModelAsset.contentURL - configuration:configuration - metadata:metadata - assetManager:self.assetManager - error:error]; - if (!model) { + + ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier); + // Compile the model. + NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier + inMemoryFS:inMemoryFS + assetManager:self.assetManager + error:error]; + if (!compiledModelURL) { return nil; } - + + model = [ETCoreMLModelLoader loadModelWithContentsOfURL:compiledModelURL + configuration:configuration + metadata:metadata + assetManager:self.assetManager + error:error]; + return [[ETCoreMLDefaultModelExecutor alloc] initWithModel:model]; } #endif - - (nullable id)_modelExecutorWithAOTData:(NSData *)data configuration:(MLModelConfiguration *)configuration error:(NSError * __autoreleasing *)error { @@ -783,7 +731,6 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle args.count); return result; } - NSError *localError = nil; @autoreleasepool { NSArray *inputs = [args subarrayWithRange:NSMakeRange(0, model.orderedInputNames.count)]; @@ -803,11 +750,11 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle result = YES; } } - - if (localError && error) { - *error = localError; + if (!result) { + if (error) { + *error = localError; + } } - return result; } diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt index 5a253347b01..99a8afa16ac 100644 --- a/backends/apple/mps/CMakeLists.txt +++ b/backends/apple/mps/CMakeLists.txt @@ -77,7 +77,7 @@ target_compile_options(mpsdelegate PRIVATE "-fno-objc-arc") install( TARGETS mpsdelegate mps_schema EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${_common_include_directories} ) diff --git a/backends/arm/README.md b/backends/arm/README.md index e495a8e40cb..0abf5e9bf55 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -6,7 +6,7 @@ PyTorch models to a TOSA representation. This representation is used to deploy to the following targets: - **Arm® Ethos™-U55/65/85** - Compiled using the Ethos-U Vela compiler. -- **VGF (Vulkan® Graph Format)** – SPIR-V™ representation for Vulkan-capable devices. +- **VGF Format, for ML extensions for Vulkan®** – a format containing SPIR-V™ ML operators for Vulkan-capable devices. The backend provides an ahead-of-time (AOT) flow, that produces a PTE file for your chosen target. The AOT flow supports the following development operating systems: diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS index a78ab252739..a737c4bc9de 100644 --- a/backends/arm/TARGETS +++ b/backends/arm/TARGETS @@ -106,3 +106,17 @@ runtime.python_library( "//caffe2:torch", ] ) +runtime.python_library( + name = "_factory", + srcs = [ + "util/_factory.py" + ], + deps = [ + ":ethosu", + ":vgf", + ":arm_compile_spec", + "//executorch/backends/arm/quantizer:lib", + "//executorch/exir/backend:operator_support", + "//executorch/exir/backend:compile_spec_schema", + ] +) diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py index f9e23f73cc5..b1337c38a58 100644 --- a/backends/arm/_passes/__init__.py +++ b/backends/arm/_passes/__init__.py @@ -27,6 +27,7 @@ from .convert_to_clamp import ConvertToClampPass # noqa from .decompose_acosh_pass import DecomposeAcoshPass # noqa from .decompose_adaptive_avg_pool2d_pass import DecomposeAdaptiveAvgPool2dPass # noqa +from .decompose_add_sub_alpha_pass import DecomposeAddSubAlphaPass # noqa from .decompose_addmm_pass import DecomposeAddmmPass # noqa from .decompose_asin_and_acos_pass import DecomposeAsinAndAcosPass # noqa from .decompose_asinh_pass import DecomposeAsinhPass # noqa @@ -46,6 +47,9 @@ from .decompose_glu_pass import DecomposeGluPass # noqa from .decompose_grouped_conv import DecomposeGroupedConv # noqa from .decompose_groupnorm_pass import DecomposeGroupNormPass # noqa +from .decompose_int16_activation_conv2d_pass import ( # noqa + DecomposeConv2dWithInt16ActivationPass, +) from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass # noqa @@ -78,7 +82,7 @@ from .insert_int32_casts_after_int64_placeholders import ( # noqa InsertInt32CastsAfterInt64PlaceholdersPass, ) -from .insert_rescales_pass import InsertRescalePass # noqa +from .insert_rescales_pass import InsertRescaleInt32Pass, InsertRescalePass # noqa from .insert_table_ops import InsertTableOpsPass # noqa from .match_arg_dtype_pass import MatchArgDtypePass # noqa from .match_arg_ranks_pass import MatchArgRanksPass # noqa @@ -88,6 +92,8 @@ ReplaceScalarWithTensorArgPassTOSABI, ReplaceScalarWithTensorArgPassTOSAMI, ) +from .rewrite_matmul import RewriteMatmulPass # noqa +from .rewrite_upsample import RewriteUpsamplePass # noqa from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa from .size_adjust_input_pass import SizeAdjustInputPass # noqa from .to_tosa_memory_format_pass import ToTosaMemoryFormatPass # noqa diff --git a/backends/arm/_passes/_debug_passes.py b/backends/arm/_passes/_debug_passes.py index 7809885d465..4c1661e50a9 100644 --- a/backends/arm/_passes/_debug_passes.py +++ b/backends/arm/_passes/_debug_passes.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.devtools.visualization.visualization_utils import visualize_graph from executorch.exir import ExportedProgram @@ -14,6 +16,8 @@ class VisualizePass(ExportPass): This pass visualizes the graph at the point of insertion in the pass manager """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program: ExportedProgram) -> None: super().__init__() self.exported_program = exported_program diff --git a/backends/arm/_passes/add_bias_pass.py b/backends/arm/_passes/add_bias_pass.py index 31c0c0505cb..fd5476f51b8 100644 --- a/backends/arm/_passes/add_bias_pass.py +++ b/backends/arm/_passes/add_bias_pass.py @@ -3,13 +3,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor +from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.backends.transforms.utils import create_constant_placeholder from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult from torch.export.graph_signature import InputKind @@ -19,6 +22,8 @@ class AddBiasPass(ArmPass): The bias is set to zero. """ + _passes_required_after: Set[Type[ExportPass]] = set() + targeted_ops = (exir_ops.edge.aten.convolution.default,) def call(self, graph_module): @@ -55,6 +60,10 @@ def call(self, graph_module): persistent_buffer=True, name=f"{node.name}_bias", ) + if node.args[0].meta["val"].dtype == torch.int16: + bias_node.meta[TosaSpecialDtype.meta_key()] = ( + TosaSpecialDtype.INT48 + ) node.update_arg(2, bias_node) if modified: diff --git a/backends/arm/_passes/annotate_decomposed_matmul.py b/backends/arm/_passes/annotate_decomposed_matmul.py index 8156ca0b89d..72ae46c76c1 100644 --- a/backends/arm/_passes/annotate_decomposed_matmul.py +++ b/backends/arm/_passes/annotate_decomposed_matmul.py @@ -7,10 +7,13 @@ import itertools import operator -from typing import cast, List +from typing import cast, List, Set, Type import torch from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( + FoldAndAnnotateQParamsPass, +) from executorch.backends.arm.constants import DQ_OPS, Q_OPS from executorch.exir.dialects._ops import ops as exir_ops @@ -29,6 +32,8 @@ class AnnotateDecomposedMatmulPass(ExportPass): matmul-op (can be mm or bmm). """ + _passes_required_after: Set[Type[ExportPass]] = {FoldAndAnnotateQParamsPass} + def _match_partition_to_node( self, node: torch.fx.Node, partitioned_inputs: List[torch.fx.Node] ) -> torch.fx.Node: @@ -68,7 +73,10 @@ def call(self, graph_module: GraphModule) -> PassResult: node for node in partition.nodes if node.target in matmul_targets ][0] - if quantized_input: + if quantized_input and not all( + input_node.target in DQ_OPS + for input_node in matmul_node.all_input_nodes + ): matmul_args = matmul_node.all_input_nodes for node in matmul_args: # Find the dq-node connected to this mm/bmm arg @@ -94,7 +102,9 @@ def call(self, graph_module: GraphModule) -> PassResult: partition_output = list(partition.output_nodes[0].users)[0] quantized_output = partition_output.target in Q_OPS - if quantized_output: + if quantized_output and not all( + user.target in Q_OPS for user in matmul_node.users + ): with graph_module.graph.inserting_after(matmul_node): # Create q-node after matmul q_node = create_node( diff --git a/backends/arm/_passes/annotate_output_dim_order_pass.py b/backends/arm/_passes/annotate_output_dim_order_pass.py index 08f93383a9c..8dc13326e4a 100644 --- a/backends/arm/_passes/annotate_output_dim_order_pass.py +++ b/backends/arm/_passes/annotate_output_dim_order_pass.py @@ -3,9 +3,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import get_output_dim_orders -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult class AnnotateOutputDimOrderPass(ArmPass): @@ -14,6 +17,8 @@ class AnnotateOutputDimOrderPass(ArmPass): for verifying that the dim order does not change unexpectedly in later passes. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call(self, graph_module): output_node = graph_module.graph.output_node() output_node.meta["original_dim_orders"] = get_output_dim_orders(graph_module) diff --git a/backends/arm/_passes/arm_pass.py b/backends/arm/_passes/arm_pass.py index 085267a174e..c76b5d157a7 100644 --- a/backends/arm/_passes/arm_pass.py +++ b/backends/arm/_passes/arm_pass.py @@ -6,7 +6,8 @@ # pyre-unsafe import traceback -from typing import Optional +from abc import abstractmethod +from typing import List, Optional, Set, Type import torch from executorch.exir.pass_base import ExportPass, NodeMetadata @@ -19,6 +20,36 @@ def __init__(self, exported_program: Optional[torch.export.ExportedProgram] = No super(ArmPass, self).__init__() self.exported_program = exported_program + @property + @abstractmethod + def _passes_required_after(self) -> Set[Type[ExportPass]]: + """The subclass defines passes that must run after it""" + pass + + @staticmethod + def get_required_passes(pass_) -> List[str]: + """ + Returns the list of passes that must be run after this pass, sorted by name. + """ + if hasattr(pass_, "_passes_required_after"): + return sorted([ArmPass.get_name(p) for p in pass_._passes_required_after]) + else: + return [] + + @staticmethod + def get_name(pass_) -> str: + """ + Returns the name of the pass. + """ + if isinstance(pass_, ExportPass): + return pass_.__class__.__name__ + elif hasattr(pass_, "__name__"): + return pass_.__name__ + else: + raise ValueError( + f"Cannot get name for pass: {pass_}. It must be an instance of ExportPass or have a __name__ attribute." + ) + def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False): if not updated: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index f49206da67e..325f667f0ac 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -7,6 +7,9 @@ # pyre-unsafe + +from collections import defaultdict + import executorch.backends.arm.tosa.dialect # noqa: unused from executorch.backends.arm._passes import ( AddBiasPass, @@ -33,12 +36,14 @@ DecomposeAcoshPass, DecomposeAdaptiveAvgPool2dPass, DecomposeAddmmPass, + DecomposeAddSubAlphaPass, DecomposeAsinAndAcosPass, DecomposeAsinhPass, DecomposeAtanhPass, DecomposeAtanPass, DecomposeAvgPool2d, DecomposeBatchNormNoStatsPass, + DecomposeConv2dWithInt16ActivationPass, DecomposeCoshPass, DecomposeCosineSimilarityPass, DecomposeCumsumPass, @@ -77,6 +82,7 @@ FuseEqualPlaceholdersPass, FuseQuantizedActivationPass, InsertInt32CastsAfterInt64PlaceholdersPass, + InsertRescaleInt32Pass, InsertRescalePass, InsertTableOpsPass, MatchArgDtypePass, @@ -87,6 +93,8 @@ ReplaceScalarWithTensorArgPassTOSABI, ReplaceScalarWithTensorArgPassTOSAMI, RetraceFoldedDtypesPass, + RewriteMatmulPass, + RewriteUpsamplePass, ScalarsToAttributePass, SizeAdjustInputPass, ToTosaMemoryFormatPass, @@ -94,6 +102,7 @@ UnsqueezeScalarPlaceholdersPass, ) +from executorch.backends.arm._passes.arm_pass import ArmPass from executorch.backends.arm.tosa.specification import ( TosaLoweringContext, TosaSpecification, @@ -107,6 +116,8 @@ from executorch.exir.pass_manager import PassManager from executorch.exir.passes.remove_graph_asserts_pass import RemoveGraphAssertsPass from torch.fx import GraphModule +from torch.fx.passes.infra.pass_base import PassResult +from torch.nn.modules import Module class ArmPassManager(PassManager): @@ -115,6 +126,32 @@ def __init__(self, tosa_spec: TosaSpecification) -> None: self.tosa_spec = tosa_spec super().__init__() + def validate_constraints_mandatory(self): + """ + Validates that necessary passes have run before transforming to backend. + + Note that this differs from the original validate_constraints function, which + only checks the order of passes. + """ + passes_to_run = defaultdict(list) + + for current_pass in self.passes: + current_pass_name = ArmPass.get_name(current_pass) + for required_pass_name in ArmPass.get_required_passes(current_pass): + passes_to_run[required_pass_name].append(current_pass_name) + + passes_to_run.pop(current_pass_name, None) + + if len(passes_to_run) > 0: + error_msg = "The following constraints for passes are not met:\n" + for required_pass, requiring_passes in passes_to_run.items(): + for requiring_pass in requiring_passes: + error_msg += ( + f" - {required_pass} must run after {requiring_pass}\n" + ) + + raise RuntimeError(error_msg) + def _transform(self, graph_module: GraphModule): with TosaLoweringContext(self.tosa_spec): return self(graph_module).graph_module @@ -125,7 +162,6 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule: self.add_pass(RemoveGetItemPass()) self.add_pass(ConvertSplitToSlicePass()) self.add_pass(ConvertMmToBmmPass()) - self.add_pass(DecomposeLinearVectorNormPass()) self.add_pass( DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec) ) @@ -154,6 +190,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule: self.add_pass(ComputeConstantOpsAOT(exported_program)) self.add_pass(DecomposeGroupedConv()) + self.add_pass(ConvertExpandCopyToRepeatPass()) self.add_pass(UnsqueezeBeforeRepeatPass()) self.add_pass(CastInt64BuffersToInt32Pass(exported_program)) @@ -167,14 +204,23 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule: self.add_pass(FuseViewCopyTransform()) self.add_pass(FuseConstantArgsPass(exported_program)) + self.add_pass(InsertTableOpsPass(exported_program)) + # If we have a conv2d with int16 activation split up into a convolution + # and an addition, to work-around the lack of support for int48 in torch + # needs to happen before AddBiasPass, but after the table ops are inserted + # to be able to validate that conv2d has right dtype arguments. + self.add_pass(DecomposeConv2dWithInt16ActivationPass()) + self.add_pass(RewriteUpsamplePass(exported_program)) self.add_pass(AddBiasPass(exported_program)) - self.add_pass(InsertTableOpsPass(exported_program)) + self.add_pass(RewriteMatmulPass(exported_program)) self.add_pass(FuseEqualPlaceholdersPass(exported_program)) self.add_pass(ToTosaMemoryFormatPass(exported_program)) self.add_pass(RemoveNoopPass()) self.add_pass(InsertRescalePass()) + self.add_pass(InsertRescaleInt32Pass()) + self.validate_constraints_mandatory() return self._transform(exported_program.graph_module) def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule: @@ -217,6 +263,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule: ) self.add_pass(DecomposeNotEqualPass()) self.add_pass(DecomposeDivPass()) + self.add_pass(DecomposeAddSubAlphaPass()) self.add_pass(DecomposeSoftmaxPass()) self.add_pass(DecomposeGeluPass()) self.add_pass(ConvertFullLikeToFullPass()) @@ -251,13 +298,16 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule: self.add_pass(FuseViewCopyTransform()) self.add_pass(FuseConstantArgsPass(exported_program)) self.add_pass(CastInt64BuffersToInt32Pass(exported_program)) + self.add_pass(RewriteUpsamplePass(exported_program)) self.add_pass(AddBiasPass(exported_program)) self.add_pass(InsertTableOpsPass(exported_program)) + self.add_pass(RewriteMatmulPass(exported_program)) self.add_pass(FuseEqualPlaceholdersPass(exported_program)) self.add_pass(ToTosaMemoryFormatPass(exported_program)) self.add_pass(RemoveNoopPass()) self.add_pass(InsertRescalePass()) + self.validate_constraints_mandatory() return self._transform(exported_program.graph_module) def transform_to_backend_pipeline(self, exported_program: ExportedProgram): @@ -286,6 +336,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): self.add_pass(DecomposeSignPass()) self.add_pass(DecomposeAddmmPass()) self.add_pass(DecomposeDivTensorModePass()) + self.add_pass(DecomposeAddSubAlphaPass()) self.add_pass(ReplaceScalarWithTensorArgPassTOSABI()) self.add_pass(ScalarsToAttributePass()) self.add_pass(DecomposeGroupNormPass()) @@ -317,3 +368,20 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): self.add_pass(DecomposeMaskedFill()) return self._transform(graph_module) + + def __call__(self, module: Module) -> PassResult: + try: + return super().__call__(module) + except Exception as e: + first_exception = e.__cause__ or e.__context__ or e + import re + + message = e.args[0] + m = re.search(r"An error occurred when running the '([^']+)' pass", message) + if m: + pass_name = m.group(1) + first_exception.args = ( + f"{pass_name}: {first_exception.args[0]}", + *first_exception.args[1:], + ) + raise first_exception diff --git a/backends/arm/_passes/broadcast_args_pass.py b/backends/arm/_passes/broadcast_args_pass.py index f125ba13ff4..659e6aca686 100644 --- a/backends/arm/_passes/broadcast_args_pass.py +++ b/backends/arm/_passes/broadcast_args_pass.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import ( @@ -12,7 +14,7 @@ from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult from torch.fx import GraphModule, Node @@ -22,6 +24,8 @@ class BroadcastArgsPass(ArmPass): This is done when more than one arg needs broadcasting. """ + _passes_required_after: Set[Type[ExportPass]] = set() + targeted_ops = { exir_ops.edge.aten.add.Tensor, exir_ops.edge.aten.sub.Tensor, diff --git a/backends/arm/_passes/cast_bool_to_int8_pass.py b/backends/arm/_passes/cast_bool_to_int8_pass.py index 1352671b01e..771b6d9e174 100644 --- a/backends/arm/_passes/cast_bool_to_int8_pass.py +++ b/backends/arm/_passes/cast_bool_to_int8_pass.py @@ -6,6 +6,8 @@ # The TOSA BITWISE_AND, BITWISE_OR, and BITWISE_XOR don't handle bool as input # If input/output is bool lest add a cast/conversion pass before/after to/from int8. +from typing import Set, Type + import torch from executorch.exir.dialects._ops import ops as exir_ops @@ -15,6 +17,8 @@ class CastBoolToInt8Pass(ExportPass): """Casts the input to int8 if it is not already and casts back the output to the original input dtype.""" + _passes_required_after: Set[Type[ExportPass]] = set() + targeted_ops = { exir_ops.edge.aten.bitwise_and.Tensor, exir_ops.edge.aten.bitwise_or.Tensor, diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py index 8052c8fd2ce..d7b2a6b6b43 100644 --- a/backends/arm/_passes/cast_int64_pass.py +++ b/backends/arm/_passes/cast_int64_pass.py @@ -6,6 +6,7 @@ # pyre-unsafe import logging +from typing import Set, Type import torch from executorch.exir.pass_base import ExportPass, PassResult @@ -19,6 +20,8 @@ class CastInt64BuffersToInt32Pass(ExportPass): Cast int64 buffers to int32 if the int64 data is in int32 range. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program: torch.export.ExportedProgram): super(CastInt64BuffersToInt32Pass, self).__init__() self.exported_program = exported_program diff --git a/backends/arm/_passes/cast_to_int32_pass.py b/backends/arm/_passes/cast_to_int32_pass.py index c4b009e2b88..2e574568235 100644 --- a/backends/arm/_passes/cast_to_int32_pass.py +++ b/backends/arm/_passes/cast_to_int32_pass.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.exir.dialects._ops import ops as exir_ops @@ -12,6 +14,8 @@ class CastToInt32Pass(ExportPass): """Casts the input to int32 if it is not already and casts back the output to the original input dtype.""" + _passes_required_after: Set[Type[ExportPass]] = set() + targeted_ops = { exir_ops.edge.aten.bitwise_left_shift.Tensor, exir_ops.edge.aten.bitwise_right_shift.Tensor, diff --git a/backends/arm/_passes/conv1d_unsqueeze_pass.py b/backends/arm/_passes/conv1d_unsqueeze_pass.py index 56f674e9066..b228da6766f 100644 --- a/backends/arm/_passes/conv1d_unsqueeze_pass.py +++ b/backends/arm/_passes/conv1d_unsqueeze_pass.py @@ -6,6 +6,11 @@ # LICENSE file in the root directory of this source tree. +from typing import Set, Type + +from executorch.backends.arm._passes.add_bias_pass import AddBiasPass +from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass + from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -21,6 +26,8 @@ class Conv1dUnsqueezePass(ExportPass): 3) squeeze the output back down to 3d. """ + _passes_required_after: Set[Type[ExportPass]] = {AddBiasPass, SizeAdjustInputPass} + def call_operator(self, op, args, kwargs, meta): if op != exir_ops.edge.aten.convolution.default: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/convert_any_default_dim_dims_pass.py b/backends/arm/_passes/convert_any_default_dim_dims_pass.py index 7085f17add0..8c8e5086b6d 100644 --- a/backends/arm/_passes/convert_any_default_dim_dims_pass.py +++ b/backends/arm/_passes/convert_any_default_dim_dims_pass.py @@ -3,7 +3,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch +from executorch.backends.arm._passes.convert_squeezes_to_view import ( + ConvertSqueezesToViewPass, +) from executorch.exir.dialects._ops import ( # type: ignore[import-not-found] ops as exir_ops, ) @@ -44,6 +49,8 @@ class ConvertAnyDefaultDimDimsPass(ExportPass): squeeze(dim = [dim1, dim2]) """ + _passes_required_after: Set[Type[ExportPass]] = {ConvertSqueezesToViewPass} + def call(self, graph_module: torch.fx.GraphModule): modified = False for node in graph_module.graph.nodes: diff --git a/backends/arm/_passes/convert_expand_copy_to_repeat.py b/backends/arm/_passes/convert_expand_copy_to_repeat.py index ee509c7ebb5..83b47d31755 100644 --- a/backends/arm/_passes/convert_expand_copy_to_repeat.py +++ b/backends/arm/_passes/convert_expand_copy_to_repeat.py @@ -6,10 +6,13 @@ # pyre-unsafe import logging -from typing import cast +from typing import cast, Set, Type import torch +from executorch.backends.arm._passes.unsqueeze_before_repeat_pass import ( + UnsqueezeBeforeRepeatPass, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -50,6 +53,8 @@ class ConvertExpandCopyToRepeatPass(ExportPass): Replace expand copy with repeat since it is a repeat that can only repeat singleton dimensions. """ + _passes_required_after: Set[Type[ExportPass]] = {UnsqueezeBeforeRepeatPass} + expand_copy = exir_ops.edge.aten.expand_copy.default repeat = exir_ops.edge.aten.repeat.default diff --git a/backends/arm/_passes/convert_full_like_to_full_pass.py b/backends/arm/_passes/convert_full_like_to_full_pass.py index 234e2ecda82..06822a4abcf 100644 --- a/backends/arm/_passes/convert_full_like_to_full_pass.py +++ b/backends/arm/_passes/convert_full_like_to_full_pass.py @@ -3,11 +3,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + +from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT + from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class ConvertFullLikeToFullPass(ExportPass): +class ConvertFullLikeToFullPass(ArmPass): """As per the full_like pytorch documentation, `torch.full_like(input, fill_value)` is equivalent to `torch.full(input.size(), @@ -19,6 +24,8 @@ class ConvertFullLikeToFullPass(ExportPass): Skip layout and device since it's not relevant for our backend. """ + _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOT} + def call_operator(self, op, args, kwargs, meta): if op not in [ exir_ops.edge.aten.full_like.default, diff --git a/backends/arm/_passes/convert_int64_const_ops_to_int32.py b/backends/arm/_passes/convert_int64_const_ops_to_int32.py index 704c89dbd78..2bf305a13f6 100644 --- a/backends/arm/_passes/convert_int64_const_ops_to_int32.py +++ b/backends/arm/_passes/convert_int64_const_ops_to_int32.py @@ -7,6 +7,7 @@ import logging +from typing import Set, Type import torch from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT @@ -30,6 +31,8 @@ class ConvertInt64ConstOpsToInt32Pass(ExportPass): 5. `torch.tensor` """ + _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOT} + torch_ops = [ torch.ops.aten.full.default, torch.ops.aten.arange.default, diff --git a/backends/arm/_passes/convert_int64_output_ops_to_int32.py b/backends/arm/_passes/convert_int64_output_ops_to_int32.py index 788201be6c8..d0d29d14e30 100644 --- a/backends/arm/_passes/convert_int64_output_ops_to_int32.py +++ b/backends/arm/_passes/convert_int64_output_ops_to_int32.py @@ -7,6 +7,7 @@ import logging +from typing import Set, Type import torch from executorch.backends.arm._passes.arm_pass_utils import ( @@ -44,6 +45,8 @@ class ConvertInt64OutputOpsToInt32Pass(ExportPass): the int32 range. """ + _passes_required_after: Set[Type[ExportPass]] = set() + aten_cast_ops = ( torch.ops.aten.to.dtype, torch.ops.aten.to.dtype_layout, diff --git a/backends/arm/_passes/convert_int_pow_to_mul.py b/backends/arm/_passes/convert_int_pow_to_mul.py index f22a2fd0b3c..8f9b3a9cb4b 100644 --- a/backends/arm/_passes/convert_int_pow_to_mul.py +++ b/backends/arm/_passes/convert_int_pow_to_mul.py @@ -5,8 +5,11 @@ # pyre-unsafe +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass class ConvertIntPowToMuls(ArmPass): @@ -16,6 +19,8 @@ class ConvertIntPowToMuls(ArmPass): Needs to be run before doing scalar to tensor conversion. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta): if op != exir_ops.edge.aten.pow.Tensor_Scalar: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/convert_minmax_pass.py b/backends/arm/_passes/convert_minmax_pass.py index 9f409632c20..79bb6e2db0c 100644 --- a/backends/arm/_passes/convert_minmax_pass.py +++ b/backends/arm/_passes/convert_minmax_pass.py @@ -3,7 +3,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import cast, Set, Type + import torch +from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor +from executorch.backends.arm._passes.convert_squeezes_to_view import ( + ConvertSqueezesToViewPass, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -29,6 +35,8 @@ class ConvertMinMaxPass(ExportPass): squeeze(dim = [dim1, dim2]) """ + _passes_required_after: Set[Type[ExportPass]] = {ConvertSqueezesToViewPass} + def check_argmax(self, node): """ Raises a RuntimeError if the argmax value returned by the min/max op is used in the graph. @@ -94,20 +102,28 @@ def call(self, graph_module: torch.fx.GraphModule): replace_node, op, squeeze_op = self.get_variables(node) # Unwrap args - if len(node.args) == 2: + if len(node.args) == 1: + # If dims is unspecified, min/max over all dims. + input_node = cast(torch.fx.Node, node.args[0]) + input_shape = get_first_fake_tensor(input_node).shape + dims = range(len(input_shape)) + keepdims = False + elif len(node.args) == 2: input_node, dims = node.args keepdims = False elif len(node.args) == 3: input_node, dims, keepdims = node.args else: - raise RuntimeError(f"Unexpected arg size in {node.name}") + raise RuntimeError( + f"Unexpected arg size {len(node.args)} in {node.name}" + ) try: - iter(dims) - except: - dims = [dims] + iter(dims) # type:ignore[assignment] + except Exception: + dims = [dims] # type:ignore[assignment] else: - dims = list(dims) + dims = list(dims) # type:ignore[assignment] # Unroll multi-dimensional reduction and keep-dims arg with graph_module.graph.inserting_before(node): diff --git a/backends/arm/_passes/convert_split_to_slice.py b/backends/arm/_passes/convert_split_to_slice.py index 67bd9d73e81..7578c07ca53 100644 --- a/backends/arm/_passes/convert_split_to_slice.py +++ b/backends/arm/_passes/convert_split_to_slice.py @@ -5,6 +5,8 @@ # pyre-unsafe +from typing import Set, Type + import torch.fx from executorch.backends.arm._passes.arm_pass_utils import ( create_node, @@ -19,6 +21,8 @@ class ConvertSplitToSlicePass(ExportPass): Replace a split operation with many slice operations. """ + _passes_required_after: Set[Type[ExportPass]] = set() + split_ops = ( exir_ops.edge.aten.split_with_sizes_copy.default, exir_ops.edge.aten.split_copy.Tensor, diff --git a/backends/arm/_passes/convert_squeezes_to_view.py b/backends/arm/_passes/convert_squeezes_to_view.py index 889dbe74172..70f4625f0ff 100644 --- a/backends/arm/_passes/convert_squeezes_to_view.py +++ b/backends/arm/_passes/convert_squeezes_to_view.py @@ -6,6 +6,10 @@ # pyre-unsafe +from typing import Set, Type + +from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform + from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -15,6 +19,8 @@ class ConvertSqueezesToViewPass(ExportPass): Replaces squeeze/unsqueeze operators with view. These are simply special cases of the view op, so removing them gives us less cases to handle in the node visitiors. """ + _passes_required_after: Set[Type[ExportPass]] = {FuseViewCopyTransform} + def call_operator(self, op, args, kwargs, meta): if op not in [ exir_ops.edge.aten.squeeze_copy.dims, diff --git a/backends/arm/_passes/convert_to_clamp.py b/backends/arm/_passes/convert_to_clamp.py index 8f2c9b16f9a..0199d6798bc 100644 --- a/backends/arm/_passes/convert_to_clamp.py +++ b/backends/arm/_passes/convert_to_clamp.py @@ -3,7 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Tuple +from typing import Set, Tuple, Type + +from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( + QuantizeOperatorArguments, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -24,6 +28,8 @@ def get_clamp_params(op, args) -> Tuple[float | None, float | None]: class ConvertToClampPass(ExportPass): + _passes_required_after: Set[Type[ExportPass]] = {QuantizeOperatorArguments} + def call_operator(self, op, args, kwargs, meta): if op not in edge_operators: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_acosh_pass.py b/backends/arm/_passes/decompose_acosh_pass.py index 1d92dd68c4a..509849fce4e 100644 --- a/backends/arm/_passes/decompose_acosh_pass.py +++ b/backends/arm/_passes/decompose_acosh_pass.py @@ -5,8 +5,18 @@ # pyre-unsafe +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass # noqa +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For MI case edge_acosh_op = exir_ops.edge.aten.acosh.default @@ -19,6 +29,14 @@ class DecomposeAcoshPass(ArmPass): acosh(x) = log(x + sqrt((x-1)(x+1)) """ + _passes_required_after: Set[Type[ExportPass]] = { + DecomposeSqrtPass, + InsertTableOpsPass, + MatchArgRanksPass, + ReplaceScalarWithTensorArgPassTOSAMI, + MatchArgDtypePass, + } + def call_operator(self, op, args, kwargs, meta, updated=False): if op is not edge_acosh_op: diff --git a/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py b/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py index abfcc8e3945..52ddb77151d 100644 --- a/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py +++ b/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py @@ -4,12 +4,15 @@ # LICENSE file in the root directory of this source tree. from math import ceil, floor +from typing import Set, Type import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.decompose_avg_pool2d import DecomposeAvgPool2d from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_ops = (exir_ops.edge.aten._adaptive_avg_pool2d.default,) aten_ops = (torch.ops.aten.adaptive_avg_pool2d.default,) @@ -41,6 +44,8 @@ class DecomposeAdaptiveAvgPool2dPass(ArmPass): The output is of size output_size_h x output_size_w for any input. """ + _passes_required_after: Set[Type[ExportPass]] = {DecomposeAvgPool2d} + def call_operator(self, op, args, kwargs, meta, updated=False): if op not in (edge_ops + aten_ops): return super().call_operator(op, args, kwargs, meta, updated) diff --git a/backends/arm/_passes/decompose_add_sub_alpha_pass.py b/backends/arm/_passes/decompose_add_sub_alpha_pass.py new file mode 100644 index 00000000000..c0ed1bae09b --- /dev/null +++ b/backends/arm/_passes/decompose_add_sub_alpha_pass.py @@ -0,0 +1,94 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import numbers +from typing import Set, Type + +import torch +from executorch.backends.arm._passes import ArmPass +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass + + +_ADD_OPS = ( + exir_ops.edge.aten.add.Tensor, + torch.ops.aten.add.Tensor, +) + +_SUB_OPS = ( + exir_ops.edge.aten.sub.Tensor, + torch.ops.aten.sub.Tensor, +) + + +def _get_ops(op): + if op in _ADD_OPS: + if op is exir_ops.edge.aten.add.Tensor: + return ( + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.full.default, + exir_ops.edge.aten.add.Tensor, + ) + return ( + torch.ops.aten.mul.Tensor, + torch.ops.aten.full.default, + torch.ops.aten.add.Tensor, + ) + if op in _SUB_OPS: + if op is exir_ops.edge.aten.sub.Tensor: + return ( + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.full.default, + exir_ops.edge.aten.sub.Tensor, + ) + return ( + torch.ops.aten.mul.Tensor, + torch.ops.aten.full.default, + torch.ops.aten.sub.Tensor, + ) + raise RuntimeError(f"Unsupported operator {op}") + + +def _should_decompose(alpha) -> bool: + if isinstance(alpha, numbers.Number): + return alpha != 1 + return False + + +class DecomposeAddSubAlphaPass(ArmPass): + """Rewrite add/sub with alpha into a mul followed by add/sub.""" + + _passes_required_after: Set[Type[ExportPass]] = set() + + def call_operator(self, op, args, kwargs, meta, updated: bool | None = False): + if op not in _ADD_OPS + _SUB_OPS: + return super().call_operator(op, args, kwargs, meta, updated) + + alpha = kwargs.get("alpha", 1) + if not _should_decompose(alpha): + return super().call_operator(op, args, kwargs, meta, updated) + + mul_op, full_op, binary_op = _get_ops(op) + lhs, rhs = args + + alpha_full = super().call_operator( + full_op, ((1,), float(alpha)), {}, meta, updated=True + ) + scaled_rhs = super().call_operator( + mul_op, + (rhs, alpha_full), + {}, + meta, + updated=True, + ) + return super().call_operator( + binary_op, + (lhs, scaled_rhs), + {}, + meta, + updated=True, + ) diff --git a/backends/arm/_passes/decompose_addmm_pass.py b/backends/arm/_passes/decompose_addmm_pass.py index b59a8cb02d3..a95c1cc7fec 100644 --- a/backends/arm/_passes/decompose_addmm_pass.py +++ b/backends/arm/_passes/decompose_addmm_pass.py @@ -3,10 +3,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass # noqa from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For MI case @@ -36,6 +42,12 @@ def get_ops(op): class DecomposeAddmmPass(ArmPass): """Decomposes the addmm operator into tensor multiplication and addition.""" + _passes_required_after: Set[Type[ExportPass]] = { + ConvertMmToBmmPass, + MatchArgRanksPass, + MatchArgDtypePass, + } + def call_operator(self, op, args, kwargs, meta): if op not in [edge_addmm, aten_addmm]: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_asin_and_acos_pass.py b/backends/arm/_passes/decompose_asin_and_acos_pass.py index e067f17b0ca..5b1c575e9c9 100644 --- a/backends/arm/_passes/decompose_asin_and_acos_pass.py +++ b/backends/arm/_passes/decompose_asin_and_acos_pass.py @@ -7,11 +7,23 @@ import logging from math import pi +from typing import Set, Type import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.convert_full_like_to_full_pass import ( + ConvertFullLikeToFullPass, +) +from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass +from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For MI case edge_asin_op = (exir_ops.edge.aten.asin.default,) @@ -54,6 +66,15 @@ class DecomposeAsinAndAcosPass(ArmPass): """ + _passes_required_after: Set[Type[ExportPass]] = { + DecomposeSqrtPass, + DecomposeDivPass, + ConvertFullLikeToFullPass, + MatchArgRanksPass, + MatchArgDtypePass, + ReplaceScalarWithTensorArgPassTOSAMI, + } + def _build_polynomial( self, coefficients: list[float], variable: torch.Tensor, meta: dict[str, str] ) -> torch.Tensor: diff --git a/backends/arm/_passes/decompose_asinh_pass.py b/backends/arm/_passes/decompose_asinh_pass.py index a0b78c51a77..088230ca4b2 100644 --- a/backends/arm/_passes/decompose_asinh_pass.py +++ b/backends/arm/_passes/decompose_asinh_pass.py @@ -6,8 +6,18 @@ # pyre-unsafe +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For MI case edge_asinh_op = (exir_ops.edge.aten.asinh.default,) @@ -20,6 +30,14 @@ class DecomposeAsinhPass(ArmPass): asinh(x) = log(x + sqrt(x^2 + 1)) """ + _passes_required_after: Set[Type[ExportPass]] = { + DecomposeSqrtPass, + InsertTableOpsPass, + MatchArgRanksPass, + ReplaceScalarWithTensorArgPassTOSAMI, + MatchArgDtypePass, + } + def call_operator(self, op, args, kwargs, meta): if op not in edge_asinh_op: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_atan_pass.py b/backends/arm/_passes/decompose_atan_pass.py index 57b9dde5216..03ed62e7870 100644 --- a/backends/arm/_passes/decompose_atan_pass.py +++ b/backends/arm/_passes/decompose_atan_pass.py @@ -5,9 +5,17 @@ import logging from math import pi +from typing import Set, Type from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_atan = exir_ops.edge.aten.atan.default # MI case @@ -35,6 +43,13 @@ def _get_atan_ops(op): class DecomposeAtanPass(ArmPass): """Decomposes the atan operator into a rational (Padé) approximation.""" + _passes_required_after: Set[Type[ExportPass]] = { + InsertTableOpsPass, + MatchArgRanksPass, + MatchArgDtypePass, + ReplaceScalarWithTensorArgPassTOSAMI, + } + def _rational_approximation(self, z, ops, meta): """Creates a (2,1) Padé approximation for atan(x) on [-1, 1].""" diff --git a/backends/arm/_passes/decompose_atanh_pass.py b/backends/arm/_passes/decompose_atanh_pass.py index dfdad41e556..2c8347e7e9f 100644 --- a/backends/arm/_passes/decompose_atanh_pass.py +++ b/backends/arm/_passes/decompose_atanh_pass.py @@ -3,8 +3,17 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_atanh = exir_ops.edge.aten.atanh.default # MI case @@ -30,6 +39,13 @@ class DecomposeAtanhPass(ArmPass): atanh(x) = 0.5 * log((1 + x) / (1 - x)) """ + _passes_required_after: Set[Type[ExportPass]] = { + InsertTableOpsPass, + MatchArgRanksPass, + MatchArgDtypePass, + ReplaceScalarWithTensorArgPassTOSAMI, + } + def call_operator(self, op, args, kwargs, meta): if op is not edge_atanh: return super().call_operator(op, args, kwargs, meta, updated=False) diff --git a/backends/arm/_passes/decompose_avg_pool2d.py b/backends/arm/_passes/decompose_avg_pool2d.py index 21ed6b518c7..bbb8ceba129 100644 --- a/backends/arm/_passes/decompose_avg_pool2d.py +++ b/backends/arm/_passes/decompose_avg_pool2d.py @@ -4,7 +4,10 @@ # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch +from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT from executorch.backends.arm.operators.operator_validation_utils import ( adjust_pooling_pad_if_needed, ) @@ -30,11 +33,11 @@ def get_decomposition(op) -> tuple: torch.ops.aten.avg_pool2d.default, torch.ops.aten.mul.Tensor, ) - raise RuntimeError(f"Can't get div decomposition for op {op}") + raise RuntimeError(f"Can't get avg_pool2d decomposition for op {op}") class DecomposeAvgPool2d(ExportPass): - """ """ + _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOT} def call_operator(self, op, args, kwargs, meta): if op not in (edge_div_ops + aten_div_ops): diff --git a/backends/arm/_passes/decompose_batch_norm_no_stats.py b/backends/arm/_passes/decompose_batch_norm_no_stats.py index 5fdb8db2d7c..b18bd4d9ac8 100644 --- a/backends/arm/_passes/decompose_batch_norm_no_stats.py +++ b/backends/arm/_passes/decompose_batch_norm_no_stats.py @@ -6,12 +6,16 @@ # pyre-unsafe import operator +from typing import Set, Type import torch from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT + +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult class DecomposeBatchNormNoStatsPass(ArmPass): @@ -33,6 +37,11 @@ class DecomposeBatchNormNoStatsPass(ArmPass): Source: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html """ + _passes_required_after: Set[Type[ExportPass]] = { + ComputeConstantOpsAOT, + InsertTableOpsPass, + } + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: # noqa: C901 bn_ops = ( exir_ops.edge.aten._native_batch_norm_legit.no_stats, diff --git a/backends/arm/_passes/decompose_cosh_pass.py b/backends/arm/_passes/decompose_cosh_pass.py index a94cf9ecff0..cbfbd5783e2 100644 --- a/backends/arm/_passes/decompose_cosh_pass.py +++ b/backends/arm/_passes/decompose_cosh_pass.py @@ -3,8 +3,17 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For MI case edge_cosh = exir_ops.edge.aten.cosh.default @@ -19,6 +28,13 @@ class DecomposeCoshPass(ArmPass): """ + _passes_required_after: Set[Type[ExportPass]] = { + InsertTableOpsPass, + MatchArgRanksPass, + ReplaceScalarWithTensorArgPassTOSAMI, + MatchArgDtypePass, + } + def call_operator(self, op, args, kwargs, meta, updated=False): if op is not edge_cosh: return super().call_operator(op, args, kwargs, meta, updated) diff --git a/backends/arm/_passes/decompose_cosine_similarity_pass.py b/backends/arm/_passes/decompose_cosine_similarity_pass.py index 9978e653408..965dad54697 100644 --- a/backends/arm/_passes/decompose_cosine_similarity_pass.py +++ b/backends/arm/_passes/decompose_cosine_similarity_pass.py @@ -3,7 +3,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch +from executorch.backends.arm._passes.convert_full_like_to_full_pass import ( + ConvertFullLikeToFullPass, +) + +from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass +from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.pass_base import ExportPass torch_cosine_similarity = (torch.ops.aten.cosine_similarity.default,) @@ -22,6 +31,13 @@ class DecomposeCosineSimilarityPass(ExportPass): out = div(dot, denom) """ + _passes_required_after: Set[Type[ExportPass]] = { + DecomposeDivPass, + DecomposeSumPass, + ConvertFullLikeToFullPass, + InsertTableOpsPass, + } + def call_operator(self, op, args, kwargs, meta): if op not in torch_cosine_similarity: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_cumsum_pass.py b/backends/arm/_passes/decompose_cumsum_pass.py index 155ccd11594..32c59f6d793 100644 --- a/backends/arm/_passes/decompose_cumsum_pass.py +++ b/backends/arm/_passes/decompose_cumsum_pass.py @@ -4,15 +4,17 @@ # LICENSE file in the root directory of this source tree. from math import prod +from typing import Set, Type import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.add_bias_pass import AddBiasPass from executorch.backends.arm._passes.arm_pass_utils import create_node from executorch.backends.arm._passes.quant_args import QuantArgs from executorch.backends.transforms.utils import create_constant_placeholder from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult from torch.export.graph_signature import InputKind @@ -39,6 +41,8 @@ class DecomposeCumsumPass(ArmPass): And the convolution is applied over dimension H. """ + _passes_required_after: Set[Type[ExportPass]] = {AddBiasPass} + def call(self, graph_module): graph = graph_module.graph targets = (exir_ops.edge.aten.cumsum.default, torch.ops.aten.cumsum.default) diff --git a/backends/arm/_passes/decompose_div_pass.py b/backends/arm/_passes/decompose_div_pass.py index 893531dac69..b6db103930e 100644 --- a/backends/arm/_passes/decompose_div_pass.py +++ b/backends/arm/_passes/decompose_div_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -6,7 +6,10 @@ # pyre-unsafe +from typing import Set, Type + import torch +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -37,6 +40,8 @@ class DecomposeDivPass(ExportPass): y = mul(a,x) """ + _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass} + def call_operator(self, op, args, kwargs, meta): if op not in (edge_div_ops + aten_div_ops): return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_div_tensor_mode.py b/backends/arm/_passes/decompose_div_tensor_mode.py index 0e6b40afbb2..b5352475d51 100644 --- a/backends/arm/_passes/decompose_div_tensor_mode.py +++ b/backends/arm/_passes/decompose_div_tensor_mode.py @@ -5,7 +5,10 @@ # pyre-unsafe +from typing import Set, Type + import torch +from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -48,6 +51,8 @@ class DecomposeDivTensorModePass(ExportPass): rounding_mode='trunc' -> where(div(a,b) < 0, ceil(div(a,b)), floor(div(a,b))) """ + _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivPass} + def call_operator(self, op, args, kwargs, meta): if op not in (edge_div_mode_ops + aten_div_mode_ops): return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_elu_pass.py b/backends/arm/_passes/decompose_elu_pass.py index 743f1b46f4d..ba3d32b7529 100644 --- a/backends/arm/_passes/decompose_elu_pass.py +++ b/backends/arm/_passes/decompose_elu_pass.py @@ -3,8 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_elu_ops = (exir_ops.edge.aten.elu.default,) @@ -55,6 +58,8 @@ class DecomposeEluPass(ArmPass): - exir_ops.edge.aten.mul.Scalar """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta): if op not in edge_elu_ops: return super().call_operator(op, args, kwargs, meta, updated=False) diff --git a/backends/arm/_passes/decompose_embedding_pass.py b/backends/arm/_passes/decompose_embedding_pass.py index 6de971f402f..01226a7a38e 100644 --- a/backends/arm/_passes/decompose_embedding_pass.py +++ b/backends/arm/_passes/decompose_embedding_pass.py @@ -8,8 +8,10 @@ import logging from math import prod +from typing import Set, Type import torch +from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -33,6 +35,8 @@ class DecomposeEmbeddingPass(ExportPass): i = indices is expected to be int32 before this pass """ + _passes_required_after: Set[Type[ExportPass]] = {FuseViewCopyTransform} + aten_ops = (torch.ops.aten.embedding.default,) edge_ops = (exir_ops.edge.aten.embedding.default,) diff --git a/backends/arm/_passes/decompose_expm1_pass.py b/backends/arm/_passes/decompose_expm1_pass.py index 5b1b90495b5..5de03cbf102 100644 --- a/backends/arm/_passes/decompose_expm1_pass.py +++ b/backends/arm/_passes/decompose_expm1_pass.py @@ -3,8 +3,19 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.convert_int_pow_to_mul import ConvertIntPowToMuls +from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_expm1_ops = (exir_ops.edge.aten.expm1.default,) # MI case @@ -68,6 +79,15 @@ class DecomposeExpm1Pass(ArmPass): - exir_ops.edge.aten.logical_and.default """ + _passes_required_after: Set[Type[ExportPass]] = { + ConvertIntPowToMuls, + InsertTableOpsPass, + DecomposeDivPass, + ReplaceScalarWithTensorArgPassTOSAMI, + MatchArgDtypePass, + MatchArgRanksPass, + } + def call_operator(self, op, args, kwargs, meta): if op not in edge_expm1_ops: return super().call_operator(op, args, kwargs, meta, updated=False) diff --git a/backends/arm/_passes/decompose_gelu_pass.py b/backends/arm/_passes/decompose_gelu_pass.py index 6e72175e68b..237b8199e82 100644 --- a/backends/arm/_passes/decompose_gelu_pass.py +++ b/backends/arm/_passes/decompose_gelu_pass.py @@ -3,8 +3,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes.arm_pass_utils import get_node_arg +from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -77,6 +83,13 @@ class DecomposeGeluPass(ExportPass): %op7 = mul(%op6, %FULL_0_5) """ + _passes_required_after: Set[Type[ExportPass]] = { + ComputeConstantOpsAOT, + InsertTableOpsPass, + MatchArgDtypePass, + MatchArgRanksPass, + } + def call_operator(self, op, args, kwargs, meta): if op not in torch_gelu + edge_gelu: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_glu_pass.py b/backends/arm/_passes/decompose_glu_pass.py index 183dc89cf61..373b31c5995 100644 --- a/backends/arm/_passes/decompose_glu_pass.py +++ b/backends/arm/_passes/decompose_glu_pass.py @@ -3,9 +3,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For FP case @@ -36,6 +40,8 @@ def get_ops(op): class DecomposeGluPass(ArmPass): """Decomposes the GLU operator into hadamard product and sigmoid.""" + _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass} + def call_operator(self, op, args, kwargs, meta): if op not in [edge_glu, aten_glu]: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_grouped_conv.py b/backends/arm/_passes/decompose_grouped_conv.py index ce9fe9c9937..916e43ee9a4 100644 --- a/backends/arm/_passes/decompose_grouped_conv.py +++ b/backends/arm/_passes/decompose_grouped_conv.py @@ -4,8 +4,10 @@ # LICENSE file in the root directory of this source tree. from copy import copy +from typing import Set, Type import torch +from executorch.backends.arm._passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass from executorch.backends.arm._passes.quant_args import QuantArgs from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -33,6 +35,8 @@ class DecomposeGroupedConv(ExportPass): x = cat(x1, x2) """ + _passes_required_after: Set[Type[ExportPass]] = {Conv1dUnsqueezePass} + @staticmethod def _get_decomposition(op): match op: diff --git a/backends/arm/_passes/decompose_groupnorm_pass.py b/backends/arm/_passes/decompose_groupnorm_pass.py index c6cb1b05e40..29d68234b29 100644 --- a/backends/arm/_passes/decompose_groupnorm_pass.py +++ b/backends/arm/_passes/decompose_groupnorm_pass.py @@ -6,12 +6,17 @@ # pyre-unsafe import operator +from typing import Set, Type import torch from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass +from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult def get_group_norm_decomposition(op) -> tuple: @@ -57,6 +62,13 @@ class DecomposeGroupNormPass(ArmPass): Source: https://pytorch.org/docs/stable/generated/torch.nn.GroupNorm.html """ + _passes_required_after: Set[Type[ExportPass]] = { + InsertTableOpsPass, + DecomposeMeanDimPass, + DecomposeVarPass, + SizeAdjustInputPass, + } + def call(self, graph_module: torch.fx.GraphModule): modified = False for node in graph_module.graph.nodes: diff --git a/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py b/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py new file mode 100644 index 00000000000..d43c2a8c89c --- /dev/null +++ b/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py @@ -0,0 +1,145 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import cast + +import torch +from executorch.backends.arm._passes.quant_args import QuantArgs + +from executorch.backends.arm.tosa.specification import get_context_spec, Tosa_1_00 +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass + + +class DecomposeConv2dWithInt16ActivationPass(ExportPass): + """ + This pass decomposes a convolution with input dtype int16 and bias + into a convolution without bias followed by an addition of the bias + since the TOSA op requires the bias to be int48 which is hard to represent + in torch. Instead rescale the int48 output to int16 and add the bias in int16. + """ + + def call_operator(self, op, args, kwargs, meta): + if op != exir_ops.edge.aten.convolution.default: + return super().call_operator(op, args, kwargs, meta) + + tosa_spec = get_context_spec() + if not tosa_spec.support_integer(): + return super().call_operator(op, args, kwargs, meta) + + # return if no bias + if args[2] is None: + return super().call_operator(op, args, kwargs, meta) + + if args[0].data.dtype == torch.int8: + return super().call_operator(op, args, kwargs, meta) + elif args[0].data.dtype == torch.int16: + if isinstance(tosa_spec, Tosa_1_00) and not tosa_spec.support_extension( + "int16" + ): + raise ValueError( + "int16 activation for convolution requires TOSA int16 extension" + ) + else: + raise NotImplementedError( + "Decomposition to conv+add only implemented for activation of int16 type" + ) + + # convolution with bias and activation is int16 + # The bias is assumed to be quantized with the same quantization parameters as + # as the output of the convolution + bias = args[2] + assert ( + meta.data["output_qparams"][0].dtype == bias.data.dtype + ), "Bias needs to have same type as quantized output type" + no_bias_args = list(args) + no_bias_args[2] = None + # split up to convolution + bias + convolution = super().call_operator(op, tuple(no_bias_args), kwargs, meta) + + # create a copy of the meta without the qparams, to be used with the new nodes + new_meta = meta.copy() + new_meta.data.pop("output_qparams", None) + new_meta.data.pop("input_qparams", None) + + # reshape the tensor to the same rank as the convolution output to add the bias to the channels + channel_bias = super().call_operator( + exir_ops.edge.aten.view_copy.default, + (bias, [1, len(bias.data), 1, 1]), + {}, + new_meta, + ) + + output_dtype = meta.data["output_qparams"][0].dtype + + if output_dtype == torch.int16: + # The conv will get the output int48 scaled to int32 in serialization step. + # To be able to add the bias we need to first scale (cast?) the output to int32. + # The resulting i32 sum will then need to be scaled back to the output dtype. + + # calculate common rescale factor from convolution output and bias quantization + output_qparams = cast(QuantArgs, meta.data["output_qparams"][0]) + conv_output_scale = output_qparams.scale + bias_qparams = cast(QuantArgs, meta.data["input_qparams"][2]) + bias_scale = bias_qparams.scale + + common_scale = max(bias_scale, conv_output_scale) + + # calculate how we can rescale bias and conv to a common scale and maximize the output range + bias_rescale_factor = bias_scale / common_scale + conv_rescale_factor = conv_output_scale / common_scale + + # Either of conv output or bias now covers the full int16 range and the other one a smaller range. + # Since we are upscaling to int32 we have 16 additional bits to work with to maximize the output range. + # Worst case here is that both bias and conv output covers the full int16 range so we leave one bit + # and then one for the sign bit. + bits_left_to_shift = 14 + + # update rescale factors + bias_rescale_factor *= 1 << bits_left_to_shift + conv_rescale_factor *= 1 << bits_left_to_shift + + conv_output = super().call_operator( + exir_ops.backend.tosa.RESCALE.default, + (convolution, torch.int32, conv_rescale_factor, 0, 0), + {}, + new_meta, + ) + + bias_rescaled = super().call_operator( + exir_ops.backend.tosa.RESCALE.default, + (channel_bias, torch.int32, bias_rescale_factor, 0, 0), + {}, + new_meta, + ) + + add = super().call_operator( + exir_ops.edge.aten.add.Tensor, + (conv_output, bias_rescaled), + {}, + new_meta, + ) + + res_rescale = super().call_operator( + exir_ops.backend.tosa.RESCALE.default, + ( + add, + output_dtype, + (common_scale / (conv_output_scale * (1 << bits_left_to_shift))), + 0, + 0, + ), + {}, + new_meta, + ) + + else: + raise NotImplementedError( + f"Decomposition to conv+add only implemented for activation of int16 type, not for {output_dtype}" + ) + + return res_rescale diff --git a/backends/arm/_passes/decompose_layernorm_pass.py b/backends/arm/_passes/decompose_layernorm_pass.py index e6cbdfb91a0..c73806b0022 100644 --- a/backends/arm/_passes/decompose_layernorm_pass.py +++ b/backends/arm/_passes/decompose_layernorm_pass.py @@ -6,12 +6,17 @@ # pyre-unsafe import operator +from typing import Set, Type import torch from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass +from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass +from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult def get_layer_norm_decomposition(op) -> tuple: @@ -56,6 +61,13 @@ class DecomposeLayerNormPass(ArmPass): Source: https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html """ + _passes_required_after: Set[Type[ExportPass]] = { + ComputeConstantOpsAOT, + DecomposeMeanDimPass, + DecomposeVarPass, + InsertTableOpsPass, + } + def call(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: if node.op != "call_function" or node.target not in ( diff --git a/backends/arm/_passes/decompose_leaky_relu_pass.py b/backends/arm/_passes/decompose_leaky_relu_pass.py index e896cc584be..8ae13a76eb0 100644 --- a/backends/arm/_passes/decompose_leaky_relu_pass.py +++ b/backends/arm/_passes/decompose_leaky_relu_pass.py @@ -6,9 +6,12 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_ops = (exir_ops.edge.aten.leaky_relu.default,) torch_ops = (torch.ops.aten.leaky_relu.default,) @@ -46,6 +49,8 @@ class DecomposeLeakyReLUPass(ArmPass): %op5 = add(%op1,%op4) """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta): if op not in (edge_ops + torch_ops): return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_linalg_vector_norm_pass.py b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py index 9f036c0524f..ea5dd2d9b55 100644 --- a/backends/arm/_passes/decompose_linalg_vector_norm_pass.py +++ b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py @@ -3,7 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch +from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass +from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass from executorch.exir.pass_base import ExportPass @@ -28,6 +32,11 @@ class DecomposeLinearVectorNormPass(ExportPass): dtype prior, but we dont know this from FX graph. """ + _passes_required_after: Set[Type[ExportPass]] = { + DecomposeSqrtPass, + DecomposeSumPass, + } + torch_linalg_vector_norm = (torch.ops.aten.linalg_vector_norm.default,) def call_operator(self, op, args, kwargs, meta): diff --git a/backends/arm/_passes/decompose_linear_pass.py b/backends/arm/_passes/decompose_linear_pass.py index 3d154d9b81e..70268c77a1d 100644 --- a/backends/arm/_passes/decompose_linear_pass.py +++ b/backends/arm/_passes/decompose_linear_pass.py @@ -5,6 +5,8 @@ # pyre-unsafe +from typing import Set, Type + import numpy as np from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import ( @@ -12,7 +14,7 @@ get_first_fake_tensor, ) from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import PassResult +from executorch.exir.pass_base import ExportPass, PassResult class DecomposeLinearPass(ArmPass): @@ -25,6 +27,8 @@ class DecomposeLinearPass(ArmPass): output = view(conv2d) """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call(self, graph_module): for node in graph_module.graph.nodes: if node.op != "call_function": diff --git a/backends/arm/_passes/decompose_logit_pass.py b/backends/arm/_passes/decompose_logit_pass.py index 40e2b22cb54..213b8f038e8 100644 --- a/backends/arm/_passes/decompose_logit_pass.py +++ b/backends/arm/_passes/decompose_logit_pass.py @@ -3,10 +3,19 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For FP case @@ -60,6 +69,13 @@ class DecomposeLogitPass(ArmPass): log(y * reciprocal((-1) * y + 1)) """ + _passes_required_after: Set[Type[ExportPass]] = { + InsertTableOpsPass, + MatchArgRanksPass, + MatchArgDtypePass, + ReplaceScalarWithTensorArgPassTOSAMI, + } + def call_operator(self, op, args, kwargs, meta): if op not in [edge_logit, aten_logit]: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_masked_fill.py b/backends/arm/_passes/decompose_masked_fill.py index fbf3079c92b..8c41c1a11bc 100644 --- a/backends/arm/_passes/decompose_masked_fill.py +++ b/backends/arm/_passes/decompose_masked_fill.py @@ -6,10 +6,16 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.convert_full_like_to_full_pass import ( + ConvertFullLikeToFullPass, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_ops = (exir_ops.edge.aten.masked_fill.Scalar,) @@ -37,6 +43,8 @@ class DecomposeMaskedFill(ArmPass): Decomposed to a where and a full_like operator. """ + _passes_required_after: Set[Type[ExportPass]] = {ConvertFullLikeToFullPass} + def call_operator(self, op, args, kwargs, meta, updated=False): if op not in (edge_ops + aten_ops): return super().call_operator(op, args, kwargs, meta, updated) diff --git a/backends/arm/_passes/decompose_maxpool2d_with_dilation.py b/backends/arm/_passes/decompose_maxpool2d_with_dilation.py index ff6db260099..22d2ec1d85b 100644 --- a/backends/arm/_passes/decompose_maxpool2d_with_dilation.py +++ b/backends/arm/_passes/decompose_maxpool2d_with_dilation.py @@ -6,9 +6,12 @@ # pyre-unsafe import operator +from typing import Set, Type from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # We'll decompose only the EXIR edge max_pool2d ops when dilation > 1 EDGE_MAXPOOL2D = ( @@ -22,6 +25,10 @@ class DecomposeMaxPool2DPass(ArmPass): Decompose dilated max_pool2d (EXIR edge ops) into space-to-batch -> maxpool -> batch-to-space. """ + _passes_required_after: Set[Type[ExportPass]] = { + SizeAdjustInputPass, + } + def call_operator(self, op, args, kwargs, meta): # Only intercept EXIR edge max_pool2d ops if op not in EDGE_MAXPOOL2D: diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py index a78514b6af5..4d4c0ee75b1 100644 --- a/backends/arm/_passes/decompose_meandim_pass.py +++ b/backends/arm/_passes/decompose_meandim_pass.py @@ -5,12 +5,17 @@ from copy import copy from math import prod +from typing import Set, Type import torch from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg +from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass +from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT +from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass from executorch.exir.backend.utils import WhyNoPartitionReporter from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass def get_meandim_decomposition(op) -> tuple: @@ -62,6 +67,12 @@ class DecomposeMeanDimPass(ArmPass): x = view_copy.default(x, new_shape=(h)) # Squeeze dims since keepdims = False """ + _passes_required_after: Set[Type[ExportPass]] = { + ComputeConstantOpsAOT, + DecomposeSumPass, + SizeAdjustInputPass, + } + def __init__(self, graph_module, tosa_spec): super().__init__() self._graph_module = graph_module @@ -83,6 +94,8 @@ def call_operator(self, op, args, kwargs, meta): input_shape = list(x.data.shape) output_shape = list(meta["val"].shape) dims_to_reduce = get_node_arg(args, 1) + if dims_to_reduce is None: + dims_to_reduce = range(len(input_shape)) dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce] dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1] diff --git a/backends/arm/_passes/decompose_ne_pass.py b/backends/arm/_passes/decompose_ne_pass.py index 16443d5d2fb..3bd4f4540bb 100644 --- a/backends/arm/_passes/decompose_ne_pass.py +++ b/backends/arm/_passes/decompose_ne_pass.py @@ -3,9 +3,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass edge_ne_ops = (exir_ops.edge.aten.ne.Tensor,) aten_ne_ops = (torch.ops.aten.ne.Tensor, torch.ops.aten.ne_.Tensor) @@ -53,6 +56,8 @@ class DecomposeNotEqualPass(ArmPass): - followed by aten.logical_not.default or its edge equivalent """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta): if op not in (edge_ne_ops + aten_ne_ops): return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_round_pass.py b/backends/arm/_passes/decompose_round_pass.py index edfa3817064..35d36e80396 100644 --- a/backends/arm/_passes/decompose_round_pass.py +++ b/backends/arm/_passes/decompose_round_pass.py @@ -3,10 +3,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload +from executorch.exir.pass_base import ExportPass from torch._ops import OpOverload @@ -56,6 +59,8 @@ class DecomposeRoundPass(ArmPass): %result = where(%is_non_negative, %floor, %ceil) """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta, updated=False): if op not in (exir_ops.edge.aten.round.default, torch.ops.aten.round.default): return super().call_operator(op, args, kwargs, meta, updated) diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py index 99c89f474ea..049409af6fd 100644 --- a/backends/arm/_passes/decompose_select.py +++ b/backends/arm/_passes/decompose_select.py @@ -6,11 +6,16 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes.arm_pass_utils import ( create_node, get_first_fake_tensor, ) +from executorch.backends.arm._passes.convert_squeezes_to_view import ( + ConvertSqueezesToViewPass, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -20,6 +25,8 @@ class DecomposeSelectPass(ExportPass): This pass decomposes select into slice + squeeze to ensure that Aten and TOSA outputs has the same rank (input rank -1) """ + _passes_required_after: Set[Type[ExportPass]] = {ConvertSqueezesToViewPass} + def call(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: diff --git a/backends/arm/_passes/decompose_sign_pass.py b/backends/arm/_passes/decompose_sign_pass.py index 1038ff0f3fa..c4cb964316d 100644 --- a/backends/arm/_passes/decompose_sign_pass.py +++ b/backends/arm/_passes/decompose_sign_pass.py @@ -3,10 +3,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For MI case @@ -42,6 +45,8 @@ def get_ops(op): class DecomposeSignPass(ArmPass): """Decomposes the sign operator into a sequence of operations that are supported by the Arm backend.""" + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta): if op not in (edge_sign, aten_sign): return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_silu_pass.py b/backends/arm/_passes/decompose_silu_pass.py index 68ebb3f4515..3d31552cf35 100644 --- a/backends/arm/_passes/decompose_silu_pass.py +++ b/backends/arm/_passes/decompose_silu_pass.py @@ -5,7 +5,10 @@ # pyre-unsafe +from typing import Set, Type + import torch +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.pass_base import ExportPass aten_silu_ops = (torch.ops.aten.silu.default, torch.ops.aten.silu_.default) @@ -22,6 +25,8 @@ class DecomposeSiluPass(ExportPass): y = mul(a,x) """ + _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass} + def call_operator(self, op, args, kwargs, meta): if op not in (aten_silu_ops): return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_sinh_pass.py b/backends/arm/_passes/decompose_sinh_pass.py index 7192eb9bf74..acb18df3134 100644 --- a/backends/arm/_passes/decompose_sinh_pass.py +++ b/backends/arm/_passes/decompose_sinh_pass.py @@ -4,8 +4,17 @@ # LICENSE file in the root directory of this source tree. +from typing import Set, Type + from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass +from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( + ReplaceScalarWithTensorArgPassTOSAMI, +) from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For MI case @@ -24,6 +33,13 @@ class DecomposeSinhPass(ArmPass): and scalar multiplication. """ + _passes_required_after: Set[Type[ExportPass]] = { + InsertTableOpsPass, + MatchArgRanksPass, + ReplaceScalarWithTensorArgPassTOSAMI, + MatchArgDtypePass, + } + def call_operator(self, op, args, kwargs, meta): if op is not edge_sinh: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_softmax_pass.py b/backends/arm/_passes/decompose_softmax_pass.py index a735501f711..52df7cf6700 100644 --- a/backends/arm/_passes/decompose_softmax_pass.py +++ b/backends/arm/_passes/decompose_softmax_pass.py @@ -3,7 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch +from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -62,6 +66,11 @@ class DecomposeSoftmaxPass(ExportPass): (in logsoftmax case: %op7 = log(%op6)) """ + _passes_required_after: Set[Type[ExportPass]] = { + DecomposeSumPass, + InsertTableOpsPass, + } + def call_operator(self, op, args, kwargs, meta): if op not in torch_softmax + edge_softmax: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_softmax_unstable_pass.py b/backends/arm/_passes/decompose_softmax_unstable_pass.py index b6f5e11b66b..04e99a46b3e 100644 --- a/backends/arm/_passes/decompose_softmax_unstable_pass.py +++ b/backends/arm/_passes/decompose_softmax_unstable_pass.py @@ -5,9 +5,14 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass # For BI case torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int) @@ -57,6 +62,11 @@ class DecomposeSoftmaxUnstablePass(ArmPass): (in logsoftmax case: %op5 = log(%op4)) """ + _passes_required_after: Set[Type[ExportPass]] = { + DecomposeSumPass, + InsertTableOpsPass, + } + def call_operator(self, op, args, kwargs, meta): if op not in torch_softmax + edge_softmax: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/decompose_sqrt_pass.py b/backends/arm/_passes/decompose_sqrt_pass.py index 547d0091e90..3f4e608c4b9 100644 --- a/backends/arm/_passes/decompose_sqrt_pass.py +++ b/backends/arm/_passes/decompose_sqrt_pass.py @@ -4,9 +4,10 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe -from typing import Tuple, Union +from typing import Set, Tuple, Type, Union import torch +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -27,6 +28,7 @@ def get_sqrt_decomposition(op) -> Union[Tuple, torch._ops.OpOverload]: class DecomposeSqrtPass(ExportPass): + _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass} def call_operator(self, op, args, kwargs, meta): """ diff --git a/backends/arm/_passes/decompose_sum_pass.py b/backends/arm/_passes/decompose_sum_pass.py index 52b9c10c49f..16027ccec2b 100644 --- a/backends/arm/_passes/decompose_sum_pass.py +++ b/backends/arm/_passes/decompose_sum_pass.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -40,6 +42,8 @@ class DecomposeSumPass(ExportPass): view(shape = squeezed_shape) -> squeezed_shape """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta): if op not in [ exir_ops.edge.aten.sum.dim_IntList, diff --git a/backends/arm/_passes/decompose_var_pass.py b/backends/arm/_passes/decompose_var_pass.py index 15872738f3e..db5d820ac70 100644 --- a/backends/arm/_passes/decompose_var_pass.py +++ b/backends/arm/_passes/decompose_var_pass.py @@ -7,10 +7,16 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg +from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass +from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass +from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass def get_var_decomposition(op) -> tuple: @@ -47,6 +53,12 @@ class DecomposeVarPass(ArmPass): y = div(sum, max(0, N-correction)) """ + _passes_required_after: Set[Type[ExportPass]] = { + ComputeConstantOpsAOT, + DecomposeMeanDimPass, + DecomposeSumPass, + } + def call_operator(self, op, args, kwargs, meta): if op not in ( exir_ops.edge.aten.var.correction, diff --git a/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py b/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py index 17a682c0a8e..9d704520302 100644 --- a/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py +++ b/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py @@ -6,10 +6,13 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass def _get_decorated_ops(op): @@ -40,6 +43,8 @@ class DecorateFp32toInt32CastingPass(ArmPass): output = to_dim_order_copy(decorated_x, dtype=torch.int32) """ + _passes_required_after: Set[Type[ExportPass]] = set() + targets = [ exir_ops.edge.dim_order_ops._to_dim_order_copy.default, ] diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py index 491b404f0a4..477e007b8bf 100644 --- a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py +++ b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py @@ -8,15 +8,17 @@ import copy -from typing import cast, Dict, Set, Tuple +from typing import cast, Dict, Set, Tuple, Type from executorch.backends.arm._passes import ArmPass from executorch.backends.arm._passes.arm_pass_utils import ( get_param_tensor, is_param_node, ) +from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.quant_args import QuantArgs +from executorch.backends.arm._passes.remove_noop_pass import RemoveNoopPass from executorch.backends.arm.constants import DQ_OPS, Q_OPS from executorch.exir.dialects._ops import ops as exir_ops @@ -70,6 +72,44 @@ def get_output_qparams(node: Node) -> dict[int, QuantArgs]: return output_qparams +class RetraceFoldedDtypesPass(ExportPass): + """ + FoldAndAnnotateQParamsPass folds dq and q nodes. When the graph is retraced + some operators are retraced to types that cannot be handled by TOSA. One + such example is sum.dim_IntList: + q (int8) -> dq (fp32) -> sum (fp32) -> q (int8) ... + After folding it becomes: + q (int8) -> sum (int64) -> ... + This pass changes types of ops in self.targeted_ops, such as sum, so that + the output type of that matches the type of the output_qparams. + """ + + _passes_required_after: Set[Type[ExportPass]] = set() + + targeted_ops: Set[EdgeOpOverload] = { + exir_ops.edge.aten.sum.dim_IntList, + } + + def call_operator( + self, + op, # pyre-ignore + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op not in self.targeted_ops: + return super().call_operator(op, args, kwargs, meta) + + node_kwargs = kwargs.copy() + output_qparams = meta["output_qparams"] + if len(output_qparams) == 0: + return super().call_operator(op, args, kwargs, meta) + + output_dtype = output_qparams[0].dtype + node_kwargs["dtype"] = output_dtype + return super().call_operator(op, args, node_kwargs, meta) + + class FoldAndAnnotateQParamsPass(ArmPass): """ A pass that walks the graph and removes any DQ and Q nodes before and after the target @@ -100,6 +140,12 @@ class FoldAndAnnotateQParamsPass(ArmPass): """ + _passes_required_after: Set[Type[ExportPass]] = { + RetraceFoldedDtypesPass, + InsertTableOpsPass, + RemoveNoopPass, + } + def fold_and_annotate_arg( self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int ) -> None: @@ -210,6 +256,8 @@ class QuantizeOperatorArguments(ExportPass): - Makes sure the min and max values to clamp.default are quantized, if it's a quantized operator. """ + _passes_required_after: Set[Type[ExportPass]] = {FoldAndAnnotateQParamsPass} + def call(self, graph_module: GraphModule) -> PassResult: modified = False # Loop over the graph nodes and find full.default nodes. @@ -243,39 +291,3 @@ def call(self, graph_module: GraphModule) -> PassResult: modified = True return PassResult(graph_module, modified) - - -class RetraceFoldedDtypesPass(ExportPass): - """ - FoldAndAnnotateQParamsPass folds dq and q nodes. When the graph is retraced - some operators are retraced to types that cannot be handled by TOSA. One - such example is sum.dim_IntList: - q (int8) -> dq (fp32) -> sum (fp32) -> q (int8) ... - After folding it becomes: - q (int8) -> sum (int64) -> ... - This pass changes types of ops in self.targeted_ops, such as sum, so that - the output type of that matches the type of the output_qparams. - """ - - targeted_ops: Set[EdgeOpOverload] = { - exir_ops.edge.aten.sum.dim_IntList, - } - - def call_operator( - self, - op, # pyre-ignore - args: Tuple[Argument, ...], - kwargs: Dict[str, Argument], - meta: NodeMetadata, - ) -> ProxyValue: - if op not in self.targeted_ops: - return super().call_operator(op, args, kwargs, meta) - - node_kwargs = kwargs.copy() - output_qparams = meta["output_qparams"] - if len(output_qparams) == 0: - return super().call_operator(op, args, kwargs, meta) - - output_dtype = output_qparams[0].dtype - node_kwargs["dtype"] = output_dtype - return super().call_operator(op, args, node_kwargs, meta) diff --git a/backends/arm/_passes/fuse_batchnorm2d_pass.py b/backends/arm/_passes/fuse_batchnorm2d_pass.py index 2dbdfa84cec..8be6b61d25c 100644 --- a/backends/arm/_passes/fuse_batchnorm2d_pass.py +++ b/backends/arm/_passes/fuse_batchnorm2d_pass.py @@ -5,11 +5,14 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes.arm_pass_utils import ( create_node, get_first_fake_tensor, ) +from executorch.backends.arm.common.debug import get_node_debug_info from executorch.backends.transforms.utils import ( create_constant_placeholder, delete_constant_placeholder, @@ -28,6 +31,8 @@ class FuseBatchnorm2DPass(ExportPass): the weights and bias of the convolution and removing the batchnorm. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program: ExportedProgram): self.exported_program = exported_program super().__init__() @@ -56,8 +61,16 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: # noqa: C901 input_node = node.all_input_nodes[0] is_single_user = len(input_node.users) == 1 bn_weight_node, bn_bias_node, bn_mean_node, bn_var_node = node.args[1:5] - assert bn_mean_node is not None, "Batchnorm mean node cannot be None." - assert bn_var_node is not None, "Batchnorm var node cannot be None." + if bn_mean_node is None: + raise RuntimeError( + "BatchNorm mean buffer missing for node: " + f"{get_node_debug_info(node, graph_module)}" + ) + if bn_var_node is None: + raise RuntimeError( + "BatchNorm variance buffer missing for node: " + f"{get_node_debug_info(node, graph_module)}" + ) epsilon = node.args[-1] @@ -129,14 +142,23 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: # noqa: C901 input_node = new_input_node else: input_weight_node, input_bias_node = input_node.args[1:3] - assert ( - isinstance(input_weight_node, Node) - and input_weight_node.op == "placeholder" - ), "Parameter weight of convolution must be a placeholder" - assert (input_bias_node is None) or ( + if not ( isinstance(input_weight_node, Node) and input_weight_node.op == "placeholder" - ), "Parameter bias of convolution must be a placeholder or None" + ): + raise RuntimeError( + "Parameter weight of convolution must be a placeholder" + ) + if not ( + (input_bias_node is None) + or ( + isinstance(input_weight_node, Node) + and input_weight_node.op == "placeholder" + ) + ): + raise RuntimeError( + "Parameter bias of convolution must be a placeholder or None" + ) input_weight_tensor = torch.Tensor( get_param(self.exported_program, input_weight_node) diff --git a/backends/arm/_passes/fuse_constant_ops_pass.py b/backends/arm/_passes/fuse_constant_ops_pass.py index f49565e3c38..c48fc008b5d 100644 --- a/backends/arm/_passes/fuse_constant_ops_pass.py +++ b/backends/arm/_passes/fuse_constant_ops_pass.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import logging +from typing import Set, Type import torch._export.utils import torch.fx @@ -13,6 +14,9 @@ get_param_tensor, is_persistent_buffer, ) +from executorch.backends.arm._passes.fuse_equal_placeholders_pass import ( + FuseEqualPlaceholdersPass, +) from executorch.backends.transforms.utils import ( create_constant_placeholder, delete_constant_placeholder, @@ -41,6 +45,8 @@ def f(): return x """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program: ExportedProgram) -> None: super().__init__() self.exported_program = exported_program @@ -108,8 +114,10 @@ def call(self, graph_module): if node.op != "call_function": continue if node.target in [ - exir_ops.backend.tosa.TABLE.default, + exir_ops.backend.tosa.MATMUL.default, exir_ops.backend.tosa.RESCALE.default, + exir_ops.backend.tosa.RESIZE.default, + exir_ops.backend.tosa.TABLE.default, exir_ops.backend.tosa.TRANSPOSE.default, ]: continue @@ -168,6 +176,8 @@ def f(node_name_pre_computed): return node_name_pre_computed """ + _passes_required_after: Set[Type[ExportPass]] = {FuseEqualPlaceholdersPass} + targeted_ops = [ exir_ops.edge.aten.full.default, exir_ops.edge.aten.arange.start_step, diff --git a/backends/arm/_passes/fuse_equal_placeholders_pass.py b/backends/arm/_passes/fuse_equal_placeholders_pass.py index 5631e2f32e9..b8b8143e6c5 100644 --- a/backends/arm/_passes/fuse_equal_placeholders_pass.py +++ b/backends/arm/_passes/fuse_equal_placeholders_pass.py @@ -5,13 +5,16 @@ import hashlib from collections import defaultdict +from typing import Set, Type import torch + from executorch.backends.arm._passes.arm_pass_utils import ( get_constant_placeholder_kind, get_param_tensor, is_param_node, ) +from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.backends.transforms.utils import ( create_constant_placeholder, delete_constant_placeholder, @@ -27,6 +30,8 @@ class FuseEqualPlaceholdersPass(ExportPass): with multiple users, using a cache for faster comparison. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program: ExportedProgram): self.exported_program = exported_program super().__init__() @@ -44,9 +49,14 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: continue # Create a lightweight fingerprint: dtype + shape + SHA1 of raw bytes # Ensure tensor is on CPU and contiguous + + # ensure we don't merge any special case int48_t tensors with int32_t tensors + # since int48_t tensors needs to be instantiated separately. + is_int48 = node.meta.get(TosaSpecialDtype.meta_key(), None) t_cpu = tensor.detach().cpu().contiguous() data_bytes = t_cpu.numpy().tobytes() key = ( + is_int48, str(t_cpu.dtype), tuple(t_cpu.shape), hashlib.sha1(data_bytes).hexdigest(), diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py index 46a7d7f6f98..1076a3df658 100644 --- a/backends/arm/_passes/fuse_quantized_activation_pass.py +++ b/backends/arm/_passes/fuse_quantized_activation_pass.py @@ -5,15 +5,28 @@ # pyre-unsafe +from typing import Set, Type + import torch +from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass +from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( + FoldAndAnnotateQParamsPass, +) from executorch.backends.arm._passes.quant_args import QuantArgs from executorch.backends.arm.constants import Q_OPS +from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult from torch.fx import Node class FuseQuantizedActivationPass(ExportPass): + _passes_required_after: Set[Type[ExportPass]] = { + ConvertToClampPass, + FoldAndAnnotateQParamsPass, + RemoveGetItemPass, + } + @staticmethod def _is_fuseable_quantized_activation(node: Node): """Fuse activations that have a 0 lower bound and quantized with a qmin zero-point""" diff --git a/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py index 4b619af790c..c6e6f70a630 100644 --- a/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py +++ b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py @@ -8,8 +8,13 @@ import logging +from typing import Set, Type + import torch from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.backends.arm._passes.decompose_embedding_pass import ( + DecomposeEmbeddingPass, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import EdgeOpOverload, ExportPass, PassResult from torch._subclasses.fake_tensor import FakeTensor @@ -26,6 +31,8 @@ class InsertInt32CastsAfterInt64PlaceholdersPass(ExportPass): the int32 range. """ + _passes_required_after: Set[Type[ExportPass]] = {DecomposeEmbeddingPass} + # Ops that require i64 inputs → positions of args to upcast. # Key: op overload; Value: zero-based indices of positional args that must be i64. I64_INPUT_ARG_POSITIONS = { diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py index 7f75aecf24c..d56e70e78b3 100644 --- a/backends/arm/_passes/insert_rescales_pass.py +++ b/backends/arm/_passes/insert_rescales_pass.py @@ -4,9 +4,14 @@ # LICENSE file in the root directory of this source tree. from copy import copy -from typing import cast +from typing import cast, Dict, Optional, Set, Tuple, Type -from executorch.backends.arm._passes.arm_pass_utils import create_node +import torch +from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass_utils import create_node, set_node_arg +from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( + get_output_qparams, +) from executorch.backends.arm._passes.quant_args import QuantArgs from executorch.backends.arm.constants import DQ_OPS, Q_OPS from executorch.exir.dialects._ops import ops as exir_ops @@ -24,6 +29,8 @@ class InsertRescalePass(ExportPass): in the fake implementation of. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule): dq_args = QuantArgs.from_operator(node.target, node.args) q_args = QuantArgs.from_operator(user.target, user.args) @@ -63,3 +70,234 @@ def call(self, graph_module: GraphModule) -> PassResult: graph_module = super().call(graph_module).graph_module graph_module.recompile() return PassResult(graph_module, modified) + + +class InsertRescaleInt32Pass(ArmPass): + """ + Numerous TOSA ops require inputs and outputs to be 32-bit integers in their + quantized implementations. This pass treats such operator nodes by + inserting rescale ops before and after them if needed. Note that extra logic + that handles the scales and zero points must be in place because the affected + TOSA have naive implementations that do not account for the quantization + parameters. + """ + + _passes_required_after: Set[Type[ExportPass]] = set() + + included_targets = [ + exir_ops.edge.aten.abs.default, + exir_ops.edge.aten.eq.Tensor, + exir_ops.edge.aten.ge.Tensor, + exir_ops.edge.aten.gt.Tensor, + exir_ops.edge.aten.le.Tensor, + exir_ops.edge.aten.lt.Tensor, + exir_ops.edge.aten.maximum.default, + exir_ops.edge.aten.minimum.default, + ] + + def _int32_qargs(self, s): + """Helper creator function for INT32-based QuantArgs""" + + return QuantArgs( + scale=s, + zp=0, + qmin=torch.iinfo(torch.int32).min, + qmax=torch.iinfo(torch.int32).max, + dtype=torch.int32, + ) + + def _get_inputs_rescaled_qparams( + self, target, input_qparams: Dict[int, QuantArgs] + ) -> Dict[int, QuantArgs]: + """Get the qparams for the INT32 operands to the op ``target`` + + Inputs to the INT32-based operator must be rescaled from INT8 to INT32. + This function computes the ``QuantArgs`` for each of the operands and returns + it as a dict, mapping tensor index to ``QuantArgs``. + """ + + if target in [ + exir_ops.edge.aten.abs.default, + exir_ops.edge.aten.eq.Tensor, + exir_ops.edge.aten.ge.Tensor, + exir_ops.edge.aten.gt.Tensor, + exir_ops.edge.aten.le.Tensor, + exir_ops.edge.aten.lt.Tensor, + exir_ops.edge.aten.minimum.default, + exir_ops.edge.aten.maximum.default, + ]: + # For these ops, use the smallest scale among the INT8 operands. + min_scale = min( + [qp.get_scale_per_tensor() for qp in input_qparams.values()] + ) + qparams = { + i: self._int32_qargs(min_scale) for i in range(len(input_qparams)) + } + else: + raise ValueError(f"Not a valid target: {target}") + + return qparams + + def _get_output_qparams( + self, target, inputs_qparams: Dict[int, QuantArgs] + ) -> Optional[QuantArgs]: + """Given an op ``target`` and the ``QuantArgs`` for each of its inputs, compute + the scale of the output based on how the operator itself affects it.""" + + if target in [ + exir_ops.edge.aten.abs.default, + exir_ops.edge.aten.maximum.default, + exir_ops.edge.aten.minimum.default, + ]: + # The op has not altered the scale; the output scale is equal to + # the operands' scales. + return self._int32_qargs(inputs_qparams[0].get_scale_per_tensor()) + elif target in [ + exir_ops.edge.aten.eq.Tensor, + exir_ops.edge.aten.ge.Tensor, + exir_ops.edge.aten.gt.Tensor, + exir_ops.edge.aten.le.Tensor, + exir_ops.edge.aten.lt.Tensor, + ]: + # Output is bool for these ops and thus no qparams are present + return None + else: + raise ValueError(f"Not a valid target: {target}") + + def _get_rescale_qparams( + self, target, input_qparams: Dict[int, QuantArgs] + ) -> Tuple[Dict[int, QuantArgs], Optional[QuantArgs]]: + """ + Get the quantization parameters of the INT32 inputs/outputs that will + surround the node after the new RESCALE ops have been inserted. + """ + + inputs_rescaled_qparams = self._get_inputs_rescaled_qparams( + target, input_qparams + ) + output_qparams = self._get_output_qparams(target, inputs_rescaled_qparams) + + return (inputs_rescaled_qparams, output_qparams) + + def _rescale_inputs(self, graph, node, rescale_qargs: Dict[int, QuantArgs]) -> bool: + qargs = node.meta["input_qparams"] + + args_copy = list(node.args) + seen_args = set() + modified = False + for i in qargs: + qp = qargs[i] + if qp.dtype != torch.int8: + continue + + arg_node = args_copy[i] + if arg_node in seen_args: + continue + seen_args.add(arg_node) + + with graph.inserting_after(arg_node): + rescale_node = create_node( + graph, + exir_ops.backend.tosa.RESCALE.default, + ( + arg_node, + torch.int32, + qp.get_scale_per_tensor() + / rescale_qargs[ + i + ].get_scale_per_tensor(), # Old scale / new scale + qp.get_zp_per_tensor(), # Old zero point + rescale_qargs[i].get_zp_per_tensor(), # New zero point + ), + from_node=node, + ) + + node.replace_input_with(arg_node, rescale_node) + modified = True + + return modified + + def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> bool: + if "output_qparams" not in node.meta or len(node.meta["output_qparams"]) == 0: + return False + + qargs = get_output_qparams(node) + assert len(qargs) == 1 + assert rescale_qargs is not None + + qarg = qargs[0] + if qarg.dtype != torch.int8: + return False + + users_copy = list(node.users) + + with graph.inserting_after(node): + rescale_node = create_node( + graph, + exir_ops.backend.tosa.RESCALE.default, + ( + node, + torch.int8, + rescale_qargs.get_scale_per_tensor() + / qarg.get_scale_per_tensor(), # Old scale / new scale + rescale_qargs.get_zp_per_tensor(), # Old zero point + qarg.get_zp_per_tensor(), # New zero point + ), + from_node=node, + ) + + for user in users_copy: + user.replace_input_with(node, rescale_node) + + return True + + def call(self, graph_module: GraphModule) -> PassResult: + graph = graph_module.graph + + modified = False + for node in list(graph.nodes): + node = cast(Node, node) + + if node.op != "call_function" or node.target not in self.included_targets: + continue + + if "input_qparams" not in node.meta or len(node.meta["input_qparams"]) == 0: + continue + input_qparams = node.meta["input_qparams"] + + inputs_rescale_qargs, output_rescale_qargs = self._get_rescale_qparams( + node.target, input_qparams + ) + + inputs_was_rescaled = self._rescale_inputs( + graph, node, inputs_rescale_qargs + ) + outputs_was_rescaled = False + if inputs_was_rescaled: + outputs_was_rescaled = self._rescale_outputs( + graph, node, output_rescale_qargs + ) + modified = True + + # Update node metadata + + if inputs_was_rescaled: + assert len(inputs_rescale_qargs) == len(node.meta["input_qparams"]) + node.meta["input_qparams"] = inputs_rescale_qargs + + if outputs_was_rescaled: + assert len(node.meta["output_qparams"]) == 1 + node.meta["output_qparams"] = {0: output_rescale_qargs} + + # If the output type is specified in the node, change it such + # that it matches the subsequent rescale node(s) that this node + # now has output edges to. + if "dtype" in node.kwargs: + set_node_arg(node, "dtype", torch.int32) + + if modified: + # Retrace the graph to update the fake tensor types + graph_module = super().call(graph_module).graph_module + graph_module.recompile() + + return PassResult(graph_module, modified) diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py index fb5d7de5e12..d838ddc823d 100644 --- a/backends/arm/_passes/insert_table_ops.py +++ b/backends/arm/_passes/insert_table_ops.py @@ -6,7 +6,7 @@ # pyre-unsafe from itertools import chain -from typing import Callable, cast, Dict, Iterator, Set +from typing import Callable, cast, Dict, Iterator, Set, Type import torch from executorch.backends.arm._passes.arm_pass_utils import create_node @@ -117,6 +117,8 @@ class InsertTableOpsPass(ExportPass): which will be used to produce the table values in operators/op_table.py. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program: ExportedProgram) -> None: super().__init__() self.exported_program = exported_program diff --git a/backends/arm/_passes/match_arg_dtype_pass.py b/backends/arm/_passes/match_arg_dtype_pass.py index e7bf3b2d60e..d482614b03f 100644 --- a/backends/arm/_passes/match_arg_dtype_pass.py +++ b/backends/arm/_passes/match_arg_dtype_pass.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Set, Type + import torch from executorch.backends.arm._passes.arm_pass_utils import create_node, get_node_arg from executorch.exir.dialects._ops import ops as exir_ops @@ -38,6 +40,8 @@ class MatchArgDtypePass(ExportPass): """ + _passes_required_after: Set[Type[ExportPass]] = set() + targeted_ops = {exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.where.self} def call(self, graph_module: torch.fx.GraphModule): diff --git a/backends/arm/_passes/match_arg_ranks_pass.py b/backends/arm/_passes/match_arg_ranks_pass.py index d6cdfacb612..c411f3b8083 100644 --- a/backends/arm/_passes/match_arg_ranks_pass.py +++ b/backends/arm/_passes/match_arg_ranks_pass.py @@ -7,7 +7,7 @@ # pyre-unsafe -from typing import cast +from typing import cast, Set, Type from executorch.backends.arm._passes.arm_pass_utils import ( create_node, @@ -36,6 +36,8 @@ class MatchArgRanksPass(ExportPass): input2 = shape(1, 3, 1) """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program): super().__init__() self.exported_program = exported_program diff --git a/backends/arm/_passes/mm_to_bmm_pass.py b/backends/arm/_passes/mm_to_bmm_pass.py index 69d8573013e..c6f4786365d 100644 --- a/backends/arm/_passes/mm_to_bmm_pass.py +++ b/backends/arm/_passes/mm_to_bmm_pass.py @@ -6,12 +6,20 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.backends.arm._passes.arm_pass_utils import ( create_node, get_first_fake_tensor, insert_q_dq_pair, ) +from executorch.backends.arm._passes.convert_squeezes_to_view import ( + ConvertSqueezesToViewPass, +) +from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( + FoldAndAnnotateQParamsPass, +) from executorch.backends.arm.constants import DQ_OPS, Q_OPS from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -28,6 +36,11 @@ class ConvertMmToBmmPass(ExportPass): 3) Squeeze output tensor to rank 2. """ + _passes_required_after: Set[Type[ExportPass]] = { + ConvertSqueezesToViewPass, + FoldAndAnnotateQParamsPass, + } + def call(self, graph_module: torch.fx.GraphModule): modified_graph = False graph = graph_module.graph diff --git a/backends/arm/_passes/remove_noop_pass.py b/backends/arm/_passes/remove_noop_pass.py index 623517aac59..55c4f71f0a8 100644 --- a/backends/arm/_passes/remove_noop_pass.py +++ b/backends/arm/_passes/remove_noop_pass.py @@ -7,6 +7,7 @@ # pyre-unsafe import logging +from typing import Set, Type from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -17,6 +18,8 @@ class RemoveNoopPass(ExportPass): """Remove no-ops from graph_module""" + _passes_required_after: Set[Type[ExportPass]] = set() + def call_operator(self, op, args, kwargs, meta): if op not in ( exir_ops.edge.dim_order_ops._clone_dim_order.default, diff --git a/backends/arm/_passes/replace_inf_values_pass.py b/backends/arm/_passes/replace_inf_values_pass.py index 8c721eda3d8..506030d82d7 100644 --- a/backends/arm/_passes/replace_inf_values_pass.py +++ b/backends/arm/_passes/replace_inf_values_pass.py @@ -7,6 +7,8 @@ # This pass is based on backends/qualcomm/_passes/replace_inf_values.py # with some modification to replaced inf values. +from typing import Set, Type + import torch from executorch.exir.pass_base import ExportPass, PassResult @@ -16,6 +18,8 @@ class ReplaceInfValues(ExportPass): Due to limitation in Quantizer, we need to change inf/-inf to more quantizable values. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self): super(ReplaceInfValues, self).__init__() diff --git a/backends/arm/_passes/replace_scalar_with_tensor_pass.py b/backends/arm/_passes/replace_scalar_with_tensor_pass.py index 249eb9ffd41..f6ef056f677 100644 --- a/backends/arm/_passes/replace_scalar_with_tensor_pass.py +++ b/backends/arm/_passes/replace_scalar_with_tensor_pass.py @@ -6,7 +6,7 @@ # pyre-unsafe -from typing import Dict, Union +from typing import Dict, Set, Type, Union import torch from executorch.backends.transforms.replace_scalar_with_tensor import ( @@ -15,6 +15,7 @@ from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload +from executorch.exir.pass_base import ExportPass # Operators that are included for both TOSA profiles @@ -56,6 +57,8 @@ class ReplaceScalarWithTensorArgPassTOSAMI(ReplaceScalarWithTensorArgPass): + _passes_required_after: Set[Type[ExportPass]] = set() + scalar_to_tensor_ops = _common_ops | { exir_ops.edge.aten.pow.Tensor_Scalar: exir_ops.edge.aten.pow.Tensor_Tensor, torch.ops.aten.pow.Tensor_Scalar: torch.ops.aten.pow.Tensor_Tensor, @@ -66,6 +69,8 @@ def __init__(self): class ReplaceScalarWithTensorArgPassTOSABI(ReplaceScalarWithTensorArgPass): + _passes_required_after: Set[Type[ExportPass]] = set() + scalar_to_tensor_ops = _common_ops def __init__(self): diff --git a/backends/arm/_passes/rewrite_matmul.py b/backends/arm/_passes/rewrite_matmul.py new file mode 100644 index 00000000000..28ff800792b --- /dev/null +++ b/backends/arm/_passes/rewrite_matmul.py @@ -0,0 +1,97 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Set, Type + +import torch +from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.arm_pass_utils import ( + create_node, + get_first_fake_tensor, +) +from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( + get_input_qparams, + get_output_qparams, +) +from executorch.backends.arm.tosa.mapping import TosaSpecialDtype +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class RewriteMatmulPass(ArmPass): + """Rewrites aten.bmm to tosa.MATMUL and inserts a tosa.RESCALE op if needed.""" + + _passes_required_after: Set[Type[ExportPass]] = set() + + def _insert_output_rescale(self, graph_module, node, tosa_matmul_node, dtype): + input_qparams = get_input_qparams(node) + output_qparams = get_output_qparams(node)[0] + scale = ( + input_qparams[0].get_scale_per_tensor() + * input_qparams[1].get_scale_per_tensor() + ) / output_qparams.get_scale_per_tensor() + + with graph_module.graph.inserting_after(tosa_matmul_node): + # If the input is int8, we need to cast the output to int32 + rescale_node = create_node( + graph_module.graph, + op_target=exir_ops.backend.tosa.RESCALE.default, + from_node=tosa_matmul_node, + ) + tosa_matmul_node.replace_all_uses_with(rescale_node) + rescale_node.args = ( + tosa_matmul_node, + dtype, + scale, + 0, + output_qparams.get_zp_per_tensor(), + ) + + def call(self, graph_module): + modified = False + for node in graph_module.graph.nodes: + if ( + node.op != "call_function" + or node.target != exir_ops.edge.aten.bmm.default + ): + continue + modified = True + + x1, x2 = node.args + tosa_matmul_target = exir_ops.backend.tosa.MATMUL.default + with graph_module.graph.inserting_before(node): + tosa_matmul_node = create_node( + graph_module.graph, + op_target=tosa_matmul_target, + args=(x1, x2), + kwargs={}, + from_node=node, + ) + node.replace_all_uses_with(tosa_matmul_node) + graph_module.graph.erase_node(node) + + x1_fake_tensor = get_first_fake_tensor(x1) + x2_fake_tensor = get_first_fake_tensor(x2) + output_fake_tensor = tosa_matmul_target(x1_fake_tensor, x2_fake_tensor) + node_output_fake_tensor = get_first_fake_tensor(node) + if ( + output_fake_tensor.dtype == torch.int32 + and node_output_fake_tensor.dtype in (torch.int8, torch.int16) + ): + self._insert_output_rescale( + graph_module, + node, + tosa_matmul_node, + dtype=node_output_fake_tensor.dtype, + ) + if x1_fake_tensor.dtype == torch.int16: + tosa_matmul_node.meta[TosaSpecialDtype.meta_key()] = ( + TosaSpecialDtype.INT48 + ) + + if modified: + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + return PassResult(graph_module, modified) diff --git a/backends/arm/_passes/rewrite_upsample.py b/backends/arm/_passes/rewrite_upsample.py new file mode 100644 index 00000000000..c9f25a1e845 --- /dev/null +++ b/backends/arm/_passes/rewrite_upsample.py @@ -0,0 +1,84 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Set, Type + +import torch +from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.arm_pass_utils import ( + create_node, + get_first_fake_tensor, +) +from executorch.backends.arm.tosa.utils import get_resize_parameters +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class RewriteUpsamplePass(ArmPass): + """Rewrite upsample2d nodes to TOSA.RESIZE nodes.""" + + targeted_ops = ( + exir_ops.edge.aten.upsample_nearest2d.vec, + exir_ops.edge.aten.upsample_bilinear2d.vec, + ) + + _passes_required_after: Set[Type[ExportPass]] = set() + + def call(self, graph_module): + modified = False + for node in graph_module.graph.nodes: + if node.op != "call_function" or node.target not in self.targeted_ops: + continue + modified = True + + if node.target == exir_ops.edge.aten.upsample_bilinear2d.vec: + x, output_size, align_corners, scale_factors = node.args + resize_mode = "bilinear" + else: + x, output_size, scale_factors = node.args + align_corners = False + resize_mode = "nearest" + + with graph_module.graph.inserting_before(node): + tosa_resize_node = create_node( + graph_module.graph, + op_target=exir_ops.backend.tosa.RESIZE.default, + args=(x, output_size, align_corners, scale_factors), + kwargs={"resize_mode": resize_mode}, + from_node=node, + ) + node.replace_all_uses_with(tosa_resize_node) + graph_module.graph.erase_node(node) + input_dtype = get_first_fake_tensor(x).dtype + if input_dtype == torch.int8 and resize_mode == "bilinear": + input_size = get_first_fake_tensor(x).shape + input_size_xy = input_size[2:] + output_size = get_first_fake_tensor(node).shape + output_size_xy = output_size[2:] + scale_n_yx, _, _, _ = get_resize_parameters( + input_size_xy=input_size_xy, + output_size_xy=output_size_xy, + resize_mode=1, + align_corners=align_corners, + ) + output_dtype = get_first_fake_tensor(node).dtype + output_scale = float(1 / (scale_n_yx[0] * scale_n_yx[1])) + with graph_module.graph.inserting_after(tosa_resize_node): + rescale_node = create_node( + graph_module.graph, + exir_ops.backend.tosa.RESCALE.default, + ) + tosa_resize_node.replace_all_uses_with(rescale_node) + rescale_node.args = ( + tosa_resize_node, + output_dtype, + output_scale, + 0, # zero point + 0, # zero point + ) + + if modified: + graph_module = super().call(graph_module).graph_module + return PassResult(graph_module, modified) diff --git a/backends/arm/_passes/scalars_to_attribute_pass.py b/backends/arm/_passes/scalars_to_attribute_pass.py index 89468bff1ff..9ad3e318011 100644 --- a/backends/arm/_passes/scalars_to_attribute_pass.py +++ b/backends/arm/_passes/scalars_to_attribute_pass.py @@ -6,10 +6,11 @@ # pyre-unsafe -from typing import cast, Union +from typing import cast, Set, Type, Union import torch from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor +from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass from executorch.exir.pass_base import ExportPass, PassResult from torch.fx import GraphModule, Node @@ -22,6 +23,8 @@ class ScalarsToAttributePass(ExportPass): to attribute Nodes that output the same value. """ + _passes_required_after: Set[Type[ExportPass]] = {MatchArgRanksPass} + targeted_ops = [ torch.ops.aten.add.Tensor, torch.ops.aten.add_.Tensor, diff --git a/backends/arm/_passes/size_adjust_input_pass.py b/backends/arm/_passes/size_adjust_input_pass.py index e87d65c450f..5eb77dc56df 100644 --- a/backends/arm/_passes/size_adjust_input_pass.py +++ b/backends/arm/_passes/size_adjust_input_pass.py @@ -5,7 +5,7 @@ # pyre-unsafe -from typing import cast, TypeAlias +from typing import cast, Set, Type, TypeAlias import torch.fx from executorch.backends.arm._passes.arm_pass_utils import create_node @@ -185,6 +185,8 @@ class SizeAdjustInputPass(ExportPass): input. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: graph = graph_module.graph modified_graph = False diff --git a/backends/arm/_passes/to_tosa_memory_format_pass.py b/backends/arm/_passes/to_tosa_memory_format_pass.py index e4436d638f4..b906c06b329 100644 --- a/backends/arm/_passes/to_tosa_memory_format_pass.py +++ b/backends/arm/_passes/to_tosa_memory_format_pass.py @@ -7,15 +7,29 @@ import logging +from typing import Set, Type import torch -from executorch.backends.arm._passes import AnnotateOutputDimOrderPass +from executorch.backends.arm._passes.annotate_decomposed_matmul import ( + AnnotateDecomposedMatmulPass, +) from executorch.backends.arm._passes.arm_pass_utils import ( create_node, get_first_fake_tensor, - get_output_dim_orders, is_param_node, ) +from executorch.backends.arm.constants import ( + HWCM_ORDER, + NCHW_ORDER, + NHWC_INVERSE_ORDER, + NHWC_ORDER, + NNCHW_ORDER, + NNHWC_INVERSE_ORDER, + NNHWC_ORDER, + NNNCHW_ORDER, + NNNHWC_INVERSE_ORDER, + NNNHWC_ORDER, +) from executorch.exir import ExportedProgram from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -38,11 +52,7 @@ class ToTosaMemoryFormatPass(ExportPass): The annotated tosa_dim_order is used to permute the node's shape such that it gives a TOSA-compliant shape. """ - NHWC_order = (0, 2, 3, 1) - NHWC_inverse_order = (0, 3, 1, 2) - HWCM_order = (2, 3, 0, 1) - NNHWC_order = (0, 1, 3, 4, 2) - NNHWC_inverse_order = (0, 1, 4, 2, 3) + _passes_required_after: Set[Type[ExportPass]] = set() def __init__(self, exported_program: ExportedProgram) -> None: self.exported_program = exported_program @@ -80,7 +90,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node): @staticmethod def memory_format_differs(shape): """Returns true if the shape will have a different memory layout in (N)NCHW and (N)NHWC format""" - if len(shape) >= 5: + if len(shape) >= 6: + C = shape[3] + H = shape[4] + W = shape[5] + elif len(shape) == 5: C = shape[2] H = shape[3] W = shape[4] @@ -99,25 +113,26 @@ def memory_format_differs(shape): @staticmethod def is_channel_reshape(input_shape, output_shape): - """Returns true if the reshape changes the channel dimension""" - if not ( - (len(input_shape) == len(output_shape) and (len(output_shape) in (4, 5))) - or (len(input_shape) == 4 and len(output_shape) == 5) - or (len(input_shape) == 5 and len(output_shape) == 4) - ): + """Returns true if reshape changes the channel dimension or batch product dimension(s)""" + + valid_ranks = {4, 5, 6} + + if not (len(input_shape) in valid_ranks and len(output_shape) in valid_ranks): return False C_old = input_shape[-3] C_new = output_shape[-3] - N_new = ( - output_shape[0] - if len(output_shape) == 4 - else output_shape[0] * output_shape[1] - ) - N_old = ( - input_shape[0] if len(input_shape) == 4 else input_shape[0] * input_shape[1] - ) + def get_batch_prod_dim(shape): + product = 1 + + for dim in shape[:-3]: + product = product * dim + + return product + + N_old = get_batch_prod_dim(input_shape) + N_new = get_batch_prod_dim(output_shape) return (N_old != N_new) or (C_old != C_new) @@ -128,17 +143,27 @@ def insert_input_transpose(node, input_node, graph_module): node.replace_input_with(input_node, pre_permute_node) return + if len(get_first_fake_tensor(input_node).size()) == 6: + mem_format = NNNHWC_INVERSE_ORDER + elif len(get_first_fake_tensor(input_node).size()) == 5: + mem_format = NNHWC_INVERSE_ORDER + else: + mem_format = NHWC_INVERSE_ORDER + # Guard: mem_format must be a true permutation for the current rank + _rank_ = len( + get_first_fake_tensor(input_node).size() + ) # or (node) in output path + assert sorted(mem_format) == list( + range(_rank_) + ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose" + with graph_module.graph.inserting_before(node): permute_node = create_node( graph_module.graph, exir_ops.backend.tosa.TRANSPOSE.default, args=( input_node, - list( - ToTosaMemoryFormatPass.NNHWC_inverse_order - if len(get_first_fake_tensor(input_node).size()) == 5 - else ToTosaMemoryFormatPass.NHWC_inverse_order - ), + list(mem_format), ), from_node=node, ) @@ -150,26 +175,38 @@ def insert_input_transpose(node, input_node, graph_module): @staticmethod def insert_output_transpose(node, graph_module): + + if len(get_first_fake_tensor(node).size()) == 6: + mem_format = NNNHWC_ORDER + elif len(get_first_fake_tensor(node).size()) == 5: + mem_format = NNHWC_ORDER + else: + mem_format = NHWC_ORDER + # Guard: mem_format must be a true permutation for the current rank + _rank_ = len(get_first_fake_tensor(node).size()) # or (node) in output path + assert sorted(mem_format) == list( + range(_rank_) + ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose" + with graph_module.graph.inserting_after(node): permute_node = create_node( graph_module.graph, exir_ops.backend.tosa.TRANSPOSE.default, args=( node, - list( - ToTosaMemoryFormatPass.NNHWC_order - if len(get_first_fake_tensor(node).size()) == 5 - else ToTosaMemoryFormatPass.NHWC_order - ), + list(mem_format), ), from_node=node, ) - permute_node.meta["tosa_dim_order"] = ( - ToTosaMemoryFormatPass.NNHWC_order - if len(get_first_fake_tensor(node).size()) == 5 - else ToTosaMemoryFormatPass.NHWC_order - ) + rank = len(get_first_fake_tensor(node).size()) + if rank == 6: + permute_node.meta["tosa_dim_order"] = NNNHWC_ORDER + elif rank == 5: + permute_node.meta["tosa_dim_order"] = NNHWC_ORDER + else: + permute_node.meta["tosa_dim_order"] = NHWC_ORDER + node.meta["tosa_dim_order"] = tuple( range(len(get_first_fake_tensor(node).size())) ) @@ -218,7 +255,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: # call_function and placeholder allowed due to # index.Tensor being able to come in as both - if node.op not in ["call_function", "placeholder", "output"]: + if node.op != "call_function": continue # Transpose views @@ -240,21 +277,34 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule): graph_module, ) - # Transpose inputs - elif _is_input(node, self.exported_program): - input_shape = get_first_fake_tensor(node).size() - if len(input_shape) in (4, 5): - ToTosaMemoryFormatPass.insert_output_transpose(node, graph_module) + output_node = graph_module.graph.output_node() - # Transpose outputs - elif node.op == "output": - output_shape = get_first_fake_tensor(node).size() + # Transpose inputs if they are in (N)NCHW format + inputs = [ + n for n in graph_module.graph.nodes if _is_input(n, self.exported_program) + ] + for input_node in inputs: + input_dim_order = get_first_fake_tensor(input_node).dim_order() + if input_dim_order in (NCHW_ORDER, NNCHW_ORDER, NNNCHW_ORDER): + self.insert_output_transpose(input_node, graph_module) + + # Transpose outputs if they are in (N)NCHW format + outputs = output_node.args[0] + output_dim_orders = output_node.meta.get("original_dim_orders") + if output_dim_orders is None: + raise RuntimeError( + f"{AnnotateDecomposedMatmulPass.__name__} is required to run at the beginning of the pass pipeline when using {ToTosaMemoryFormatPass.__name__}." + ) - if len(output_shape) in (4, 5): - for input_node in node.all_input_nodes: - ToTosaMemoryFormatPass.insert_input_transpose( - node, input_node, graph_module - ) + for output_node_input, output_dim_order in zip(outputs, output_dim_orders): # type: ignore[arg-type] + if output_dim_order in ( + NCHW_ORDER, + NNCHW_ORDER, + NNNCHW_ORDER, + ): + self.insert_input_transpose( + output_node, output_node_input, graph_module + ) def remove_dim_order_kwargs( self, graph_module: torch.fx.GraphModule, node: torch.fx.Node @@ -277,17 +327,19 @@ def call(self, graph_module: torch.fx.GraphModule): node_data = get_first_fake_tensor(node).data self.remove_dim_order_kwargs(graph_module, node) - # Inputs and outputs are always in (N)NCHW format + # Inputs and outputs may vary in dim_order if _is_input(node, self.exported_program) or node.op == "output": - dim_order = tuple(range(node_data.dim())) + dim_order = node_data.dim_order() elif node_data.dim() == 4: - dim_order = self.NHWC_order + dim_order = NHWC_ORDER if self.is_weight_node_for_depthwise_conv2d(node): # The weights of TOSA DEPTHWISE_CONV2D have shape (H, W, C, M) which corresponds to # dim_order = (2, 3, 0, 1) (https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d). - dim_order = self.HWCM_order + dim_order = HWCM_ORDER elif node_data.dim() == 5: - dim_order = self.NNHWC_order + dim_order = NNHWC_ORDER + elif node_data.dim() == 6: + dim_order = NNNHWC_ORDER else: dim_order = tuple(range(node_data.dim())) # type: ignore[assignment] @@ -300,32 +352,3 @@ def call(self, graph_module: torch.fx.GraphModule): graph_module = super().call(graph_module).graph_module return PassResult(graph_module, True) - - def requires(self, graph_module) -> None: - """ - This is the only pass which handles dim_orders, so verify that the output dim_orders has not changed since the beginning of the lowering pipeline. - """ - - dim_orders = get_output_dim_orders(graph_module) - original_dim_orders = graph_module.graph.output_node().meta.get( - "original_dim_orders" - ) - output_node = graph_module.graph.output_node() - - if original_dim_orders is None: - raise RuntimeError( - f"{AnnotateOutputDimOrderPass.__name__} must be run in the beginning of the pass pipeline to verify that the dim order has not changed unexpectedly during its run." - ) - - if len(dim_orders) != len(original_dim_orders): - raise RuntimeError( - f"The number of outputs has changed since {AnnotateOutputDimOrderPass.__name__} was run." - ) - - for node, dim_order, original_dim_order in zip( - output_node.args[0], dim_orders, original_dim_orders - ): - if dim_order != original_dim_order: - raise RuntimeError( - f"The dim order of output {node.name} has changed from {original_dim_order} to {dim_order} since {AnnotateOutputDimOrderPass.__name__} was run." - ) diff --git a/backends/arm/_passes/unsqueeze_before_repeat_pass.py b/backends/arm/_passes/unsqueeze_before_repeat_pass.py index 01983baa9ab..66286b6a954 100644 --- a/backends/arm/_passes/unsqueeze_before_repeat_pass.py +++ b/backends/arm/_passes/unsqueeze_before_repeat_pass.py @@ -1,9 +1,11 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # pyre-unsafe +from typing import Set, Type + import torch import torch.fx from executorch.backends.arm._passes.arm_pass_utils import ( @@ -29,6 +31,8 @@ class UnsqueezeBeforeRepeatPass(ExportPass): repeat(multiples) """ + _passes_required_after: Set[Type[ExportPass]] = set() + def call(self, graph_module: torch.fx.GraphModule): modified_graph = False for node in graph_module.graph.nodes: diff --git a/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py b/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py index ccae9b503cf..d3932dd1217 100644 --- a/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py +++ b/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py @@ -5,6 +5,8 @@ # pyre-unsafe +from typing import Set, Type + import torch from executorch.exir.pass_base import ExportPass, PassResult from torch._export.utils import is_buffer, is_param @@ -16,6 +18,8 @@ class UnsqueezeScalarPlaceholdersPass(ExportPass): This pass unsqueezes the placeholders to make sure shape is at least (1,). """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, exported_program): self.exported_program = exported_program super().__init__() diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py index 90f9dcb8324..5e2af9c5f39 100644 --- a/backends/arm/arm_vela.py +++ b/backends/arm/arm_vela.py @@ -34,7 +34,10 @@ def vela_bin_pack_io(prefix, data): io_elem_size = data[prefix + "_elem_size"][i] io_offset = data[prefix + "_offset"][i] io_region = data[prefix + "_region"][i] - assert len(io_shape) == vela_io_shape_dims + if len(io_shape) != vela_io_shape_dims: + raise ValueError( + f"Expected {vela_io_shape_dims}D shape, got {len(io_shape)}D" + ) inp_pad = io_shape.tolist() io_struct = struct.pack( " bytes: tosaname = "out.tosa" - tosa_path = os.path.join(tmpdir, tosaname) + tosa_path = os.path.join(dir, tosaname) with open(tosa_path, "wb") as f: f.write(tosa_flatbuffer) # invoke vela - output_dir = os.path.join(tmpdir, "output") + output_dir = os.path.join(dir, "output") args.append(f"--output-dir={output_dir}") args.append(tosa_path) if verbose: @@ -72,9 +80,9 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False) if any("ethos-u85" in arg for arg in args) or any( "debug-force-regor" in arg for arg in args ): - np_path = os.path.join(tmpdir, "output", "out_vela.npz") + np_path = os.path.join(dir, "output", "out_vela.npz") else: - np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") + np_path = os.path.join(dir, "output", "out_sg0_vela.npz") blocks = b"" with np.load(np_path, allow_pickle=False) as data: @@ -122,3 +130,9 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False) blocks = blocks + block return blocks + + if intermediate_path is not None: + return run(intermediate_path) + else: + with tempfile.TemporaryDirectory() as tmpdir: + return run(tmpdir) diff --git a/backends/arm/common/arm_compile_spec.py b/backends/arm/common/arm_compile_spec.py index c6818e2716a..b38fe72b29c 100644 --- a/backends/arm/common/arm_compile_spec.py +++ b/backends/arm/common/arm_compile_spec.py @@ -126,7 +126,8 @@ def validate(self): def to_list(self): """Get the ArmCompileSpec in list form.""" - assert self.tosa_spec + if not self.tosa_spec: + raise ValueError("tosa_spec must be set before calling to_list()") # Always supply a TOSA version compile_spec = [ diff --git a/backends/arm/constants.py b/backends/arm/constants.py index fd8710d3ead..0e562f12e88 100644 --- a/backends/arm/constants.py +++ b/backends/arm/constants.py @@ -29,3 +29,18 @@ DEQUANT_PER_TENSOR_OP_T, ) PER_CHANNEL_QDQ_OPS: Final = (QUANT_PER_CHANNEL_OP, DEQUANT_PER_CHANNEL_OP) + +NHWC_ORDER: Final = (0, 2, 3, 1) +NHWC_INVERSE_ORDER: Final = (0, 3, 1, 2) +NNHWC_ORDER: Final = (0, 1, 3, 4, 2) +NNHWC_INVERSE_ORDER: Final = (0, 1, 4, 2, 3) +NNNHWC_ORDER: Final = (0, 1, 2, 4, 5, 3) +NNNHWC_INVERSE_ORDER: Final = (0, 1, 2, 5, 3, 4) + +NCHW_ORDER: Final = (0, 1, 2, 3) +NNCHW_ORDER: Final = (0, 1, 2, 3, 4) +NNNCHW_ORDER: Final = (0, 1, 2, 3, 4, 5) + +HWCM_ORDER: Final = (2, 3, 0, 1) + +MAX_RANK: Final = 6 diff --git a/backends/arm/ethosu/backend.py b/backends/arm/ethosu/backend.py index b7b8798c3e6..00da88ef60b 100644 --- a/backends/arm/ethosu/backend.py +++ b/backends/arm/ethosu/backend.py @@ -56,6 +56,7 @@ def _compile_tosa_flatbuffer( tosa_flatbuffer, compile_flags, verbose=logger.getEffectiveLevel() == logging.INFO, + intermediate_path=compile_spec.get_intermediate_path(), ) return binary diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py index 7b73cddad37..53d37407ee6 100644 --- a/backends/arm/operator_support/__init__.py +++ b/backends/arm/operator_support/__init__.py @@ -16,8 +16,8 @@ pool_2d_support, reduce_sum_support, right_shift_support, - sin_cos_support, slice_copy_support, to_dim_order_copy_support, tosa_supported_operators, + where_support, ) diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py index 6e9d3b3528e..f335c5046f5 100644 --- a/backends/arm/operator_support/convolution_support.py +++ b/backends/arm/operator_support/convolution_support.py @@ -2,6 +2,12 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Declare operator support for ``aten.convolution`` in TOSA. + +Provide general checks and hardware-specific constraints (e.g., U55 subset) for +convolution nodes prior to delegation to the TOSA backend. + +""" from typing import cast @@ -18,6 +24,8 @@ @register_tosa_support_check class ConvolutionSupported(SupportedTOSAOperatorCheck): + """Provide TOSA support check for convolutions.""" + targets = [exir_ops.edge.aten.convolution.default] tosa_specs = [ @@ -25,8 +33,15 @@ class ConvolutionSupported(SupportedTOSAOperatorCheck): TosaSpecification.create_from_string("TOSA-1.0+FP"), ] - def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): + def is_node_tosa_supported( + self, node: fx.Node, tosa_spec: TosaSpecification + ) -> bool: + """Return True if the node is supported by TOSA. + Reject transposed convolutions and convolutions with non-zero output + padding. Apply additional hardware-specific constraints for U55. + + """ # Not implemented transposed = cast(bool, node.args[6]) output_padding = cast(list[int], node.args[7]) @@ -46,9 +61,19 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): else: return True - def _is_node_supported_u55(self, node: fx.Node): - """Hardware constraints for Ethos-U-55 case, Vela 4.2.0 (25.02 release)""" + def _is_node_supported_u55(self, node: fx.Node) -> bool: + """Enforce Ethos-U55-specific constraints (Vela 4.2.0). + + Check channel dimensions, kernel sizes, and stride/pad/dilation + combinations permitted on U55. + Args: + node (fx.Node): Convolution node to validate. + + Returns: + bool: True if supported; otherwise, False. + + """ shape_in = cast(torch.Tensor, node.all_input_nodes[0].meta["val"]).shape shape_out = node.meta["val"].shape kernel = cast(fx.Node, node.args[1]).meta["val"].shape @@ -98,13 +123,17 @@ def _is_node_supported_u55(self, node: fx.Node): return True def _stride_condition(self, node: fx.Node) -> bool: - """This condition is somewhat complex but boils down - to not supporting stride > 3, unless we have some special conditions. - This condition is a simplified, relaxed version of the hardware constraint, - since the actual constraint requires information not available - here (without a lot of work). + """Check a simplified stride/padding/dilation constraint. + + Disallow strides greater than 3 unless there is no padding and the + dilation is 1. For 3D convolutions, enforce ``stride_z <= 1``. + + Args: + node (fx.Node): Convolution node to evaluate. + + Returns: + bool: True if the condition is satisfied. - This means that we might accept ops that are not actually supported. """ strides = cast(list[int], node.args[3]) has_padding = any(pad > 0 for pad in cast(list[int], node.args[4])) diff --git a/backends/arm/operator_support/embedding_support.py b/backends/arm/operator_support/embedding_support.py index bf95014e575..24395d56cbf 100644 --- a/backends/arm/operator_support/embedding_support.py +++ b/backends/arm/operator_support/embedding_support.py @@ -27,11 +27,16 @@ class EmbeddingSupported(SupportedTOSAOperatorCheck): def is_node_tosa_supported( self, node: fx.Node, tosa_spec: TosaSpecification ) -> bool: # type: ignore[override, misc] - # Note aten.embedding.default requires int64 indices and TOSA does not support it. - # Int32 indices here for aten.embedding.default is ok since it will be decomposed into ops that can handle it. - assert ( - len(node.all_input_nodes) == 2 - ), "Number of inputs to aten.embedding is not 2" + # Note aten.embedding.default requires int64 indices and TOSA does not + # support it. Int32 indices here for aten.embedding.default is ok since + # it will be decomposed into ops that can handle it. + + if len(node.all_input_nodes) != 2: + self.reporter.report_reject( + node, + (f"Expected exactly two input nodes, got {len(node.all_input_nodes)}"), + ) + return False indices_val = node.all_input_nodes[1].meta["val"] indices_dtype = indices_val.dtype diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py index bf9e29d5cb7..27ddb95637b 100644 --- a/backends/arm/operator_support/ethos_u55_support.py +++ b/backends/arm/operator_support/ethos_u55_support.py @@ -2,6 +2,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Provide Ethos-U55 specific operator support checks. + +Contains dtype validation, explicit unsupported-op filtering, and shape/ +permutation constraints for view and permute operations when targeting the +Ethos-U55 subset of TOSA. + +""" # pyre-unsafe @@ -21,6 +28,19 @@ def _try_determine_dtype(node: fx.Node) -> torch.dtype | None: + """Return an inferred dtype for a node when possible. + + Uses fake tensor metadata and nearby quantize/dequantize nodes to infer the + integer dtype used by the operator. Returns ``None`` when the dtype cannot + be determined reliably. + + Args: + node (fx.Node): FX node to inspect. + + Returns: + torch.dtype | None: Inferred dtype or ``None`` if unknown. + + """ dtype = get_first_fake_tensor(node).dtype if not dtype.is_floating_point: return dtype @@ -34,8 +54,23 @@ def _try_determine_dtype(node: fx.Node) -> torch.dtype | None: class EthosU55DtypeSupport(OperatorSupportBase): + """Validate dtypes for U55-supported operators. + + Ensures operators use a supported integer dtype according to U55 + constraints, with specific rules for convolution, matmul, and table ops. + + Attributes: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ def __init__(self, reporter: WhyNoPartitionReporter): + """Initialize the check with a reporter. + + Args: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ super().__init__() self.reporter = reporter @@ -52,7 +87,20 @@ def __init__(self, reporter: WhyNoPartitionReporter): def is_node_supported( # noqa: C901 self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node ) -> bool: + """Return True if the node uses supported dtypes. + Applies per-operator dtype rules for U55, including specialized input + and weight constraints for convolution and int8-only checks for table + operations and matmul variants. + + Args: + submodules (typing.Mapping[str, torch.nn.Module]): Exported modules. + node (fx.Node): FX node to check. + + Returns: + bool: True if supported; otherwise, False. + + """ dtype = _try_determine_dtype(node) if dtype is None: # If we couldn't determine dtype, just return ok. @@ -112,10 +160,12 @@ def is_node_supported( # noqa: C901 class EthosU55NotSupported(OperatorSupportBase): - """ - Certain operators are not supported on U55. These are listed in `unsupported_ops`. - The comment mentions the unsupported TOSA operator that the aten operator maps to where it is not obvious. - For unimplemented operators, this is the anticipated mapping, and it might be incorrect. + """Reject operators not supported by Ethos-U55. + + The ``unsupported_ops`` list contains aten ops that either map to TOSA + operators the U55 cannot run or remain unimplemented. The mapping comments + capture expected TOSA equivalents when not obvious. + """ unsupported_ops = [ @@ -128,7 +178,7 @@ class EthosU55NotSupported(OperatorSupportBase): exir_ops.edge.aten.bitwise_and.Scalar, exir_ops.edge.aten.bitwise_or.Scalar, exir_ops.edge.aten.bitwise_xor.Scalar, - exir_ops.edge.aten.bitwise_not, + exir_ops.edge.aten.bitwise_not.default, exir_ops.edge.aten.logical_and.default, exir_ops.edge.aten.logical_or.default, exir_ops.edge.aten.logical_xor.default, @@ -165,12 +215,27 @@ class EthosU55NotSupported(OperatorSupportBase): ] def __init__(self, reporter: WhyNoPartitionReporter): + """Initialize the check with a reporter. + + Args: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ self.reporter = reporter def is_node_supported( self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node ) -> bool: + """Return False for nodes explicitly unsupported on U55. + + Args: + submodules (typing.Mapping[str, torch.nn.Module]): Exported modules. + node (fx.Node): FX node to check. + Returns: + bool: False if ``node.target`` is in ``unsupported_ops``; else True. + + """ if node.target in self.unsupported_ops: self.reporter.report_reject(node, "Op is not supported on U55.") return False @@ -182,12 +247,37 @@ def is_node_supported( class EthosU55ViewCheck(OperatorSupportBase): + """Validate view/select shapes and dtypes for U55. + + Performs lightweight checks on output shape rank and product constraints, + with awareness that transposes may be inserted around view/select during + lowering to channels-last. + + Attributes: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ def __init__(self, reporter: WhyNoPartitionReporter): + """Initialize the check with a reporter. + + Args: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ super().__init__() self.reporter = reporter def axes_product(self, nhwc_shape: shape_t) -> int: + """Return the product of all axes in ``nhwc_shape``. + + Args: + nhwc_shape (list[int]): Shape in NHWC order. + + Returns: + int: Product of the axis sizes. + + """ product = 1 for axes in nhwc_shape: product *= axes @@ -197,26 +287,27 @@ def axes_product(self, nhwc_shape: shape_t) -> int: def is_node_supported( self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node ) -> bool: - """ - Check whether a given view node is supported on U55. + """Check whether a given view/select node is U55-supported. Currently only checks dtypes and product of axes. - It is not the view operator itself that is not supported on U55. In order for the - view operator to be compatible with the channels-last format of TosaBackend, - transposes may need to be inserted before and after the view op. If that happens - and that transpose operator does not adhere to the limitations then it will - result in the following error: + It is not the view operator itself that is not supported on U55. In + order for the view operator to be compatible with the channels-last + format of TosaBackend, transposes may need to be inserted before and + after the view op. If that happens and that transpose operator does not + adhere to the limitations then it will result in the following error: CPU performance estimation for "Transpose" not implemented. ... CPU operations are not supported for GraphAPI input Args: - node: The FX node representing the view_copy operator. + submodules (typing.Mapping[str, torch.nn.Module]): Exported modules. + node (fx.Node): FX node for ``view_copy`` or ``select``. Returns: - False if the operator is not support and True if it is supported. + bool: False if rejected by constraints; otherwise, True. + """ # Select decomposes into squeeze, which in turn becomes a view. Therefore, # perform the same check on select operators as view operators. @@ -236,18 +327,20 @@ def is_node_supported( shape = input_node.meta["val"].shape rank = len(shape) if not -rank <= dim < rank: - raise IndexError( - f"Dim {dim} is outside of the range for tensor '{node.target}' of " - f"rank {rank}" + self.reporter.report_reject( + node, + (f"Dimension {dim} out of range for rank {rank}."), ) + return False dim = dim % rank size = shape[dim] if not -size <= index < size: - raise IndexError( - f"Index {index} is outside of the range for dim {dim} with size " - f"{size} for tensor {node.target}" + self.reporter.report_reject( + node, + (f"Index {index} out of range for dim {dim} with size {size}."), ) + return False index = index % size # Shape after squeeze. This may get converted into a view which may become @@ -277,14 +370,40 @@ def is_node_supported( class EthosU55TransposeCheck(OperatorSupportBase): + """Validate permute nodes against U55 reshape/transpose limits. + + Applies dtype- and rank-specific constraints to permutations. Tests both + NCHW and NHWC interpretations for rank-3/4 shapes since dim order is unknown + at partition time. + + Attributes: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ def __init__(self, reporter: WhyNoPartitionReporter): + """Initialize the check with a reporter. + + Args: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ super().__init__() self.reporter = reporter def _pad_to_rank_4( self, shape: shape_t, permutation: list[int] ) -> tuple[shape_t, shape_t]: + """Pad shape/permutation to rank 4 by prepending ones/indices. + + Args: + shape (list[int]): Original shape. + permutation (list[int]): Original permutation indices. + + Returns: + tuple[list[int], list[int]]: Padded shape and permutation. + + """ diff = 4 - len(shape) padded_shape = [1] * diff + shape for i in range(len(permutation)): @@ -293,6 +412,15 @@ def _pad_to_rank_4( return padded_shape, padded_permutation def axes_product(self, nhwc_shape: shape_t) -> int: + """Return the product of all axes in ``nhwc_shape``. + + Args: + nhwc_shape (list[int]): Shape in NHWC order. + + Returns: + int: Product of the axis sizes. + + """ product = 1 for axes in nhwc_shape: product *= axes @@ -301,7 +429,7 @@ def axes_product(self, nhwc_shape: shape_t) -> int: def _permute_constraint_i8_i16( self, nhwc_shape: list[int], permutation: list[int] ) -> bool: - """Returns True if the constraints are ok.""" + """Return True if permutation meets i8/i16 constraints.""" N, H, W, C = nhwc_shape match permutation: case (0, 1, 2, 3): # NHWC -> NHWC @@ -314,7 +442,7 @@ def _permute_constraint_i8_i16( def _permute_constraint_i32( self, nhwc_shape: list[int], permutation: list[int] ) -> bool: - """Returns True if the constraints are ok.""" + """Return True if permutation meets i32 constraints.""" N, H, W, C = nhwc_shape match permutation: case (0, 1, 2, 3): # NHWC -> NHWC @@ -327,6 +455,7 @@ def _permute_constraint_i32( return False def _permute_constraint(self, shape, permutation, dtype): + """Return True if permutation meets dtype-specific constraints.""" if dtype in (torch.int8, torch.int16): return self._permute_constraint_i8_i16(shape, permutation) if dtype == torch.int32: @@ -336,7 +465,19 @@ def _permute_constraint(self, shape, permutation, dtype): def is_node_supported( self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node ) -> bool: + """Return True if a permute node satisfies U55 constraints. + + Tests both NCHW and NHWC interpretations for rank-3/4 shapes, and + applies dtype-specific limits to shapes and permutations. + + Args: + submodules (typing.Mapping[str, torch.nn.Module]): Exported modules. + node (fx.Node): FX node to check. + Returns: + bool: True if supported; otherwise, False. + + """ if not node.target == exir_ops.edge.aten.permute_copy.default: return True @@ -382,3 +523,63 @@ def is_node_supported( return False return True + + +class EthosU55CastCheck(OperatorSupportBase): + """Reject unsupported casts on U55. + + U55 does not support casting from INT32 or any casts involving BOOL. Note that + casting from one dtype to the same dtype is a no-op and is supported. + + + Attributes: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ + + targets = [ + exir_ops.edge.dim_order_ops._to_dim_order_copy.default, + ] + + def __init__(self, reporter: WhyNoPartitionReporter): + """Initialize the check with a reporter. + + Args: + reporter (WhyNoPartitionReporter): Reporter for rejection reasons. + + """ + super().__init__() + self.reporter = reporter + + def is_node_supported( + self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node + ) -> bool: + """Return True if the node satisfies the cast constraints of U55. + + Args: + submodules (typing.Mapping[str, torch.nn.Module]): Exported modules. + node (fx.Node): FX node to check. + + Returns: + bool: True if supported; otherwise, False. + + """ + if node.target not in self.targets: + return True + input_dtype = get_first_fake_tensor(node.all_input_nodes[0]).dtype + output_dtype = get_first_fake_tensor(node).dtype + if input_dtype == output_dtype: + # This is ok as this will not result in a cast + return True + if input_dtype in (torch.bool, torch.int32): + self.reporter.report_reject( + node, f"Casting from {input_dtype} is not supported on U55." + ) + return False + if output_dtype in (torch.bool,): + self.reporter.report_reject( + node, f"Casting to {output_dtype} is not supported on U55." + ) + return False + + return True diff --git a/backends/arm/operator_support/index_tensor_support.py b/backends/arm/operator_support/index_tensor_support.py index 4b226a9c407..92b0ce48a32 100644 --- a/backends/arm/operator_support/index_tensor_support.py +++ b/backends/arm/operator_support/index_tensor_support.py @@ -2,6 +2,12 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Provide TOSA support checks for ``aten.index.Tensor``. + +Reject unsupported patterns such as high-rank index tensors, front-positioned +slice/ellipsis/None markers, and cases that exceed ``int32`` element limits. + +""" import math @@ -18,7 +24,8 @@ @register_tosa_support_check class IndexTensorSupported(SupportedTOSAOperatorCheck): - """ + """Prevent partitioning of unsupported ``index.Tensor`` usages. + This support check is intended to prevent the partitioning of currently unsupported usages of the index.Tensor operator. @@ -95,6 +102,7 @@ class IndexTensorSupported(SupportedTOSAOperatorCheck): t[1:3, torch.arange(5), 2:3, torch.arange(3).reshape(3,1)] are also possible and can result in some unintuitive behaviors where batching and indexing are mixed together. + """ targets = [exir_ops.edge.aten.index.Tensor] @@ -107,20 +115,43 @@ class IndexTensorSupported(SupportedTOSAOperatorCheck): def is_node_tosa_supported( self, node: fx.Node, tosa_spec: TosaSpecification ) -> bool: # type: ignore[override, misc] + """Return True if ``aten.index.Tensor`` usage fits supported patterns. + + Enforces the following constraints: + - No ``None`` (unsqueeze), slice, or ellipsis before an indexing tensor. + - Indexing tensors have rank <= 3. + - The value tensor element count fits in ``int32``. + + """ indices = node.args[1] for index in indices: # type: ignore[union-attr] # Usage 2 guard if index is None: + self.reporter.report_reject( + node, + ( + "None (from slice/unsqueeze/ellipsis) before an indexing tensor" + " is not supported." + ), + ) return False # Usage 1 guard fake_tensor = get_first_fake_tensor(index) # type: ignore[arg-type] if len(fake_tensor.size()) > 3: + self.reporter.report_reject( + node, + ("Indexing tensors of rank >= 4 is not supported."), + ) return False # Usage 3 guard total_vals = math.prod(get_first_fake_tensor(node.args[0]).shape) # type: ignore[arg-type] if total_vals > torch.iinfo(torch.int32).max: + self.reporter.report_reject( + node, + ("Value size exceeds int32 range; would overflow flattened indexing."), + ) return False return True diff --git a/backends/arm/operator_support/minmax_support.py b/backends/arm/operator_support/minmax_support.py index edbf7f61818..68433819f4b 100644 --- a/backends/arm/operator_support/minmax_support.py +++ b/backends/arm/operator_support/minmax_support.py @@ -32,6 +32,13 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): ) if not (no_argmax or no_argmax_users): + self.reporter.report_reject( + node, + ( + "Using the indices output is not supported; only usage of the " + "values output is supported." + ), + ) return False return True diff --git a/backends/arm/operator_support/pool_2d_support.py b/backends/arm/operator_support/pool_2d_support.py index ff453741f1f..c0428e45e03 100644 --- a/backends/arm/operator_support/pool_2d_support.py +++ b/backends/arm/operator_support/pool_2d_support.py @@ -2,6 +2,12 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Provide TOSA support checks for 2D pooling. + +Validate ``avg_pool2d`` and ``max_pool2d_with_indices`` against U55 profile +constraints including kernel size, stride, padding, and dimensionality. + +""" from typing import cast @@ -20,16 +26,48 @@ def kernel_check(kernel: tuple[int, int]) -> bool: + """Check if kernel size is within U55 constraints. + + Checks that ``kernel_x * kernel_y`` is in ``[1, 65536]`` and + ``kernel_y`` is in ``[1, 256]`` as required by the U55 profile. + + Args: + kernel (tuple[int, int]): Kernel height and width ``(kh, kw)``. + + Returns: + bool: True if the kernel passes validation. + + """ if not (1 <= kernel[0] * kernel[1] <= 65536): return False return 1 <= kernel[1] <= 256 def stride_check(strides: tuple[int, int]) -> bool: + """Check if strides are within U55 constraints. + + Args: + strides (tuple[int, int]): Vertical and horizontal strides. + + Returns: + bool: True if each stride is in ``[1, 3]``. + + """ return all(1 <= stride <= 3 for stride in strides) def dim_check(shape=torch.Size) -> bool: + """Check if non-batch dims are within U55 constraints. + + Verifies that all dimensions except batch are in ``[1, 65536]``. + + Args: + shape (torch.Size): Input tensor shape. + + Returns: + bool: True if all checked dimensions pass. + + """ check = True for dim in shape[1:]: check &= 1 <= dim <= 65536 @@ -38,6 +76,13 @@ def dim_check(shape=torch.Size) -> bool: @register_tosa_support_check class AvgPool2dSupported(SupportedTOSAOperatorCheck): + """Provide TOSA support checks for ``aten.avg_pool2d``. + + Applies additional constraints when targeting the U55 subset, including + limits on kernel size, stride, padding behavior, and tensor ranks. + + """ + targets = [ exir_ops.edge.aten.avg_pool2d.default, ] @@ -48,6 +93,12 @@ class AvgPool2dSupported(SupportedTOSAOperatorCheck): ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): + """Return True if ``avg_pool2d`` satisfies U55 constraints. + + Computes the effective TOSA padding (depending on ``count_include_pad`` + and ``divisor_override``) and validates kernel, stride, and shape limits. + + """ if not tosa_spec.is_U55_subset: return True @@ -115,6 +166,13 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): @register_tosa_support_check class MaxPool2dSupported(SupportedTOSAOperatorCheck): + """Provide TOSA support checks for ``aten.max_pool2d_with_indices``. + + Applies additional constraints when targeting the U55 subset, including + limits on kernel size, stride, and tensor ranks. + + """ + targets = [ exir_ops.edge.aten.max_pool2d_with_indices.default, ] @@ -125,6 +183,9 @@ class MaxPool2dSupported(SupportedTOSAOperatorCheck): ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): + """Return True if ``max_pool2d_with_indices`` satisfies U55 + constraints. + """ if not tosa_spec.is_U55_subset: return True diff --git a/backends/arm/operator_support/right_shift_support.py b/backends/arm/operator_support/right_shift_support.py index 5d3896e3643..df124319887 100644 --- a/backends/arm/operator_support/right_shift_support.py +++ b/backends/arm/operator_support/right_shift_support.py @@ -2,6 +2,12 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Declare operator support for bitwise right-shift in TOSA. + +Provide support checks for ``aten.bitwise_right_shift`` and ``__rshift__`` +targets across integer and float TOSA profiles. + +""" # pyre-unsafe @@ -21,6 +27,8 @@ @register_tosa_support_check class RightShiftSupported(SupportedTOSAOperatorCheck): + """Provide TOSA support check for right-shift operations.""" + targets = [ exir_ops.edge.aten.bitwise_right_shift.Tensor, exir_ops.edge.aten.__rshift__.Scalar, @@ -31,8 +39,15 @@ class RightShiftSupported(SupportedTOSAOperatorCheck): TosaSpecification.create_from_string("TOSA-1.0+FP"), ] - def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): + def is_node_tosa_supported( + self, node: fx.Node, tosa_spec: TosaSpecification + ) -> bool: + """Return True if the node is supported by TOSA. + + Emit a warning on U55 subsets where one-off errors may occur. Otherwise + accept all matching targets. + """ # TODO MLETORCH-525 Remove warning if tosa_spec.is_U55_subset: logging.warning(f"{node.target} may introduce one-off errors.") diff --git a/backends/arm/operator_support/sin_cos_support.py b/backends/arm/operator_support/sin_cos_support.py deleted file mode 100644 index dcdc20f8e4a..00000000000 --- a/backends/arm/operator_support/sin_cos_support.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2025 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - - -import torch.fx as fx -from executorch.backends.arm.operator_support.tosa_supported_operators import ( - register_tosa_support_check, - SupportedTOSAOperatorCheck, -) -from executorch.backends.arm.tosa import TosaSpecification -from executorch.exir.dialects._ops import ops as exir_ops - - -@register_tosa_support_check -class SinCosSupported(SupportedTOSAOperatorCheck): - targets = [ - exir_ops.edge.aten.cos.default, - exir_ops.edge.aten.sin.default, - ] - - tosa_specs = [ - TosaSpecification.create_from_string("TOSA-1.0+INT"), - TosaSpecification.create_from_string("TOSA-1.0+FP"), - ] - - def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): - return True diff --git a/backends/arm/operator_support/to_dim_order_copy_support.py b/backends/arm/operator_support/to_dim_order_copy_support.py index e21f8a68ad6..3cc587d99d3 100644 --- a/backends/arm/operator_support/to_dim_order_copy_support.py +++ b/backends/arm/operator_support/to_dim_order_copy_support.py @@ -2,6 +2,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Declare operator support for ``_to_dim_order_copy`` in TOSA. + +Provide dtype-compatibility checks for casting when converting to a specific +dimension order. Supported input/output dtype pairs depend on the active TOSA +profile (integer and/or float). + +""" # pyre-unsafe import copy @@ -25,6 +32,16 @@ @register_tosa_support_check class ToCopySupported(SupportedTOSAOperatorCheck): + """Provide TOSA support check for ``_to_dim_order_copy``. + + Attributes: + SUPPORTED_INT_PROFILE_DTYPES (dict[torch.dtype, list[torch.dtype]]): + Allowed output dtypes for each integer input dtype. + SUPPORTED_FP_PROFILE_DTYPES (dict[torch.dtype, list[torch.dtype]]): + Allowed output dtypes for each floating input dtype. + + """ + targets = [ exir_ops.edge.dim_order_ops._to_dim_order_copy.default, ] @@ -40,21 +57,31 @@ def _merge_supported_types( dtypes1: SupportedTypeDict, dtypes2: SupportedTypeDict, ) -> SupportedTypeDict: + """Return a merged mapping of supported dtype transitions. + + Args: + dtypes1 (dict[torch.dtype, list[torch.dtype]]): Base mapping. + dtypes2 (dict[torch.dtype, list[torch.dtype]]): Mapping to merge in. + + Returns: + dict[torch.dtype, list[torch.dtype]]: Combined mapping. + + """ merged_dtypes = copy.deepcopy( dtypes1 - ) # Use deepcopy to avoid unintentionally modifying SUPPORTED_INT_TYPES + ) # Use deepcopy to avoid unintentionally modifying SUPPORTED_INT_PROFILE_DTYPES for k, v in dtypes2.items(): merged_dtypes[k] = merged_dtypes.get(k, []) + v return merged_dtypes - SUPPORTED_INT_TYPES: SupportedTypeDict = { + SUPPORTED_INT_PROFILE_DTYPES: SupportedTypeDict = { torch.bool: [torch.bool, torch.int8, torch.int16, torch.int32], torch.int8: [torch.bool, torch.int8, torch.int16, torch.int32], torch.int16: [torch.bool, torch.int8, torch.int16, torch.int32], torch.int32: [torch.bool, torch.int8, torch.int16, torch.int32], torch.int64: [torch.bool, torch.int8, torch.int16, torch.int32], } - SUPPORTED_FLOAT_TYPES: SupportedTypeDict = { + SUPPORTED_FP_PROFILE_DTYPES: SupportedTypeDict = { torch.int8: [torch.int8, torch.float16, torch.bfloat16, torch.float32], torch.int16: [torch.int16, torch.float16, torch.bfloat16, torch.float32], torch.int32: [torch.int32, torch.float16, torch.bfloat16, torch.float32], @@ -89,24 +116,28 @@ def _merge_supported_types( torch.int32, torch.bfloat16, torch.float16, + torch.float32, ], } - ALL_SUPPORTED_TYPES = _merge_supported_types( - SUPPORTED_INT_TYPES, SUPPORTED_FLOAT_TYPES - ) def is_node_tosa_supported( self, node: fx.Node, tosa_spec: TosaSpecification ) -> bool: + """Return True if the node is supported by TOSA. + + Check FakeTensor metadata, validate input dtype is supported for the + active profile, and ensure the output dtype is allowed for the given + input dtype. + """ supported_dtypes: SupportedTypeDict = {} if tosa_spec.support_integer(): supported_dtypes = self._merge_supported_types( - self.SUPPORTED_INT_TYPES, supported_dtypes + self.SUPPORTED_INT_PROFILE_DTYPES, supported_dtypes ) if tosa_spec.support_float(): supported_dtypes = self._merge_supported_types( - self.SUPPORTED_FLOAT_TYPES, supported_dtypes + self.SUPPORTED_FP_PROFILE_DTYPES, supported_dtypes ) if len(node.all_input_nodes) != 1: diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py index d3207c65dff..86db2d9b0b6 100644 --- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py +++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py @@ -2,6 +2,12 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Define TOSA profile support lists for INT and FP. + +Expose static sets of EXIR operator overloads used by the TOSA partitioner to +seed positive support checks for different profiles. + +""" import operator from typing import Final, Set @@ -12,6 +18,7 @@ # INT profile: ops supported via native TOSA ops, decompositions/transformations, precompute, TableOps, etc. +# Note that ops supported via pre-quantization decompositions are not included here. TOSA_PRO_INT_SupportList: Final[Set] = { exir_ops.edge.aten.abs.default, exir_ops.edge.aten.add.Tensor, @@ -24,6 +31,7 @@ exir_ops.edge.aten.bitwise_and.Scalar, exir_ops.edge.aten.bitwise_or.Scalar, exir_ops.edge.aten.bitwise_xor.Scalar, + exir_ops.edge.aten.cos.default, exir_ops.edge.aten.logical_and.default, exir_ops.edge.aten.logical_or.default, exir_ops.edge.aten.logical_xor.default, @@ -39,8 +47,6 @@ exir_ops.edge.aten.hardsigmoid.default, exir_ops.edge.aten.hardtanh.default, exir_ops.edge.aten.hardswish.default, - exir_ops.edge.aten.div.Tensor, - exir_ops.edge.aten.div.Tensor_mode, exir_ops.edge.aten.eq.Tensor, exir_ops.edge.aten.eq.Scalar, exir_ops.edge.aten.erf.default, @@ -61,16 +67,7 @@ exir_ops.edge.aten.lt.Tensor, exir_ops.edge.aten.lt.Scalar, exir_ops.edge.aten.mul.Tensor, - exir_ops.edge.aten.ne.Tensor, - exir_ops.edge.aten.ne.Scalar, exir_ops.edge.aten.neg.default, - exir_ops.edge.aten.add.Scalar, - exir_ops.edge.aten.sub.Scalar, - exir_ops.edge.aten.mul.Scalar, - exir_ops.edge.aten.div.Scalar, - exir_ops.edge.aten._native_batch_norm_legit_no_training.default, - exir_ops.edge.aten.native_layer_norm.default, - exir_ops.edge.aten.native_group_norm.default, exir_ops.edge.aten.sigmoid.default, exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.mm.default, @@ -79,25 +76,17 @@ exir_ops.edge.aten.repeat.default, exir_ops.edge.aten.reciprocal.default, exir_ops.edge.aten.relu.default, - exir_ops.edge.aten.leaky_relu.default, - exir_ops.edge.aten.sqrt.default, exir_ops.edge.aten.rsqrt.default, - exir_ops.edge.aten.round.default, - exir_ops.edge.aten._softmax.default, exir_ops.edge.aten.select_copy.int, - exir_ops.edge.aten._log_softmax.default, exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.tanh.default, exir_ops.edge.aten.upsample_bilinear2d.vec, exir_ops.edge.aten.upsample_nearest2d.vec, - exir_ops.edge.aten.var.correction, - exir_ops.edge.aten.var.dim, exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.unsqueeze_copy.default, exir_ops.edge.aten.squeeze_copy.dims, exir_ops.edge.aten.pow.Tensor_Scalar, exir_ops.edge.aten.pow.Tensor_Tensor, - exir_ops.edge.aten.where.self, operator.getitem, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, exir_ops.edge.quantized_decomposed.quantize_per_channel.default, @@ -113,6 +102,7 @@ torch.ops.aten.scalar_tensor.default, exir_ops.edge.aten.gelu.default, exir_ops.edge.aten.alias_copy.default, + exir_ops.edge.aten.sin.default, exir_ops.edge.aten.sinh.default, exir_ops.edge.aten.atan.default, exir_ops.edge.aten.acosh.default, @@ -120,14 +110,12 @@ exir_ops.edge.aten.sign.default, exir_ops.edge.aten.asin.default, exir_ops.edge.aten.atanh.default, - exir_ops.edge.aten.addmm.default, exir_ops.edge.aten.masked_fill.Scalar, exir_ops.edge.aten.asinh.default, exir_ops.edge.aten.cosh.default, - exir_ops.edge.aten.glu.default, - exir_ops.edge.aten.logit.default, exir_ops.edge.aten.acos.default, exir_ops.edge.aten.elu.default, + exir_ops.edge.aten.bitwise_not.default, } @@ -147,6 +135,7 @@ exir_ops.edge.aten.cat.default, exir_ops.edge.aten.ceil.default, exir_ops.edge.aten.clamp.default, + exir_ops.edge.aten.cos.default, exir_ops.edge.aten.cumsum.default, exir_ops.edge.aten.bmm.default, exir_ops.edge.aten.permute_copy.default, @@ -211,7 +200,6 @@ exir_ops.edge.aten.squeeze_copy.dims, exir_ops.edge.aten.pow.Tensor_Scalar, exir_ops.edge.aten.pow.Tensor_Tensor, - exir_ops.edge.aten.where.self, operator.getitem, exir_ops.edge.aten.constant_pad_nd.default, exir_ops.edge.aten.amax.default, @@ -223,6 +211,7 @@ torch.ops.aten.scalar_tensor.default, exir_ops.edge.aten.gelu.default, exir_ops.edge.aten.alias_copy.default, + exir_ops.edge.aten.sin.default, exir_ops.edge.aten.sinh.default, exir_ops.edge.aten.atan.default, exir_ops.edge.aten.acosh.default, diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py index b580fbb9a9a..f7857894d40 100644 --- a/backends/arm/operator_support/tosa_supported_operators.py +++ b/backends/arm/operator_support/tosa_supported_operators.py @@ -19,8 +19,9 @@ FuseQuantizedActivationPass, ) from executorch.backends.arm._passes.insert_table_ops import TableOps -from executorch.backends.arm.constants import DQ_OPS, Q_OPS +from executorch.backends.arm.constants import DQ_OPS, MAX_RANK, Q_OPS from executorch.backends.arm.operator_support.ethos_u55_support import ( + EthosU55CastCheck, EthosU55DtypeSupport, EthosU55NotSupported, EthosU55TransposeCheck, @@ -126,7 +127,7 @@ def tosa_support_factory( negative_checks: list[OperatorSupportBase] = [ CheckInt64InputsAndOutputs(exported_program, reporter), CheckFloat64Inputs(exported_program, reporter), - RankCheck(reporter, max_rank=5), + RankCheck(reporter, max_rank=MAX_RANK), *[ reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}") for check in (additional_checks if additional_checks else []) @@ -134,13 +135,13 @@ def tosa_support_factory( ] if not tosa_spec.support_float(): - negative_checks.append(NeedsDecompositionCheck(reporter)) negative_checks.append(CheckProperQuantization(reporter)) if tosa_spec.is_U55_subset: negative_checks.append(EthosU55NotSupported(reporter)) negative_checks.append(EthosU55DtypeSupport(reporter)) negative_checks.append(EthosU55TransposeCheck(reporter)) negative_checks.append(EthosU55ViewCheck(reporter)) + negative_checks.append(EthosU55CastCheck(reporter)) return chain( reporter.wrap_check( @@ -154,7 +155,8 @@ def tosa_support_factory( class TOSAProINTSupportList(OperatorSupportBase): """ TOSA_PRO_INT_SupportList: - Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps + Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps. + Note that ops supported via pre-quantization decompositions are not included here. """ def is_node_supported( @@ -177,57 +179,6 @@ def is_node_supported( return node.op == "call_function" and node.target in TOSA_PRO_FP_SupportList -class NeedsDecompositionCheck(OperatorSupportBase): - """ - Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding - the operator, and to get optimal quantization parameters for each operator. This check will reject operators - that need to be decomposed. - """ - - def __init__(self, reporter: WhyNoPartitionReporter): - self.reporter = reporter - - def is_node_supported( - self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node - ) -> bool: - - if node.op != "call_function": - return True - - needs_decomp_dict = { - exir_ops.edge.aten.div.Tensor: None, - exir_ops.edge.aten._native_batch_norm_legit_no_training.default: "BatchNorm2D with track_running_stats==True not immediately following a convolution is not supported for quantized TOSA backends.", - exir_ops.edge.aten.native_layer_norm.default: None, - exir_ops.edge.aten.native_group_norm.default: None, - exir_ops.edge.aten._softmax.default: None, - exir_ops.edge.aten._log_softmax.default: None, - exir_ops.edge.aten.var.correction: None, - exir_ops.edge.aten.var.dim: None, - exir_ops.edge.aten.add.Scalar: None, - exir_ops.edge.aten.sqrt.default: None, - exir_ops.edge.aten.sub.Scalar: None, - exir_ops.edge.aten.mul.Scalar: None, - exir_ops.edge.aten.ne.Tensor: None, - exir_ops.edge.aten.ne.Scalar: None, - exir_ops.edge.aten.div.Scalar: None, - exir_ops.edge.aten.leaky_relu.default: None, - exir_ops.edge.aten.round.default: None, - exir_ops.edge.aten.addmm.default: None, - exir_ops.edge.aten.glu.default: None, - exir_ops.edge.aten.logit.default: None, - } - - if node.target in needs_decomp_dict: - reject_message = needs_decomp_dict[node.target] - if reject_message is None: - reject_message = "Op needs to be decomposed into other ops before quantization to get quantized properly." - - self.reporter.report_reject(node, reject_message) - return False - else: - return True - - class CheckProperQuantization(OperatorSupportBase): """ For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize diff --git a/backends/arm/operator_support/where_support.py b/backends/arm/operator_support/where_support.py new file mode 100644 index 00000000000..2ec7c30827d --- /dev/null +++ b/backends/arm/operator_support/where_support.py @@ -0,0 +1,77 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch + +import torch.fx as fx +from executorch.backends.arm.constants import DQ_OPS +from executorch.backends.arm.operator_support.tosa_supported_operators import ( + register_tosa_support_check, + SupportedTOSAOperatorCheck, +) +from executorch.backends.arm.tosa import TosaSpecification +from executorch.exir.dialects._ops import ops as exir_ops + + +@register_tosa_support_check +class WhereSupported(SupportedTOSAOperatorCheck): + targets = [exir_ops.edge.aten.where.self] + + tosa_specs = [ + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), + ] + + def is_node_tosa_supported( + self, node: fx.Node, tosa_spec: TosaSpecification + ) -> bool: # type: ignore[override, misc] + + if len(node.all_input_nodes) != 3: + self.reporter.report_reject( + node, + ( + "Expected exactly three input nodes, " + f"got {len(node.all_input_nodes)} for {node.target}." + ), + ) + return False + + condition, x, y = node.all_input_nodes + if condition.meta["val"].dtype != torch.bool: + self.reporter.report_reject( + node, + f"Type of condition in {node.target} is not torch.bool", + ) + return False + + x_dtype, y_dtype = x.meta["val"].dtype, y.meta["val"].dtype + if tosa_spec.support_float(): + if x_dtype in (torch.bool, torch.float16, torch.float32) and y_dtype in ( + torch.bool, + torch.float16, + torch.float32, + ): + return True + + if tosa_spec.support_integer(): + if ( + x_dtype in (torch.bool, torch.int8, torch.int16, torch.int32) + or (x_dtype == torch.float32 and x.target in DQ_OPS) + ) and ( + y_dtype in (torch.bool, torch.int8, torch.int16, torch.int32) + or (y_dtype == torch.float32 and y.target in DQ_OPS) + ): + return True + + self.reporter.report_reject( + node, + ( + f"Tensor x dtype {x_dtype} and/or tensor y dtype {y_dtype} is not supported in {node.target} " + f"for tosa specification {tosa_spec}" + ), + ) + + return False diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index f7a9638254e..9278d25959f 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -13,7 +13,7 @@ op_amin, op_any, op_avg_pool2d, - op_bmm, + op_bitwise_not, op_cat, op_ceil, op_clamp, @@ -41,7 +41,6 @@ op_pow, op_reciprocal, op_repeat, - op_rescale, op_rshift_tensor, op_rsqrt, op_sigmoid, @@ -49,12 +48,13 @@ op_slice, op_sub, op_sum, - op_table, op_tanh, op_to_dim_order_copy, - op_transpose, - op_upsample_bilinear2d, - op_upsample_nearest2d, + op_tosa_matmul, + op_tosa_rescale, + op_tosa_resize, + op_tosa_table, + op_tosa_transpose, op_view, op_where, ops_binary, diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py index ec76eb5517f..943c4778867 100644 --- a/backends/arm/operators/op_abs.py +++ b/backends/arm/operators/op_abs.py @@ -6,9 +6,6 @@ # pyre-unsafe from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils -import executorch.backends.arm.tosa.utils as tutils - from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -18,22 +15,20 @@ validate_same_dtype, validate_valid_dtype, ) -from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.mapping import TosaArg +from executorch.backends.arm.tosa.specification import TosaSpecification from torch.fx import Node @register_node_visitor -class AbsVisitor_INT(NodeVisitor): +class AbsVisitor(NodeVisitor): target = "aten.abs.default" tosa_specs = [ TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] - def __init__(self, *args): - super().__init__(*args) - def define_node( self, node: Node, @@ -47,89 +42,18 @@ def define_node( validate_num_inputs(self.target, inputs, 1) validate_same_dtype(self.target, [*inputs, output], ts) - # Handle int8 (quantized) and int32 validate_valid_dtype( self.target, [*inputs, output], - [ts.DType.INT8, ts.DType.INT32], + [ts.DType.INT32, ts.DType.FP32], output.tosa_spec, ) - scale_back = 1.0 - if inputs[0].dtype == ts.DType.INT8: - rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) # type: ignore[possibly-undefined] - else: - # input[0].dtype == ts.DType.INT32 - # Non quantized input, natively support by TOSA.abs - rescaled_inputs = inputs - - if output.dtype == ts.DType.INT8: - broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order) - abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32) - else: - # output.dtype == ts.DType.INT32 - abs_output = output - - # Do the INT32 Abs - self._serialize_operator( - node, - tosa_graph, + tosa_graph.addOperator( ts.TosaOp.Op().ABS, [ - rescaled_inputs[0].name, + inputs[0].name, ], - [abs_output.name], + [output.name], None, ) - - if output.dtype == ts.DType.INT8: - # Scale output back to 8 bit - # pyre-ignore - tqutils.insert_rescale_op_to_int8( - tosa_graph, abs_output, scale_back, node, self.tosa_spec - ) # type: ignore[possibly-undefined] - - -@register_node_visitor -class AbsVisitor_FP(AbsVisitor_INT): - # inheriting 'target' from BI class - - tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")] - - def __init__(self, *args): - super().__init__(*args) - - def define_node( - self, - node: Node, - tosa_graph: Any, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - - import serializer.tosa_serializer as ts # type: ignore - - validate_num_inputs(self.target, inputs, 1) - validate_same_dtype(self.target, [*inputs, output], ts) - - if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]: - # Call the inherited define_node for handling integers - super().define_node(node, tosa_graph, inputs, output) - else: - # FP32 Abs lowering - - validate_valid_dtype( - self.target, [*inputs, output], ts.DType.FP32, output.tosa_spec - ) - - # MI lowering - self._serialize_operator( - node, - tosa_graph, - ts.TosaOp.Op().ABS, - [inputs[0].name], - [output.name], - None, - ) diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py index a8f0c3fe14d..81b415363ea 100644 --- a/backends/arm/operators/op_add.py +++ b/backends/arm/operators/op_add.py @@ -64,12 +64,18 @@ def define_node( rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32_maxscale( tosa_graph, inputs, node, self.tosa_spec ) + elif inputs[0].dtype == ts.DType.INT16: + rescaled_inputs, scale_back = ( + tqutils.insert_rescale_ops_int16_to_int32_maxscale( + tosa_graph, inputs, node, self.tosa_spec + ) + ) else: # input[0].dtype == ts.DType.INT16 or ts.DType.INT32 # Non quantized input, natively support by TOSA.ADD rescaled_inputs = inputs - if output.dtype == ts.DType.INT8: + if output.dtype in [ts.DType.INT8, ts.DType.INT16]: broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order) add_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32) else: @@ -99,6 +105,15 @@ def define_node( compute_rescale=False, tosa_spec=self.tosa_spec, ) # type: ignore[possibly-undefined] + elif output.dtype == ts.DType.INT16: + tqutils.insert_rescale_op_to_int16( + tosa_graph, + add_output, + scale_back, + node, + compute_rescale=False, + tosa_spec=self.tosa_spec, + ) # type: ignore[possibly-undefined] @register_node_visitor diff --git a/backends/arm/operators/op_bitwise_not.py b/backends/arm/operators/op_bitwise_not.py new file mode 100644 index 00000000000..908cf68e9b2 --- /dev/null +++ b/backends/arm/operators/op_bitwise_not.py @@ -0,0 +1,59 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, List + +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.operators.operator_validation_utils import ( + validate_num_inputs, + validate_same_dtype, + validate_valid_dtype, +) +from executorch.backends.arm.tosa.mapping import TosaArg +from executorch.backends.arm.tosa.specification import TosaSpecification +from torch.fx import Node + + +@register_node_visitor +class BitwiseNotVisitor(NodeVisitor): + target = "aten.bitwise_not.default" + + # bitwise_not is not supported on the FP profile + tosa_specs = [ + TosaSpecification.create_from_string("TOSA-1.0+INT"), + ] + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: Any, + inputs: List[TosaArg], + output: TosaArg, + ) -> None: + + import serializer.tosa_serializer as ts # type: ignore + + validate_num_inputs(self.target, inputs, 1) + validate_same_dtype(self.target, [*inputs, output], ts) + validate_valid_dtype( + self.target, + [*inputs, output], + [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32], + output.tosa_spec, + ) + + self._serialize_operator( + node, + tosa_graph, + ts.TosaOp.Op().BITWISE_NOT, + [inputs[0].name], + [output.name], + ) diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index 6bfe0ab21eb..933e353387b 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -4,6 +4,8 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe +"""Provide a visitor for lowering 2D convolution to TOSA (INT/FP).""" + import itertools from typing import Any, List @@ -19,15 +21,22 @@ ) from executorch.backends.arm.operators.operator_validation_utils import ( validate_num_inputs, + validate_valid_dtype, ) -from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.mapping import TosaArg from executorch.backends.arm.tosa.quant_utils import build_rescale +from executorch.backends.arm.tosa.specification import Tosa_1_00, TosaSpecification from executorch.backends.arm.tosa.utils import tosa_shape @register_node_visitor class Conv2dVisitor(NodeVisitor): + """Provide a visitor that lowers ``aten.convolution`` to TOSA. + + Map to ``CONV2D`` or ``DEPTHWISE_CONV2D`` as appropriate. + + """ + target = "aten.convolution.default" tosa_specs = [ @@ -38,13 +47,32 @@ class Conv2dVisitor(NodeVisitor): def __init__(self, *args): super().__init__(*args) - # torch.nn.Conv2d does not require the result of - # `(input + 2 * pad - dilation * (weight - 1) - 1) / stride` - # to be an integer, but tosa currently strictly require this property. - # This function adjusts the pad value to meet the requirement. def adjust_pad_if_needed( self, input_size: int, input_weight: int, stride: int, pad: int, dilation: int ) -> int: + """Adjust padding to satisfy TOSA's integer output-size requirement. + + Torch ``Conv2d`` does not require the result of + ``(input + 2 * pad - dilation * (weight - 1) - 1) / stride`` to be an + integer, but TOSA does. This helper reduces the provided padding so + that the expression becomes divisible by ``stride``. + + Args: + input_size (int): Spatial input size along the dimension (H or W). + input_weight (int): Kernel size along the same dimension. + stride (int): Stride along the same dimension. + pad (int): Padding value to adjust (bottom or right after duplication). + dilation (int): Dilation along the same dimension. + + Returns: + int: Adjusted padding value that yields an integer output size. + + Raises: + RuntimeError: If the required adjustment exceeds the provided + padding, which should be handled by the ``SizeAdjustInputPass`` + pass instead. + + """ mod_remainder = ( input_size + 2 * pad - dilation * (input_weight - 1) - 1 ) % stride @@ -55,7 +83,8 @@ def adjust_pad_if_needed( if mod_remainder > pad: raise RuntimeError( - "This case should be handled by the SizeAdjustConv2d pass, is it enabled?" + "This case should be handled by the SizeAdjustInputPass pass, " + "is it enabled?" ) return pad - mod_remainder @@ -66,13 +95,39 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - + """Define the TOSA CONV2D/DEPTHWISE_CONV2D operator and post-rescale.""" import serializer.tosa_serializer as ts # type: ignore from tosa.RoundingMode import RoundingMode # type: ignore input, weight, bias, stride, pad, dilation, _, _, group = inputs validate_num_inputs(self.target, inputs, 9) + valid_input_dtypes = [] + if self.tosa_spec.support_float(): + valid_input_dtypes.append(ts.DType.FP32) + if self.tosa_spec.support_integer(): + valid_input_dtypes.append(ts.DType.INT8) + + if isinstance(self.tosa_spec, Tosa_1_00) and self.tosa_spec.support_extension( + "int16" + ): + valid_input_dtypes.append(ts.DType.INT16) + # Check constraints for int16 activations + if inputs[0].dtype == ts.DType.INT16: + validate_valid_dtype( + self.target, [inputs[1]], [ts.DType.INT8], self.tosa_spec + ) + validate_valid_dtype( + self.target, [inputs[2]], [ts.DType.INT48], self.tosa_spec + ) + + validate_valid_dtype( + self.target, + [inputs[0]], + valid_input_dtypes, + self.tosa_spec, + ) + # Get the attributes of convolution. attr = ts.TosaSerializerAttribute() pad_attr = [val for val in pad.special for _ in (0, 1)] @@ -97,8 +152,8 @@ def define_node( ) input_zp = 0 - if inputs[0].dtype == ts.DType.INT8: - # int8 input requires quantization information + if inputs[0].dtype in (ts.DType.INT8, ts.DType.INT16): + # int8 and int16 input requires quantization information input_qparams = get_input_qparams(node) input_zp = input_qparams[0].get_zp_per_tensor() @@ -109,22 +164,29 @@ def define_node( weight_zp = input_qparams[1].zp # type: ignore[assignment] # The output type is int32 when input type is int8. - conv2d_output_name = output.name - if output.dtype == ts.DType.INT8: + if inputs[0].dtype == ts.DType.INT8: conv2d_res = tosa_graph.addIntermediate( tosa_shape(output.shape, output.dim_order), ts.DType.INT32 ) conv2d_output_name = conv2d_res.name - acc_type = ( - inputs[0].dtype if inputs[0].dtype == ts.DType.FP32 else ts.DType.INT32 - ) + acc_type = ts.DType.INT32 + elif inputs[0].dtype == ts.DType.INT16: + conv2d_res = tosa_graph.addIntermediate( + tosa_shape(output.shape, output.dim_order), ts.DType.INT48 + ) + conv2d_output_name = conv2d_res.name + acc_type = ts.DType.INT48 + else: + conv2d_output_name = output.name + conv2d_res = output + acc_type = ts.DType.FP32 tosa_graph.addConst( - [1], output.dtype, [input_zp], name=f"{conv2d_output_name}_input_zp" + [1], inputs[0].dtype, [input_zp], name=f"{conv2d_output_name}_input_zp" ) tosa_graph.addConst( [1], - output.dtype, + inputs[1].dtype, weight_zp, name=f"{conv2d_output_name}_weight_zp", ) @@ -133,7 +195,7 @@ def define_node( in_channels = input.shape[1] out_channels = weight.shape[0] if (in_channels == group.number) and (out_channels % in_channels) == 0: - """Depthwise convolution case""" + """Depthwise convolution case.""" # Reshape torch shape format of weight tensor to tosa required format. # https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d m_length = int(out_channels / in_channels) @@ -178,7 +240,7 @@ def define_node( acc_type=acc_type, ) else: - """Regular convolution case""" + """Regular convolution case.""" tosa_op = ts.TosaOp.Op().CONV2D weight_name = weight.name @@ -207,7 +269,7 @@ def define_node( # For quantized convolution, rescale the output value back to the same # integer value domain of the next op. Otherwise return float32 output. - if inputs[0].dtype == ts.DType.INT8: + if output.dtype == ts.DType.INT8 or output.dtype == ts.DType.INT16: # Get scale_factor from input, weight, and output. input_scale = input_qparams[0].get_scale_per_tensor() # type: ignore[possibly-undefined] # pyre-ignore [61] per_channel_quant = input_qparams[1].per_channel # pyre-ignore [61] diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py index 2136fe2e946..76b6e67cd8d 100644 --- a/backends/arm/operators/op_eq.py +++ b/backends/arm/operators/op_eq.py @@ -7,8 +7,6 @@ from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils - from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -56,23 +54,12 @@ def define_node( ) validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec) - input_nodes = inputs - # Handle quantization - if inputs[0].dtype == ts.DType.INT8: - # Rescale inputs to 32 bit - rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) - - # Update IO - input_nodes = rescaled_inputs - # Do the equal comparison self._serialize_operator( node, tosa_graph, ts.TosaOp.Op().EQUAL, - [input_nodes[0].name, input_nodes[1].name], + [inputs[0].name, inputs[1].name], [output.name], None, ) diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py index c538e735880..4bb20cac77f 100644 --- a/backends/arm/operators/op_ge.py +++ b/backends/arm/operators/op_ge.py @@ -7,8 +7,6 @@ from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils - from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -56,22 +54,11 @@ def define_node( ) validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec) - input_nodes = inputs - # Handle quantization - if inputs[0].dtype == ts.DType.INT8: - # Rescale inputs to 32 bit - rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) - - # Update IO - input_nodes = rescaled_inputs - self._serialize_operator( node, tosa_graph, ts.TosaOp.Op().GREATER_EQUAL, - [input_nodes[0].name, input_nodes[1].name], + [inputs[0].name, inputs[1].name], [output.name], None, ) diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py index d407e28c1b6..c25c959681e 100644 --- a/backends/arm/operators/op_gt.py +++ b/backends/arm/operators/op_gt.py @@ -7,8 +7,6 @@ from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils - from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -56,22 +54,11 @@ def define_node( ) validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec) - input_nodes = inputs - # Handle quantization - if inputs[0].dtype == ts.DType.INT8: - # Rescale inputs to 32 bit - rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) - - # Update IO - input_nodes = rescaled_inputs - self._serialize_operator( node, tosa_graph, ts.TosaOp.Op().GREATER, - [input_nodes[0].name, input_nodes[1].name], + [inputs[0].name, inputs[1].name], [output.name], None, ) diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py index 403c6c233d3..e62d669814f 100644 --- a/backends/arm/operators/op_le.py +++ b/backends/arm/operators/op_le.py @@ -7,8 +7,6 @@ from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils - from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -56,22 +54,11 @@ def define_node( ) validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec) - input_nodes = inputs - # Handle quantization - if inputs[0].dtype == ts.DType.INT8: - # Rescale inputs to 32 bit - rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) - - # Update IO - input_nodes = rescaled_inputs - self._serialize_operator( node, tosa_graph, ts.TosaOp.Op().GREATER_EQUAL, - [input_nodes[1].name, input_nodes[0].name], + [inputs[1].name, inputs[0].name], [output.name], None, ) diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py index f5132dd4feb..cccb0abd5d7 100644 --- a/backends/arm/operators/op_lt.py +++ b/backends/arm/operators/op_lt.py @@ -7,8 +7,6 @@ from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils - from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -56,22 +54,11 @@ def define_node( ) validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec) - input_nodes = inputs - # Handle quantization - if inputs[0].dtype == ts.DType.INT8: - # Rescale inputs to 32 bit - rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) - - # Update IO - input_nodes = rescaled_inputs - self._serialize_operator( node, tosa_graph, ts.TosaOp.Op().GREATER, - [input_nodes[1].name, input_nodes[0].name], + [inputs[1].name, inputs[0].name], [output.name], None, ) diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py index 66437f8af1d..50c6e06a4bb 100644 --- a/backends/arm/operators/op_maximum.py +++ b/backends/arm/operators/op_maximum.py @@ -7,12 +7,6 @@ from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils - -from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( - get_input_qparams, -) - from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -22,9 +16,8 @@ validate_same_dtype, validate_valid_dtype, ) -from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.mapping import TosaArg -from executorch.backends.arm.tosa.utils import tosa_shape +from executorch.backends.arm.tosa.specification import TosaSpecification from torch.fx import Node @@ -56,35 +49,12 @@ def define_node( validate_valid_dtype( self.target, [*inputs, output], - [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32], + [ts.DType.INT32, ts.DType.FP32], output.tosa_spec, ) - scale_back = 1.0 - max_output = output - if inputs[0].dtype == ts.DType.INT8: - input_qparams = get_input_qparams(node) - if len(input_qparams) != 2: - raise ValueError( - f"Both inputs need to have quantization information for {node}" - ) - if input_qparams[0] != input_qparams[1]: - raise ValueError( - "Both inputs must have the same quantization parameters for MAX" - ) - - operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) - - output.shape = tosa_shape(output.shape, output.dim_order) - max_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32) - else: - operand_inputs = inputs - attr_maximum = ts.TosaSerializerAttribute() - - # Set to PROPOGATE as default + # Set to PROPAGATE as default attr_maximum.MaximumAttribute(nan_mode=NanPropagationMode.PROPAGATE) self._serialize_operator( @@ -92,15 +62,9 @@ def define_node( tosa_graph, ts.TosaOp.Op().MAXIMUM, [ - operand_inputs[0].name, - operand_inputs[1].name, + inputs[0].name, + inputs[1].name, ], - [max_output.name], + [output.name], attr_maximum, ) - - if output.dtype == ts.DType.INT8: - # insert RESCALE from int32 back to int8 - tqutils.insert_rescale_op_to_int8( - tosa_graph, max_output, scale_back, node, self.tosa_spec - ) diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py index 518366d5463..d5b97f186d3 100644 --- a/backends/arm/operators/op_minimum.py +++ b/backends/arm/operators/op_minimum.py @@ -7,11 +7,6 @@ from typing import Any, List -import executorch.backends.arm.tosa.quant_utils as tqutils - -from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( - get_input_qparams, -) from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -23,7 +18,6 @@ ) from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.mapping import TosaArg -from executorch.backends.arm.tosa.utils import tosa_shape from torch.fx import Node @@ -55,35 +49,12 @@ def define_node( validate_valid_dtype( self.target, [*inputs, output], - [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32], + [ts.DType.INT32, ts.DType.FP32], output.tosa_spec, ) - scale_back = 1.0 - min_output = output - if inputs[0].dtype == ts.DType.INT8: - input_qparams = get_input_qparams(node) - if len(input_qparams) != 2: - raise ValueError( - f"Both inputs need to have quantization information for {node}" - ) - if input_qparams[0] != input_qparams[1]: - raise ValueError( - "Both inputs must have the same quantization parameters for MIN" - ) - - operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32( - tosa_graph, inputs, node, self.tosa_spec - ) - - output.shape = tosa_shape(output.shape, output.dim_order) - min_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32) - else: - operand_inputs = inputs - attr_minimum = ts.TosaSerializerAttribute() - - # Set to PROPOGATE as default + # Set to PROPAGATE as default attr_minimum.MinimumAttribute(nan_mode=NanPropagationMode.PROPAGATE) self._serialize_operator( @@ -91,15 +62,9 @@ def define_node( tosa_graph, ts.TosaOp.Op().MINIMUM, [ - operand_inputs[0].name, - operand_inputs[1].name, + inputs[0].name, + inputs[1].name, ], - [min_output.name], + [output.name], attr_minimum, ) - - if output.dtype == ts.DType.INT8: - # insert RESCALE from int32 back to int8 - tqutils.insert_rescale_op_to_int8( - tosa_graph, min_output, scale_back, node, self.tosa_spec - ) diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py index 5db7ce9347c..9ee4e9fedf8 100644 --- a/backends/arm/operators/op_repeat.py +++ b/backends/arm/operators/op_repeat.py @@ -44,7 +44,7 @@ def define_node( validate_valid_dtype( self.target, [inputs[0], output], - [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32], + [ts.DType.INT8, ts.DType.INT32, ts.DType.INT16, ts.DType.FP32], output.tosa_spec, ) diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py index 9c27fddf68a..5f037dc3d1c 100644 --- a/backends/arm/operators/op_sub.py +++ b/backends/arm/operators/op_sub.py @@ -50,7 +50,7 @@ def define_node( validate_valid_dtype( self.target, [*inputs, output], - [ts.DType.INT8, ts.DType.INT32], + [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32], output.tosa_spec, ) @@ -59,12 +59,18 @@ def define_node( rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32_maxscale( tosa_graph, inputs, node, self.tosa_spec ) + elif inputs[0].dtype == ts.DType.INT16: + rescaled_inputs, scale_back = ( + tqutils.insert_rescale_ops_int16_to_int32_maxscale( + tosa_graph, inputs, node, self.tosa_spec + ) + ) else: # input[0].dtype == ts.DType.INT32 # Non quantized input, natively support by TOSA.SUB rescaled_inputs = inputs - if output.dtype == ts.DType.INT8: + if output.dtype in [ts.DType.INT8, ts.DType.INT16]: broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order) sub_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32) else: @@ -95,6 +101,15 @@ def define_node( compute_rescale=False, tosa_spec=self.tosa_spec, ) # type: ignore[possibly-undefined] + elif output.dtype == ts.DType.INT16: + tqutils.insert_rescale_op_to_int16( + tosa_graph, + sub_output, + scale_back, + node, + compute_rescale=False, + tosa_spec=self.tosa_spec, + ) # type: ignore[possibly-undefined] @register_node_visitor diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_tosa_matmul.py similarity index 53% rename from backends/arm/operators/op_bmm.py rename to backends/arm/operators/op_tosa_matmul.py index 382386ffa26..b177fd2ba37 100644 --- a/backends/arm/operators/op_bmm.py +++ b/backends/arm/operators/op_tosa_matmul.py @@ -5,13 +5,14 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe +"""Provide a visitor for lowering batched matmul (BMM) to TOSA.""" + from typing import Any, List import torch from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, - get_output_qparams, ) from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, @@ -24,13 +25,13 @@ ) from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.mapping import TosaArg -from executorch.backends.arm.tosa.quant_utils import build_rescale -from tosa.RoundingMode import RoundingMode # type: ignore @register_node_visitor -class BMMVisitor(NodeVisitor): - target = "aten.bmm.default" +class MatmulVisitor(NodeVisitor): + """Provide a visitor that serializes TOSA ``MATMUL``.""" + + target = "tosa.MATMUL.default" tosa_specs = [ TosaSpecification.create_from_string("TOSA-1.0+INT"), @@ -47,35 +48,36 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - + """Define the TOSA ``MATMUL`` operator.""" import serializer.tosa_serializer as ts # type: ignore validate_num_inputs(self.target, inputs, 2) - validate_same_dtype(self.target, [*inputs, output], ts) + validate_same_dtype(self.target, [*inputs], ts) validate_valid_dtype( self.target, - [*inputs, output], + [*inputs], [ts.DType.INT8, ts.DType.INT16, ts.DType.FP32], output.tosa_spec, ) + validate_valid_dtype( + self.target, + [output], + [ts.DType.INT32, ts.DType.INT48, ts.DType.FP32], + output.tosa_spec, + ) - # aten.bmm maps directly to MATMUL - - # For INT8, we need to get the zero points and add an intermediate tensor - # for a later rescale. - - if inputs[0].dtype == ts.DType.INT8: + # We need to get the zero points and add an intermediate tensor for INT16 case + if inputs[0].dtype in (ts.DType.INT8, ts.DType.INT16): input_qparams = get_input_qparams(node) input0_zp = input_qparams[0].get_zp_per_tensor() input1_zp = input_qparams[1].get_zp_per_tensor() - bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32) - bmm_output_name = bmm_result.name else: - bmm_output_name = output.name input0_zp, input1_zp = 0, 0 - tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=f"{node.name}_A_ZP") - tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=f"{node.name}_B_ZP") + input_A_ZP_name = f"{node.name}_A_ZP" + input_B_ZP_name = f"{node.name}_B_ZP" + tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=input_A_ZP_name) + tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=input_B_ZP_name) # Add the MATMUL to the TOSA graph. self._serialize_operator( @@ -85,27 +87,8 @@ def define_node( [ inputs[0].name, inputs[1].name, - f"{node.name}_A_ZP", - f"{node.name}_B_ZP", + input_A_ZP_name, + input_B_ZP_name, ], - [bmm_output_name], + [output.name], ) - - # As INT8 accumulates into INT32, we need to rescale it back to INT8 - if output.dtype == ts.DType.INT8: - output_qparams = get_output_qparams(node)[0] - final_output_scale = ( - input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor() # type: ignore[possibly-undefined] # pyre-ignore[61] - ) / output_qparams.get_scale_per_tensor() - - build_rescale( - tosa_fb=tosa_graph, - scale=[final_output_scale], - # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined. - input_node=bmm_result, # type: ignore[possibly-undefined] - output_name=output.name, - output_type=ts.DType.INT8, - input_zp=[0], - output_zp=[output_qparams.get_zp_per_tensor()], - rounding_mode=RoundingMode.SINGLE_ROUND, - ) diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_tosa_rescale.py similarity index 100% rename from backends/arm/operators/op_rescale.py rename to backends/arm/operators/op_tosa_rescale.py diff --git a/backends/arm/operators/op_upsample_nearest2d.py b/backends/arm/operators/op_tosa_resize.py similarity index 82% rename from backends/arm/operators/op_upsample_nearest2d.py rename to backends/arm/operators/op_tosa_resize.py index 3c3ca67c9f5..020395ee7c2 100644 --- a/backends/arm/operators/op_upsample_nearest2d.py +++ b/backends/arm/operators/op_tosa_resize.py @@ -24,8 +24,8 @@ @register_node_visitor -class UpsampleNearest2dVisitor(NodeVisitor): - target = "aten.upsample_nearest2d.vec" +class ResizeVisitor(NodeVisitor): + target = "tosa.RESIZE.default" tosa_specs = NodeVisitor.tosa_specs @@ -41,12 +41,18 @@ def define_node( ) -> None: import serializer.tosa_serializer as ts - validate_num_inputs(self.target, inputs, 3) - validate_same_dtype(self.target, [inputs[0], output], ts) + validate_num_inputs(self.target, inputs, [3, 4]) + if node.kwargs.get("resize_mode") == "bilinear": + resize_mode = ResizeMode.BILINEAR + align_corners = bool(node.args[2]) + else: + resize_mode = ResizeMode.NEAREST + align_corners = False + validate_same_dtype(self.target, [inputs[0], output], ts) validate_valid_dtype( self.target, [inputs[0], output], - [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32], + [ts.DType.INT8, ts.DType.INT32, ts.DType.FP16, ts.DType.FP32], output.tosa_spec, ) @@ -59,7 +65,7 @@ def define_node( # Align corners shouldn't make a difference for nearest upsampling. We set to False so # half pixel centers are used for resize parameter logic. scale_n_yx, scale_d_yx, offset_yx, border_yx = get_resize_parameters( - input_size_yx, output_size_yx, ResizeMode.NEAREST, align_corners=False + input_size_yx, output_size_yx, resize_mode, align_corners=align_corners ) def in_int16_range(x): @@ -86,7 +92,7 @@ def in_int16_range(x): ) attr = ts.TosaSerializerAttribute() attr.ResizeAttribute( - mode=ResizeMode.NEAREST, + mode=resize_mode, ) self._serialize_operator( diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_tosa_table.py similarity index 100% rename from backends/arm/operators/op_table.py rename to backends/arm/operators/op_tosa_table.py diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_tosa_transpose.py similarity index 100% rename from backends/arm/operators/op_transpose.py rename to backends/arm/operators/op_tosa_transpose.py diff --git a/backends/arm/operators/op_upsample_bilinear2d.py b/backends/arm/operators/op_upsample_bilinear2d.py deleted file mode 100644 index 3cc620727e0..00000000000 --- a/backends/arm/operators/op_upsample_bilinear2d.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright 2025 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe -from typing import Any, List - -import torch - -from executorch.backends.arm.operators.node_visitor import ( - NodeVisitor, - register_node_visitor, -) -from executorch.backends.arm.operators.operator_validation_utils import ( - validate_num_inputs, - validate_same_dtype, - validate_valid_dtype, -) -from executorch.backends.arm.tosa.mapping import TosaArg -from executorch.backends.arm.tosa.quant_utils import build_rescale -from executorch.backends.arm.tosa.utils import get_resize_parameters, tosa_shape - - -@register_node_visitor -class UpsampleBilinear2dVisitor(NodeVisitor): - - target = "aten.upsample_bilinear2d.vec" - tosa_specs = NodeVisitor.tosa_specs - - def __init__(self, *args): - super().__init__(*args) - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: Any, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - import serializer.tosa_serializer as ts - from tosa.ResizeMode import ResizeMode # type: ignore - from tosa.RoundingMode import RoundingMode # type: ignore - - validate_num_inputs(self.target, inputs, 4) - validate_same_dtype(self.target, [inputs[0], output], ts) - validate_valid_dtype( - self.target, - [inputs[0], output], - [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32], - output.tosa_spec, - ) - - if inputs[0].shape is None or output.shape is None: - raise ValueError("Only static shapes are supported") - - input_dtype = inputs[0].dtype - - # tosa_shape output is NHWC, take HW - input_size_yx = tuple([inputs[0].shape[dim] for dim in inputs[0].dim_order])[ - 1:3 - ] - output_size_yx = tuple([output.shape[dim] for dim in output.dim_order])[1:3] - - # Get align_corners value from the node arguments. - align_corners = bool(node.args[2]) - scale_n_yx, scale_d_yx, offset_yx, border_yx = get_resize_parameters( - input_size_yx, - output_size_yx, - ResizeMode.NEAREST, - align_corners=align_corners, - ) - - def in_int16_range(x): - return torch.all(x >= -(2**15)) and torch.all(x <= 2**15 - 1) - - if not in_int16_range(scale_n_yx): - raise ValueError("scale_n_yx is out of the int16 range") - if not in_int16_range(scale_d_yx): - raise ValueError("scale_d_yx is out of the int16 range") - if not in_int16_range(border_yx): - raise ValueError("border_yx is out of the int16 range") - - scales = [scale_n_yx[0], scale_d_yx[0], scale_n_yx[1], scale_d_yx[1]] - - attr = ts.TosaSerializerAttribute() - attr.ResizeAttribute(mode=ResizeMode.BILINEAR) - - scales_tensor = tosa_graph.addConst( - [len(scales)], ts.DType.SHAPE, scales, node.name + "_scales" - ) - offset = offset_yx.tolist() - offset_tensor = tosa_graph.addConst( - [len(offset)], ts.DType.SHAPE, offset, node.name + "_offset" - ) - border = border_yx.tolist() - border_tensor = tosa_graph.addConst( - [len(border)], ts.DType.SHAPE, border, node.name + "_border" - ) - if input_dtype == output.dtype == ts.DType.FP32: - self._serialize_operator( - node, - tosa_graph, - ts.TosaOp.Op().RESIZE, - [ - inputs[0].name, - scales_tensor.name, - offset_tensor.name, - border_tensor.name, - ], - [output.name], - attr, - ) - return - elif input_dtype == output.dtype == ts.DType.INT8: - intermediate = tosa_graph.addIntermediate( - tosa_shape(output.shape, output.dim_order), ts.DType.INT32 - ) - self._serialize_operator( - node, - tosa_graph, - ts.TosaOp.Op().RESIZE, - [ - inputs[0].name, - scales_tensor.name, - offset_tensor.name, - border_tensor.name, - ], - [intermediate.name], - attr, - ) - - final_output_scale = float(1 / (scale_n_yx[0] * scale_n_yx[1])) - - build_rescale( - tosa_fb=tosa_graph, - scale=[final_output_scale], - input_node=intermediate, - output_name=output.name, - output_type=ts.DType.INT8, - input_zp=[0], - output_zp=[0], - rounding_mode=RoundingMode.SINGLE_ROUND, - ) - else: - raise ValueError( - "Input/output dtype not in {float32, int8}: {input_dtype=} {output.dtype=}" - ) diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py index 9ca435c60c5..8865513a6dd 100644 --- a/backends/arm/process_node.py +++ b/backends/arm/process_node.py @@ -12,7 +12,7 @@ import torch import torch.fx from executorch.backends.arm.operators.node_visitor import NodeVisitor -from executorch.backends.arm.tosa.mapping import TosaArg +from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype from executorch.backends.arm.tosa.specification import TosaSpecification from executorch.backends.arm.tosa.utils import tosa_shape from torch._export.utils import ( @@ -70,13 +70,6 @@ def process_inputs( tosa_spec: TosaSpecification, ): """Serialize an input node""" - # inputs need to be in default dim_order (contiguous memory format) - meta = node.meta["val"] - if meta.dim_order() != tuple(range(meta.dim())): - raise RuntimeError( - f"Arm backend only supports contiguous memory format for inputs. " - f"Expected dim_order: {tuple(range(meta.dim()))}, but got: {meta.dim_order()} for node {node.name}" - ) try: tosa_arg = TosaArg(node, tosa_spec) except ValueError as e: @@ -113,16 +106,28 @@ def process_inputs_to_parameters( ) from e parameter_data = get_param(edge_program, node) - assert isinstance(parameter_data, torch.Tensor), "Expect Attr to be tensor" + if not isinstance(parameter_data, torch.Tensor): + raise TypeError( + f"Expected parameter '{node.name}' to be a torch.Tensor, got " + f"{type(parameter_data).__name__}" + ) parameter_values = parameter_data.detach().numpy() if tosa_arg.dtype == torch.float32: - assert tosa_spec.support_float(), f"{tosa_spec} doesn't support float" + if not tosa_spec.support_float(): + raise ValueError(f"{tosa_spec} doesn't support float operations") + + # Handle special case for INT48 tensors + special_type = node.meta.get(TosaSpecialDtype.meta_key(), None) + if isinstance(special_type, TosaSpecialDtype): + tosa_dtype = special_type.get_tosa_dtype() + else: + tosa_dtype = tosa_arg.dtype parameter_values = np.transpose(parameter_values, tosa_arg.dim_order) tosa_graph.addConst( - parameter_values.shape, tosa_arg.dtype, parameter_values, name=tosa_arg.name + parameter_values.shape, tosa_dtype, parameter_values, name=tosa_arg.name ) @@ -142,7 +147,11 @@ def process_inputs_to_buffers( ) from e buffer_data = get_buffer(edge_program, node) - assert isinstance(buffer_data, torch.Tensor), "Expect Attr to be tensor" + if not isinstance(buffer_data, torch.Tensor): + raise TypeError( + f"Expected buffer '{node.name}' to be a torch.Tensor, got " + f"{type(buffer_data).__name__}" + ) buffer_values = buffer_data.detach().numpy() # TODO: fragile code for temporary fix @@ -183,8 +192,12 @@ def process_placeholder( tosa_spec: TosaSpecification, ): """Wrapper for processing and serializing all types of placeholders""" - assert node.name == node.target, "Expect placeholder name and target to match" - assert 0 == len(node.args), "Can't handle default input values" + if node.name != node.target: + raise ValueError( + f"Placeholder name '{node.name}' does not match target '{node.target}'" + ) + if len(node.args) != 0: + raise ValueError(f"Placeholder '{node.name}' must not have default values") if node.name in edge_program.graph_signature.user_inputs: process_inputs(node, tosa_graph, tosa_spec) diff --git a/backends/arm/quantizer/__init__.py b/backends/arm/quantizer/__init__.py index 5cb5c834a98..e36c683416a 100644 --- a/backends/arm/quantizer/__init__.py +++ b/backends/arm/quantizer/__init__.py @@ -2,7 +2,12 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Expose quantizer APIs and load optional quantized kernels. +Import the public quantizer classes and configuration helpers for Arm +backends. Attempt to load portable and quantized libraries; fall back to a +log message if unavailable. +""" from .quantization_config import QuantizationConfig # noqa # usort: skip from .arm_quantizer import ( # noqa diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index 838dd44733e..90876386aa6 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -6,10 +6,12 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe +"""Provide utilities for quantization annotations. -# -# Utility functions for TOSAQuantizer -# +Use these helpers to check and mark annotation state when working with +``QuantizationAnnotation`` entries in FX node metadata. + +""" from typing import cast @@ -20,7 +22,15 @@ def is_annotated(node: Node) -> bool: - """Given a node return whether the node is annotated.""" + """Return True if the node is annotated. + + Args: + node (Node): FX node to inspect. + + Returns: + bool: True if ``Q_ANNOTATION_KEY`` exists and ``_annotated`` is set. + + """ return ( Q_ANNOTATION_KEY in node.meta and cast(QuantizationAnnotation, node.meta[Q_ANNOTATION_KEY])._annotated @@ -28,7 +38,15 @@ def is_annotated(node: Node) -> bool: def is_output_annotated(node: Node) -> bool: - """Given a node, return whether the output of the node is annotated.""" + """Return True if the node's output is annotated. + + Args: + node (Node): FX node to inspect. + + Returns: + bool: True if annotated and an output qspec is present. + + """ if Q_ANNOTATION_KEY in node.meta: annotation = cast(QuantizationAnnotation, node.meta[Q_ANNOTATION_KEY]) return annotation._annotated and annotation.output_qspec is not None @@ -37,8 +55,14 @@ def is_output_annotated(node: Node) -> bool: def mark_node_as_annotated(node: Node) -> None: - """Marks node as annotated. If needed, an empty QuantizationAnnotation is added - to the quantization_annotation node meta entry. + """Mark a node as annotated. + + Create an empty ``QuantizationAnnotation`` on the node when missing and set + its ``_annotated`` flag to True. + + Args: + node (Node): FX node to update. + """ if Q_ANNOTATION_KEY not in node.meta: node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation() diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py index ff1ad50e517..349aa3e6b21 100644 --- a/backends/arm/quantizer/quantization_annotator.py +++ b/backends/arm/quantizer/quantization_annotator.py @@ -6,7 +6,7 @@ import logging import operator from dataclasses import dataclass -from typing import Callable, List, Optional, Sequence +from typing import Callable, cast, List, Optional, Sequence import torch import torch.fx @@ -137,11 +137,18 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule): node since histc op (in HistogramObserver) only works for values up to certain upper bound. """ + HISTC_UPPER_BOUND = 3.4028235e15 if node.op == "get_attr" and isinstance(node.target, str): tensor = _get_node_target(gm, node.target) # torch.histc works until this upper bound - HISTC_UPPER_BOUND = 3.4028235e15 return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND + if node.op == "call_function" and node.target in ( + torch.ops.aten.full.default, + torch.ops.aten.full, + torch.ops.aten.fill_.Scalar, + ): + fill_value = cast(float, node.args[1]) + return abs(fill_value) > HISTC_UPPER_BOUND return False @@ -358,13 +365,13 @@ def _match_pattern( torch.ops.aten.permute_copy.default, torch.ops.aten.avg_pool2d.default, torch.ops.aten.max_pool2d.default, - torch.ops.aten.full.default, - torch.ops.aten.full, torch.ops.aten.flatten.using_ints, torch.ops.aten.dropout.default, torch.ops.aten.dropout_.default, torch.ops.aten.adaptive_avg_pool2d.default, torch.ops.aten.alias_copy.default, + torch.ops.aten.pixel_shuffle.default, + torch.ops.aten.pixel_unshuffle.default, ] @@ -391,7 +398,11 @@ def any_or_hardtanh_min_zero(n: Node): torch.ops.aten.conv2d.padding, ], [torch.ops.aten.batch_norm.default, F.batch_norm], - [torch.ops.aten.relu.default, torch.ops.aten.hardtanh.default], + [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + torch.ops.aten.hardtanh.default, + ], ], filter_fn=any_or_hardtanh_min_zero, ): @@ -407,6 +418,7 @@ def any_or_hardtanh_min_zero(n: Node): ] elif node.target in ( torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, torch.ops.aten.hardtanh.default, ): quant_properties.quant_output = _QuantProperty(0, output_act_qspec) @@ -443,7 +455,11 @@ def any_or_hardtanh_min_zero(n: Node): torch.ops.aten.linear.default, torch.ops.aten.conv2d.padding, ], - [torch.ops.aten.relu.default, torch.ops.aten.hardtanh.default], + [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + torch.ops.aten.hardtanh.default, + ], ], any_or_hardtanh_min_zero, ): @@ -508,9 +524,6 @@ def any_or_hardtanh_min_zero(n: Node): ] quant_properties.quant_output = _QuantProperty(0, shared_qspec) # type: ignore[arg-type] elif node.target in _one_to_one_shared_input_or_input_act_qspec: - if not isinstance(node.args[0], Node): - return None - input_qspec = ( SharedQuantizationSpec(node.args[0]) # type: ignore[arg-type] if is_output_annotated(node.args[0]) # type: ignore @@ -568,7 +581,12 @@ def any_or_hardtanh_min_zero(n: Node): ), ] quant_properties.quant_output = None - elif node.target in [torch.ops.aten.scalar_tensor.default]: + elif node.target in [ + torch.ops.aten.scalar_tensor.default, + torch.ops.aten.full.default, + torch.ops.aten.full, + torch.ops.aten.fill_.Scalar, + ]: quant_properties.quant_inputs = [] quant_properties.quant_output = _QuantProperty(0, output_act_qspec) elif node.target in [operator.getitem]: @@ -625,6 +643,7 @@ def annotate_graph( # type: ignore[return] torch.ops.aten.full_like.default, torch.ops.aten.full.default, torch.ops.aten.full, + torch.ops.aten.fill_.Scalar, torch.ops.aten.scalar_tensor.default, ]: node.kwargs = {} diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py index d5c3aab1060..7495ff22ac6 100644 --- a/backends/arm/quantizer/quantization_config.py +++ b/backends/arm/quantizer/quantization_config.py @@ -3,6 +3,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +"""Provide quantization configuration helpers for the Arm backend. + +Define a small dataclass to carry activation/weight/bias specs and helper +accessors that validate specs before use. Use this module to build and validate +quantization specs consumed by the annotator. + +""" # pyre-unsafe @@ -19,13 +26,38 @@ @dataclass(eq=True, frozen=True) class QuantizationConfig: + """Provide a container for quantization specs. + + Hold optional specs for input/output activations, weights, and bias, and + expose validated accessors. + + Attributes: + input_activation (QuantizationSpec | None): Spec for input activations. + output_activation (QuantizationSpec | None): Spec for output activations. + weight (QuantizationSpec | None): Spec for weights. + bias (QuantizationSpec | None): Spec for bias values. + + """ + input_activation: QuantizationSpec | None output_activation: QuantizationSpec | None weight: QuantizationSpec | None bias: QuantizationSpec | None def get_input_act_qspec(self) -> QuantizationSpec | None: - """Returns QuantizationSpec 'input_activation' after asserting that input_activation.qscheme is valid.""" + """Get the validated input activation spec. + + Validate that the input activation qscheme is supported before + returning the spec. + + Returns: + QuantizationSpec | None: Input activation spec, or ``None`` when + unset. + + Raises: + ValueError: If the qscheme is not per-tensor affine or symmetric. + + """ if self.input_activation is None: return None # Validate that input_activation uses a supported qscheme @@ -39,7 +71,19 @@ def get_input_act_qspec(self) -> QuantizationSpec | None: return self.input_activation def get_output_act_qspec(self) -> QuantizationSpec | None: - """Returns QuantizationSpec 'output_activation' after asserting that output_activation.qscheme is valid.""" + """Get the validated output activation spec. + + Validate that the output activation qscheme is supported before + returning the spec. + + Returns: + QuantizationSpec | None: Output activation spec, or ``None`` when + unset. + + Raises: + ValueError: If the qscheme is not per-tensor affine or symmetric. + + """ if self.output_activation is None: return None # Validate that output_activation uses a supported qscheme @@ -53,7 +97,18 @@ def get_output_act_qspec(self) -> QuantizationSpec | None: return self.output_activation def get_weight_qspec(self) -> QuantizationSpec | None: - """Returns QuantizationSpec 'weight' after asserting that weight.qscheme is valid.""" + """Get the validated weight spec. + + Validate that the weight qscheme is supported (per-tensor or + per-channel symmetric) before returning the spec. + + Returns: + QuantizationSpec | None: Weight spec, or ``None`` when unset. + + Raises: + ValueError: If the qscheme is not a supported symmetric scheme. + + """ if self.weight is None: return None # Validate that weight uses a supported qscheme @@ -65,11 +120,46 @@ def get_weight_qspec(self) -> QuantizationSpec | None: return self.weight def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None: - """Returns QuantizationSpec 'bias' after asserting that bias.dtype is torch.float.""" + """Get the derived or validated bias spec. + + For conv/linear ops, derive bias qparams from the input/weight observers. + Otherwise, validate a user-provided floating-point bias spec. + + Args: + node (torch.fx.Node): Node whose bias spec is requested. + + Returns: + QuantizationSpec | None: Derived or provided bias spec, or ``None`` + when unset. + + Raises: + ValueError: If deriving qparams sees an unexpected number of + observers/fake-quantizers, or if a provided bias dtype is not + floating-point. + + """ def _derive_qparams_fn( obs_or_fqs: list[ObserverOrFakeQuantize], ) -> tuple[torch.Tensor, torch.Tensor]: + """Compute bias scale/zero-point from activation/weight observers. + + Expect two observers or fake-quantize modules: one for the input + activation and one for the weight. The bias scale is the product of + input and weight scales, and the zero-point is a tensor of zeros. + + Args: + obs_or_fqs (list[ObserverOrFakeQuantize]): Observers/fake-quant + in order ``[act, weight]``. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Bias scale tensor and + integer zero-point tensor. + + Raises: + ValueError: If the list does not contain exactly two items. + + """ # Validate expected number of observers/fake-quantizes if len(obs_or_fqs) != 2: raise ValueError( @@ -89,29 +179,48 @@ def _derive_qparams_fn( torch.ops.aten.linear.default, torch.ops.aten.conv2d.padding, ]: - input_act = node.args[0] - weight = node.args[1] - # If the weights are quantized per_tensor, do the same with bias - qscheme = ( - torch.per_tensor_symmetric - if self.weight is None - else self.weight.qscheme - ) - ch_axis = None - if self.weight is not None: - if qscheme == torch.per_channel_symmetric: - ch_axis = self.weight.ch_axis - - quantization_spec = DerivedQuantizationSpec( - derived_from=[(input_act, node), (weight, node)], # type: ignore[list-item] - derive_qparams_fn=_derive_qparams_fn, - dtype=torch.int32, - quant_min=torch.iinfo(torch.int32).min, - quant_max=torch.iinfo(torch.int32).max - 1, - qscheme=qscheme, - ch_axis=ch_axis, - ) - return quantization_spec # type: ignore[return-value] + if self.input_activation is None or self.weight is None: + raise ValueError( + "Input activation and weight QuantizationConfig must be specified." + ) + if self.input_activation.dtype == self.weight.dtype == torch.int8: + # This is the default int8 quantization which uses the derived quantization + # calculated from the activation and weight scale + input_act = node.args[0] + weight = node.args[1] + + # If the weights are quantized per_tensor, do the same with bias + qscheme = ( + torch.per_tensor_symmetric + if self.weight is None + else self.weight.qscheme + ) + ch_axis = None + if self.weight is not None: + if qscheme == torch.per_channel_symmetric: + ch_axis = self.weight.ch_axis + + quantization_spec = DerivedQuantizationSpec( + derived_from=[(input_act, node), (weight, node)], # type: ignore[list-item] + derive_qparams_fn=_derive_qparams_fn, + dtype=torch.int32, + quant_min=torch.iinfo(torch.int32).min, + quant_max=torch.iinfo(torch.int32).max - 1, + qscheme=qscheme, + ch_axis=ch_axis, + ) + return quantization_spec # type: ignore[return-value] + elif ( + self.input_activation.dtype == torch.int16 + and self.weight.dtype == torch.int8 + ): + # In case the activation is quantized to int16, the bias needs to be + # added after the convolution, so use the output quantization for this case. + return self.output_activation + else: + raise NotImplementedError( + f"Bias quantization of types: i:{self.input_activation.dtype}, w:{self.weight.dtype} not implemented" + ) if self.bias is None: return None diff --git a/backends/arm/requirements-arm-ethos-u.txt b/backends/arm/requirements-arm-ethos-u.txt index a26fb014234..9076aa08852 100644 --- a/backends/arm/requirements-arm-ethos-u.txt +++ b/backends/arm/requirements-arm-ethos-u.txt @@ -3,4 +3,4 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -ethos-u-vela == 4.4.0 +ethos-u-vela == 4.4.1 \ No newline at end of file diff --git a/backends/arm/requirements-arm-tosa.txt b/backends/arm/requirements-arm-tosa.txt index 0f9c2f702a4..16aa01a6c23 100644 --- a/backends/arm/requirements-arm-tosa.txt +++ b/backends/arm/requirements-arm-tosa.txt @@ -8,4 +8,4 @@ flatbuffers == 24.3.25 tosa-adapter-model-explorer == 0.0.1 ai-edge-model-explorer >= 0.1.16 -tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.0 +tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.1 diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp index 8f63569eece..08589c34c69 100644 --- a/backends/arm/runtime/EthosUBackend.cpp +++ b/backends/arm/runtime/EthosUBackend.cpp @@ -249,15 +249,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { handles.inputs->io[i].elem_size); return Error::InvalidProgram; } - supported = executorch::runtime::is_contiguous_dim_order( - tensor_in.dim_order().data(), tensor_in.dim()); - if (!supported) { - ET_LOG( - Error, - "Input %d expected contiguous dim_order, but got non-contiguous dim_order", - i); - return Error::InvalidProgram; - } // Select a compatible copy routine including checking for input layouts // which require permutation. diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp index abb4c50d8be..fa8c7ead220 100644 --- a/backends/arm/runtime/VGFSetup.cpp +++ b/backends/arm/runtime/VGFSetup.cpp @@ -24,6 +24,13 @@ namespace vgf { /* static function to map format to byte count */ static uint32_t get_format_size(VkFormat format); +// SPV_ARM_tensor does not support rank-0 representations according to the spec. +// Use an unsqueezed dimension when the resource table contains an empty +// shape. Tensors are output as rank 0 when copied back from the vgf backend. +namespace { +constexpr int64_t kScalarSentinelDimension = 1; +} + // Debug function to inspect memory properties static string memory_flags_to_string(VkMemoryPropertyFlags flags) { if (flags == 0) @@ -264,7 +271,11 @@ static void debug_print_resources( the_shape.size(), the_stride.size()); for (int j = 0; j < the_shape.size(); j++) { - ET_LOG(Info, " %d: dim %ld", j, the_shape[j]); + ET_LOG( + Info, + " %d: dim %lld", + j, + static_cast(the_shape[j])); } // Allocate a tensor with bound memory break; @@ -387,6 +398,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef specs) { // Get tensor shape and strides auto shape = resource_decoder->getTensorShape(i); auto stride = resource_decoder->getTensorStride(i); + const auto shape_size = shape.size(); switch (resource_decoder->getCategory(i)) { case vgflib::ResourceCategory::INPUT: @@ -409,9 +421,9 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef specs) { result = allocate_tensor( vk_physical, vk_device, - vgflib::ToVkFormat(resource_decoder->getVkFormat(i)), - static_cast(shape.size()), - shape.begin(), + resource_format, + shape_size == 0 ? 1 : static_cast(shape_size), + shape_size == 0 ? &kScalarSentinelDimension : shape.begin(), static_cast(stride.size()), stride.begin(), &tensor_description, @@ -422,8 +434,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef specs) { ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i); return false; } - size_t e_size = get_format_size( - vgflib::ToVkFormat(resource_decoder->getVkFormat(i))); + size_t e_size = get_format_size(resource_format); if (0 == e_size) { ET_LOG(Error, "failed to get element size of VkFormat"); return false; @@ -449,9 +460,11 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef specs) { .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM, .pNext = nullptr, .tiling = VK_TENSOR_TILING_LINEAR_ARM, - .format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i)), - .dimensionCount = static_cast(shape.size()), - .pDimensions = shape.begin(), + .format = resource_format, + .dimensionCount = + shape_size == 0 ? 1 : static_cast(shape_size), + .pDimensions = + shape_size == 0 ? &kScalarSentinelDimension : shape.begin(), // Note: stride_data of 0's causes size==0, null means stride==size .pStrides = (0 == stride.size() ? nullptr : stride.begin()), .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM, diff --git a/backends/arm/scripts/TOSA_minimal_example.ipynb b/backends/arm/scripts/TOSA_minimal_example.ipynb index b79780c6a07..a249f03a873 100644 --- a/backends/arm/scripts/TOSA_minimal_example.ipynb +++ b/backends/arm/scripts/TOSA_minimal_example.ipynb @@ -62,7 +62,7 @@ "model = Add()\n", "model = model.eval()\n", "exported_program = torch.export.export(model, example_inputs)\n", - "graph_module = exported_program.module()\n", + "graph_module = exported_program.graph_module\n", "\n", "_ = graph_module.print_readable()" ] @@ -201,7 +201,7 @@ " config=ExecutorchBackendConfig(extract_delegate_segments=False)\n", " )\n", "\n", - "executorch_program_manager.exported_program().module().print_readable()\n", + "executorch_program_manager.exported_program().graph_module.print_readable()\n", "\n", "# Save pte file\n", "pte_name = base_name + \".pte\"\n", diff --git a/backends/arm/scripts/build_executor_runner_vkml.sh b/backends/arm/scripts/build_executor_runner_vkml.sh index 1df63acc425..afca02c6299 100755 --- a/backends/arm/scripts/build_executor_runner_vkml.sh +++ b/backends/arm/scripts/build_executor_runner_vkml.sh @@ -69,6 +69,7 @@ cmake \ -DCMAKE_BUILD_TYPE=${build_type} \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=OFF \ diff --git a/backends/arm/scripts/mlsdk_utils.sh b/backends/arm/scripts/mlsdk_utils.sh index 7a7d2585e52..2c6553df3d3 100755 --- a/backends/arm/scripts/mlsdk_utils.sh +++ b/backends/arm/scripts/mlsdk_utils.sh @@ -38,6 +38,28 @@ function download_ai_mlsdk_manifest() { --manifest-url ${mlsdk_manifest_url} \ --manifest-branch ${mlsdk_manifest_tag} \ -g model-converter,emulation-layer,vgf-library + +# Update dependencies to use gitlab tosa-mlir-translator +# Do not indent the xml. Heredoc indentation is significant. +mkdir -p .repo/local_manifests/ +cat > ".repo/local_manifests/tosa_gitlab.xml" <<'XML' + + + + + + + + + +XML + ./repo sync -j$(nproc) popd @@ -109,7 +131,7 @@ function setup_mlsdk() { -DSPIRV_TOOLS_PATH=../../dependencies/SPIRV-Tools \ -DVULKAN_HEADERS_PATH=../../dependencies/Vulkan-Headers - cmake --build build + cmake --build build -j$(nproc) cmake --install build --prefix deploy popd fi diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py index c6eaafa597b..54f8aa7421d 100644 --- a/backends/arm/scripts/parse_test_names.py +++ b/backends/arm/scripts/parse_test_names.py @@ -26,6 +26,8 @@ "_native_batch_norm_legit_no_training.default", "_native_batch_norm_legit.no_stats", "alias_copy.default", + "pixel_shuffle.default", + "pixel_unshuffle.default", ] ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS @@ -95,6 +97,9 @@ def parse_test_name( op = op.removesuffix("_1d") op = op.removesuffix("_2d") + # Remove suffix for 16 bit activation and 8 bit weight test cases + op = op.removesuffix("_16a8w") + assert target != "None", f"{test_name} does not contain one of {TARGETS}" assert ( op in op_name_map.keys() diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh index 0f76d0496de..5d3088c865a 100755 --- a/backends/arm/scripts/run_fvp.sh +++ b/backends/arm/scripts/run_fvp.sh @@ -22,6 +22,7 @@ data_file="" target="ethos-u55-128" timeout="600" etrecord_file="" +trace_file="" help() { echo "Usage: $(basename $0) [options]" @@ -31,6 +32,7 @@ help() { echo " --target= Target to build and run for Default: ${target}" echo " --timeout= Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}" echo " --etrecord= If ETDump is used you can supply a ETRecord file matching the PTE" + echo " --trace_file= File to write PMU trace output to" exit 0 } @@ -42,6 +44,7 @@ for arg in "$@"; do --target=*) target="${arg#*=}";; --timeout=*) timeout="${arg#*=}";; --etrecord=*) etrecord_file="${arg#*=}";; + --trace_file=*) trace_file="${arg#*=}";; *) ;; esac @@ -86,6 +89,14 @@ fi log_file=$(mktemp) +extra_args_u55=() +extra_args_u85=() + +if [[ -n "${trace_file}" ]]; then + extra_args_u55+=(-C "ethosu.extra_args=--pmu-trace ${trace_file}") + extra_args_u85+=(-C "mps4_board.subsystem.ethosu.extra_args=--pmu-trace ${trace_file}") +fi + if [[ ${target} == *"ethos-u55"* ]]; then ${nobuf} ${fvp_model} \ -C ethosu.num_macs=${num_macs} \ @@ -93,6 +104,7 @@ if [[ ${target} == *"ethos-u55"* ]]; then -C mps3_board.telnetterminal0.start_telnet=0 \ -C mps3_board.uart0.out_file='-' \ -C mps3_board.uart0.shutdown_on_eot=1 \ + "${extra_args_u55[@]}" \ -a "${elf_file}" \ ${data_file} \ --timelimit ${timeout} 2>&1 | sed 's/\r$//' | tee ${log_file} || true # seconds @@ -105,6 +117,7 @@ elif [[ ${target} == *"ethos-u85"* ]]; then -C mps4_board.telnetterminal0.start_telnet=0 \ -C mps4_board.uart0.out_file='-' \ -C mps4_board.uart0.shutdown_on_eot=1 \ + "${extra_args_u85[@]}" \ -a "${elf_file}" \ ${data_file} \ --timelimit ${timeout} 2>&1 | sed 's/\r$//' | tee ${log_file} || true # seconds diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS index ec35b63f8f6..fd7d894fbf0 100644 --- a/backends/arm/test/TARGETS +++ b/backends/arm/test/TARGETS @@ -1,3 +1,8 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load(":targets.bzl", "define_arm_tests") @@ -58,6 +63,7 @@ runtime.python_library( "//executorch/backends/arm/quantizer:lib", "//executorch/backends/arm/tosa:mapping", "//executorch/backends/arm:vgf", + "//executorch/backends/arm:_factory", "//executorch/devtools/backend_debug:delegation_info", "//executorch/exir/backend:operator_support", "fbsource//third-party/pypi/tabulate:tabulate", diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index 963084d6091..3b5dd8bd4db 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -14,6 +14,7 @@ import pytest from executorch.backends.arm.ethosu import EthosUCompileSpec + from executorch.backends.arm.test.runner_utils import ( arm_executor_runner_exists, corstone300_installed, @@ -226,6 +227,7 @@ def parametrize( test_data: dict[str, Any], xfails: dict[str, xfail_type] | None = None, strict: bool = True, + flakies: dict[str, int] | None = None, ): """ Custom version of pytest.mark.parametrize with some syntatic sugar and added xfail functionality @@ -236,12 +238,17 @@ def parametrize( """ if xfails is None: xfails = {} + if flakies is None: + flakies = {} def decorator_func(func): """Test data is transformed from a dict of (id, data) pairs to a list of pytest params to work with the native pytests parametrize function""" pytest_testsuite = [] for id, test_parameters in test_data.items(): - if id in xfails: + if id in flakies: + # Mark this parameter as flaky with given reruns + marker = (pytest.mark.flaky(reruns=flakies[id]),) + elif id in xfails: xfail_info = xfails[id] reason = "" raises = None diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py index 6fc9e7e5adc..0060bf0ea63 100644 --- a/backends/arm/test/conftest.py +++ b/backends/arm/test/conftest.py @@ -118,7 +118,7 @@ def is_option_enabled(option: str, fail_if_not_enabled: bool = False) -> bool: a RuntimeError instead of returning False. """ - if option in pytest._test_options and pytest._test_options[option]: # type: ignore[attr-defined] + if hasattr(pytest, "_test_options") and option in pytest._test_options and pytest._test_options[option]: # type: ignore[attr-defined] return True else: if fail_if_not_enabled: diff --git a/backends/arm/test/misc/test_conv_relu_residual_add.py b/backends/arm/test/misc/test_conv_relu_residual_add.py index fdd6ec972a6..d88a9c74b7c 100644 --- a/backends/arm/test/misc/test_conv_relu_residual_add.py +++ b/backends/arm/test/misc/test_conv_relu_residual_add.py @@ -85,7 +85,6 @@ def test_tosa_u55_INT(per_channel_quantization): model_inputs, [], [], - run_on_fvp=True, use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, qtol=0, @@ -102,7 +101,6 @@ def test_tosa_u85_INT(per_channel_quantization): model_inputs, [], [], - run_on_fvp=True, use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, qtol=0, diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index 3796d3dce4a..c2f28f4e9d8 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -262,9 +262,10 @@ def forward(self, x): @common.parametrize("test_data", Add.inputs) +@common.XfailIfNoCorstone300 def test_fail_dump_tosa_ops(caplog, test_data: input_t1): pipeline = EthosU55PipelineINT[input_t1]( - Add(), test_data, [], [], use_to_edge_transform_and_lower=True, run_on_fvp=False + Add(), test_data, [], [], use_to_edge_transform_and_lower=True ) pipeline.dump_operator_distribution("to_edge_transform_and_lower") pipeline.run() diff --git a/backends/arm/test/misc/test_dim_order.py b/backends/arm/test/misc/test_dim_order.py new file mode 100644 index 00000000000..6b0b79add99 --- /dev/null +++ b/backends/arm/test/misc/test_dim_order.py @@ -0,0 +1,123 @@ +# Copyright 2024-2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineINT, + EthosU85PipelineINT, + TosaPipelineFP, + TosaPipelineINT, +) + + +input_t1 = Tuple[torch.Tensor] # Input x + + +class ChannelsLastInput(torch.nn.Module): + """ + Test a complex case with (channels last, channels first) input, + and (channels first, channels last) output. + """ + + inputs: input_t1 = ( + torch.arange(1, 25, dtype=torch.float32) + .reshape((1, 2, 3, 4)) + .to(memory_format=torch.channels_last), + torch.arange(1, 25, dtype=torch.float32).reshape((1, 2, 3, 4)), + ) + + def forward(self, x, y): + x = x * x + return y, x + + +class ChannelsFirstOutput(torch.nn.Module): + """ + Test coverting to channels_first inside the delegate. + """ + + inputs: input_t1 = ( + torch.arange(1, 25, dtype=torch.float32) + .reshape((1, 2, 3, 4)) + .to(memory_format=torch.channels_last), + ) + + def forward(self, x): + x = x.clone(memory_format=torch.contiguous_format) * x + return x + + +class ChannelsLastOutput(torch.nn.Module): + """ + Test changing of dim_order inside the delegate. + """ + + inputs: input_t1 = (torch.arange(1, 9, dtype=torch.float32).reshape((1, 2, 2, 2)),) + + def forward(self, x): + x = x * x + x = x.clone(memory_format=torch.channels_last) + return x + + +class ChannelsLastInsidePartition(torch.nn.Module): + """ + Test dim_order changes inside the partiton, but no dim_order changes at input/output. + """ + + inputs: input_t1 = (torch.randn((1, 2, 3, 3)),) + + def __init__(self): + super().__init__() + self.conv2d = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=(3, 3)) + + def forward(self, x): + return ( + self.conv2d(x.clone(memory_format=torch.channels_last)).clone( + memory_format=torch.contiguous_format + ) + * 1 + ) + + +test_modules = { + "channels_last_input": ChannelsLastInput, + "channels_first_output": ChannelsFirstOutput, + "channels_last_output": ChannelsLastOutput, + "channels_last_inside_partition": ChannelsLastInsidePartition, +} + + +@common.parametrize("module", test_modules) +def test_dim_order_tosa_FP(module): + pipeline = TosaPipelineFP[input_t1](module(), module.inputs, []) + pipeline.run() + + +@common.parametrize("module", test_modules) +def test_dim_order_tosa_INT(module): + pipeline = TosaPipelineINT[input_t1]( + module(), module.inputs, [], symmetric_io_quantization=True + ) + pipeline.run() + + +@common.XfailIfNoCorstone300 +@common.parametrize("module", test_modules) +def test_dim_order_u55_INT(module): + pipeline = EthosU55PipelineINT[input_t1](module(), module.inputs, []) + pipeline.run() + + +@common.XfailIfNoCorstone320 +@common.parametrize("module", test_modules) +def test_dim_order_u85_INT(module): + pipeline = EthosU85PipelineINT[input_t1](module(), module.inputs, []) + pipeline.run() diff --git a/backends/arm/test/misc/test_dim_order_guards.py b/backends/arm/test/misc/test_dim_order_guards.py deleted file mode 100644 index 80a3c014abc..00000000000 --- a/backends/arm/test/misc/test_dim_order_guards.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2024-2025 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -from typing import Tuple - -import pytest - -import torch -from executorch.backends.arm.test import common - -from executorch.backends.arm.test.tester.test_pipeline import ( - TosaPipelineFP, - TosaPipelineINT, -) - - -input_t1 = Tuple[torch.Tensor] # Input x - - -class Conv2D(torch.nn.Module): - inputs: dict[str, input_t1] = { - "randn": (torch.randn(1, 2, 20, 20).to(memory_format=torch.channels_last),), - } - - def __init__(self): - super().__init__() - self.conv2d = torch.nn.Conv2d(in_channels=2, out_channels=3, kernel_size=(3, 3)) - - def forward(self, x): - return self.conv2d(x) - - -@common.parametrize("test_data", Conv2D.inputs) -def test_tosa_FP_pipeline(test_data: input_t1): - module = Conv2D() - pipeline = TosaPipelineFP[input_t1]( - module, - test_data, - [], - [], - use_to_edge_transform_and_lower=False, - ) - pos = pipeline.find_pos("partition") - pipeline._stages = pipeline._stages[:pos] - pipeline.run() - with pytest.raises(RuntimeError): - pipeline.tester.partition() - - -@common.parametrize("test_data", Conv2D.inputs) -def test_tosa_INT_pipeline(test_data: input_t1): - module = Conv2D() - pipeline = TosaPipelineINT[input_t1]( - module, - test_data, - [], - [], - use_to_edge_transform_and_lower=False, - ) - pos = pipeline.find_pos("partition") - pipeline._stages = pipeline._stages[:pos] - pipeline.run() - with pytest.raises(RuntimeError): - pipeline.tester.partition() diff --git a/backends/arm/test/misc/test_multiple_delegates.py b/backends/arm/test/misc/test_multiple_delegates.py index f716bc45385..8dad25f4180 100644 --- a/backends/arm/test/misc/test_multiple_delegates.py +++ b/backends/arm/test/misc/test_multiple_delegates.py @@ -23,7 +23,7 @@ class MultipleDelegatesModule(torch.nn.Module): def forward(self, x: torch.Tensor, y: torch.Tensor): z = x + y - s = torch.tan(z) + s = torch.max(z) return s * z diff --git a/backends/arm/test/misc/test_pass_required_order.py b/backends/arm/test/misc/test_pass_required_order.py new file mode 100644 index 00000000000..2745d25a498 --- /dev/null +++ b/backends/arm/test/misc/test_pass_required_order.py @@ -0,0 +1,95 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import re +from typing import List, Set, Type + +import pytest +from executorch.backends.arm._passes.arm_pass_manager import ArmPass, ArmPassManager +from executorch.backends.arm.tosa.specification import TosaSpecification +from executorch.exir.pass_base import ExportPass + + +class PassC(ArmPass): + _passes_required_after: Set[Type[ExportPass]] = set() + + +class PassB(ArmPass): + _passes_required_after = {PassC} + + +class PassA(ArmPass): + _passes_required_after = {PassB, PassC} + + +class IndependentPass(ArmPass): + _passes_required_after: Set[Type[ExportPass]] = set() + + +def _setup_pass_manager(passes: List[ArmPass] | None = None): + tosa_spec = TosaSpecification.create_from_string("TOSA-1.00+INT") + pass_manager = ArmPassManager(tosa_spec) + if passes is not None: + for p in passes: + pass_manager.add_pass(p) + return pass_manager + + +def test_no_passes(): + pass_manager = _setup_pass_manager() + pass_manager.validate_constraints_mandatory() + + +def test_correct_order(): + pass_manager = _setup_pass_manager([PassA(), PassB(), PassC()]) + pass_manager.validate_constraints_mandatory() + + +def test_run_pass_twice(): + pass_manager = _setup_pass_manager([PassA(), PassB(), PassB(), PassC()]) + pass_manager.validate_constraints_mandatory() + + +def test_independent_pass(): + pass_manager = _setup_pass_manager( + [ + IndependentPass(), + PassA(), + IndependentPass(), + PassB(), + IndependentPass(), + PassC(), + IndependentPass(), + ] + ) + pass_manager.validate_constraints_mandatory() + + +def test_duplicated_requiring_pass_put_last(): + error_msg = """The following constraints for passes are not met: + - PassC must run after PassB +""" + pass_manager = _setup_pass_manager([PassA(), PassB(), PassC(), PassB()]) + with pytest.raises(RuntimeError, match=re.escape(error_msg)): + pass_manager.validate_constraints_mandatory() + + +def test_two_passes_wrong_order(): + error_msg = """The following constraints for passes are not met: + - PassC must run after PassB +""" + pass_manager = _setup_pass_manager([PassC(), PassB()]) + with pytest.raises(RuntimeError, match=re.escape(error_msg)): + pass_manager.validate_constraints_mandatory() + + +def test_missing_passes(): + error_msg = """The following constraints for passes are not met: + - PassC must run after PassA + - PassC must run after PassB +""" + pass_manager = _setup_pass_manager([PassA(), PassB()]) + with pytest.raises(RuntimeError, match=re.escape(error_msg)): + pass_manager.validate_constraints_mandatory() diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py index 0e99f3f5bfa..fad31b57537 100644 --- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py +++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py @@ -4,9 +4,8 @@ # LICENSE file in the root directory of this source tree. -import unittest +from typing import Tuple -import pytest import torch from executorch.backends.arm._passes import ( ConvertInt64ConstOpsToInt32Pass, @@ -18,26 +17,41 @@ from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import ( CLIP_text_encoder_config, ) -from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.test.tester.test_pipeline import ( + TosaPipelineFP, + TosaPipelineINT, + VgfPipeline, +) from transformers import CLIPTextModelWithProjection +input_t = Tuple[torch.Tensor] + -class TestCLIPTextModelWithProjection(unittest.TestCase): +class TestCLIPTextModelWithProjection: """ Test class of CLIPTextModelWithProjection. CLIPTextModelWithProjection is one of the text_encoder used by Stable Diffusion 3.5 Medium """ - # Adjust nbr below as we increase op support. Note: most of the delegates - # calls are directly consecutive to each other in the .pte. The reason - # for that is some assert ops are removed by passes in the - # .to_executorch step, i.e. after Arm partitioner. - ops_after_partitioner = { + # Adjust nbr below as we increase op support. + ops_after_partitioner_FP = { "executorch_exir_dialects_edge__ops_aten_argmax_default": 1, "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2, "torch.ops.higher_order.executorch_call_delegate": 2, } + ops_after_partitioner_INT = { + "executorch_exir_dialects_edge__ops_aten_argmax_default": 1, + "executorch_exir_dialects_edge__ops_aten_full_default": 1, + "executorch_exir_dialects_edge__ops_aten_index_select_default": 1, + "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, + "executorch_exir_dialects_edge__ops_aten_where_self": 1, + "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2, + "torch.ops.aten.scalar_tensor.default": 1, + "torch.ops.higher_order.executorch_call_delegate": 2, + } + def _prepare_inputs( self, batch_size=12, @@ -61,46 +75,93 @@ def prepare_model_and_inputs(self): return text_encoder_model, text_encoder_model_inputs - def test_CLIPTextModelWithProjection_tosa_FP(self): - text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs() - with torch.no_grad(): - ( - ArmTester( - text_encoder_model, - example_inputs=text_encoder_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"), - transform_passes=[ - ConvertInt64ConstOpsToInt32Pass(), - ConvertInt64OutputOpsToInt32Pass(), - InsertInt32CastsAfterInt64PlaceholdersPass(), - ], - ) - .export() - .to_edge_transform_and_lower() - .dump_operator_distribution() - .check_count(self.ops_after_partitioner) - .to_executorch() - .run_method_and_compare_outputs( - inputs=text_encoder_model_inputs, - ) - ) - - @pytest.mark.xfail(raises=AssertionError, reason="Output difference.") - def test_CLIPTextModelWithProjection_tosa_INT(self): - text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs() - with torch.no_grad(): - ( - ArmTester( - text_encoder_model, - example_inputs=text_encoder_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"), - ) - .quantize() - .export() - .to_edge_transform_and_lower() - .dump_operator_distribution() - .to_executorch() - .run_method_and_compare_outputs( - inputs=text_encoder_model_inputs, - ) - ) + +def test_CLIPTextModelWithProjection_tosa_FP(): + text_encoder_model, text_encoder_model_inputs = ( + TestCLIPTextModelWithProjection().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineFP[input_t]( + text_encoder_model, + text_encoder_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + transform_passes=[ + ConvertInt64ConstOpsToInt32Pass(), + ConvertInt64OutputOpsToInt32Pass(), + InsertInt32CastsAfterInt64PlaceholdersPass(), + ], + ) + pipeline.change_args( + "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP + ) + pipeline.run() + + +def test_CLIPTextModelWithProjection_tosa_INT(): + text_encoder_model, text_encoder_model_inputs = ( + TestCLIPTextModelWithProjection().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineINT[input_t]( + text_encoder_model, + text_encoder_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + atol=0.8, + ) + pipeline.change_args( + "check_count.exir", + TestCLIPTextModelWithProjection.ops_after_partitioner_INT, + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_CLIPTextModelWithProjection_vgf_FP(): + text_encoder_model, text_encoder_model_inputs = ( + TestCLIPTextModelWithProjection().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + text_encoder_model, + text_encoder_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+FP", + use_to_edge_transform_and_lower=True, + atol=4, # TODO: Investiage numerical issue: MAX Diff ~50% + transform_passes=[ + ConvertInt64ConstOpsToInt32Pass(), + ConvertInt64OutputOpsToInt32Pass(), + InsertInt32CastsAfterInt64PlaceholdersPass(), + ], + ) + pipeline.change_args( + "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_CLIPTextModelWithProjection_vgf_INT(): + text_encoder_model, text_encoder_model_inputs = ( + TestCLIPTextModelWithProjection().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + text_encoder_model, + text_encoder_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+INT", + use_to_edge_transform_and_lower=True, + atol=0.8, + ) + pipeline.change_args( + "check_count.exir", + TestCLIPTextModelWithProjection.ops_after_partitioner_INT, + ) + pipeline.run() diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py index f9d814d044b..9506fe727db 100644 --- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py +++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. -import unittest +from typing import Tuple import torch from diffusers.models.transformers import SD3Transformer2DModel @@ -13,10 +13,16 @@ from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import ( SD3Transformer2DModel_init_dict, ) -from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.test.tester.test_pipeline import ( + TosaPipelineFP, + TosaPipelineINT, + VgfPipeline, +) + +input_t4 = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] -class TestSD3Transformer2DModel(unittest.TestCase): +class TestSD3Transformer2DModel: """ Test class of AutoenSD3Transformer2DModelcoderKL. SD3Transformer2DModel is the transformer model used by Stable Diffusion 3.5 Medium @@ -24,16 +30,12 @@ class TestSD3Transformer2DModel(unittest.TestCase): # Adjust nbr below as we increase op support. ops_after_partitioner_FP = { - "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 1, - "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2, "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1, "torch.ops.higher_order.executorch_call_delegate": 1, } ops_after_partitioner_INT = { - "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1, - "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2, "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2, "torch.ops.higher_order.executorch_call_delegate": 2, } @@ -93,48 +95,88 @@ def forward(self, *args, **kwargs): return sd35_transformer2D_model, sd35_transformer2D_model_inputs - def test_SD3Transformer2DModel_tosa_FP(self): - sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( - self.prepare_model_and_inputs() - ) - with torch.no_grad(): - ( - ArmTester( - sd35_transformer2D_model, - example_inputs=sd35_transformer2D_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"), - ) - .export() - .to_edge_transform_and_lower() - .check_count(self.ops_after_partitioner_FP) - .to_executorch() - .run_method_and_compare_outputs( - inputs=sd35_transformer2D_model_inputs, - rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT - atol=4.0, - ) - ) - def test_SD3Transformer2DModel_tosa_INT(self): - sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( - self.prepare_model_and_inputs() +def test_SD3Transformer2DModel_tosa_FP(): + sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( + TestSD3Transformer2DModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineFP[input_t4]( + sd35_transformer2D_model, + sd35_transformer2D_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT + atol=4.0, ) - with torch.no_grad(): - ( - ArmTester( - sd35_transformer2D_model, - example_inputs=sd35_transformer2D_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"), - ) - .quantize() - .export() - .to_edge_transform_and_lower() - .check_count(self.ops_after_partitioner_INT) - .to_executorch() - .run_method_and_compare_outputs( - inputs=sd35_transformer2D_model_inputs, - qtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT - rtol=1.0, - atol=4.0, - ) - ) + pipeline.change_args( + "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP + ) + pipeline.run() + + +def test_SD3Transformer2DModel_tosa_INT(): + sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( + TestSD3Transformer2DModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineINT[input_t4]( + sd35_transformer2D_model, + sd35_transformer2D_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + qtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT + rtol=1.0, + atol=4.0, + ) + pipeline.change_args( + "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_SD3Transformer2DModel_vgf_FP(): + sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( + TestSD3Transformer2DModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t4]( + sd35_transformer2D_model, + sd35_transformer2D_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+FP", + use_to_edge_transform_and_lower=True, + rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT + atol=4.0, + ) + pipeline.change_args( + "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_SD3Transformer2DModel_vgf_INT(): + sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( + TestSD3Transformer2DModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t4]( + sd35_transformer2D_model, + sd35_transformer2D_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+INT", + use_to_edge_transform_and_lower=True, + qtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT + rtol=1.0, + atol=4.0, + ) + pipeline.change_args( + "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT + ) + pipeline.run() diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py index 22a47042eb1..20b92e4a258 100644 --- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py +++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. -import unittest +from typing import Tuple import torch from executorch.backends.arm._passes import ( @@ -17,11 +17,17 @@ from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import ( T5_encoder_config, ) -from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.test.tester.test_pipeline import ( + TosaPipelineFP, + TosaPipelineINT, + VgfPipeline, +) from transformers import T5EncoderModel +input_t = Tuple[torch.Tensor] + -class TestT5EncoderModel(unittest.TestCase): +class TestT5EncoderModel: """ Test class of T5EncoderModel. T5EncoderModel is one of the text_encoder used by Stable Diffusion 3.5 Medium @@ -61,46 +67,88 @@ def prepare_model_and_inputs(self): return t5_encoder_model, t5_encoder_model_inputs - def test_T5EncoderModel_tosa_FP(self): - t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs() - with torch.no_grad(): - ( - ArmTester( - t5_encoder_model, - example_inputs=t5_encoder_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"), - transform_passes=[ - ConvertInt64ConstOpsToInt32Pass(), - ConvertInt64OutputOpsToInt32Pass(), - InsertInt32CastsAfterInt64PlaceholdersPass(), - ], - ) - .export() - .to_edge_transform_and_lower() - .dump_operator_distribution() - .check_count(self.ops_after_partitioner_FP) - .to_executorch() - .run_method_and_compare_outputs( - inputs=t5_encoder_model_inputs, - ) - ) - - def test_T5EncoderModel_tosa_INT(self): - t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs() - with torch.no_grad(): - ( - ArmTester( - t5_encoder_model, - example_inputs=t5_encoder_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"), - ) - .quantize() - .export() - .to_edge_transform_and_lower() - .dump_operator_distribution() - .check_count(self.ops_after_partitioner_INT) - .to_executorch() - .run_method_and_compare_outputs( - inputs=t5_encoder_model_inputs, - ) - ) + +def test_T5EncoderModel_tosa_FP(): + t5_encoder_model, t5_encoder_model_inputs = ( + TestT5EncoderModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineFP[input_t]( + t5_encoder_model, + t5_encoder_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + transform_passes=[ + ConvertInt64ConstOpsToInt32Pass(), + ConvertInt64OutputOpsToInt32Pass(), + InsertInt32CastsAfterInt64PlaceholdersPass(), + ], + ) + pipeline.change_args( + "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP + ) + pipeline.run() + + +def test_T5EncoderModel_tosa_INT(): + t5_encoder_model, t5_encoder_model_inputs = ( + TestT5EncoderModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineINT[input_t]( + t5_encoder_model, + t5_encoder_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + ) + pipeline.change_args( + "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_T5EncoderModel_vgf_FP(): + t5_encoder_model, t5_encoder_model_inputs = ( + TestT5EncoderModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + t5_encoder_model, + t5_encoder_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+FP", + use_to_edge_transform_and_lower=True, + transform_passes=[ + ConvertInt64ConstOpsToInt32Pass(), + ConvertInt64OutputOpsToInt32Pass(), + InsertInt32CastsAfterInt64PlaceholdersPass(), + ], + ) + pipeline.change_args( + "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_T5EncoderModel_vgf_INT(): + t5_encoder_model, t5_encoder_model_inputs = ( + TestT5EncoderModel().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + t5_encoder_model, + t5_encoder_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+INT", + use_to_edge_transform_and_lower=True, + ) + pipeline.change_args( + "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT + ) + pipeline.run() diff --git a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py index ab0f4892fb8..a3c3a018131 100644 --- a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py +++ b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. -import unittest +from typing import Tuple import torch from diffusers.models.autoencoders import AutoencoderKL @@ -14,10 +14,16 @@ from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import ( AutoencoderKL_config, ) -from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.test.tester.test_pipeline import ( + TosaPipelineFP, + TosaPipelineINT, + VgfPipeline, +) + +input_t = Tuple[torch.Tensor] -class TestAutoencoderKL(unittest.TestCase): +class TestAutoencoderKL: """ Test class of AutoencoderKL. AutoencoderKL is the encoder/decoder used by Stable Diffusion 3.5 Medium @@ -41,40 +47,68 @@ def forward(self, *args, **kwargs): return auto_encoder_model, auto_encoder_model_inputs - def test_AutoencoderKL_tosa_FP(self): - auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs() - with torch.no_grad(): - ( - ArmTester( - auto_encoder_model, - example_inputs=auto_encoder_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"), - ) - .export() - .to_edge_transform_and_lower() - .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) - .to_executorch() - .run_method_and_compare_outputs( - inputs=auto_encoder_model_inputs, - ) - ) - - def test_AutoencoderKL_tosa_INT(self): - auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs() - with torch.no_grad(): - ( - ArmTester( - auto_encoder_model, - example_inputs=auto_encoder_model_inputs, - compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"), - ) - .quantize() - .export() - .to_edge_transform_and_lower() - .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) - .to_executorch() - .run_method_and_compare_outputs( - inputs=auto_encoder_model_inputs, - atol=1.0, # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT - ) - ) + +def test_AutoencoderKL_tosa_FP(): + auto_encoder_model, auto_encoder_model_inputs = ( + TestAutoencoderKL().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineFP[input_t]( + auto_encoder_model, + auto_encoder_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + ) + pipeline.run() + + +def test_AutoencoderKL_tosa_INT(): + auto_encoder_model, auto_encoder_model_inputs = ( + TestAutoencoderKL().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = TosaPipelineINT[input_t]( + auto_encoder_model, + auto_encoder_model_inputs, + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + atol=1.0, # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_AutoencoderKL_vgf_FP(): + auto_encoder_model, auto_encoder_model_inputs = ( + TestAutoencoderKL().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + auto_encoder_model, + auto_encoder_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+FP", + use_to_edge_transform_and_lower=True, + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +def test_AutoencoderKL_vgf_INT(): + auto_encoder_model, auto_encoder_model_inputs = ( + TestAutoencoderKL().prepare_model_and_inputs() + ) + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + auto_encoder_model, + auto_encoder_model_inputs, + aten_op=[], + exir_op=[], + tosa_version="TOSA-1.0+INT", + use_to_edge_transform_and_lower=True, + atol=1.0, # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT + ) + pipeline.run() diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py index 3119145aef1..dacf14dc0e7 100644 --- a/backends/arm/test/models/test_conformer.py +++ b/backends/arm/test/models/test_conformer.py @@ -92,7 +92,6 @@ def test_conformer_u55_INT(): aten_ops=TestConformer.aten_ops, exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( "run_method_and_compare_outputs", @@ -114,7 +113,6 @@ def test_conformer_u85_INT(): aten_ops=TestConformer.aten_ops, exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( "run_method_and_compare_outputs", @@ -136,18 +134,9 @@ def test_conformer_vgf_INT(): exir_op=[], tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, + run_on_vulkan_runtime=False, # TODO: run on vulkan runtime ) pipeline.pop_stage("check_count.exir") - - # TODO: MLETORCH-1167 Create Vulkan backend e2e tests - # pipeline.change_args( - # "run_method_and_compare_outputs", - # get_test_inputs( - # TestConformer.dim, TestConformer.lengths, TestConformer.num_examples - # ), - # rtol=1.0, - # atol=3.0, - # ) pipeline.run() diff --git a/backends/arm/test/models/test_dl3_arm.py b/backends/arm/test/models/test_dl3_arm.py index 2000ac34794..c9eab58dda6 100644 --- a/backends/arm/test/models/test_dl3_arm.py +++ b/backends/arm/test/models/test_dl3_arm.py @@ -66,7 +66,6 @@ def test_dl3_u55_INT(): TestDl3.model_example_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.change_args( "run_method_and_compare_outputs", rtol=1.0, atol=1.0 @@ -82,7 +81,6 @@ def test_dl3_u85_INT(): TestDl3.model_example_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.change_args( "run_method_and_compare_outputs", rtol=1.0, atol=1.0 @@ -99,11 +97,8 @@ def test_dl3_vgf_INT(): exir_op=[], tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, + run_on_vulkan_runtime=False, # TODO: run on vulkan runtime ) - # TODO: MLETORCH-1167 Create Vulkan backend e2e tests - # pipeline.change_args( - # "run_method_and_compare_outputs", rtol=1.0, atol=1.0 - # ) pipeline.run() @@ -117,8 +112,4 @@ def test_dl3_vgf_FP(): tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, ) - # TODO: MLETORCH-1167 Create Vulkan backend e2e tests - # pipeline.change_args( - # "run_method_and_compare_outputs", rtol=1.0, atol=1.0 - # ) pipeline.run() diff --git a/backends/arm/test/models/test_inception_v3_arm.py b/backends/arm/test/models/test_inception_v3_arm.py index f973521c1fa..2cb180a87ea 100644 --- a/backends/arm/test/models/test_inception_v3_arm.py +++ b/backends/arm/test/models/test_inception_v3_arm.py @@ -66,7 +66,6 @@ def test_ic3_u55_BI(): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, atol=0.6, qtol=1, @@ -83,7 +82,6 @@ def test_ic3_u85_BI(): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, atol=0.6, qtol=1, diff --git a/backends/arm/test/models/test_lstm_arm.py b/backends/arm/test/models/test_lstm_arm.py index 1e63472f5f4..6ee16b6a31a 100644 --- a/backends/arm/test/models/test_lstm_arm.py +++ b/backends/arm/test/models/test_lstm_arm.py @@ -77,7 +77,6 @@ def test_lstm_u55_INT(): aten_ops=[], exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0 @@ -93,7 +92,6 @@ def test_lstm_u85_INT(): aten_ops=[], exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0 @@ -111,10 +109,6 @@ def test_lstm_vgf_INT(): tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, ) - # TODO: MLETORCH-1167 Create Vulkan backend e2e tests - # pipeline.change_args( - # "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0 - # ) pipeline.run() @@ -128,8 +122,4 @@ def test_lstm_vgf_FP(): tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, ) - # TODO: MLETORCH-1167 Create Vulkan backend e2e tests - # pipeline.change_args( - # "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0 - # ) pipeline.run() diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py index d4e3bbc8e28..f06e1b74bbd 100644 --- a/backends/arm/test/models/test_mobilenet_v2_arm.py +++ b/backends/arm/test/models/test_mobilenet_v2_arm.py @@ -46,6 +46,23 @@ def test_mv2_tosa_FP(): pipeline.run() +def test_mv2_tosa_FP_channels_last(): + input_tensor = model_inputs[0].to(memory_format=torch.channels_last) + pipeline = TosaPipelineFP[input_t]( + mv2, + (input_tensor,), + aten_op=[], + exir_op=[], + use_to_edge_transform_and_lower=True, + ) + # Changing memory format leads to an unsupported as_strided_copy op being inserted into the graph, + # leading to a graph break. + pipeline.change_args( + "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2} + ) + pipeline.run() + + @common.parametrize("per_channel_quantization", quant_test_data) def test_mv2_tosa_INT(per_channel_quantization): pipeline = TosaPipelineINT[input_t]( @@ -70,7 +87,6 @@ def test_mv2_u55_INT(per_channel_quantization): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, atol=0.25, @@ -88,7 +104,6 @@ def test_mv2_u85_INT(per_channel_quantization): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, atol=0.25, @@ -110,11 +125,8 @@ def test_mv2_vgf_INT(per_channel_quantization): per_channel_quantization=per_channel_quantization, atol=0.25, qtol=1, + run_on_vulkan_runtime=False, ) - # TODO: MLETORCH-1167 Create Vulkan backend e2e tests - # pipeline.change_args( - # "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0 - # ) pipeline.run() @@ -127,9 +139,6 @@ def test_mv2_vgf_FP(): exir_op=[], tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + run_on_vulkan_runtime=False, ) - # TODO: MLETORCH-1167 Create Vulkan backend e2e tests - # pipeline.change_args( - # "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0 - # ) # TODO: MLETORCH-1036 decrease tolerance pipeline.run() diff --git a/backends/arm/test/models/test_mobilenet_v3_arm.py b/backends/arm/test/models/test_mobilenet_v3_arm.py index 0dcbd9757ac..f3a8f27428b 100644 --- a/backends/arm/test/models/test_mobilenet_v3_arm.py +++ b/backends/arm/test/models/test_mobilenet_v3_arm.py @@ -61,7 +61,6 @@ def test_mv3_u55_INT(): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, atol=0.5, qtol=1, @@ -77,7 +76,6 @@ def test_mv3_u85_INT(): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, atol=0.5, qtol=1, diff --git a/backends/arm/test/models/test_resnet18.py b/backends/arm/test/models/test_resnet18.py index 6e965daeb8b..3cb21abd772 100644 --- a/backends/arm/test/models/test_resnet18.py +++ b/backends/arm/test/models/test_resnet18.py @@ -23,7 +23,8 @@ model = model.eval() normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) -model_inputs = (normalize(torch.randn((1, 3, 224, 224))),) +# Using torch.rand * 2 - 1 to generate numbers in the range [-1;1] like an RGB image +model_inputs = (normalize(torch.rand((1, 3, 224, 224)) * 2 - 1),) input_t = Tuple[torch.Tensor] @@ -54,7 +55,7 @@ def test_resnet_tosa_INT(per_channel_quantization): exir_op=[], use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, - atol=0.5, + atol=0.25, qtol=1, ) pipeline.run() @@ -69,10 +70,9 @@ def test_resnet_u55_INT(per_channel_quantization): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, - atol=0.5, + atol=0.25, qtol=1, ) pipeline.run() @@ -90,10 +90,9 @@ def test_resnet_u85_INT(per_channel_quantization): model_inputs, aten_ops=[], exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, - atol=0.5, + atol=0.25, qtol=1, ) pipeline.run() diff --git a/backends/arm/test/models/test_torch_functions.py b/backends/arm/test/models/test_torch_functions.py index 580438f6da8..de45dbe0356 100644 --- a/backends/arm/test/models/test_torch_functions.py +++ b/backends/arm/test/models/test_torch_functions.py @@ -101,7 +101,6 @@ def forward(self, *args): "Requires dynamic output shape.", "topk": "NotImplementedError: No registered serialization name for found", "sort": "NotImplementedError: No registered serialization name for found", - "norm": "An error occurred when running the 'KeepDimsFalseToSqueezePass' pass after the following passes:", }, ) def test_torch_fns_FP(test_data): diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py index c627cd7f887..d62d92f5fa2 100644 --- a/backends/arm/test/models/test_w2l_arm.py +++ b/backends/arm/test/models/test_w2l_arm.py @@ -91,7 +91,6 @@ def test_w2l_u55_INT(): aten_ops=[], exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.run() @@ -106,7 +105,6 @@ def test_w2l_u85_INT(): aten_ops=[], exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_abs.py b/backends/arm/test/ops/test_abs.py index 4ebcf7393c1..26495b9df3a 100644 --- a/backends/arm/test/ops/test_abs.py +++ b/backends/arm/test/ops/test_abs.py @@ -55,7 +55,10 @@ def test_abs_tosa_INT(test_data: torch.Tensor): @common.XfailIfNoCorstone300 def test_abs_u55_INT(test_data: torch.Tensor): pipeline = EthosU55PipelineINT[input_t1]( - Abs(), test_data(), aten_op, exir_op, run_on_fvp=True + Abs(), + test_data(), + aten_op, + exir_op, ) pipeline.run() @@ -64,7 +67,10 @@ def test_abs_u55_INT(test_data: torch.Tensor): @common.XfailIfNoCorstone320 def test_abs_u85_INT(test_data: torch.Tensor): pipeline = EthosU85PipelineINT[input_t1]( - Abs(), test_data(), aten_op, exir_op, run_on_fvp=True + Abs(), + test_data(), + aten_op, + exir_op, ) pipeline.run() diff --git a/backends/arm/test/ops/test_acos.py b/backends/arm/test/ops/test_acos.py index 28dadcf95be..f078f46f98e 100644 --- a/backends/arm/test/ops/test_acos.py +++ b/backends/arm/test/ops/test_acos.py @@ -4,7 +4,6 @@ # LICENSE file in the root directory of this source tree. from typing import Tuple -import pytest import torch from executorch.backends.arm.test import common @@ -105,10 +104,7 @@ def test_acos_vgf_FP(test_data: Tuple): tosa_version="TOSA-1.0+FP", run_on_vulkan_runtime=True, ) - try: - pipeline.run() - except FileNotFoundError as e: - pytest.skip(f"VKML executor_runner not found - not built - skip {e}") + pipeline.run() @common.parametrize("test_data", test_data_suite) @@ -122,7 +118,4 @@ def test_acos_vgf_INT(test_data: Tuple): tosa_version="TOSA-1.0+INT", run_on_vulkan_runtime=True, ) - try: - pipeline.run() - except FileNotFoundError as e: - pytest.skip(f"VKML executor_runner not found - not built - skip {e}") + pipeline.run() diff --git a/backends/arm/test/ops/test_acosh.py b/backends/arm/test/ops/test_acosh.py index 25ba2b1a83b..db0bd1c3281 100644 --- a/backends/arm/test/ops/test_acosh.py +++ b/backends/arm/test/ops/test_acosh.py @@ -87,7 +87,6 @@ def test_acosh_u55_INT_xfail(test_data: Tuple): Acosh(), (test_data(),), aten_ops=[], - run_on_fvp=False, ) pipeline.run() @@ -110,7 +109,6 @@ def test_acosh_u85_INT_xfail(test_data: Tuple): Acosh(), (test_data(),), aten_ops=[], - run_on_fvp=False, ) pipeline.run() diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py index 24fdfbb5457..09c9d8fa224 100644 --- a/backends/arm/test/ops/test_add.py +++ b/backends/arm/test/ops/test_add.py @@ -7,7 +7,6 @@ from typing import cast, Tuple -import pytest import torch from executorch.backends.arm.quantizer import arm_quantizer from executorch.backends.arm.quantizer.arm_quantizer import ( @@ -78,7 +77,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): class Add3(torch.nn.Module): def forward(self, x: torch.Tensor, y: torch.Tensor): - return x + y + return torch.add(x, y, alpha=1.5) test_data: list[input_t2] = { "3d_randn_diff_rank": lambda: (torch.randn(1, 4, 5), torch.randn(4, 1)), @@ -144,7 +143,10 @@ def test_add_tensor_tosa_INT_i32(test_data: input_t1): @common.XfailIfNoCorstone300 def test_add_tensor_u55_INT(test_data: input_t1): pipeline = EthosU55PipelineINT[input_t1]( - Add(), test_data(), aten_op, exir_op, run_on_fvp=True + Add(), + test_data(), + aten_op, + exir_op, ) pipeline.run() @@ -153,7 +155,10 @@ def test_add_tensor_u55_INT(test_data: input_t1): @common.XfailIfNoCorstone320 def test_add_tensor_u85_INT(test_data: input_t1): pipeline = EthosU85PipelineINT[input_t1]( - Add(), test_data(), aten_op, exir_op, run_on_fvp=True + Add(), + test_data(), + aten_op, + exir_op, ) pipeline.run() @@ -186,7 +191,10 @@ def test_add_tensor_tosa_INT_2(test_data: input_t2): @common.XfailIfNoCorstone300 def test_add_tensor_u55_INT_2(test_data: input_t2): pipeline = EthosU55PipelineINT[input_t2]( - Add2(), test_data(), aten_op, exir_op, run_on_fvp=True + Add2(), + test_data(), + aten_op, + exir_op, ) pipeline.run() @@ -195,7 +203,10 @@ def test_add_tensor_u55_INT_2(test_data: input_t2): @common.XfailIfNoCorstone320 def test_add_tensor_u85_INT_2(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( - Add2(), test_data(), aten_op, exir_op, run_on_fvp=True + Add2(), + test_data(), + aten_op, + exir_op, ) pipeline.run() @@ -211,10 +222,7 @@ def test_add_tensor_vgf_FP(test_data: input_t1): tosa_version="TOSA-1.0+FP", run_on_vulkan_runtime=True, ) - try: - pipeline.run() - except FileNotFoundError as e: - pytest.skip(f"VKML executor_runner not found - not built - skip {e}") + pipeline.run() @common.parametrize("test_data", Add.test_data) @@ -228,10 +236,7 @@ def test_add_tensor_vgf_INT(test_data: input_t1): tosa_version="TOSA-1.0+INT", run_on_vulkan_runtime=True, ) - try: - pipeline.run() - except FileNotFoundError as e: - pytest.skip(f"VKML executor_runner not found - not built - skip {e}") + pipeline.run() def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False): @@ -254,9 +259,6 @@ def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False): @common.parametrize("test_data", Add.test_data) -@pytest.mark.xfail( - reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13730" -) def test_add_tensor_16a8w_tosa_INT(test_data: input_t1): """Test add operation with 16A8W quantization (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -282,9 +284,6 @@ def test_add_tensor_16a8w_tosa_INT(test_data: input_t1): @common.parametrize("test_data", Add.test_data) @common.XfailIfNoCorstone300 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 add operations. See: https://github.com/pytorch/executorch/issues/13730" -) def test_add_tensor_16a8w_u55_INT16(test_data: input_t1): """Test add operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -296,7 +295,6 @@ def test_add_tensor_16a8w_u55_INT16(test_data: input_t1): exir_op, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -310,9 +308,6 @@ def test_add_tensor_16a8w_u55_INT16(test_data: input_t1): @common.parametrize("test_data", Add.test_data) @common.XfailIfNoCorstone320 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 add operations. See: https://github.com/pytorch/executorch/issues/13730" -) def test_add_tensor_16a8w_u85_INT16(test_data: input_t1): """Test add operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -324,7 +319,6 @@ def test_add_tensor_16a8w_u85_INT16(test_data: input_t1): exir_op, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py index 753cb599b2b..685b69b3541 100644 --- a/backends/arm/test/ops/test_addmm.py +++ b/backends/arm/test/ops/test_addmm.py @@ -211,9 +211,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False): @common.parametrize("test_data", test_data_suite) -@pytest.mark.xfail( - reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979" -) def test_addmm_16a8w_tosa_INT(test_data: input_t1): """Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -253,7 +250,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -267,9 +263,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1): @common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations" -) def test_addmm_16a8w_u85_INT16(test_data: input_t1): """Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -281,7 +274,6 @@ def test_addmm_16a8w_u85_INT16(test_data: input_t1): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py index 080dddda92e..e69e9163325 100644 --- a/backends/arm/test/ops/test_amax.py +++ b/backends/arm/test/ops/test_amax.py @@ -103,7 +103,6 @@ def test_amax_u85_INT(test_data: Amax.input_t): Amax(dim, keep_dims), data, Amax.aten_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py index a24da9e1ba0..09d9018c73e 100644 --- a/backends/arm/test/ops/test_amin.py +++ b/backends/arm/test/ops/test_amin.py @@ -29,12 +29,16 @@ def __init__(self, dim, keep_dims): super().__init__() def forward(self, x): - return torch.amin(x, self.dim, self.keep_dims) + if self.dim is None: + return torch.amin(x, keepdim=self.keep_dims) + else: + return torch.amin(x, self.dim, self.keep_dims) - test_data: Dict[str, input_t] = { + test_data: Dict = { "rank_1_dim_0": lambda: ((torch.rand([10]),), 0, False), "rank_2_dim_1_keep_dims": lambda: ((torch.rand([2, 2]),), (1,), True), "rank_4_all_dim": lambda: ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False), + "rank_4_no_dim": lambda: ((torch.rand([1, 2, 5, 5]),), None, False), "rank_4_0,3_keep_dims": lambda: ((torch.rand([1, 2, 2, 2]),), (0, 3), True), "rank_4_mult_batches": lambda: ((torch.rand([2, 2, 2, 2]),), (0), True), } @@ -52,7 +56,7 @@ def forward(self, x): x = torch.min(x, self.dim) return x[0] - test_data: Dict[str, input_t] = { + test_data: Dict = { "rank_1_dim_0": lambda: ((torch.rand([10]),), 0), "rank_2_dim_1": lambda: ((torch.rand([2, 2]),), 1), "rank_4_dim_2": lambda: ((torch.rand([2, 2, 2, 2]),), 2), @@ -112,7 +116,6 @@ def test_amin_u85_INT(test_data: Amin.input_t): Amin(dim, keep_dims), data, Amin.aten_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_any.py b/backends/arm/test/ops/test_any.py index ae738480048..3eccff0a64e 100644 --- a/backends/arm/test/ops/test_any.py +++ b/backends/arm/test/ops/test_any.py @@ -177,7 +177,6 @@ def test_any_u85_INT(test_data: input_t1): test_input(), op.aten_op, op.exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, diff --git a/backends/arm/test/ops/test_avg_pool2d.py b/backends/arm/test/ops/test_avg_pool2d.py index be54c76e68b..8310d1e40a4 100644 --- a/backends/arm/test/ops/test_avg_pool2d.py +++ b/backends/arm/test/ops/test_avg_pool2d.py @@ -151,7 +151,6 @@ def test_avg_pool2d_u55_INT(test_module): input_tensor, aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -166,7 +165,6 @@ def test_avg_pool2d_u85_INT(test_module): input_tensor, aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py index a28180b7b57..fc5e11645dd 100644 --- a/backends/arm/test/ops/test_batch_norm.py +++ b/backends/arm/test/ops/test_batch_norm.py @@ -220,7 +220,6 @@ def test_native_batch_norm_legit_no_training_u55_INT_conv(test_data: Tuple): BatchNorm2dConv(*model_params), (test_data,), aten_ops=BatchNorm2dConv.aten_ops[0], # Bn is removed before check - run_on_fvp=True, qtol=1, ) pipeline.run() @@ -234,7 +233,6 @@ def test_native_batch_norm_legit_no_training_u85_INT_conv(test_data: Tuple): BatchNorm2dConv(*model_params), (test_data,), aten_ops=BatchNorm2dConv.aten_ops[0], # Bn is removed before check - run_on_fvp=True, qtol=1, ) pipeline.run() @@ -336,7 +334,6 @@ def test_native_batch_norm_legit_no_stats_u55_INT(test_data: Tuple): BatchNorm2dNoStats(*model_params), (test_data,), aten_op=BatchNorm2dNoStats.aten_ops, - run_on_fvp=True, qtol=1, ) pipeline.run() @@ -353,7 +350,6 @@ def test_native_batch_norm_legit_no_stats_u85_INT(test_data: Tuple): BatchNorm2dNoStats(*model_params), (test_data,), aten_op=BatchNorm2dNoStats.aten_ops, - run_on_fvp=False, qtol=1, ) pipeline.run() diff --git a/backends/arm/test/ops/test_bitwise.py b/backends/arm/test/ops/test_bitwise.py index 218f2290cab..f9b20e5dbdd 100644 --- a/backends/arm/test/ops/test_bitwise.py +++ b/backends/arm/test/ops/test_bitwise.py @@ -235,7 +235,6 @@ def test_bitwise_and_scalar_u85_INT(test_data: input_t2): test_data(), AndScalar.aten_op, AndScalar.exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -253,7 +252,6 @@ def test_bitwise_and_tensor_u85_INT(test_data: input_t2): test_data(), And().aten_op, And().exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -418,7 +416,6 @@ def test_bitwise_xor_tensor_u85_INT(test_data: input_t2): test_data(), Xor().aten_op, Xor().exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -436,7 +433,6 @@ def test_bitwise_xor_scalar_u85_INT(test_data: input_t2): test_data(), XorScalar.aten_op, XorScalar.exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -601,7 +597,6 @@ def test_bitwise_or_tensor_u85_INT(test_data: input_t2): test_data(), Or().aten_op, Or().exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -619,7 +614,6 @@ def test_bitwise_or_scalar_u85_INT(test_data: input_t2): test_data(), OrScalar.aten_op, OrScalar.exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, diff --git a/backends/arm/test/ops/test_bitwise_not.py b/backends/arm/test/ops/test_bitwise_not.py new file mode 100644 index 00000000000..4f48bc134ba --- /dev/null +++ b/backends/arm/test/ops/test_bitwise_not.py @@ -0,0 +1,120 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU85PipelineINT, + OpNotSupportedPipeline, + TosaPipelineINT, + VgfPipeline, +) + +aten_op = "torch.ops.aten.bitwise_not.default" +exir_op = "executorch_exir_dialects_edge__ops_aten_bitwise_not_default" + +input_t1 = Tuple[torch.Tensor] + +test_data_suite = { + "zeros": torch.zeros(1, 10, 10, 10, dtype=torch.int32), + "ones": torch.ones(10, 2, 3, dtype=torch.int8), + "pattern1_int8": 0xAA * torch.ones(1, 2, 2, 2, dtype=torch.int8), + "pattern1_int16": 0xAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int16), + "pattern1_int32": 0xAAAAAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int32), + "pattern2_int8": 0xCC * torch.ones(1, 2, 2, 2, dtype=torch.int8), + "pattern2_int16": 0xCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int16), + "pattern2_int32": 0xCCCCCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int32), + "rand_rank2": torch.randint(-128, 127, (10, 10), dtype=torch.int8), + "rand_rank4": torch.randint(-128, 127, (1, 10, 10, 10), dtype=torch.int8), +} + + +class BitwiseNot(torch.nn.Module): + + def forward(self, x: torch.Tensor): + return torch.bitwise_not(x) + + +@common.parametrize("test_data", test_data_suite) +def test_bitwise_not_tosa_FP(test_data: Tuple): + # We don't delegate bitwise_not since it is not supported on the FP profile. + pipeline = OpNotSupportedPipeline[input_t1]( + BitwiseNot(), + (test_data,), + {exir_op: 1}, + quantize=False, + ) + pipeline.run() + + +@common.parametrize("test_data", test_data_suite) +def test_bitwise_not_tosa_INT(test_data: Tuple): + pipeline = TosaPipelineINT[input_t1]( + BitwiseNot(), + (test_data,), + aten_op=aten_op, + exir_op=exir_op, + ) + pipeline.pop_stage("quantize") + pipeline.pop_stage("check.quant_nodes") + pipeline.run() + + +@common.parametrize("test_data", test_data_suite) +def test_bitwise_not_u55_INT(test_data: Tuple): + # We don't delegate bitwise_not since it is not supported on U55. + pipeline = OpNotSupportedPipeline[input_t1]( + BitwiseNot(), + (test_data,), + {exir_op: 1}, + quantize=True, + u55_subset=True, + ) + pipeline.run() + + +@common.XfailIfNoCorstone320 +@common.parametrize("test_data", test_data_suite) +def test_bitwise_not_u85_INT(test_data: Tuple): + pipeline = EthosU85PipelineINT[input_t1]( + BitwiseNot(), + (test_data,), + aten_ops=aten_op, + exir_ops=exir_op, + ) + pipeline.pop_stage("quantize") + pipeline.pop_stage("check.quant_nodes") + pipeline.run() + + +@common.parametrize("test_data", test_data_suite) +@common.SkipIfNoModelConverter +def test_bitwise_not_vgf_FP(test_data: Tuple): + # We don't delegate bitwise_not since it is not supported on the FP profile. + pipeline = OpNotSupportedPipeline[input_t1]( + BitwiseNot(), + (test_data,), + {exir_op: 1}, + quantize=False, + ) + pipeline.run() + + +@common.parametrize("test_data", test_data_suite) +@common.SkipIfNoModelConverter +def test_bitwise_not_vgf_INT(test_data: Tuple): + pipeline = VgfPipeline[input_t1]( + BitwiseNot(), + (test_data,), + aten_op, + exir_op, + tosa_version="TOSA-1.0+INT", + ) + pipeline.pop_stage("quantize") + pipeline.pop_stage("check.quant_nodes") + pipeline.run() diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py index 7c0fc1665bb..f69b1419c8d 100644 --- a/backends/arm/test/ops/test_bmm.py +++ b/backends/arm/test/ops/test_bmm.py @@ -97,7 +97,6 @@ def test_bmm_u55_INT(test_data: input_t1): test_data(), aten_op_bmm, exir_op_bmm, - run_on_fvp=True, ) pipeline.run() @@ -110,7 +109,6 @@ def test_bmm_u85_INT(test_data: input_t1): test_data(), aten_op_bmm, exir_op_bmm, - run_on_fvp=True, ) pipeline.run() @@ -123,7 +121,6 @@ def test_bmm_u55_INT_single_input(test_data: input_t1): test_data(), aten_op_bmm, exir_op_bmm, - run_on_fvp=True, ) pipeline.run() @@ -136,7 +133,6 @@ def test_bmm_u85_INT_single_input(test_data: input_t1): test_data(), aten_op_bmm, exir_op_bmm, - run_on_fvp=True, ) pipeline.run() @@ -150,7 +146,11 @@ def test_bmm_vgf_FP(test_data: input_t1): pipeline.run() -@common.parametrize("test_data", BMMSingleInput.test_data_generators) +@common.parametrize( + "test_data", + BMMSingleInput.test_data_generators, + flakies={"rand_big_1": 3}, +) @common.SkipIfNoModelConverter def test_bmm_vgf_FP_single_input(test_data: input_t1): pipeline = VgfPipeline[input_t1]( @@ -186,6 +186,4 @@ def test_bmm_vgf_INT_single_input(test_data: input_t1): exir_op_bmm, tosa_version="TOSA-1.0+INT", ) - # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests - # pipeline.change_args("run_method_and_compare_outputs", qtol=1) pipeline.run() diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py index 84ecd8641b5..254edbc411f 100644 --- a/backends/arm/test/ops/test_cat.py +++ b/backends/arm/test/ops/test_cat.py @@ -8,7 +8,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.quantizer.arm_quantizer import ( get_symmetric_a16w8_quantization_config, @@ -120,7 +119,6 @@ def test_cat_u55_INT(test_data: Tuple): test_data(), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -133,7 +131,6 @@ def test_cat_u85_INT(test_data: Tuple): test_data(), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -180,9 +177,6 @@ def get_symmetric_a16w8_cat_quantizer(per_channel_quantization=False): @common.parametrize("test_data", Cat.test_parameters) -@pytest.mark.xfail( - reason="missing int16 cat ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13978" -) def test_cat_16a8w_tosa_INT(test_data: Tuple): """Test cat operation with 16A8W quantization (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -208,9 +202,6 @@ def test_cat_16a8w_tosa_INT(test_data: Tuple): @common.parametrize("test_data", Cat.test_parameters) @common.XfailIfNoCorstone300 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations" -) def test_cat_16a8w_u55_INT16(test_data: Tuple): """Test cat operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -222,7 +213,6 @@ def test_cat_16a8w_u55_INT16(test_data: Tuple): exir_op, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -236,9 +226,6 @@ def test_cat_16a8w_u55_INT16(test_data: Tuple): @common.parametrize("test_data", Cat.test_parameters) @common.XfailIfNoCorstone320 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations" -) def test_cat_16a8w_u85_INT16(test_data: Tuple): """Test cat operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -250,7 +237,6 @@ def test_cat_16a8w_u85_INT16(test_data: Tuple): exir_op, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_ceil.py b/backends/arm/test/ops/test_ceil.py index 64e9040a974..ed304bbd9df 100644 --- a/backends/arm/test/ops/test_ceil.py +++ b/backends/arm/test/ops/test_ceil.py @@ -78,7 +78,6 @@ def test_ceil_u55_INT(test_data: input_t1): (data,), module.aten_op, module.exir_op, - run_on_fvp=True, ) pipeline.run() @@ -92,7 +91,6 @@ def test_ceil_u85_INT(test_data: input_t1): (data,), module.aten_op, module.exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_clamp.py b/backends/arm/test/ops/test_clamp.py index ba490ccc0c6..a5561802e44 100644 --- a/backends/arm/test/ops/test_clamp.py +++ b/backends/arm/test/ops/test_clamp.py @@ -96,7 +96,6 @@ def test_clamp_u55_INT(test_data): (input_tensor,), aten_op, exir_op, - run_on_fvp=True, ) pipeline.change_args("run_method_and_compare_outputs", qtol=1) @@ -115,7 +114,6 @@ def test_clamp_u85_INT(test_data): (input_tensor,), aten_op, exir_op, - run_on_fvp=True, ) pipeline.change_args("run_method_and_compare_outputs", qtol=1) @@ -149,6 +147,4 @@ def test_clamp_vgf_INT(test_data): exir_op, tosa_version="TOSA-1.0+INT", ) - # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests - # pipeline.change_args("run_method_and_compare_outputs", qtol=1) pipeline.run() diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index b240fb1ea07..8a6d3714b8b 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -102,7 +102,6 @@ def test_clone_u55_INT(input_data): input_tensor, aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -118,7 +117,6 @@ def test_clone_u85_INT(input_data): input_tensor, aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py index ac66bc1556b..d58cdb5ff61 100644 --- a/backends/arm/test/ops/test_conv1d.py +++ b/backends/arm/test/ops/test_conv1d.py @@ -307,7 +307,6 @@ def test_convolution_1d_u55_INT(test_data): model.get_inputs(), aten_op, exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, qtol=1, ) @@ -323,7 +322,6 @@ def test_convolution_1d_u85_INT(test_data): model.get_inputs(), aten_op, exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, qtol=1, ) diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py index 0300f7c2049..bf47e3fa084 100644 --- a/backends/arm/test/ops/test_conv2d.py +++ b/backends/arm/test/ops/test_conv2d.py @@ -426,7 +426,6 @@ def test_convolution_2d_u55_INT(test_data): model.get_inputs(), aten_op, exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -441,7 +440,6 @@ def test_convolution_u85_INT(test_data): model.get_inputs(), aten_op, exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() diff --git a/backends/arm/test/ops/test_conv3d.py b/backends/arm/test/ops/test_conv3d.py index b26f75daa1a..46986103aa0 100644 --- a/backends/arm/test/ops/test_conv3d.py +++ b/backends/arm/test/ops/test_conv3d.py @@ -367,7 +367,6 @@ def test_convolution_3d_u55_INT(test_data): model.get_inputs(), aten_op, exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -382,7 +381,6 @@ def test_convolution_3d_u85_INT(test_data): model.get_inputs(), aten_op, exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index a7a031468ea..f0f8b404594 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -258,7 +258,6 @@ def test_convolution_2d_u55_INT_meandim(): model.get_inputs(), aten_ops=[], exir_ops=ComboConv2dMeandim.edge_op_list, - run_on_fvp=True, ) pipeline.run() @@ -271,7 +270,6 @@ def test_convolution_2d_u85_INT_meandim(): model.get_inputs(), aten_ops=[], exir_ops=ComboConv2dMeandim.edge_op_list, - run_on_fvp=True, ) pipeline.run() @@ -346,7 +344,6 @@ def test_convolution_2d_u55_INT_batchnorm_relu6(test_data): model.get_inputs(), aten_ops=[], exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -362,7 +359,6 @@ def test_convolution_2d_u85_INT_batchnorm_relu6(test_data): model.get_inputs(), aten_ops=[], exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -441,7 +437,6 @@ def test_convolution_2d_u55_INT_relu6(test_data): input, aten_ops=[], exir_ops=ComboConvRelu6.edge_op_list, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -457,7 +452,6 @@ def test_convolution_2d_u85_INT_relu6(test_data): input, aten_ops=[], exir_ops=ComboConvRelu6.edge_op_list, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -533,7 +527,6 @@ def test_convolution_2d_u55_INT_block_bottleneck(test_data): model.get_inputs(), aten_ops=[], exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -549,7 +542,6 @@ def test_convolution_2d_u85_INT_block_bottleneck(test_data): model.get_inputs(), aten_ops=[], exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -581,8 +573,6 @@ def test_convolution_2d_vgf_INT_block_bottleneck(test_data): tosa_version="TOSA-1.0+INT", per_channel_quantization=per_channel_quantization, ) - # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests - # pipeline.change_args("run_method_and_compare_outputs", model.get_inputs(), qtol=1) pipeline.run() @@ -628,7 +618,6 @@ def test_convolution_2d_u55_INT_avgpool2d(test_data): input, aten_ops=[], exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -644,7 +633,6 @@ def test_convolution_2d_u85_INT_avgpool2d(test_data): input, aten_ops=[], exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() diff --git a/backends/arm/test/ops/test_cos.py b/backends/arm/test/ops/test_cos.py index acb950f2a2e..b0c35bf7878 100644 --- a/backends/arm/test/ops/test_cos.py +++ b/backends/arm/test/ops/test_cos.py @@ -66,25 +66,25 @@ def test_cos_tosa_INT(test_data: Tuple): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_cos_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( Cos(), (test_data,), aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone320 def test_cos_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( Cos(), (test_data,), aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 0f8b34d3d47..e49ab236d86 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -260,7 +260,6 @@ def test_convolution_2d_u55_INT_depthwise(test_data): model.get_inputs(), aten_ops=[], exir_ops=exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -275,7 +274,6 @@ def test_convolution_1d_u55_INT_depthwise(test_data): model.get_inputs(), aten_ops=[], exir_ops=exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -290,7 +288,6 @@ def test_convolution_2d_u85_INT_depthwise(test_data): model.get_inputs(), aten_ops=[], exir_ops=exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() @@ -305,7 +302,6 @@ def test_convolution_1d_u85_INT_depthwise(test_data): model.get_inputs(), aten_ops=[], exir_ops=exir_op, - run_on_fvp=True, per_channel_quantization=per_channel_quantization, ) pipeline.run() diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index 5bacac1c962..612622b46cb 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -109,7 +109,6 @@ def test_div_tensor_u55_INT(test_data: Tuple): test_data(), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -122,7 +121,6 @@ def test_div_tensor_u85_INT(test_data: Tuple): test_data(), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_div_tensor_mode.py b/backends/arm/test/ops/test_div_tensor_mode.py index 909b83bd97f..e1f6036a487 100644 --- a/backends/arm/test/ops/test_div_tensor_mode.py +++ b/backends/arm/test/ops/test_div_tensor_mode.py @@ -96,7 +96,6 @@ def test_div_tensor_mode_u55_INT(data): aten_ops=model.aten_ops_int, exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.run() @@ -113,7 +112,6 @@ def test_div_tensor_mode_u85_INT(data): aten_ops=model.aten_ops_int, exir_ops=[], use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_eq.py b/backends/arm/test/ops/test_eq.py index b840869ba48..8f783240a2c 100644 --- a/backends/arm/test/ops/test_eq.py +++ b/backends/arm/test/ops/test_eq.py @@ -165,7 +165,6 @@ def test_eq_scalar_u85_INT_tensor(test_module): test_module().get_inputs(), Equal.aten_op_Tensor, Equal.exir_op, - run_on_fvp=True, ) pipeline.run() @@ -185,7 +184,6 @@ def test_eq_scalar_u85_INT(test_module): test_module().get_inputs(), Equal.aten_op_Tensor, Equal.exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_erf.py b/backends/arm/test/ops/test_erf.py index 363b1e2d8c9..e6b28255d6b 100644 --- a/backends/arm/test/ops/test_erf.py +++ b/backends/arm/test/ops/test_erf.py @@ -50,7 +50,10 @@ def test_erf_tosa_INT(test_data: input_t1): @common.XfailIfNoCorstone300 def test_erf_u55_INT(test_data: input_t1): pipeline = EthosU55PipelineINT[input_t1]( - Erf(), test_data(), aten_op, exir_op, run_on_fvp=True + Erf(), + test_data(), + aten_op, + exir_op, ) pipeline.run() @@ -59,7 +62,10 @@ def test_erf_u55_INT(test_data: input_t1): @common.XfailIfNoCorstone320 def test_erf_u85_INT(test_data: input_t1): pipeline = EthosU85PipelineINT[input_t1]( - Erf(), test_data(), aten_op, exir_op, run_on_fvp=True + Erf(), + test_data(), + aten_op, + exir_op, ) pipeline.run() diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py index 6eaacc71d86..56d258944c2 100644 --- a/backends/arm/test/ops/test_exp.py +++ b/backends/arm/test/ops/test_exp.py @@ -68,7 +68,6 @@ def test_exp_u55_INT(test_data: Tuple): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -81,7 +80,6 @@ def test_exp_u85_INT(test_data: Tuple): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py index b5784c9ff93..34694469bbf 100644 --- a/backends/arm/test/ops/test_expand.py +++ b/backends/arm/test/ops/test_expand.py @@ -79,7 +79,6 @@ def test_expand_u55_INT(test_data: Tuple): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -92,7 +91,6 @@ def test_expand_u85_INT(test_data: Tuple): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_fill_scalar.py b/backends/arm/test/ops/test_fill_scalar.py new file mode 100644 index 00000000000..fb84d993575 --- /dev/null +++ b/backends/arm/test/ops/test_fill_scalar.py @@ -0,0 +1,108 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineINT, + EthosU85PipelineINT, + TosaPipelineFP, + TosaPipelineINT, + VgfPipeline, +) + +aten_op = "torch.ops.aten.fill_.Scalar" +exir_op = "executorch_exir_dialects_edge__ops_aten_full_like_default" + +input_t1 = Tuple[torch.Tensor] + +test_data_suite = { + "ones_float": [torch.ones(2, 3), 5.0], + "ones_int": [torch.ones(2, 3), -3], +} + + +class FillScalar(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, y: torch.Tensor, fill_value: int | float): + mask = torch.full_like(y, 0) + mask.fill_(fill_value) + return mask * y + + +@common.parametrize("test_data", test_data_suite) +def test_fill_scalar_tosa_FP(test_data: Tuple): + pipeline = TosaPipelineFP[input_t1]( + FillScalar(), + (*test_data,), + aten_op=aten_op, + exir_op=exir_op, + ) + pipeline.run() + + +@common.parametrize("test_data", test_data_suite) +def test_fill_scalar_tosa_INT(test_data: Tuple): + pipeline = TosaPipelineINT[input_t1]( + FillScalar(), + (*test_data,), + aten_op=aten_op, + exir_op=exir_op, + ) + pipeline.run() + + +@common.XfailIfNoCorstone300 +@common.parametrize("test_data", test_data_suite) +def test_fill_scalar_u55_INT(test_data: Tuple): + pipeline = EthosU55PipelineINT[input_t1]( + FillScalar(), + (*test_data,), + aten_ops=[aten_op], + exir_ops=exir_op, + ) + pipeline.run() + + +@common.XfailIfNoCorstone320 +@common.parametrize("test_data", test_data_suite) +def test_fill_scalar_u85_INT(test_data: Tuple): + pipeline = EthosU85PipelineINT[input_t1]( + FillScalar(), + (*test_data,), + aten_ops=[aten_op], + exir_ops=exir_op, + ) + pipeline.run() + + +@common.parametrize("test_data", test_data_suite) +@common.SkipIfNoModelConverter +def test_fill_scalar_vgf_FP(test_data: input_t1): + pipeline = VgfPipeline[input_t1]( + FillScalar(), + (*test_data,), + aten_op, + exir_op, + tosa_version="TOSA-1.0+FP", + ) + pipeline.run() + + +@common.parametrize("test_data", test_data_suite) +@common.SkipIfNoModelConverter +def test_fill_scalar_vgf_INT(test_data: input_t1): + pipeline = VgfPipeline[input_t1]( + FillScalar(), + (*test_data,), + aten_op, + exir_op, + tosa_version="TOSA-1.0+INT", + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_floor.py b/backends/arm/test/ops/test_floor.py index c66ef1c5d27..475fe18679a 100644 --- a/backends/arm/test/ops/test_floor.py +++ b/backends/arm/test/ops/test_floor.py @@ -78,7 +78,6 @@ def test_floor_u55_INT(test_data: input_t1): (data,), module.aten_op, module.exir_op, - run_on_fvp=True, ) pipeline.run() @@ -92,7 +91,6 @@ def test_floor_u85_INT(test_data: input_t1): (data,), module.aten_op, module.exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py index 9e2c9b4d8be..8ab063e9957 100644 --- a/backends/arm/test/ops/test_full.py +++ b/backends/arm/test/ops/test_full.py @@ -202,7 +202,6 @@ def test_full_u85_INT(test_data: Tuple): test_data, aten_ops=[], exir_ops=exir_op, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -216,7 +215,6 @@ def test_full_u55_INT(test_data: Tuple): test_data, aten_ops=[], exir_ops=exir_op, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py index 94f33d28630..ede5be76eda 100644 --- a/backends/arm/test/ops/test_ge.py +++ b/backends/arm/test/ops/test_ge.py @@ -161,7 +161,6 @@ def test_ge_tensor_u85_INT(test_module): test_module().get_inputs(), GreaterEqual.aten_op_tensor, GreaterEqual.exir_op, - run_on_fvp=True, ) pipeline.run() @@ -177,7 +176,6 @@ def test_ge_scalar_u85_INT(test_module): test_module().get_inputs(), GreaterEqual.aten_op_tensor, GreaterEqual.exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_group_norm.py b/backends/arm/test/ops/test_group_norm.py index 0f314064548..8f2c0f0d6a5 100644 --- a/backends/arm/test/ops/test_group_norm.py +++ b/backends/arm/test/ops/test_group_norm.py @@ -118,7 +118,6 @@ def test_native_group_norm_u55_INT(test_data): test_data[1], test_data[0], "torch.ops.aten.sub.Tensor", # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed - run_on_fvp=True, atol=0.1, # TODO: "MLETORCH-925: Fix numerical issue for aten.native_group_norm" ) pipeline.change_args("run_method_and_compare_outputs", atol=1, qtol=1) @@ -142,7 +141,6 @@ def test_native_group_norm_u85_INT(test_data): test_data[1], test_data[0], "torch.ops.aten.sub.Tensor", # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed - run_on_fvp=True, atol=0.1, # TODO: "MLETORCH-925: Fix numerical issue for aten.native_group_norm" ) pipeline.change_args("run_method_and_compare_outputs", atol=1, qtol=1) diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py index 41229397eb5..0e50b6b78be 100644 --- a/backends/arm/test/ops/test_gt.py +++ b/backends/arm/test/ops/test_gt.py @@ -162,7 +162,6 @@ def test_gt_tensor_u85_INT(test_module): test_module().get_inputs(), Greater.aten_op_tensor, Greater.exir_op, - run_on_fvp=True, ) pipeline.run() @@ -178,7 +177,6 @@ def test_gt_scalar_u85_INT(test_module): test_module().get_inputs(), Greater.aten_op_tensor, Greater.exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_hardsigmoid.py b/backends/arm/test/ops/test_hardsigmoid.py index 5f591c15617..568eb069f8b 100644 --- a/backends/arm/test/ops/test_hardsigmoid.py +++ b/backends/arm/test/ops/test_hardsigmoid.py @@ -70,7 +70,6 @@ def test_hardsigmoid_u55_INT(test_data: torch.Tensor): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -84,7 +83,6 @@ def test_hardsigmoid_u85_INT(test_data: torch.Tensor): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_hardswish.py b/backends/arm/test/ops/test_hardswish.py index 00db0cb296b..760293ec492 100644 --- a/backends/arm/test/ops/test_hardswish.py +++ b/backends/arm/test/ops/test_hardswish.py @@ -62,7 +62,6 @@ def test_hardswish_u55_INT(test_data): (test_data(),), aten_op, exir_op, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ).run() @@ -75,7 +74,6 @@ def test_hardswish_u85_INT(test_data): (test_data(),), aten_op, exir_op, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ).run() diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py index 28f7e717351..3bb8e212cc9 100644 --- a/backends/arm/test/ops/test_hardtanh.py +++ b/backends/arm/test/ops/test_hardtanh.py @@ -71,7 +71,6 @@ def test_hardtanh_u55_INT(test_data: torch.Tensor): (test_data(),), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -84,7 +83,6 @@ def test_hardtanh_u85_INT(test_data: torch.Tensor): (test_data(),), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_index_select.py b/backends/arm/test/ops/test_index_select.py index 95ebaa62a38..6d2a6d73b70 100644 --- a/backends/arm/test/ops/test_index_select.py +++ b/backends/arm/test/ops/test_index_select.py @@ -174,8 +174,4 @@ def test_index_select_vgf_INT_rand(test_data: input_params): op.exir_op, tosa_version="TOSA-1.0+INT", ) - # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests - # pipeline.change_args( - # "run_method_and_compare_outputs", inputs=test_input, atol=0.9, rtol=0.2, qtol=1 - # ) pipeline.run() diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py index 2c9b83dc7e7..2659bc2eab4 100644 --- a/backends/arm/test/ops/test_layer_norm.py +++ b/backends/arm/test/ops/test_layer_norm.py @@ -95,7 +95,6 @@ def test_native_layer_norm_u55_INT(test_data): model, test_data, "torch.ops.aten.sub.Tensor", # Just check for sub op included in the layernorm decomposition - run_on_fvp=True, symmetric_io_quantization=True, ) pipeline.run() @@ -109,7 +108,6 @@ def test_native_layer_norm_u85_INT(test_data): model, test_data, "torch.ops.aten.sub.Tensor", # Just check for sub op included in the layernorm decomposition - run_on_fvp=True, symmetric_io_quantization=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_le.py b/backends/arm/test/ops/test_le.py index 31422302a2d..fd0e63e9beb 100644 --- a/backends/arm/test/ops/test_le.py +++ b/backends/arm/test/ops/test_le.py @@ -163,7 +163,6 @@ def test_le_tensor_u85_INT(test_module): test_module().get_inputs(), LessEqual.aten_op_tensor, LessEqual.exir_op, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -180,7 +179,6 @@ def test_le_scalar_u85_INT(test_module): test_module().get_inputs(), LessEqual.aten_op_tensor, LessEqual.exir_op, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_leaky_relu.py b/backends/arm/test/ops/test_leaky_relu.py index 432c4da7ecc..a7ae4cb8564 100644 --- a/backends/arm/test/ops/test_leaky_relu.py +++ b/backends/arm/test/ops/test_leaky_relu.py @@ -73,7 +73,6 @@ def test_leaky_relu_u55_INT(test_data): LeakyReLU(slope), data, [], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op]) @@ -88,7 +87,6 @@ def test_leaky_relu_u85_INT(test_data): LeakyReLU(slope), data, [], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op]) diff --git a/backends/arm/test/ops/test_linalg_vector_norm.py b/backends/arm/test/ops/test_linalg_vector_norm.py index 1777cffb0a7..df3bef38cc1 100644 --- a/backends/arm/test/ops/test_linalg_vector_norm.py +++ b/backends/arm/test/ops/test_linalg_vector_norm.py @@ -103,7 +103,6 @@ def test_vector_norm_u55_INT_fvp(test_module): input_tensor, aten_op_q_decomposed_q, exir_op_q_decomposed, - run_on_fvp=True, symmetric_io_quantization=True, ) pipeline.pop_stage("check_not.exir") @@ -121,7 +120,6 @@ def test_vector_norm_u85_INT_fvp(test_module): input_tensor, aten_op_q_decomposed_q, exir_op_q_decomposed, - run_on_fvp=True, symmetric_io_quantization=True, ) pipeline.pop_stage("check_not.exir") diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index e5d00c83e9f..4029fcef54e 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -8,7 +8,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.quantizer.arm_quantizer import ( get_symmetric_a16w8_quantization_config, @@ -181,7 +180,6 @@ def test_linear_u55_INT(test_data: torch.Tensor): (test_data,), aten_op, exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, ).run() @@ -204,7 +202,6 @@ def test_linear_u85_INT(test_data: torch.Tensor): (test_data,), aten_op, exir_ops=[], - run_on_fvp=True, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, ).run() @@ -276,10 +273,14 @@ def get_symmetric_a16w8_linear_quantizer( ) -@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT) -@pytest.mark.xfail( - reason="missing int16 linear ops support; fails at TOSA reference model run with Invalid TOSA graph" -) +test_data_all_16a8w = test_data_rank1_INT | test_data_rank4_INT +# TODO: Remove large rand test as they are flaky until sorted out why: MLETORCH-1377 +for k in list(test_data_all_16a8w.keys()): + if "large_rand" in k: + test_data_all_16a8w.pop(k) + + +@common.parametrize("test_data", test_data_all_16a8w) def test_linear_16a8w_tosa_INT(test_data: torch.Tensor): """Test linear operation with 16A8W quantization (16-bit activations, 8-bit weights)""" test_data, out_features, has_bias, per_channel_quantization = test_data() @@ -308,3 +309,63 @@ def test_linear_16a8w_tosa_INT(test_data: torch.Tensor): ) # Run the pipeline pipeline.run() + + +@common.parametrize("test_data", test_data_all_16a8w) +@common.XfailIfNoCorstone300 +def test_linear_16a8w_u55_INT16(test_data: torch.Tensor): + """Test linear operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)""" + test_data, out_features, has_bias, per_channel_quantization = test_data() + in_features = test_data.shape[-1] + + pipeline = EthosU55PipelineINT[input_t1]( + Linear( + in_features=in_features, + out_features=out_features, + bias=has_bias, + ), + (test_data,), + aten_op, + exir_ops=[], + per_channel_quantization=per_channel_quantization, + use_to_edge_transform_and_lower=True, + run_on_fvp=True, + ) + + pipeline.change_args( + "quantize", + get_symmetric_a16w8_linear_quantizer( + per_channel_quantization=per_channel_quantization + ), + ) + pipeline.run() + + +@common.parametrize("test_data", test_data_all_16a8w) +@common.XfailIfNoCorstone320 +def test_linear_16a8w_u85_INT16(test_data: torch.Tensor): + """Test linear operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" + test_data, out_features, has_bias, per_channel_quantization = test_data() + in_features = test_data.shape[-1] + + pipeline = EthosU85PipelineINT[input_t1]( + Linear( + in_features=in_features, + out_features=out_features, + bias=has_bias, + ), + (test_data,), + aten_op, + exir_ops=[], + per_channel_quantization=per_channel_quantization, + use_to_edge_transform_and_lower=True, + run_on_fvp=True, + ) + + pipeline.change_args( + "quantize", + get_symmetric_a16w8_linear_quantizer( + per_channel_quantization=per_channel_quantization + ), + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py index 1ed5c57f1ab..44811715407 100644 --- a/backends/arm/test/ops/test_log.py +++ b/backends/arm/test/ops/test_log.py @@ -60,7 +60,6 @@ def test_log_u55_INT(test_data: input_t1): (test_data(),), aten_op, exir_op, - run_on_fvp=True, ).run() @@ -72,7 +71,6 @@ def test_log_u85_INT(test_data: input_t1): (test_data(),), aten_op, exir_op, - run_on_fvp=True, ).run() diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py index 2b160ce7b50..e772840e6e6 100644 --- a/backends/arm/test/ops/test_logical.py +++ b/backends/arm/test/ops/test_logical.py @@ -137,7 +137,6 @@ def test_logical_and_u85_INT(test_data: input_t2): test_data(), And().aten_op, And().exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -231,7 +230,6 @@ def test_logical_xor_u85_INT(test_data: input_t2): test_data(), Xor().aten_op, Xor().exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -325,7 +323,6 @@ def test_logical_or_u85_INT(test_data: input_t2): test_data(), Or().aten_op, Or().exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, @@ -419,7 +416,6 @@ def test_logical_not_u85_INT(test_data: input_t2): test_data(), Not().aten_op, Not().exir_op, - run_on_fvp=True, atol=0, rtol=0, qtol=0, diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py index c4a68caabac..f0411847dd3 100644 --- a/backends/arm/test/ops/test_logsoftmax.py +++ b/backends/arm/test/ops/test_logsoftmax.py @@ -72,7 +72,6 @@ def test_log_softmax_u55_INT(test_data): LogSoftmax(dim), data, [], - run_on_fvp=True, ) pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op]) pipeline.change_args("run_method_and_compare_outputs", qtol=1) @@ -87,7 +86,6 @@ def test_log_softmax_u85_INT(test_data): LogSoftmax(dim), data, [], - run_on_fvp=True, ) pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op]) pipeline.change_args("run_method_and_compare_outputs", qtol=1) @@ -119,6 +117,4 @@ def test_log_softmax_vgf_INT(test_data): tosa_version="TOSA-1.0+INT", ) pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op]) - # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests - # pipeline.change_args("run_method_and_compare_outputs", qtol=1) pipeline.run() diff --git a/backends/arm/test/ops/test_lshift.py b/backends/arm/test/ops/test_lshift.py index bab364a4528..3af49cd4dc2 100644 --- a/backends/arm/test/ops/test_lshift.py +++ b/backends/arm/test/ops/test_lshift.py @@ -103,7 +103,6 @@ def test_bitwise_left_shift_tensor_u55_INT_scalar(test_data): test_data, LshiftScalar.torch_op_INT, LshiftScalar.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() @@ -117,7 +116,6 @@ def test_bitwise_left_shift_tensor_u85_INT_scalar(test_data): test_data, LshiftScalar.torch_op_INT, LshiftScalar.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() @@ -178,28 +176,26 @@ def test_bitwise_left_shift_tensor_tosa_INT(test_data): @common.parametrize("test_data", LshiftTensor.test_data) -@XfailIfNoCorstone300 +@common.XfailIfNoCorstone300 def test_bitwise_left_shift_tensor_u55_INT(test_data): pipeline = EthosU55PipelineINT[scalar_input_t]( LshiftTensor(), test_data, LshiftTensor.torch_op, LshiftTensor.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() @common.parametrize("test_data", LshiftTensor.test_data) -@XfailIfNoCorstone320 +@common.XfailIfNoCorstone320 def test_bitwise_left_shift_tensor_u85_INT(test_data): pipeline = EthosU85PipelineINT[scalar_input_t]( LshiftTensor(), test_data, LshiftTensor.torch_op, LshiftTensor.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py index 98d0298b195..d0ed1a34185 100644 --- a/backends/arm/test/ops/test_lt.py +++ b/backends/arm/test/ops/test_lt.py @@ -162,7 +162,6 @@ def test_lt_tensor_u85_INT(test_module): test_module().get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op, - run_on_fvp=True, ) pipeline.run() @@ -178,7 +177,6 @@ def test_lt_scalar_u85_INT(test_module): test_module().get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_matmul.py b/backends/arm/test/ops/test_matmul.py index d1a21684325..f564672e98f 100644 --- a/backends/arm/test/ops/test_matmul.py +++ b/backends/arm/test/ops/test_matmul.py @@ -22,6 +22,7 @@ class MatMul(torch.nn.Module): test_data_generators = { + "rand_rand_2d": lambda: (torch.rand(5, 5), torch.rand(5, 2)), "rand_rand_3d": lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)), "rand_rand_4d": lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)), } @@ -32,6 +33,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): class MatMulSingleInput(torch.nn.Module): test_data_generators = { + "rand_2d": lambda: (torch.rand(5, 5),), "rand_3d": lambda: (torch.rand(2, 5, 5),), "rand_4d": lambda: (torch.rand(1, 2, 5, 5),), } @@ -42,6 +44,11 @@ def forward(self, x: torch.Tensor): class MatMulCombo(torch.nn.Module): test_data_generators = { + "rand_rand_rand_2d": lambda: ( + torch.rand(5, 5), + torch.rand(5, 2), + torch.rand(2, 5), + ), "rand_rand_rand_3d": lambda: ( torch.rand(2, 5, 5), torch.rand(2, 5, 2), @@ -122,7 +129,6 @@ def test_matmul_u55_INT(test_data: input_t1): test_data(), aten_op_mm, exir_op_mm, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -136,7 +142,6 @@ def test_matmul_single_input_u55_INT(test_data: input_t1): test_data(), aten_op_mm, exir_op_mm, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -150,7 +155,6 @@ def test_matmul_combo_u55_INT(test_data: input_t1): test_data(), aten_op_mm, exir_op_mm, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -164,7 +168,6 @@ def test_matmul_u85_INT(test_data: input_t1): test_data(), aten_op_mm, exir_op_mm, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -178,7 +181,6 @@ def test_matmul_single_input_u85_INT(test_data: input_t1): test_data(), aten_op_mm, exir_op_mm, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -192,7 +194,6 @@ def test_matmul_combo_u85_INT(test_data: input_t1): test_data(), aten_op_mm, exir_op_mm, - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py index 7db56311837..559932848e4 100644 --- a/backends/arm/test/ops/test_max_pool.py +++ b/backends/arm/test/ops/test_max_pool.py @@ -142,7 +142,6 @@ def test_max_pool2d_u55_INT(test_data: torch.Tensor): (test_data,), aten_op, exir_ops=[], - run_on_fvp=True, ).run() @@ -155,7 +154,6 @@ def test_max_pool2d_u85_INT(test_data: torch.Tensor): (test_data,), aten_op, exir_ops=[], - run_on_fvp=True, ).run() diff --git a/backends/arm/test/ops/test_maximum.py b/backends/arm/test/ops/test_maximum.py index eb0d4b86efc..ed3a5247d3d 100644 --- a/backends/arm/test/ops/test_maximum.py +++ b/backends/arm/test/ops/test_maximum.py @@ -61,7 +61,6 @@ def test_maximum_u55_INT(test_data: Tuple): Maximum(), test_data(), aten_op, - run_on_fvp=True, ).run() @@ -72,7 +71,6 @@ def test_maximum_u85_INT(test_data: Tuple): Maximum(), test_data(), aten_op, - run_on_fvp=True, ).run() diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index 061e8da14f1..970340c352b 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -4,7 +4,6 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( @@ -66,7 +65,6 @@ def test_adaptive_avg_pool2d_u55_INT(test_data): test_data(), AdaptiveAveragePool2d.aten_op, AdaptiveAveragePool2d.exir_op, - run_on_fvp=True, symmetric_io_quantization=True, ).run() @@ -79,7 +77,6 @@ def test_adaptive_avg_pool2d_u85_INT(test_data): test_data(), AdaptiveAveragePool2d.aten_op, AdaptiveAveragePool2d.exir_op, - run_on_fvp=True, symmetric_io_quantization=True, ).run() @@ -115,7 +112,7 @@ class MeanDim(torch.nn.Module): test_data_suite: dict[str, tuple] = { "rank_1_keepdim": lambda: ( torch.rand(7), - (0), + 0, True, ), "rank_2_keepdim": lambda: ( @@ -168,6 +165,11 @@ class MeanDim(torch.nn.Module): (0, 1, 2, 3), True, ), + "rand_none_keepdim": lambda: ( + torch.rand(1, 5, 7, 3), + None, + True, + ), "rank_1": lambda: ( torch.rand(7), (-1), @@ -280,7 +282,6 @@ def test_mean_dim_tosa_INT(test_data): (test_data,), [], # Might be sum, avgpool, or both symmetric_io_quantization=True, - custom_path="MEANDIM", ) pipeline.run() @@ -301,7 +302,6 @@ def test_mean_dim_u55_INT(test_data): MeanDim(dim, keep_dim), (test_data,), [], # Might be sum, avgpool, or both - run_on_fvp=True, symmetric_io_quantization=True, ) pipeline.add_stage_after( @@ -321,7 +321,6 @@ def test_mean_dim_u85_INT(test_data): MeanDim(dim, keep_dim), (test_data,), [], # Might be sum, avgpool, or both - run_on_fvp=True, symmetric_io_quantization=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_minimum.py b/backends/arm/test/ops/test_minimum.py index 88ae2c2b8da..3e87e64acbd 100644 --- a/backends/arm/test/ops/test_minimum.py +++ b/backends/arm/test/ops/test_minimum.py @@ -61,7 +61,6 @@ def test_minimum_u55_INT(test_data: Tuple): Minimum(), test_data(), aten_op, - run_on_fvp=True, ).run() @@ -72,7 +71,6 @@ def test_minimum_u85_INT(test_data: Tuple): Minimum(), test_data(), aten_op, - run_on_fvp=True, ).run() diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py index 1b76baaeff0..afb7a6d7d30 100644 --- a/backends/arm/test/ops/test_mm.py +++ b/backends/arm/test/ops/test_mm.py @@ -53,7 +53,6 @@ def test_mm_u55_INT(test_data: Tuple): MM(), test_data(), MM.aten_op, - run_on_fvp=True, ).run() @@ -65,7 +64,6 @@ def test_mm_u85_INT(test_data: Tuple): test_data(), MM.aten_op, MM.exir_op, - run_on_fvp=True, ).run() diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py index b2db55d90fd..02447e40c4e 100644 --- a/backends/arm/test/ops/test_mul.py +++ b/backends/arm/test/ops/test_mul.py @@ -8,7 +8,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.quantizer.arm_quantizer import ( get_symmetric_a16w8_quantization_config, @@ -200,7 +199,6 @@ def test_mul_tensor_u55_INT(test_data: torch.Tensor): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -213,7 +211,6 @@ def test_mul_tensor_u85_INT(test_data: torch.Tensor): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -226,7 +223,6 @@ def test_mul_tensor_u55_INT_int32(test_data: torch.Tensor): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() @@ -240,7 +236,6 @@ def test_mul_tensor_u85_INT_int32(test_data: torch.Tensor): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() @@ -314,9 +309,6 @@ def get_symmetric_a16w8_mul_quantizer(per_channel_quantization=False): @common.parametrize("test_data", test_data_suite) -@pytest.mark.xfail( - reason="missing int16 mul ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13947" -) def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1): """Test mul operation with 16A8W quantization (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -342,9 +334,6 @@ def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1): @common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone300 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947" -) def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1): """Test mul operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -356,7 +345,6 @@ def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -370,9 +358,6 @@ def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1): @common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947" -) def test_mul_tensor_16a8w_u85_INT16(test_data: input_t1): """Test mul operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -384,7 +369,6 @@ def test_mul_tensor_16a8w_u85_INT16(test_data: input_t1): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_multihead_attention.py b/backends/arm/test/ops/test_multihead_attention.py index 71cf076a157..cbc2ccb32f4 100644 --- a/backends/arm/test/ops/test_multihead_attention.py +++ b/backends/arm/test/ops/test_multihead_attention.py @@ -3,7 +3,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import pytest import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( @@ -69,7 +68,6 @@ def test_multihead_attention_tosa_INT(test_data): "test_data", test_suite, ) -@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP") @common.XfailIfNoCorstone300 def test_multihead_attention_u55_INT(test_data: input_t1): test_data, module = test_data() @@ -79,7 +77,6 @@ def test_multihead_attention_u55_INT(test_data: input_t1): [], [], use_to_edge_transform_and_lower=True, - run_on_fvp=True, # TODO: Per-channel quantization is broken (MLETORCH-1144) per_channel_quantization=False, ) @@ -91,7 +88,6 @@ def test_multihead_attention_u55_INT(test_data: input_t1): "test_data", test_suite, ) -@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP") @common.XfailIfNoCorstone320 def test_multihead_attention_u85_INT(test_data: input_t1): test_data, module = test_data() @@ -101,7 +97,6 @@ def test_multihead_attention_u85_INT(test_data: input_t1): [], [], use_to_edge_transform_and_lower=True, - run_on_fvp=True, # TODO: Per-channel quantization is broken (MLETORCH-1144) per_channel_quantization=False, ) diff --git a/backends/arm/test/ops/test_ne.py b/backends/arm/test/ops/test_ne.py index 60f07ad9fdd..e20953b64dc 100644 --- a/backends/arm/test/ops/test_ne.py +++ b/backends/arm/test/ops/test_ne.py @@ -171,7 +171,6 @@ def test_ne_tensor_u85_INT(test_module): test_module.get_inputs(), NotEqual.decomposed_ops, NotEqual.decomposed_exir_ops, - run_on_fvp=True, ) pipeline.run() @@ -192,7 +191,6 @@ def test_ne_scalar_u85_INT(test_module): test_module.get_inputs(), NotEqual.decomposed_ops, NotEqual.decomposed_exir_ops, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_neg.py b/backends/arm/test/ops/test_neg.py index 395a4815b62..f0afe7bd23b 100644 --- a/backends/arm/test/ops/test_neg.py +++ b/backends/arm/test/ops/test_neg.py @@ -53,7 +53,10 @@ def test_neg_tosa_INT(test_data: input_t1): @common.XfailIfNoCorstone300 def test_neg_u55_INT(test_data: input_t1): pipeline = EthosU55PipelineINT[input_t1]( - Neg(), test_data, Neg.aten_op, Neg.exir_op, run_on_fvp=True + Neg(), + test_data, + Neg.aten_op, + Neg.exir_op, ) pipeline.run() @@ -62,7 +65,10 @@ def test_neg_u55_INT(test_data: input_t1): @common.XfailIfNoCorstone320 def test_neg_u85_INT(test_data: input_t1): pipeline = EthosU85PipelineINT[input_t1]( - Neg(), test_data, Neg.aten_op, Neg.exir_op, run_on_fvp=True + Neg(), + test_data, + Neg.aten_op, + Neg.exir_op, ) pipeline.run() diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index eb482bcee54..6fd8555b56b 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -85,7 +85,6 @@ def test_permute_u55_INT(test_data): (test_data,), aten_op, exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default", - run_on_fvp=True, ) pipeline.run() @@ -99,7 +98,6 @@ def test_permute_u85_INT(test_data: torch.Tensor): (test_data,), aten_op, exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default", - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_pixel_shuffling.py b/backends/arm/test/ops/test_pixel_shuffling.py new file mode 100644 index 00000000000..5aeb8b2d1bb --- /dev/null +++ b/backends/arm/test/ops/test_pixel_shuffling.py @@ -0,0 +1,233 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Tuple + +import pytest + +import torch + +from executorch.backends.arm.constants import MAX_RANK + +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineINT, + EthosU85PipelineINT, + TosaPipelineFP, + TosaPipelineINT, + VgfPipeline, +) +from torch import nn + +aten_op_pixel_unshuffle = "torch.ops.aten.pixel_unshuffle.default" +exir_op_pixel_unshuffle = ( + "executorch_exir_dialects_edge__ops_aten_pixel_unshuffle_default" +) + +aten_op_pixel_shuffle = "torch.ops.aten.pixel_shuffle.default" +exir_op_pixel_shuffle = "executorch_exir_dialects_edge__ops_aten_pixel_shuffle_default" + +input_t1 = Tuple[torch.Tensor] # single positional input (1-tuple) + +max_rank_input_supported = MAX_RANK - 2 + + +class PixelUnShuffle(nn.Module): + + upscale_factor = 2 + test_data_generators = { + "rand_4d": lambda: (torch.randn(1, 12, 64, 64),), + "test_4d": lambda: (torch.tensor([[[[10.0, 20.0], [30.0, 40.0]]]]),), + "test_3d": lambda: (torch.tensor([[[10.0, 20.0], [30.0, 40.0]]]),), + } + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.space_to_depth = nn.PixelUnshuffle(self.upscale_factor) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + if inputs.dim() > max_rank_input_supported: + raise RuntimeError( + f"Max rank of input for pixel_unshuffle is currently {max_rank_input_supported}, got {inputs.dim()}" + ) + return self.space_to_depth(inputs) + + +class PixelShuffle(nn.Module): + + upscale_factor = 2 + test_data_generators = { + "rand_4d": lambda: (torch.randn(1, 12, 64, 64),), + "test_4d": lambda: (torch.tensor([[[[10.0]], [[20.0]], [[30.0]], [[40.0]]]]),), + "test_3d": lambda: (torch.tensor([[[10.0]], [[20.0]], [[30.0]], [[40.0]]]),), + } + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.depth_to_space = nn.PixelShuffle(self.upscale_factor) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + if inputs.dim() > max_rank_input_supported: + raise RuntimeError( + f"Max rank of input for pixel_shuffle is currently {max_rank_input_supported}, got {inputs.dim()}" + ) + return self.depth_to_space(inputs) + + +@common.parametrize("test_data", PixelUnShuffle.test_data_generators) +def test_pixel_unshuffle_tosa_FP(test_data: input_t1): + pipeline = TosaPipelineFP[input_t1]( + PixelUnShuffle(), + test_data(), + aten_op_pixel_unshuffle, + exir_op_pixel_unshuffle, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelUnShuffle.test_data_generators) +def test_pixel_unshuffle_tosa_INT(test_data: input_t1): + pipeline = TosaPipelineINT[input_t1]( + PixelUnShuffle(), + test_data(), + aten_op_pixel_unshuffle, + exir_op_pixel_unshuffle, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelShuffle.test_data_generators) +def test_pixel_shuffle_tosa_FP(test_data: input_t1): + pipeline = TosaPipelineFP[input_t1]( + PixelShuffle(), + test_data(), + aten_op_pixel_shuffle, + exir_op_pixel_shuffle, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelShuffle.test_data_generators) +def test_pixel_shuffle_tosa_INT(test_data: input_t1): + pipeline = TosaPipelineINT[input_t1]( + PixelShuffle(), + test_data(), + aten_op_pixel_shuffle, + exir_op_pixel_shuffle, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelUnShuffle.test_data_generators) +@common.SkipIfNoModelConverter +def test_pixel_unshuffle_vgf_FP(test_data: input_t1): + pipeline = VgfPipeline[input_t1]( + PixelUnShuffle(), + test_data(), + aten_op_pixel_unshuffle, + exir_op_pixel_unshuffle, + tosa_version="TOSA-1.0+FP", + run_on_vulkan_runtime=True, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelUnShuffle.test_data_generators) +@common.SkipIfNoModelConverter +def test_pixel_unshuffle_vgf_INT(test_data: input_t1): + pipeline = VgfPipeline[input_t1]( + PixelUnShuffle(), + test_data(), + aten_op_pixel_unshuffle, + exir_op_pixel_unshuffle, + tosa_version="TOSA-1.0+INT", + run_on_vulkan_runtime=True, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelShuffle.test_data_generators) +@common.SkipIfNoModelConverter +def test_pixel_shuffle_vgf_FP(test_data: input_t1): + pipeline = VgfPipeline[input_t1]( + PixelShuffle(), + test_data(), + aten_op_pixel_shuffle, + exir_op_pixel_shuffle, + tosa_version="TOSA-1.0+FP", + run_on_vulkan_runtime=True, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelShuffle.test_data_generators) +@common.SkipIfNoModelConverter +def test_pixel_shuffle_vgf_INT(test_data: input_t1): + pipeline = VgfPipeline[input_t1]( + PixelShuffle(), + test_data(), + aten_op_pixel_shuffle, + exir_op_pixel_shuffle, + tosa_version="TOSA-1.0+INT", + run_on_vulkan_runtime=True, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelUnShuffle.test_data_generators) +@common.XfailIfNoCorstone300 +def test_pixel_unshuffle_u55_INT(test_data: input_t1): + pipeline = EthosU55PipelineINT[input_t1]( + PixelUnShuffle(), + test_data(), + aten_op_pixel_unshuffle, + exir_op_pixel_unshuffle, + run_on_fvp=True, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelUnShuffle.test_data_generators) +@common.XfailIfNoCorstone320 +@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails") +def test_pixel_unshuffle_u85_INT(test_data: input_t1): + pipeline = EthosU85PipelineINT[input_t1]( + PixelUnShuffle(), + test_data(), + aten_op_pixel_unshuffle, + exir_op_pixel_unshuffle, + run_on_fvp=True, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelShuffle.test_data_generators) +@common.XfailIfNoCorstone300 +def test_pixel_shuffle_u55_INT(test_data: input_t1): + pipeline = EthosU55PipelineINT[input_t1]( + PixelShuffle(), + test_data(), + aten_op_pixel_shuffle, + exir_op_pixel_shuffle, + run_on_fvp=True, + ) + pipeline.run() + + +@common.parametrize("test_data", PixelShuffle.test_data_generators) +@common.XfailIfNoCorstone320 +@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails") +def test_pixel_shuffle_u85_INT(test_data: input_t1): + pipeline = EthosU85PipelineINT[input_t1]( + PixelShuffle(), + test_data(), + aten_op_pixel_shuffle, + exir_op_pixel_shuffle, + run_on_fvp=True, + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_pow.py b/backends/arm/test/ops/test_pow.py index 016c3e97265..377d1355992 100644 --- a/backends/arm/test/ops/test_pow.py +++ b/backends/arm/test/ops/test_pow.py @@ -159,7 +159,6 @@ def test_pow_tensor_scalar_u55_INT(test_data: Pow_TensorScalar.input_t): (base,), Pow_TensorScalar.aten_op, Pow_TensorScalar.exir_op, - run_on_fvp=True, ) pipeline.run() @@ -173,7 +172,6 @@ def test_pow_tensor_scalar_u85_INT(test_data: Pow_TensorScalar.input_t): (base,), Pow_TensorScalar.aten_op, Pow_TensorScalar.exir_op, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py index 78edbb980e8..3e4d7c18b40 100644 --- a/backends/arm/test/ops/test_reciprocal.py +++ b/backends/arm/test/ops/test_reciprocal.py @@ -71,7 +71,6 @@ def test_reciprocal_u55_INT(test_data: torch.Tensor): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() @@ -84,7 +83,6 @@ def test_reciprocal_u85_INT(test_data: torch.Tensor): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=False, symmetric_io_quantization=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py index 0b29bc24e75..fad6e7a9162 100644 --- a/backends/arm/test/ops/test_relu.py +++ b/backends/arm/test/ops/test_relu.py @@ -43,6 +43,28 @@ def forward(self, x): return self.relu(x) +test_data_conv_relu = { + # (test_name, test_data) + "4d_randn_inplace=True": (lambda: (torch.randn(1, 64, 96, 96) * 1000, True)), + "4d_randn_inplace=False": (lambda: (torch.randn(1, 64, 96, 96) * 1000, False)), +} + + +class Conv2d_Relu_Add(torch.nn.Module): + def __init__(self, inplace: bool = True): + super().__init__() + self.conv1 = torch.nn.Conv2d( + in_channels=64, out_channels=64, kernel_size=7, padding="same" + ) + self.relu = torch.nn.ReLU(inplace=inplace) + + def forward(self, x: torch.Tensor): + y = self.conv1(x) + z = self.relu(y) + out = x + z + return out + + @common.parametrize("test_data", test_data_suite) def test_relu_tosa_FP(test_data: torch.Tensor): pipeline = TosaPipelineFP[input_t1]( @@ -54,6 +76,35 @@ def test_relu_tosa_FP(test_data: torch.Tensor): pipeline.run() +# Test the folding of Conv2D with ReLU +@common.parametrize("test_data", test_data_conv_relu) +def test_conv_relu_folding_tosa_INT(test_data: torch.Tensor): + input_data, inplace = test_data() + pipeline = TosaPipelineINT[input_t1]( + Conv2d_Relu_Add(inplace=inplace), + (input_data,), + [], + [], + ) + # We should have : + # 3 quantize_per_tensor nodes: input activation , output of the conv-relu sequence, out of the add + # 4 dequantize_per_tensor nodes: into the conv2d input, into the add, output of the conv-relu sequence, before returning + # 2 dequantize_per_channel nodes: one for the weights and another one for the bias + # In case of incorrect annotation of the ReLU, we get separate Q/DR around both the conv2d and the ReLU and + # therefore more quantize_per_tensor and dequantize_per_tensor nodes + pipeline.add_stage_after( + "quantize", + pipeline.tester.check_count, + { + "quantized_decomposed.quantize_per_tensor.default": 3, + "torch.ops.quantized_decomposed.dequantize_per_tensor.default": 4, + "quantized_decomposed.dequantize_per_channel.default": 2, + }, + suffix="quant_nodes", + ) + pipeline.run() + + @common.parametrize("test_data", test_data_suite) def test_relu_tosa_INT(test_data: torch.Tensor): pipeline = TosaPipelineINT[input_t1]( @@ -66,25 +117,25 @@ def test_relu_tosa_INT(test_data: torch.Tensor): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_relu_u55_INT(test_data: torch.Tensor): pipeline = EthosU55PipelineINT[input_t1]( Relu(), (test_data(),), aten_op, exir_op, - run_on_fvp=False, ) pipeline.run() @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone320 def test_relu_u85_INT(test_data: torch.Tensor): pipeline = EthosU85PipelineINT[input_t1]( Relu(), (test_data(),), aten_op, exir_op, - run_on_fvp=False, ) pipeline.run() diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py index 3236515b661..56986a54781 100644 --- a/backends/arm/test/ops/test_repeat.py +++ b/backends/arm/test/ops/test_repeat.py @@ -88,6 +88,7 @@ def test_repeat_tosa_INT(test_data: Tuple): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_repeat_u55_INT(test_data: Tuple): module, test_data = test_data() pipeline = EthosU55PipelineINT[input_t1]( @@ -95,12 +96,12 @@ def test_repeat_u55_INT(test_data: Tuple): test_data, module.aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone320 def test_repeat_u85_INT(test_data: Tuple): module, test_data = test_data() pipeline = EthosU85PipelineINT[input_t1]( @@ -108,7 +109,6 @@ def test_repeat_u85_INT(test_data: Tuple): test_data, module.aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() diff --git a/backends/arm/test/ops/test_rshift.py b/backends/arm/test/ops/test_rshift.py index e97bfb840ae..f7a821e3a63 100644 --- a/backends/arm/test/ops/test_rshift.py +++ b/backends/arm/test/ops/test_rshift.py @@ -96,14 +96,13 @@ def test_bitwise_right_shift_tensor_tosa_INT_scalar(test_data): @common.parametrize("test_data", RshiftScalar.test_data) -@XfailIfNoCorstone300 +@common.XfailIfNoCorstone300 def test_bitwise_right_shift_tensor_u55_INT_scalar(test_data): pipeline = EthosU55PipelineINT[scalar_input_t]( RshiftScalar(), test_data(), RshiftScalar.torch_op_INT, RshiftScalar.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") @@ -113,14 +112,13 @@ def test_bitwise_right_shift_tensor_u55_INT_scalar(test_data): @common.parametrize("test_data", RshiftScalar.test_data) -@XfailIfNoCorstone320 +@common.XfailIfNoCorstone320 def test_bitwise_right_shift_tensor_u85_INT_scalar(test_data): pipeline = EthosU85PipelineINT[scalar_input_t]( RshiftScalar(), test_data(), RshiftScalar.torch_op_INT, RshiftScalar.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() @@ -188,7 +186,6 @@ def test_bitwise_right_shift_tensor_u55_INT(test_data): test_data(), RshiftTensor.torch_op, RshiftTensor.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") @@ -205,7 +202,6 @@ def test_bitwise_right_shift_tensor_u85_INT(test_data): test_data(), RshiftTensor.torch_op, RshiftTensor.exir_op, - run_on_fvp=True, ) pipeline.pop_stage("check.quant_nodes") pipeline.run() diff --git a/backends/arm/test/ops/test_rsqrt.py b/backends/arm/test/ops/test_rsqrt.py index d146a83287e..23bb9dc1a4b 100644 --- a/backends/arm/test/ops/test_rsqrt.py +++ b/backends/arm/test/ops/test_rsqrt.py @@ -66,7 +66,6 @@ def test_rsqrt_u55_INT(test_tensor: torch.Tensor): test_tensor(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -79,7 +78,6 @@ def test_rsqrt_u85_INT(test_tensor: torch.Tensor): test_tensor(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_scalar_tensor.py b/backends/arm/test/ops/test_scalar_tensor.py index 22c1cc0373d..d5e5b365da1 100644 --- a/backends/arm/test/ops/test_scalar_tensor.py +++ b/backends/arm/test/ops/test_scalar_tensor.py @@ -2,7 +2,6 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - import torch from executorch.backends.arm.test import common @@ -86,7 +85,6 @@ def test_scalar_tensor_u55_INT(test_data): ScalarTensor(scalar, dtype), tuple(data), ScalarTensor.aten_op, - run_on_fvp=True, ).run() @@ -98,7 +96,6 @@ def test_scalar_tensor_u85_INT(test_data): ScalarTensor(scalar, dtype), tuple(data), ScalarTensor.aten_op, - run_on_fvp=True, ).run() diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py index 4c3887f1e18..23046c34fe4 100644 --- a/backends/arm/test/ops/test_select.py +++ b/backends/arm/test/ops/test_select.py @@ -110,7 +110,6 @@ def test_select_int_u55_INT_copy(test_data: Tuple): test_data(), aten_op_copy, exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -124,7 +123,6 @@ def test_select_int_u55_INT(test_data: Tuple): test_data(), aten_op_int, exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -151,7 +149,6 @@ def test_select_int_u85_INT_copy(test_data: Tuple): test_data(), aten_op_copy, exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() @@ -165,7 +162,6 @@ def test_select_int_u85_INT(test_data: Tuple): test_data(), aten_op_int, exir_ops=[], - run_on_fvp=True, use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py index aac2ee1c9b1..a9b9ef11b48 100644 --- a/backends/arm/test/ops/test_sigmoid.py +++ b/backends/arm/test/ops/test_sigmoid.py @@ -141,25 +141,25 @@ def test_sigmoid_tosa_INT_3(): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_sigmoid_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( Sigmoid(), (test_data(),), aten_op, exir_op, - run_on_fvp=False, ) pipeline.run() @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone320 def test_sigmoid_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( Sigmoid(), (test_data(),), aten_op, exir_op, - run_on_fvp=False, ) pipeline.run() @@ -324,7 +324,6 @@ def test_sigmoid_16a8w_u55_INT16(test_data: torch.Tensor): exir_op, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -352,7 +351,6 @@ def test_sigmoid_16a8w_u85_INT16(test_data: torch.Tensor): exir_op, per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py index ad8c49b234c..587ba99222a 100644 --- a/backends/arm/test/ops/test_sigmoid_16bit.py +++ b/backends/arm/test/ops/test_sigmoid_16bit.py @@ -125,6 +125,7 @@ def test_sigmoid_tosa_INT_add_sigmoid(test_data): "test_data", test_data_suite, ) +@common.XfailIfNoCorstone300 def test_sigmoid_u55_INT(test_data): pipeline = OpNotSupportedPipeline( Sigmoid(), @@ -141,6 +142,7 @@ def test_sigmoid_u55_INT(test_data): "test_data", test_data_suite, ) +@common.XfailIfNoCorstone300 def test_sigmoid_u55_INT_add_sigmoid(test_data): pipeline = OpNotSupportedPipeline( SigmoidAddSigmoid(), @@ -163,7 +165,6 @@ def test_sigmoid_u85_INT(test_data): (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, - run_on_fvp=True, ) pipeline.change_args("quantize", get_16bit_sigmoid_quantizer()) pipeline.run() @@ -184,7 +185,6 @@ def test_sigmoid_u85_INT_add_sigmoid(test_data): (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, - run_on_fvp=True, ) pipeline.change_args("quantize", get_16bit_sigmoid_quantizer()) pipeline.run() diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py index 70863cd4757..389f1d8a278 100644 --- a/backends/arm/test/ops/test_sigmoid_32bit.py +++ b/backends/arm/test/ops/test_sigmoid_32bit.py @@ -131,6 +131,7 @@ def test_sigmoid_tosa_INT_add_sigmoid(test_data): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_sigmoid_u55_INT(test_data): pipeline = OpNotSupportedPipeline( Sigmoid(), @@ -145,6 +146,7 @@ def test_sigmoid_u55_INT(test_data): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_sigmoid_u55_INT_add_sigmoid(test_data): pipeline = OpNotSupportedPipeline( SigmoidAddSigmoid(), @@ -167,7 +169,6 @@ def test_sigmoid_u85_INT(test_data): (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, - run_on_fvp=True, ) pipeline.change_args("quantize", get_32bit_sigmoid_quantizer()) pipeline.run() @@ -184,7 +185,6 @@ def test_sigmoid_u85_INT_add_sigmoid(test_data): (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, - run_on_fvp=True, ) pipeline.change_args("quantize", get_32bit_sigmoid_quantizer()) pipeline.run() diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py index edc7d769be1..362358d0813 100644 --- a/backends/arm/test/ops/test_silu.py +++ b/backends/arm/test/ops/test_silu.py @@ -79,7 +79,9 @@ def test_silu_tosa_INT_inplace(test_data: input_t): def test_silu_u55_INT(test_data: input_t): silu_data = (test_data(), False) pipeline = EthosU55PipelineINT[input_t]( - Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True + Silu(), + silu_data, + Silu.aten_op_INT, ) pipeline.run() @@ -89,7 +91,9 @@ def test_silu_u55_INT(test_data: input_t): def test_silu_u55_INT_inplace(test_data: input_t): silu_data = (test_data(), True) pipeline = EthosU55PipelineINT[input_t]( - Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True + Silu(), + silu_data, + Silu.aten_op_INT, ) pipeline.run() @@ -99,7 +103,9 @@ def test_silu_u55_INT_inplace(test_data: input_t): def test_silu_u85_INT(test_data: input_t): silu_data = (test_data(), False) pipeline = EthosU85PipelineINT[input_t]( - Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True + Silu(), + silu_data, + Silu.aten_op_INT, ) pipeline.run() @@ -109,7 +115,9 @@ def test_silu_u85_INT(test_data: input_t): def test_silu_u85_INT_inplace(test_data: input_t): silu_data = (test_data(), True) pipeline = EthosU85PipelineINT[input_t]( - Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True + Silu(), + silu_data, + Silu.aten_op_INT, ) pipeline.run() diff --git a/backends/arm/test/ops/test_sin.py b/backends/arm/test/ops/test_sin.py index 3ca593ad608..06d06e3b11d 100644 --- a/backends/arm/test/ops/test_sin.py +++ b/backends/arm/test/ops/test_sin.py @@ -61,25 +61,25 @@ def test_sin_tosa_INT(test_data: Tuple): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_sin_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( Sin(), (test_data,), aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone320 def test_sin_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( Sin(), (test_data,), aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py index eafeb04320e..7e71a51899a 100644 --- a/backends/arm/test/ops/test_slice.py +++ b/backends/arm/test/ops/test_slice.py @@ -7,7 +7,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.quantizer.arm_quantizer import ( get_symmetric_a16w8_quantization_config, @@ -34,11 +33,11 @@ test_data_suite = { "ones_neg_3": lambda: (torch.ones(10), [(3, -3)]), "ones_neg_8": lambda: (torch.ones(10), [(-8, 3)]), - "ones_slice_2": lambda: (torch.ones(10, 10), [(1, 3), (3, None)]), - "ones_slice_3": lambda: (torch.ones(10, 10, 10), [(0, 7), (0, None), (0, 8)]), + "ones_slice_2": lambda: (torch.ones(10, 10), [(1, 3), (3, 10)]), + "ones_slice_3": lambda: (torch.ones(10, 10, 10), [(0, 7), (0, 10), (0, 8)]), "ones_slice_4": lambda: ( torch.ones((1, 12, 10, 10)), - [(None, None), (None, 5), (3, 5), (4, 10)], + [(0, 1), (0, 5), (3, 5), (4, 10)], ), } @@ -78,26 +77,32 @@ def test_slice_tensor_tosa_INT_nhwc(test_data: torch.Tensor): pipeline.run() -@common.parametrize("test_data", test_data_suite) +x_fails = { + "ones_slice_3": "MLETORCH-1402: Slice operator has incorrect number of inputs", + "ones_slice_4": "MLETORCH-1402: Slice operator has incorrect number of inputs", +} + + +@common.parametrize("test_data", test_data_suite, x_fails) +@common.XfailIfNoCorstone300 def test_slice_tensor_u55_INT(test_data: torch.Tensor): pipeline = EthosU55PipelineINT[input_t1]( Slice(), test_data(), aten_ops=[], exir_ops=[], - run_on_fvp=False, ) pipeline.run() -@common.parametrize("test_data", test_data_suite) +@common.parametrize("test_data", test_data_suite, x_fails) +@common.XfailIfNoCorstone320 def test_slice_tensor_u85_INT(test_data: torch.Tensor): pipeline = EthosU85PipelineINT[input_t1]( Slice(), test_data(), aten_ops=[], exir_ops=[], - run_on_fvp=False, ) pipeline.run() @@ -148,9 +153,6 @@ def get_symmetric_a16w8_slice_quantizer(per_channel_quantization=False): @common.parametrize("test_data", test_data_suite) -@pytest.mark.xfail( - reason="missing int16 slice ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13976" -) def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor): """Test slice operation with 16A8W quantization (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -176,9 +178,6 @@ def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor): @common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone300 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations" -) def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor): """Test slice operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -190,7 +189,6 @@ def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -204,9 +202,6 @@ def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor): @common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations" -) def test_slice_tensor_16a8w_u85_INT16(test_data: torch.Tensor): """Test slice operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -218,7 +213,6 @@ def test_slice_tensor_16a8w_u85_INT16(test_data: torch.Tensor): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index 6b4455fc702..22bd919fccd 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -65,7 +65,11 @@ def test_softmax_tosa_INT(test_data): @common.XfailIfNoCorstone300 def test_softmax_u55_INT(test_data): data, dim = test_data() - pipeline = EthosU55PipelineINT[input_t1](Softmax(dim), data, [], run_on_fvp=True) + pipeline = EthosU55PipelineINT[input_t1]( + Softmax(dim), + data, + [], + ) pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op]) pipeline.change_args("run_method_and_compare_outputs", qtol=1) pipeline.run() @@ -75,7 +79,11 @@ def test_softmax_u55_INT(test_data): @common.XfailIfNoCorstone320 def test_softmax_u85_INT(test_data): data, dim = test_data() - pipeline = EthosU85PipelineINT[input_t1](Softmax(dim), data, [], run_on_fvp=True) + pipeline = EthosU85PipelineINT[input_t1]( + Softmax(dim), + data, + [], + ) pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op]) pipeline.change_args("run_method_and_compare_outputs", qtol=1) pipeline.run() diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py index 388e85762af..284c142a34e 100644 --- a/backends/arm/test/ops/test_split.py +++ b/backends/arm/test/ops/test_split.py @@ -132,17 +132,24 @@ def test_split_with_sizes_tosa_INT(test_data: input_t1): pipeline.run() +x_fails = { + "split_3d_2_sizes_dim": "MLETORCH-1403: Split operator is running out of memory when reading input file", + "split_4d_2_sizes_dim_neg": "MLETORCH-1403: Split operator is running out of memory when reading input file", +} + + @common.parametrize( "test_data", (Split.test_data | Split.test_data_list), + x_fails, ) +@common.XfailIfNoCorstone300 def test_split_with_sizes_u55_INT(test_data: input_t1): pipeline = EthosU55PipelineINT[input_t1]( Split(), test_data(), aten_ops=[], exir_ops=exir_op, - run_on_fvp=False, ) pipeline.run() @@ -150,7 +157,9 @@ def test_split_with_sizes_u55_INT(test_data: input_t1): @common.parametrize( "test_data", (Split.test_data | Split.test_data_list), + x_fails, ) +@common.XfailIfNoCorstone320 def test_split_with_sizes_u85_INT(test_data: input_t1): pipeline = EthosU85PipelineINT[input_t1]( @@ -158,7 +167,6 @@ def test_split_with_sizes_u85_INT(test_data: input_t1): test_data(), aten_ops=[], exir_ops=exir_op, - run_on_fvp=False, ) pipeline.run() diff --git a/backends/arm/test/ops/test_sqrt.py b/backends/arm/test/ops/test_sqrt.py index 15e2dd45322..13a2366b17c 100644 --- a/backends/arm/test/ops/test_sqrt.py +++ b/backends/arm/test/ops/test_sqrt.py @@ -70,7 +70,6 @@ def test_sqrt_u55_INT(test_data: Sqrt.input_t): test_data(), Sqrt.aten_op_INT, Sqrt.exir_op_INT, - run_on_fvp=True, ) pipeline.run() @@ -83,7 +82,6 @@ def test_sqrt_u85_INT(test_data: Sqrt.input_t): test_data(), Sqrt.aten_op_INT, Sqrt.exir_op_INT, - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py index 0de51673496..3c2014cdcda 100644 --- a/backends/arm/test/ops/test_squeeze.py +++ b/backends/arm/test/ops/test_squeeze.py @@ -95,7 +95,6 @@ def test_squeeze_dim_u55_INT(test_data: Tuple): test_data(), aten_ops="torch.ops.aten.squeeze.default", exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -108,7 +107,6 @@ def test_squeeze_dim_u85_INT(test_data: Tuple): test_data(), aten_ops="torch.ops.aten.squeeze.default", exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -174,7 +172,6 @@ def test_squeeze_dim_u55_INT_2(test_data: Tuple): test_data(), aten_ops="torch.ops.aten.squeeze.dim", exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -187,7 +184,6 @@ def test_squeeze_dim_u85_INT_2(test_data: Tuple): test_data(), aten_ops="torch.ops.aten.squeeze.dim", exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -253,7 +249,6 @@ def test_squeeze_dims_u55_INT(test_data: Tuple): test_data(), aten_ops="torch.ops.aten.squeeze.dims", exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -266,7 +261,6 @@ def test_squeeze_dims_u85_INT(test_data: Tuple): test_data(), aten_ops="torch.ops.aten.squeeze.dims", exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_stack.py b/backends/arm/test/ops/test_stack.py new file mode 100644 index 00000000000..873a599992a --- /dev/null +++ b/backends/arm/test/ops/test_stack.py @@ -0,0 +1,150 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +import torch.nn as nn + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineINT, + EthosU85PipelineINT, + TosaPipelineFP, + TosaPipelineINT, + VgfPipeline, +) + +test_data_suite = { + # (test_name, test_data) + "ones_two_tensors": lambda: ((torch.ones(1), torch.ones(1)), 0), + "ones_and_rand_three_tensors": lambda: ( + (torch.ones(1, 2), torch.randn(1, 2), torch.randn(1, 2)), + 1, + ), + "ones_and_rand_four_tensors": lambda: ( + ( + torch.ones(1, 2, 5), + torch.randn(1, 2, 5), + torch.randn(1, 2, 5), + torch.randn(1, 2, 5), + ), + -1, + ), + "rand_two_tensors": lambda: ( + (torch.randn(2, 2, 4), torch.randn(2, 2, 4)), + 2, + ), + "rand_two_tensors_dim_0": lambda: ( + (torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)), + ), + "rand_two_tensors_dim_2": lambda: ( + (torch.randn(2, 2, 3, 5), torch.randn(2, 2, 3, 5)), + 2, + ), + "rand_large": lambda: ( + ( + 10000 * torch.randn(2, 3, 1, 4), + torch.randn(2, 3, 1, 4), + torch.randn(2, 3, 1, 4), + ), + -3, + ), +} + + +class Stack(nn.Module): + aten_op = "torch.ops.aten.stack.default" + exir_op = "executorch_exir_dialects_edge__ops_aten_cat_default" + + def forward(self, n: tuple[torch.Tensor, ...], dim: int = 0): + return torch.stack(n, dim) + + +input_t1 = Tuple[torch.Tensor] + + +@common.parametrize("test_module", test_data_suite) +def test_stack_tosa_FP(test_module: input_t1): + test_data = test_module() + pipeline = TosaPipelineFP[input_t1]( + Stack(), + test_data, + aten_op=Stack.aten_op, + exir_op=Stack.exir_op, + use_to_edge_transform_and_lower=False, + ) + pipeline.run() + + +@common.parametrize("test_module", test_data_suite) +def test_stack_tosa_INT(test_module: input_t1): + test_data = test_module() + pipeline = TosaPipelineINT[input_t1]( + Stack(), + test_data, + aten_op=Stack.aten_op, + exir_op=Stack.exir_op, + use_to_edge_transform_and_lower=False, + ) + pipeline.run() + + +@common.XfailIfNoCorstone300 +@common.parametrize("test_module", test_data_suite) +def test_stack_u55_INT(test_module: input_t1): + test_data = test_module() + pipeline = EthosU55PipelineINT[input_t1]( + Stack(), + test_data, + aten_ops=Stack.aten_op, + exir_ops=Stack.exir_op, + use_to_edge_transform_and_lower=False, + ) + pipeline.run() + + +@common.XfailIfNoCorstone320 +@common.parametrize("test_module", test_data_suite) +def test_stack_u85_INT(test_module: input_t1): + test_data = test_module() + pipeline = EthosU85PipelineINT[input_t1]( + Stack(), + test_data, + aten_ops=Stack.aten_op, + exir_ops=Stack.exir_op, + use_to_edge_transform_and_lower=False, + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +@common.parametrize("test_module", test_data_suite) +def test_stack_vgf_FP(test_module: input_t1): + test_data = test_module() + pipeline = VgfPipeline[input_t1]( + Stack(), + test_data, + aten_op=Stack.aten_op, + exir_op=Stack.exir_op, + tosa_version="TOSA-1.0+FP", + use_to_edge_transform_and_lower=False, + ) + pipeline.run() + + +@common.SkipIfNoModelConverter +@common.parametrize("test_module", test_data_suite) +def test_stack_vgf_INT(test_module: input_t1): + test_data = test_module() + pipeline = VgfPipeline[input_t1]( + Stack(), + test_data, + aten_op=Stack.aten_op, + exir_op=Stack.exir_op, + tosa_version="TOSA-1.0+INT", + use_to_edge_transform_and_lower=False, + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py index c691506beb2..68b6ad5fb93 100644 --- a/backends/arm/test/ops/test_sub.py +++ b/backends/arm/test/ops/test_sub.py @@ -10,8 +10,12 @@ from typing import Tuple import torch +from executorch.backends.arm.quantizer.arm_quantizer import ( + get_symmetric_a16w8_quantization_config, + TOSAQuantizer, +) -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.test_pipeline import ( EthosU55PipelineINT, EthosU85PipelineINT, @@ -19,6 +23,8 @@ TosaPipelineINT, VgfPipeline, ) +from executorch.backends.arm.tosa import TosaSpecification +from executorch.backends.xnnpack.test.tester import Quantize aten_op = "torch.ops.aten.sub.Tensor" exir_op = "executorch_exir_dialects_edge__ops_aten_sub_Tensor" @@ -73,6 +79,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): return x - y +class SubAlpha(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor): + return torch.sub(x, y, alpha=5) + + class SubTan(torch.nn.Module): def forward(self, x: torch.Tensor, y: torch.Tensor): @@ -109,6 +120,18 @@ def test_sub_tensor_tosa_FP_2(test_data: Tuple[torch.Tensor, torch.Tensor]): pipeline.run() +@common.parametrize("test_data", sub_tan_test_data) +def test_sub_tensor_tosa_FP_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]): + """Test Two-Operand Subtraction with alpha (TOSA FP)""" + pipeline = TosaPipelineFP[input_t2]( + SubAlpha(), + test_data(), + aten_op, + exir_op, + ) + pipeline.run() + + @common.parametrize("test_data", sub_test_data) def test_sub_tensor_tosa_INT(test_data): """Test Subtraction (TOSA INT)""" @@ -132,6 +155,15 @@ def test_sub_tensor_tosa_INT_3(test_data: Tuple[torch.Tensor, torch.Tensor]): pipeline.run() +@common.parametrize("test_data", sub_tan_test_data) +def test_sub_tensor_tosa_INT_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]): + """Test Two-Operand Subtraction with alpha (TOSA INT)""" + pipeline = TosaPipelineINT[input_t2]( + SubAlpha(), test_data(), aten_op, exir_op, qtol=0 + ) + pipeline.run() + + @common.parametrize("test_data", sub_test_data) @common.XfailIfNoCorstone300 def test_sub_tensor_u55_INT(test_data): @@ -141,7 +173,6 @@ def test_sub_tensor_u55_INT(test_data): test_data(), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -155,7 +186,6 @@ def test_sub_tensor_u55_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]): test_data(), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -169,7 +199,6 @@ def test_sub_tensor_u85_INT_2(test_data): test_data(), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -183,7 +212,6 @@ def test_sub_tensor_u85_INT(test_data: Tuple[torch.Tensor, torch.Tensor]): test_data(), aten_op, exir_op, - run_on_fvp=True, ) pipeline.run() @@ -242,3 +270,96 @@ def test_sub_tensor_vgf_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]): tosa_version="TOSA-1.0+INT", ) pipeline.run() + + +def get_symmetric_a16w8_sub_quantizer(per_channel_quantization=False): + tosa_version = conftest.get_option("tosa_version") + tosa_profiles = { + "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT+int16"), + } + + quantizer = TOSAQuantizer(tosa_profiles[tosa_version]) + quantizer.set_global( + get_symmetric_a16w8_quantization_config(is_per_channel=per_channel_quantization) + ) + + return Quantize( + quantizer, + get_symmetric_a16w8_quantization_config( + is_per_channel=per_channel_quantization + ), + ) + + +@common.parametrize("test_data", sub_test_data) +def test_sub_tensor_16a8w_tosa_INT(test_data: input_t1): + """Test sub operation with 16A8W quantization (16-bit activations, 8-bit weights)""" + per_channel_quantization = False + + pipeline = TosaPipelineINT[input_t1]( + Sub(), + test_data(), + aten_op, + exir_op=[], + per_channel_quantization=per_channel_quantization, + use_to_edge_transform_and_lower=True, + tosa_extensions=["int16"], + ) + + pipeline.change_args( + "quantize", + get_symmetric_a16w8_sub_quantizer( + per_channel_quantization=per_channel_quantization + ), + ) + pipeline.run() + + +@common.parametrize("test_data", sub_test_data) +@common.XfailIfNoCorstone300 +def test_sub_tensor_16a8w_u55_INT16(test_data: input_t1): + """Test sub operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)""" + per_channel_quantization = False + + pipeline = EthosU55PipelineINT[input_t1]( + Sub(), + test_data(), + aten_op, + exir_op, + per_channel_quantization=per_channel_quantization, + use_to_edge_transform_and_lower=True, + run_on_fvp=True, + ) + + pipeline.change_args( + "quantize", + get_symmetric_a16w8_sub_quantizer( + per_channel_quantization=per_channel_quantization + ), + ) + pipeline.run() + + +@common.parametrize("test_data", sub_test_data) +@common.XfailIfNoCorstone320 +def test_sub_tensor_16a8w_u85_INT16(test_data: input_t1): + """Test sub operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" + per_channel_quantization = False + + pipeline = EthosU85PipelineINT[input_t1]( + Sub(), + test_data(), + aten_op, + exir_op, + per_channel_quantization=per_channel_quantization, + use_to_edge_transform_and_lower=True, + run_on_fvp=True, + ) + + pipeline.change_args( + "quantize", + get_symmetric_a16w8_sub_quantizer( + per_channel_quantization=per_channel_quantization + ), + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py index 9308315f76d..13c1e029032 100644 --- a/backends/arm/test/ops/test_sum.py +++ b/backends/arm/test/ops/test_sum.py @@ -72,7 +72,6 @@ def test_view_u55_INT_1_0(test_data: Tuple): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -85,7 +84,6 @@ def test_view_u85_INT_1_0(test_data: Tuple): test_data(), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -94,7 +92,11 @@ def test_view_u85_INT_1_0(test_data: Tuple): @common.SkipIfNoModelConverter def test_sum_dim_intlist_vgf_FP(test_data: input_t1): pipeline = VgfPipeline[input_t1]( - Sum(), test_data(), aten_op, tosa_version="TOSA-1.0+FP" + Sum(), + test_data(), + aten_op, + tosa_version="TOSA-1.0+FP", + run_on_vulkan_runtime=True, ) pipeline.run() @@ -107,6 +109,7 @@ def test_sum_dim_intlist_vgf_INT(test_data: input_t1): test_data(), aten_op, tosa_version="TOSA-1.0+INT", + run_on_vulkan_runtime=True, ) pipeline.run() @@ -119,7 +122,7 @@ def test_sum_dim_intlist_vgf_INT(test_data: input_t1): @common.parametrize("test_data", reject_inputs) -def test_view_u55_INT_not_delegated(test_data: Tuple): +def test_view_u55_INT_failure_set(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( Sum(), test_data(), diff --git a/backends/arm/test/ops/test_tanh.py b/backends/arm/test/ops/test_tanh.py index f3f4df31d0e..8dc967c01d7 100644 --- a/backends/arm/test/ops/test_tanh.py +++ b/backends/arm/test/ops/test_tanh.py @@ -77,7 +77,6 @@ def test_tanh_u55_INT(test_data: Tuple): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -90,7 +89,6 @@ def test_tanh_u85_INT(test_data: Tuple): (test_data(),), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -178,7 +176,6 @@ def test_tanh_16a8w_u55_INT16(test_data: torch.Tensor): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -206,7 +203,6 @@ def test_tanh_16a8w_u85_INT16(test_data: torch.Tensor): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py index 5c01788c805..1fdc4619131 100644 --- a/backends/arm/test/ops/test_to_copy.py +++ b/backends/arm/test/ops/test_to_copy.py @@ -192,20 +192,15 @@ def test_to_vgf_INT(test_data: Tuple): ), } -redundant_xfails_FP = { +redundant_xfails = { "rand_fp16_fp16": "FP16 is not supported", "rand_int8_int8": "Tracing graph with quantized input is not supported.", "rand_int16_int16": "Tracing graph with quantized input is not supported.", } -redundant_xfails_INT = { - "rand_fp16_fp16": "FP16 is not supported", - "rand_int8_int8": "Tracing graph with quantized input is not supported.", -} - @common.parametrize( - "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails_FP + "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails ) def test_to_tosa_FP_REDUNDANT_CAST(test_data: Tuple): test_tensor, new_dtype = test_data() @@ -220,7 +215,7 @@ def test_to_tosa_FP_REDUNDANT_CAST(test_data: Tuple): @common.parametrize( - "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails_INT + "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails ) def test_to_tosa_INT_REDUNDANT_CAST(test_data: Tuple): test_tensor, new_dtype = test_data() @@ -244,3 +239,32 @@ def test_to_tosa_INT_not_delegated_REDUNDANT_CAST(test_data: Tuple): non_delegated_ops={}, # These are removed outside of the Arm backend so the graph is empty ) pipeline.run() + + +_TO_COPY_DATA_INT_U55_REJECT = { + "rand_bool_int8": lambda: ( + torch.randint(0, 2, (1, 2, 3, 4), dtype=torch.bool), + torch.int8, + ), + "rand_int16_bool": lambda: ( + torch.randint(-1000, 1000, (1, 2, 3, 4), dtype=torch.int16), + torch.bool, + ), + "rand_int32_int8": lambda: ( + torch.randint(-1000, 1000, (1, 2, 3, 4), dtype=torch.int32), + torch.int8, + ), +} + + +@common.parametrize("test_data", _TO_COPY_DATA_INT_U55_REJECT) +def test_to_u55_INT(test_data: Tuple): + test_tensor, new_dtype = test_data() + pipeline = OpNotSupportedPipeline[input_t1]( + Cast(new_dtype), + (test_tensor,), + u55_subset=True, + quantize=True, + non_delegated_ops={}, # These are removed outside of the Arm backend so the graph is empty + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_unary_combos.py b/backends/arm/test/ops/test_unary_combos.py index db442d2d8d0..bfeb9b59e80 100644 --- a/backends/arm/test/ops/test_unary_combos.py +++ b/backends/arm/test/ops/test_unary_combos.py @@ -109,7 +109,10 @@ def test_unary_combos_tosa_INT(model_cls): def test_unary_combos_u55_INT(model_cls): m, inputs, exir = _build(model_cls) p = EthosU55PipelineINT[Tensor1]( - m, inputs, aten_ops=[], exir_ops=exir, run_on_fvp=True + m, + inputs, + aten_ops=[], + exir_ops=exir, ) p.run() @@ -119,7 +122,10 @@ def test_unary_combos_u55_INT(model_cls): def test_unary_combos_u85_INT(model_cls): m, inputs, exir = _build(model_cls) p = EthosU85PipelineINT[Tensor1]( - m, inputs, aten_ops=[], exir_ops=exir, run_on_fvp=True + m, + inputs, + aten_ops=[], + exir_ops=exir, ) p.run() diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py index 9da1a352ebb..c76c1236ab3 100644 --- a/backends/arm/test/ops/test_unsqueeze.py +++ b/backends/arm/test/ops/test_unsqueeze.py @@ -65,7 +65,6 @@ def test_unsqueeze_u55_INT(test_tensor: torch.Tensor): (*test_tensor, 0), aten_op, exir_ops=[], - run_on_fvp=False, ) pipeline.run() @@ -78,7 +77,6 @@ def test_unsqueeze_u85_INT(test_tensor: torch.Tensor): (*test_tensor, 0), aten_op, exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_upsample_bilinear2d.py b/backends/arm/test/ops/test_upsample_bilinear2d.py index 95e69bc5204..1edba708f1f 100644 --- a/backends/arm/test/ops/test_upsample_bilinear2d.py +++ b/backends/arm/test/ops/test_upsample_bilinear2d.py @@ -259,7 +259,6 @@ def test_upsample_bilinear2d_vec_U85_INT_Upsample(test_data: input_t1): Upsample(size, scale_factor), (test_data,), aten_op, - run_on_fvp=True, qtol=1, use_to_edge_transform_and_lower=True, ) @@ -279,7 +278,6 @@ def test_upsample_bilinear2d_vec_U85_INT_Interpolate( Interpolate(size, scale_factor), (test_data,), aten_op, - run_on_fvp=True, qtol=1, use_to_edge_transform_and_lower=True, ) @@ -299,7 +297,6 @@ def test_upsample_bilinear2d_vec_U85_INT_UpsamplingBilinear2d( UpsamplingBilinear2d(size, scale_factor), (test_data,), aten_op, - run_on_fvp=True, qtol=1, use_to_edge_transform_and_lower=True, ) diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py index 9567f90c480..9f1c437fc65 100644 --- a/backends/arm/test/ops/test_var.py +++ b/backends/arm/test/ops/test_var.py @@ -194,7 +194,6 @@ def test_var_dim_u55_INT_no_dim(test_data: Tuple): (test_data,), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -208,7 +207,6 @@ def test_var_dim_u85_INT_no_dim(test_data: Tuple): (test_data,), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -276,7 +274,6 @@ def test_var_dim_u55_INT(test_data: Tuple): (test_data,), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -290,7 +287,6 @@ def test_var_dim_u85_INT(test_data: Tuple): (test_data,), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -357,7 +353,6 @@ def test_var_dim_u55_INT_correction(test_data: Tuple): (test_data,), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -371,7 +366,6 @@ def test_var_dim_u85_INT_correction(test_data: Tuple): (test_data,), aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index ed942c07aa1..3e706ae1cac 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -9,7 +9,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.quantizer.arm_quantizer import ( get_symmetric_a16w8_quantization_config, @@ -180,9 +179,6 @@ def get_symmetric_a16w8_view_quantizer(per_channel_quantization=False): @common.parametrize("test_data", View.needs_transpose_tests) -@pytest.mark.xfail( - reason="missing int16 view ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13977" -) def test_view_16a8w_tosa_INT(test_data: Tuple): """Test view operation with 16A8W quantization (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -209,9 +205,6 @@ def test_view_16a8w_tosa_INT(test_data: Tuple): @common.parametrize("test_data", View.needs_transpose_tests) @common.XfailIfNoCorstone300 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 view operations" -) def test_view_16a8w_u55_INT16(test_data: Tuple): """Test view operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -224,7 +217,6 @@ def test_view_16a8w_u55_INT16(test_data: Tuple): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( @@ -238,9 +230,6 @@ def test_view_16a8w_u55_INT16(test_data: Tuple): @common.parametrize("test_data", View.needs_transpose_tests) @common.XfailIfNoCorstone320 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 view operations" -) def test_view_16a8w_u85_INT16(test_data: Tuple): """Test view operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -253,7 +242,6 @@ def test_view_16a8w_u85_INT16(test_data: Tuple): exir_ops=[], per_channel_quantization=per_channel_quantization, use_to_edge_transform_and_lower=True, - run_on_fvp=True, ) pipeline.change_args( diff --git a/backends/arm/test/ops/test_where.py b/backends/arm/test/ops/test_where.py index ea036d26361..a35a9fc3b7d 100644 --- a/backends/arm/test/ops/test_where.py +++ b/backends/arm/test/ops/test_where.py @@ -139,8 +139,11 @@ def scalar_condition(input: torch.Tensor): test_modules_FP = { **test_modules_common, - "float32_tensor_cond_tuple_dtype": lambda: float32_tensor_cond_tuple_dtype, "float32_tensor_cond_tuple_dtype_bool": lambda: float32_tensor_cond_tuple_dtype_bool, +} + +test_modules_FP_unsupported_dtype = { + "float32_tensor_cond_tuple_dtype": lambda: float32_tensor_cond_tuple_dtype, "int32_scalar_cond": lambda: int32_scalar_cond, } @@ -162,6 +165,17 @@ def test_where_self_tosa_FP(test_module): pipeline.run() +@common.parametrize("test_module", test_modules_FP_unsupported_dtype) +def test_where_self_tosa_FP_unsupported_dtype(test_module): + pipeline = OpNotSupportedPipeline[input_t]( + test_module(), + test_module().get_inputs(), + {exir_op: 1}, + n_expected_delegates=1, # condition can be delegated + ) + pipeline.run() + + @common.parametrize("test_module", test_modules_INT) def test_where_self_tosa_INT(test_module): pipeline = TosaPipelineINT[input_t]( @@ -212,7 +226,6 @@ def test_where_self_u85_INT(test_module): test_module().get_inputs(), aten_op, exir_op, - run_on_fvp=True, symmetric_io_quantization=True, ) pipeline.run() diff --git a/backends/arm/test/passes/test_insert_rescale_i32_pass.py b/backends/arm/test/passes/test_insert_rescale_i32_pass.py new file mode 100644 index 00000000000..096c90d330d --- /dev/null +++ b/backends/arm/test/passes/test_insert_rescale_i32_pass.py @@ -0,0 +1,77 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes import ( + FoldAndAnnotateQParamsPass, + InsertRescaleInt32Pass, +) +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + + +class NeedsRescaleOps(torch.nn.Module): + """A module containing ops that require INT32 inputs/outputs.""" + + input_t = Tuple[torch.Tensor, torch.Tensor] + + def __init__(self): + super().__init__() + + def forward(self, x, y): + a = torch.maximum(x, y) + b = torch.abs(a) + c = a > b + return c + + def get_inputs(self, dtype) -> input_t: + if dtype == torch.float32: + return (torch.rand(1, 3, 5, 6), torch.rand(1, 3, 5, 6)) + elif dtype == torch.int32: + return ( + torch.randint(3, 5, (3,), dtype=torch.int32), + torch.randint(3, 5, (3,), dtype=torch.int32), + ) + else: + raise ValueError("Not a valid input dtype for model") + + +def test_insert_rescales(): + module = NeedsRescaleOps() + input_t = Tuple[torch.Tensor, torch.Tensor] + ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"} + ops_after = { + # "number of op nodes with i8 output" + "number of i8 node inputs" + "executorch_exir_dialects_backend__ops_tosa_RESCALE_default": 2 + + 5, + } + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(torch.float32), + quantize=True, + ops_not_before_pass=ops_not_before, + ops_after_pass=ops_after, + pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass], + ) + pipeline.pop_stage("run_method_and_compare_outputs") + pipeline.run() + + +def test_dont_insert_rescales(): + module = NeedsRescaleOps() + input_t = Tuple[torch.Tensor, torch.Tensor] + ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"} + # All inputs are already i32. Rescales should not be added. + ops_not_after = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"} + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(torch.int32), + ops_not_before_pass=ops_not_before, + ops_not_after_pass=ops_not_after, + pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass], + ) + pipeline.pop_stage("run_method_and_compare_outputs") + pipeline.run() diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py index 3baa03fde65..9774ebd2fcd 100644 --- a/backends/arm/test/passes/test_rescale_pass.py +++ b/backends/arm/test/passes/test_rescale_pass.py @@ -183,7 +183,6 @@ def test_quantized_rescale_u55(test_data: tuple[torch.Tensor, torch.Tensor]): test_data=test_data, aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() @@ -199,6 +198,5 @@ def test_quantized_rescale_u85(test_data: tuple[torch.Tensor, torch.Tensor]): test_data=test_data, aten_ops=[], exir_ops=[], - run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/passes/test_to_tosa_memory_format.py b/backends/arm/test/passes/test_to_tosa_memory_format.py index 1e9b8ffc63d..643a3bf5733 100644 --- a/backends/arm/test/passes/test_to_tosa_memory_format.py +++ b/backends/arm/test/passes/test_to_tosa_memory_format.py @@ -6,7 +6,10 @@ from typing import Tuple import torch -from executorch.backends.arm._passes import ToTosaMemoryFormatPass +from executorch.backends.arm._passes import ( + AnnotateOutputDimOrderPass, + ToTosaMemoryFormatPass, +) from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( @@ -177,7 +180,10 @@ def test_to_tosa_memory_format_tosa_INT(module): ops_after_pass=module.ops_after_pass, ops_not_after_pass=module.ops_not_after_pass, pass_list=[RemoveGetItemPass], - passes_with_exported_program=[ToTosaMemoryFormatPass], + passes_with_exported_program=[ + AnnotateOutputDimOrderPass, + ToTosaMemoryFormatPass, + ], ) pipeline.pop_stage( "run_method_and_compare_outputs" diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 1b59b186a2e..3d002eff25e 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -13,11 +13,19 @@ from pathlib import Path +from types import NoneType from typing import Any, cast, Dict, List, Literal, Optional, Tuple import numpy as np import torch +from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec +from executorch.backends.arm.constants import ( + NHWC_INVERSE_ORDER, + NHWC_ORDER, + NNHWC_INVERSE_ORDER, + NNHWC_ORDER, +) from executorch.backends.arm.ethosu import EthosUCompileSpec from executorch.backends.arm.test.conftest import is_option_enabled @@ -157,6 +165,36 @@ def get_output_quantization_params( return quant_params +def torch_tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray: + dtype = _torch_to_numpy_dtype_dict[tensor.dtype] + array = tensor.detach().numpy().astype(dtype) + dim_order = tensor.dim_order() + if dim_order == NHWC_ORDER: + a = array.transpose(NHWC_ORDER) + return a + elif dim_order == NNHWC_ORDER: + return array.transpose(NNHWC_ORDER) + else: + return array + + +def numpy_to_torch_tensor(array: np.ndarray, output_node: Node) -> torch.Tensor: + output_tensor = get_first_fake_tensor(output_node) + shape = output_tensor.shape + dim_order = output_tensor.dim_order() + if dim_order == NHWC_ORDER: + shape_with_dim_order = [shape[i] for i in NHWC_ORDER] + tensor = torch.from_numpy(array).reshape(shape_with_dim_order) + return tensor.permute(NHWC_INVERSE_ORDER).to(memory_format=torch.channels_last) + elif dim_order == NNHWC_ORDER: + shape_with_dim_order = [shape[i] for i in NNHWC_ORDER] + tensor = torch.from_numpy(array).reshape(shape_with_dim_order) + return tensor.permute(NNHWC_INVERSE_ORDER).to(memory_format=torch.channels_last) + else: + tensor = torch.from_numpy(array).reshape(shape) + return tensor + + class TosaReferenceModelDispatch(TorchFunctionMode): """A context manager for executing call_delegate nodes using the reference model""" @@ -168,7 +206,8 @@ def _tosa_dispatch(self, lowered_backend_module: LoweredBackendModule, inputs): tosa_buffer = lowered_backend_module.processed_bytes compile_spec = TosaCompileSpec.from_list(lowered_backend_module.compile_specs) - return run_tosa_graph(tosa_buffer, compile_spec.tosa_spec, inputs) + output_node = lowered_backend_module.original_module.graph.output_node() + return run_tosa_graph(tosa_buffer, compile_spec.tosa_spec, inputs, output_node) def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) @@ -190,6 +229,22 @@ def __torch_function__(self, func, types, args=..., kwargs=None): ) kwargs = kwargs or {} + + # This is a hack since Q/DQ ops does not handle channels last input correctly: the simplest and most robust + # workaround is to simply run them in channels first format and then convert back to channels last. + if func in ( + torch.ops.quantized_decomposed.quantize_per_tensor.out, + torch.ops.quantized_decomposed.dequantize_per_tensor.out, + torch.ops.quantized_decomposed.quantize_per_channel.out, + torch.ops.quantized_decomposed.dequantize_per_channel.out, + ): + + input_dim_order = args[0].dim_order() + if input_dim_order in (NHWC_ORDER, NNHWC_ORDER): + args = [args[0].to(memory_format=torch.contiguous_format), *args[1:]] + res = func(*args, **kwargs) + return res.to(memory_format=torch.channels_last) + return func(*args, **kwargs) @@ -244,14 +299,13 @@ def get_output_from_file( output_np = [] output_node = exported_program.graph_module.graph.output_node() for i, node in enumerate(output_node.args[0]): - output_shape = node.meta["val"].shape output_dtype = node.meta["val"].dtype tosa_ref_output = np.fromfile( os.path.join(intermediate_path, f"{output_base_name}-{i}.bin"), _torch_to_numpy_dtype_dict[output_dtype], ) - output_np.append(torch.from_numpy(tosa_ref_output).reshape(output_shape)) + output_np.append(numpy_to_torch_tensor(tosa_ref_output, node)) return tuple(output_np) @@ -437,11 +491,14 @@ def prep_data_for_save( quant_param: Optional[QuantizationParams] = None, ): if isinstance(data, torch.Tensor): - data_np = np.array(data.detach(), order="C").astype( - _torch_to_numpy_dtype_dict[data.dtype] - ) + data_np = torch_tensor_to_numpy(data) + elif isinstance(data, (int, float, bool, NoneType)): + return np.array(data) else: - data_np = np.array(data) + raise RuntimeError( + f"Input dtype {type(data)} could not be converted to numpy array." + ) + if quant_param is not None: assert quant_param.node_name in input_name, ( f"The quantization params name '{quant_param.node_name}' does not " @@ -455,30 +512,8 @@ def prep_data_for_save( f"{quant_param.dtype}".replace("torch.", "") ) # Use string format of dtype to convert to numpy dtype ) - return data_np - - -def save_npy( - path: str, - data, - input_name: str, - quant_param: Optional[QuantizationParams] = None, -) -> str: - """Serializes and saves 'data' as a .npy file, possibly quantizing it before. - - Parameters: - path: the directory where to save the data. - data: the data to save. - input_name: the name of the file, without file-ending. - quant_param: the parameters to use for quantization. - Returns: - the full file path of the output. - """ - data_np = prep_data_for_save(data, input_name, quant_param) - file_path = os.path.join(path, input_name + ".npy") - np.save(file_path, data_np, allow_pickle=False) - return file_path + return data_np def save_bytes( @@ -691,9 +726,12 @@ def run_tosa_graph( graph: Any, tosa_version: TosaSpecification, inputs: list[torch.Tensor], + output_node: Node, ) -> list[torch.Tensor]: """Runs the TOSA reference model with inputs and returns the result.""" - inputs_np = [input.numpy() for input in inputs] + + # Convert tensors to numpy arrays with correct dim_order + inputs_np = [torch_tensor_to_numpy(input_tensor) for input_tensor in inputs] if isinstance(tosa_version, Tosa_1_00): import tosa_reference_model as reference_model @@ -715,7 +753,13 @@ def run_tosa_graph( status == reference_model.GraphStatus.TOSA_VALID ), "Non-valid TOSA given to reference model." - return [torch.from_numpy(output) for output in outputs_np] + # Convert output numpy arrays to tensors with same dim_order as the output nodes + result = [ + numpy_to_torch_tensor(output_array, node) + for output_array, node in zip(outputs_np, output_node.args[0]) + ] + + return result def get_target_board(compile_spec: ArmCompileSpec) -> str | None: diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl index f240855cdf4..5fdd1c3d827 100644 --- a/backends/arm/test/targets.bzl +++ b/backends/arm/test/targets.bzl @@ -4,7 +4,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest") load("@bazel_skylib//lib:paths.bzl", "paths") def define_arm_tests(): - # TODO Add more tests + # TODO [fbonly] Add more tests test_files = [] # Passes @@ -22,9 +22,11 @@ def define_arm_tests(): "ops/test_mul.py", "ops/test_slice.py", "ops/test_sigmoid.py", + "ops/test_sub.py", "ops/test_tanh.py", "ops/test_view.py", "ops/test_cos.py", + "ops/test_to_copy.py", ] # Quantization @@ -39,7 +41,7 @@ def define_arm_tests(): "misc/test_bn_relu_folding_qat.py", "misc/test_custom_partition.py", "misc/test_debug_hook.py", - "misc/test_dim_order_guards.py", + # "misc/test_dim_order.py", (TODO - T238390249) "misc/test_outputs_order.py", ] diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh index 53c707cad28..b8e8aee4e3a 100755 --- a/backends/arm/test/test_arm_baremetal.sh +++ b/backends/arm/test/test_arm_baremetal.sh @@ -155,17 +155,18 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using test_pytest_ops_vkml() { # Same as test_pytest but also sometime verify using VKML runtime - echo "${TEST_SUITE_NAME}: Run pytest with VKML" + echo "${TEST_SUITE_NAME}: Run pytest operator tests with VKML runtime" backends/arm/scripts/build_executorch.sh backends/arm/test/setup_testing_vkml.sh - pytest --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models + pytest --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ \ + --ignore=backends/arm/test/models -k _vgf_ echo "${TEST_SUITE_NAME}: PASS" } test_pytest_models_vkml() { # Same as test_pytest but also sometime verify VKML runtime - echo "${TEST_SUITE_NAME}: Run pytest with VKML" + echo "${TEST_SUITE_NAME}: Run pytest model tests with VKML runtime" backends/arm/scripts/build_executorch.sh backends/arm/test/setup_testing_vkml.sh @@ -173,7 +174,7 @@ test_pytest_models_vkml() { # Same as test_pytest but also sometime verify VKML # Install model dependencies for pytest source backends/arm/scripts/install_models_for_test.sh - pytest --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models + pytest --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k _vgf_ echo "${TEST_SUITE_NAME}: PASS" } @@ -365,5 +366,20 @@ test_smaller_stories_llama() { echo "${TEST_SUITE_NAME}: PASS" } +test_memory_allocation() { + echo "${TEST_SUITE_NAME}: Test ethos-u memory allocation with run.sh" + + mkdir -p arm_test/test_run + # Ethos-U85 + echo "${TEST_SUITE_NAME}: Test target Ethos-U85" + examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=examples/arm/example_modules/add.py &> arm_test/test_run/full.log + python3 backends/arm/test/test_memory_allocator_log.py --log arm_test/test_run/full.log \ + --require "model_pte_program_size" "<= 3000 B" \ + --require "method_allocator_planned" "<= 64 B" \ + --require "method_allocator_loaded" "<= 1024 B" \ + --require "method_allocator_input" "<= 4 B" \ + --require "Total DRAM used" "<= 0.06 KiB" + echo "${TEST_SUITE_NAME}: PASS" +} ${TEST_SUITE} diff --git a/backends/arm/test/test_memory_allocator_log.py b/backends/arm/test/test_memory_allocator_log.py new file mode 100644 index 00000000000..3853b60b7f6 --- /dev/null +++ b/backends/arm/test/test_memory_allocator_log.py @@ -0,0 +1,170 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +""" +Check log files for memory metrics and compare them against thresholds. + +Usage example: + python3 test_memory_allocator_log.py \ + --log path/to/log.txt \ + --require "Total SRAM used" "<= 310 KiB" \ + --require "method_allocator_input" "<= 4 B" +""" + +import argparse +import re +import sys +from typing import List, Optional, Tuple + + +def unit_factor(u: str) -> float: + if not u: + return 1.0 + ul = u.strip().lower() + table = { + "b": 1, + "byte": 1, + "bytes": 1, + "kb": 1000, + "mb": 1000**2, + "gb": 1000**3, + "kib": 1024, + "mib": 1024**2, + "gib": 1024**3, + } + if ul in table: + return float(table[ul]) + return 1.0 + + +def parse_value(text_num: str, text_unit: Optional[str]) -> float: + return float(text_num) * unit_factor(text_unit or "") + + +def parse_cond(cond: str) -> Tuple[str, float, str]: + # Regexp explained. Example of things it will parse: + # "< 310 KiB", ">=10MB", "== 42", "!=3 bytes", "<=0.5 MiB" + + # The regexp explained in detail: + # ^: anchor the match to the start and end of the string (no extra chars allowed). + # \s*: optional whitespace (spaces, tabs, etc.). + # (<=|>=|==|!=|<|>): capturing group 1. One of the comparison operators: <=, >=, ==, !=, <, >. + # \s*: optional whitespace. + # ([0-9]+(?:\.[0-9]+)?): capturing group 2. A number: + # [0-9]+: one or more digits (the integer part). + # (?:\.[0-9]+)?: optional non-capturing group for a fractional part like .25. + # \s*: optional whitespace between number and unit + # ([A-Za-z]+)?: capturing group 3, optional. A unit made of letters only (e.g., B, KB, KiB, MB, MiB). Case# insensitive by class choice. + # \s*: optional trailing whitespace. + m = re.match( + r"^\s*(<=|>=|==|!=|<|>)\s*([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?\s*$", cond + ) + if not m: + raise ValueError(f"Invalid condition: {cond}") + op, num, unit = m.groups() + return op, float(num), (unit or "") + + +def compare(a: float, b: float, op: str) -> bool: + return { + "<": a < b, + "<=": a <= b, + ">": a > b, + ">=": a >= b, + "==": abs(a - b) < 1e-9, + "!=": abs(a - b) >= 1e-9, + }[op] + + +def find_metric_value(line: str, label: str) -> Tuple[Optional[str], Optional[str]]: + # Same regexp as parse_cond() but without the first group of matching comparison operators + # First go, search for the pattern but escape and ignore cases + # The regexp: + # ([0-9]+(?:\.[0-9]+)?) — capturing group 1: a decimal number + # [0-9]+ — one or more digits (integer part) + # (?:\.[0-9]+)? — optional fractional part like .25 (non-capturing) + # \s* — optional whitespace between number and unit + # ([A-Za-z]+)? — capturing group 2 (optional): a unit made only of letters (e.g., B, KB, KiB, MB) + m = re.search( + re.escape(label) + r".*?([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?", + line, + flags=re.IGNORECASE, + ) + if m: + return m.group(1), m.group(2) + # Second go, same regexp as above but not caring about label. If + # no number was tied to a label be happy just salvaging it from + # the line + m = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?", line) + if m: + return m.group(1), m.group(2) + return None, None + + +def first_line_with_label(lines: List[str], label: str) -> Optional[str]: + label_lc = label.lower() + return next((ln for ln in lines if label_lc in ln.lower()), None) + + +def check_requirement(label: str, cond: str, lines: List[str]) -> Optional[str]: + op, thr_num, thr_unit = parse_cond(cond) + matched = first_line_with_label(lines, label) + if matched is None: + return f"{label}: not found in log" + + num_str, unit_str = find_metric_value(matched, label) + if num_str is None: + return f"{label}: value not found on line: {matched.strip()}" + + left_bytes = parse_value(num_str, unit_str) + right_bytes = parse_value(str(thr_num), thr_unit or (unit_str or "")) + ok = compare(left_bytes, right_bytes, op) + + human_left = f"{num_str} {unit_str or 'B'}" + human_right = f"{thr_num:g} {thr_unit or (unit_str or 'B')}" + print( + f"[check] {label}: {human_left} {op} {human_right} -> {'OK' if ok else 'FAIL'}" + ) + + if ok: + return None + return f"{label}: {human_left} not {op} {human_right}" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--log", required=True, help="Path to log file") + parser.add_argument( + "--require", + action="append", + nargs=2, + metavar=("LABEL", "COND"), + default=[], + help="""Required label and condition consisting + of a number and unit. Example: \"Total DRAM + used\" \"<= 0.06 KiB\"""", + ) + args = parser.parse_args() + + with open(args.log, "r", encoding="utf-8", errors="ignore") as f: + lines = f.readlines() + + failures: List[str] = [] + for label, cond in args.require: + msg = check_requirement(label, cond, lines) + if msg: + failures.append(msg) + + if failures: + print("Failures:") + for msg in failures: + print(" - " + msg) + return 1 + + print("All checks passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 8bf72827549..0cba8d987c0 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -28,17 +28,11 @@ import torch.fx import torch.utils._pytree as pytree - from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec -from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner -from executorch.backends.arm.quantizer import ( - EthosUQuantizer, - get_symmetric_quantization_config, - TOSAQuantizer, - VgfQuantizer, -) +from executorch.backends.arm.ethosu import EthosUCompileSpec +from executorch.backends.arm.quantizer import get_symmetric_quantization_config from executorch.backends.arm.test.runner_utils import ( dbg_tosa_fb_to_json, get_output_quantization_params, @@ -53,9 +47,13 @@ from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec from executorch.backends.arm.tosa.mapping import extract_tensor_meta -from executorch.backends.arm.tosa.partitioner import TOSAPartitioner -from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner +from executorch.backends.arm.util._factory import ( + create_partitioner, + create_quantizer, + parse_compile_spec, +) +from executorch.backends.arm.vgf import VgfCompileSpec from executorch.backends.test.harness.error_statistics import ErrorStatistics from executorch.backends.test.harness.stages import Stage, StageType @@ -83,7 +81,6 @@ _copy_module, _update_exported_program_graph_module, ) - from tabulate import tabulate from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec @@ -103,12 +100,6 @@ def _dump_lowered_modules_artifact( artifact.exported_program().graph_signature ) - def get_output_format(lowered_module) -> str | None: - for spec in lowered_module.compile_specs: - if spec.key == "output_format": - return spec.value.decode() - return None - for node in graph_module.graph.nodes: if node.op == "get_attr" and node.name.startswith("lowered_module_"): lowered_module = getattr(graph_module, node.name) @@ -116,13 +107,13 @@ def get_output_format(lowered_module) -> str | None: lowered_module, LoweredBackendModule ), f"Attribute {node.name} must be of type LoweredBackendModule." - output_format = get_output_format(lowered_module) - if output_format == "tosa": + compile_spec = parse_compile_spec(lowered_module.compile_specs) + if isinstance(compile_spec, TosaCompileSpec): tosa_fb = lowered_module.processed_bytes to_print = dbg_tosa_fb_to_json(tosa_fb) to_print = pformat(to_print, compact=True, indent=1) output += f"\nTOSA deserialized {node.name}: \n{to_print}\n" - elif output_format == EthosUCompileSpec.get_output_format(): + elif isinstance(compile_spec, EthosUCompileSpec): vela_cmd_stream = lowered_module.processed_bytes output += f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n" else: @@ -284,13 +275,7 @@ def quantize( quantize_stage: Optional[tester.Quantize] = None, ): if quantize_stage is None: - quantizer = None - if isinstance(self.compile_spec, TosaCompileSpec): - quantizer = TOSAQuantizer(self.compile_spec) - elif isinstance(self.compile_spec, EthosUCompileSpec): - quantizer = EthosUQuantizer(self.compile_spec) - elif isinstance(self.compile_spec, VgfCompileSpec): - quantizer = VgfQuantizer(self.compile_spec) + quantizer = create_quantizer(self.compile_spec) quantize_stage = tester.Quantize( quantizer, get_symmetric_quantization_config(), @@ -312,14 +297,7 @@ def to_edge( def partition(self, partition_stage: Optional[Partition] = None): if partition_stage is None: - if isinstance(self.compile_spec, TosaCompileSpec): - arm_partitioner = TOSAPartitioner(self.compile_spec) - elif isinstance(self.compile_spec, EthosUCompileSpec): - arm_partitioner = EthosUPartitioner(self.compile_spec) - elif isinstance(self.compile_spec, VgfCompileSpec): - arm_partitioner = VgfPartitioner(self.compile_spec) - else: - raise ValueError("compile spec doesn't target any Arm Partitioner") + arm_partitioner = create_partitioner(self.compile_spec) partition_stage = Partition(arm_partitioner) return super().partition(partition_stage) @@ -329,7 +307,7 @@ def to_edge_transform_and_lower( partitioners: Optional[List[Partitioner]] = None, edge_compile_config: Optional[EdgeCompileConfig] = None, additional_checks: Optional[ - List[Union[DontPartition | DontPartitionModule | DontPartitionName]] + List[DontPartition | DontPartitionModule | DontPartitionName] ] = None, transform_passes: Optional[ Union[Sequence[PassType], Dict[str, Sequence[PassType]]] @@ -343,20 +321,9 @@ def to_edge_transform_and_lower( if to_edge_and_lower_stage is None: if partitioners is None: - if isinstance(self.compile_spec, TosaCompileSpec): - arm_partitioner = TOSAPartitioner( - self.compile_spec, additional_checks - ) - elif isinstance(self.compile_spec, EthosUCompileSpec): - arm_partitioner = EthosUPartitioner( - self.compile_spec, additional_checks - ) - elif isinstance(self.compile_spec, VgfCompileSpec): - arm_partitioner = VgfPartitioner( - self.compile_spec, additional_checks - ) - else: - raise ValueError("compile spec doesn't target any Arm Partitioner") + arm_partitioner = create_partitioner( + self.compile_spec, additional_checks + ) partitioners = [arm_partitioner] to_edge_and_lower_stage = ToEdgeTransformAndLower( partitioners, @@ -463,6 +430,10 @@ def run_method_and_compare_outputs( for run_iteration in range(num_runs): reference_input = inputs if inputs else next(self.generate_random_inputs()) + # Avoid issues with inplace operators + test_input = copy.deepcopy(reference_input) + original_input = copy.deepcopy(reference_input) + input_shapes = [ generated_input.shape if hasattr(generated_input, "shape") else (1,) for generated_input in reference_input @@ -477,16 +448,16 @@ def run_method_and_compare_outputs( # Run exported module directly test_outputs, _ = pytree.tree_flatten( self._calculate_reference_output( - exported_program.module(), reference_input + exported_program.module(), test_input ) ) else: # Run lowered model with target test_outputs, _ = pytree.tree_flatten( - test_stage.run_artifact(reference_input) + test_stage.run_artifact(test_input) ) - logger.info(f"\n Input: {reference_input}") + logger.info(f"\n Input: {original_input}") logger.info(f"\n Ref output: {reference_outputs}") logger.info(f"\nTest output: {test_outputs}") @@ -743,22 +714,19 @@ def _get_tosa_operator_distribution( op_list = [] id = 0 while lowered_module := getattr(graph_module, f"lowered_module_{id}", None): - for spec in lowered_module.compile_specs: - if spec.key != "output_format": - continue - if spec.value == b"tosa": - tosa_fb = lowered_module.processed_bytes - tosa_json = dbg_tosa_fb_to_json(tosa_fb) - for region in tosa_json["regions"]: - for block in region["blocks"]: - op_list.extend( - [operator["op"] for operator in block["operators"]] - ) - break - elif spec.value == EthosUCompileSpec.get_output_format().encode(): - return "Can not get operator distribution for Vela command stream." - else: - return f"Unknown output format '{spec.value}'." + compile_spec = parse_compile_spec(lowered_module.compile_specs) + if isinstance(compile_spec, TosaCompileSpec): + tosa_fb = lowered_module.processed_bytes + tosa_json = dbg_tosa_fb_to_json(tosa_fb) + for region in tosa_json["regions"]: + for block in region["blocks"]: + op_list.extend([operator["op"] for operator in block["operators"]]) + elif isinstance(compile_spec, EthosUCompileSpec): + return "Can not get operator distribution for Vela command stream." + elif isinstance(compile_spec, VgfCompileSpec): + return "Can not get operator distribution for VGF." + else: + return f"Unknown output format '{compile_spec.get_output_format()}'." id += 1 if id == 0: return "No delegate with name 'lowered_module_0 found in graph module." diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py index 123c1af44c3..54a8f08ee50 100644 --- a/backends/arm/test/tester/test_pipeline.py +++ b/backends/arm/test/tester/test_pipeline.py @@ -906,7 +906,7 @@ class VgfPipeline(BasePipelineMaker, Generic[T]): exir_ops: Exir dialect ops expected to be found in the graph after to_edge. if not using use_edge_to_transform_and_lower. - run_on_vulkan_runtime: Set to true to test VGF output on VKML runtime. + run_on_vulkan_runtime: Whether to test VGF output on VKML runtime. vgf_compiler_flags: Optional compiler flags. @@ -922,7 +922,7 @@ def __init__( test_data: T, aten_op: str | List[str], exir_op: Optional[str | List[str]] = None, - run_on_vulkan_runtime: bool = False, + run_on_vulkan_runtime: bool = True, vgf_compiler_flags: Optional[str] = "", tosa_version: str = "TOSA-1.0+FP", symmetric_io_quantization: bool = False, @@ -1018,3 +1018,16 @@ def __init__( qtol=qtol, inputs=self.test_data, ) + self.run_on_vulkan_runtime = run_on_vulkan_runtime + + # TODO: Remove once CI fully working + def run(self): + import pytest + + if self.run_on_vulkan_runtime: + try: + super().run() + except FileNotFoundError as e: + pytest.skip(f"VKML executor_runner not found - not built - skip {e}") + else: + super().run() diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py index afae6f8163f..7a7ea2ca377 100644 --- a/backends/arm/tosa/backend.py +++ b/backends/arm/tosa/backend.py @@ -104,10 +104,15 @@ def _preprocess( # noqa: C901 # const data directly. Path created and data written only in debug builds. tosa_graph = ts.TosaSerializer(artifact_path) - assert ( + if not ( tosa_spec.version.major == ts.TOSA_VERSION_MAJOR and tosa_spec.version.minor == ts.TOSA_VERSION_MINOR - ), f"TOSA serializer version ({ts.TOSA_VERSION_MAJOR}.{ts.TOSA_VERSION_MINOR}) doesn't match specification {tosa_spec}" + ): + raise RuntimeError( + f"TOSA serializer version " + f"({ts.TOSA_VERSION_MAJOR}.{ts.TOSA_VERSION_MINOR}) " + f"doesn't match specification {tosa_spec}" + ) # TODO: Fix the need to lazily import this. from executorch.backends.arm._passes import ArmPassManager @@ -201,8 +206,8 @@ def filter_tosa_compile_specs( hardware. """ - new_compile_spec = TosaCompileSpec.__new__(TosaCompileSpec) - new_compile_spec._set_compile_specs( - compile_spec.tosa_spec, [], compile_spec.get_intermediate_path() + return ( + TosaCompileSpec(compile_spec.tosa_spec) + .dump_intermediate_artifacts_to(compile_spec.get_intermediate_path()) + .dump_debug_info(compile_spec.tosa_debug_mode) ) - return new_compile_spec diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py index 136f59beb62..897de70279f 100644 --- a/backends/arm/tosa/dialect/__init__.py +++ b/backends/arm/tosa/dialect/__init__.py @@ -4,7 +4,9 @@ # LICENSE file in the root directory of this source tree. from executorch.backends.arm.tosa.dialect.ops import ( # noqa F401 + matmul, rescale, + resize, table, transpose, ) diff --git a/backends/arm/tosa/dialect/ops/matmul.py b/backends/arm/tosa/dialect/ops/matmul.py new file mode 100644 index 00000000000..1ba3821f674 --- /dev/null +++ b/backends/arm/tosa/dialect/ops/matmul.py @@ -0,0 +1,56 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.backends.arm.tosa.dialect.lib import TosaValueError +from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op + +from executorch.backends.arm.tosa.specification import ( + get_context_spec, + TosaSpecification, +) +from executorch.exir.dialects._ops import ops as exir_ops + + +@register_fake_tosa_op( + "MATMUL(Tensor input1, Tensor input2) -> Tensor", # schema + ( + TosaSpecification.create_from_string("TOSA-1.0+INT"), + ), # target TOSA specifications +) +def MATMUL(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + tosa_spec = get_context_spec() + """Performs matrix multiplication on two input tensors. + Additionally validates TOSA constraints of a MATMUL op. + """ + if x1.dtype != x2.dtype: + raise TosaValueError( + f"Input tensors must have the same dtype, got {x1.dtype} and {x2.dtype}", + op="MATMUL", + ) + if x1.dtype in (torch.int8, torch.int16): + if not tosa_spec.support_integer(): + raise TosaValueError( + f"TOSA spec {tosa_spec} doesn't support integers", op="MATMUL" + ) + else: + dtype = torch.int32 + elif x1.dtype in (torch.float16, torch.float32): + if not tosa_spec.support_float(): + raise TosaValueError( + f"TOSA spec {tosa_spec} doesn't support float", op="MATMUL" + ) + else: + # float16 supports float16 accumulation as well + dtype = torch.float32 + else: + raise TosaValueError( + f"Input tensors must be of type int8, float16 or float32, got {x1.dtype}", + op="MATMUL", + ) + + aten_fake_tensor = exir_ops.edge.aten.bmm.default(x1, x2) + + return torch.empty_like(aten_fake_tensor, dtype=dtype) diff --git a/backends/arm/tosa/dialect/ops/resize.py b/backends/arm/tosa/dialect/ops/resize.py new file mode 100644 index 00000000000..1f976d0f5e0 --- /dev/null +++ b/backends/arm/tosa/dialect/ops/resize.py @@ -0,0 +1,60 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Literal, Optional + +import torch +from executorch.backends.arm.tosa.dialect.lib import TosaValueError +from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op + +from executorch.backends.arm.tosa.specification import ( + get_context_spec, + TosaSpecification, +) +from executorch.exir.dialects._ops import ops as exir_ops + + +# Add kwarg instead? +@register_fake_tosa_op( + "RESIZE(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors, *, str resize_mode) -> Tensor", # schema + ( + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), + ), # target TOSA specifications +) +def RESIZE( + x: torch.Tensor, + output_size: list[int] | None = None, + align_corners: Optional[bool] = False, + scale_factors: list[float] | None = None, + *, + resize_mode: Literal["nearest", "bilinear"], +) -> torch.Tensor: + tosa_spec = get_context_spec() + + if resize_mode not in ("nearest", "bilinear"): + raise TosaValueError(f"Unsupported resize mode {resize_mode} for TOSA RESIZE") + if x.dtype == torch.int8: + if not tosa_spec.support_integer(): + raise TosaValueError( + f"TOSA spec {tosa_spec} doesn't support integers", op="RESIZE" + ) + bilinear = resize_mode == "bilinear" + output_dtype = torch.int32 if bilinear else torch.int8 + elif x.dtype in (torch.float16, torch.float32): + if not tosa_spec.support_float(): + raise TosaValueError( + f"TOSA spec {tosa_spec} doesn't support float", op="RESIZE" + ) + output_dtype = x.dtype + else: + raise TosaValueError(f"Unsupported input dtype {x.dtype} for TOSA RESIZE") + + # Does it matter which one to use for fake tracing? + fake_aten_tensor = exir_ops.edge.aten.upsample_nearest2d.vec( + x, output_size, scale_factors + ) + + return fake_aten_tensor.to(output_dtype) diff --git a/backends/arm/tosa/dialect/ops/transpose.py b/backends/arm/tosa/dialect/ops/transpose.py index 9c5aba05394..8d5bf8bac70 100644 --- a/backends/arm/tosa/dialect/ops/transpose.py +++ b/backends/arm/tosa/dialect/ops/transpose.py @@ -26,9 +26,9 @@ def TRANSPOSE(a, perms): # By utilizing an edge IR passthrough operator we can keep the edge program in # channels-first/contiguous and get the desired behavior in the TOSA lowering. - if len(perms) not in (4, 5): + if len(perms) not in (4, 5, 6): raise TosaValueError( - f"Only 4D and 5D tensors are supported, got {len(perms)}: {perms}", + f"Only 4D, 5D and 6D tensors are supported, got {len(perms)}: {perms}", op="TRANSPOSE", ) diff --git a/backends/arm/tosa/mapping.py b/backends/arm/tosa/mapping.py index 60ef98a37c0..64e4ae96e08 100644 --- a/backends/arm/tosa/mapping.py +++ b/backends/arm/tosa/mapping.py @@ -4,13 +4,14 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe +"""Provide PyTorch-to-TOSA mapping helpers. -# -# PyTorch to Tosa mapping - simple mapping functions and multi-type extraction -# of key information. These are used by the initial compile stage which captures -# the standardised TOSA representation. -# +Use these utilities to translate PyTorch dtypes and FX node metadata into +the TOSA serializer types and shapes used during initial compilation. + +""" +from enum import Enum from typing import Any, Optional, Sequence import serializer.tosa_serializer as ts # type: ignore @@ -31,7 +32,36 @@ ) +class TosaSpecialDtype(Enum): + """ + Special TOSA data types that are not natively supported in PyTorch, to be + used in specific scenarios as a value in the key from meta_key(). + """ + + INT48 = ts.DType.INT48 + + def get_tosa_dtype(self) -> ts.TosaDType.DType: + return self.value + + @staticmethod + def meta_key() -> str: + return "tosa_special_dtype" + + def map_dtype(data_type: torch.dtype, tosa_spec: TosaSpecification) -> Any: + """Map a ``torch.dtype`` to a ``ts.DType``. + + Args: + data_type (torch.dtype): PyTorch dtype to convert. + tosa_spec (TosaSpecification): Active spec (reserved for future checks). + + Returns: + Any: Matching ``ts.DType`` enum value. + + Raises: + ValueError: If the dtype is unsupported or unknown. + + """ if data_type in UNSUPPORTED_DTYPES: raise ValueError(f"Unsupported type: {data_type}") @@ -57,7 +87,22 @@ def map_dtype(data_type: torch.dtype, tosa_spec: TosaSpecification) -> Any: # TODO: other types, can be # SymInt, FakeTensor, a List[Union[FakeTensor, SymInt]], or None def extract_tensor_meta(meta, tosa_spec: TosaSpecification): - assert meta.get("val") is not None + """Extract dtype, shape, and dimension order from FX metadata. + + Args: + meta (dict): FX node ``meta`` containing a ``val`` FakeTensor (or tuple). + tosa_spec (TosaSpecification): Active TOSA spec for dtype mapping. + + Returns: + tuple: ``(dtype, shape, dim_order)`` where ``dtype`` is ``ts.DType``, + ``shape`` is ``Tuple[int, ...]``, and ``dim_order`` is ``Tuple[int, ...]``. + + Raises: + ValueError: If ``meta['val']`` is not a ``FakeTensor``. + + """ + if meta.get("val") is None: + raise ValueError("Expected node.meta['val'] to be set to a FakeTensor") val = meta["val"] if type(val) is tuple: # TODO: should use first concrete representation @@ -77,23 +122,72 @@ def extract_tensor_meta(meta, tosa_spec: TosaSpecification): return (dtype, shape, dim_order) -# Class to capture arguments and turn into tensor references for TOSA OPs class TosaArg: + """Capture and normalize TOSA operator arguments. + + Use this to convert FX nodes, sequences, and numeric literals into a + consistent structure suitable for TOSA serialization. + + Attributes: + name (str): Node name when argument is a ``torch.fx.Node``; empty otherwise. + dtype (ts.DType | None): Inferred dtype when available. + shape (tuple[int, ...] | None): Inferred shape when available. + dim_order (tuple[int, ...] | None): Dimension order, defaulting to ``range(len(shape))``. + special (list | None): Captured list when the argument is a sequence. + number (float | int | None): Captured numeric value when given. + tosa_spec (TosaSpecification): Active specification used for mapping. + + """ + def __process_node(self, argument: torch.fx.Node): + """Parse a ``torch.fx.Node`` and populate tensor attributes. + + Args: + argument (torch.fx.Node): FX node to inspect. + + """ self.name: str = argument.name - self.dtype, self.shape, self.dim_order = extract_tensor_meta( + output_dtype, self.shape, self.dim_order = extract_tensor_meta( argument.meta, self.tosa_spec ) + # Handle special case of types not representable in torch (i.e. i48_t) + if special_type := argument.meta.get(TosaSpecialDtype.meta_key(), None): + output_dtype = special_type.get_tosa_dtype() + + self.dtype = output_dtype + def __process_list(self, argument): + """Capture a sequence argument as ``special``. + + Args: + argument (Sequence): Sequence to store. + + """ self.special: list = list(argument) def __process_number(self, argument: float | int): + """Capture a numeric argument as ``number``. + + Args: + argument (float | int): Numeric value. + + """ self.number: float | int = argument def __init__( self, argument: Any, tosa_spec: Optional[TosaSpecification] = None ) -> None: + """Initialize the argument wrapper and populate fields. + + Args: + argument (Any): One of ``torch.fx.Node``, ``Sequence``, ``int``, ``float``, ``torch.dtype``, or ``None``. + tosa_spec (Optional[TosaSpecification]): Active specification; required. + + Raises: + RuntimeError: If ``argument`` is of an unsupported type. + + """ if tosa_spec is None: raise ValueError("tosa_spec is None") elif not isinstance(tosa_spec, TosaSpecification): @@ -127,6 +221,12 @@ def __init__( ) def __repr__(self): + """Return a compact representation of populated attributes. + + Returns: + str: Readable list of set attributes. + + """ attrs = [] if hasattr(self, "name"): if self.name is not None: diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py index 3e512847109..6eb1dcbef72 100644 --- a/backends/arm/tosa/partitioner.py +++ b/backends/arm/tosa/partitioner.py @@ -4,6 +4,15 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe +"""Provide a partitioner for delegating subgraphs to the TOSA backend. + +Implement logic to identify and tag regions of an ``ExportedProgram`` that can +be delegated to the TOSA backend. Use this module to: + +- Partition graphs based on operator support and additional checks. +- Prune trivial no-op partitions that would lower to empty TOSA graphs. +- Tag constant data and report reasons for rejected nodes. +""" import logging from typing import Callable, List, Optional, Sequence, Tuple @@ -34,14 +43,46 @@ def is_noop_clone(node: torch.fx.node.Node) -> bool: + """Return True if the node is a no-op ``dim_order_ops._clone_dim_order``. + + Args: + node (torch.fx.Node): FX node to inspect. + + Returns: + bool: True if the node targets ``dim_order_ops._clone_dim_order.default`` + in the Edge dialect; otherwise, False. + + """ return node.target == exir_ops.edge.dim_order_ops._clone_dim_order.default def is_noop_alias_copy(node: torch.fx.Node) -> bool: + """Return True if the node is a no-op ``aten.alias_copy``. + + Args: + node (torch.fx.Node): FX node to inspect. + + Returns: + bool: True if the node targets ``aten.alias_copy.default``; otherwise, + False. + + """ return node.target == exir_ops.edge.aten.alias_copy.default def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool: + """Return True if node is a no-op ``dim_order_ops._to_dim_order_copy``. + + Consider the op a no-op when the output dtype equals the input's dtype. + + Args: + node (torch.fx.Node): FX node to inspect. + + Returns: + bool: True if it targets ``_to_dim_order_copy.default`` and preserves + dtype; otherwise, False. + + """ if node.target != exir_ops.edge.dim_order_ops._to_dim_order_copy.default: return False else: @@ -49,6 +90,19 @@ def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool: def is_noop_expand(node: torch.fx.node.Node) -> bool: + """Return True if the node is an ``expand_copy`` with all-ones multiples. + + This corresponds to a semantic no-op, since expanding by 1 along every + dimension leaves the tensor unchanged. + + Args: + node (torch.fx.Node): FX node to inspect. + + Returns: + bool: True if the node targets ``aten.expand_copy.default`` and all + computed multiples are 1; otherwise, False. + + """ if node.target != exir_ops.edge.aten.expand_copy.default: return False else: @@ -57,11 +111,30 @@ def is_noop_expand(node: torch.fx.node.Node) -> bool: class TOSAPartitioner(Partitioner): + """Partition an exported program into TOSA-delegable subgraphs. + + Construct this partitioner for compile specs targeting TOSA. The partition + algorithm uses capability checks and optional additional operator-support + rules to tag nodes with a delegation tag per subgraph. + """ + def __init__( self, compile_spec: TosaCompileSpec, additional_checks: Optional[Sequence[OperatorSupportBase]] = None, ) -> None: + """Initialize the TOSAPartitioner. + + Args: + compile_spec (TosaCompileSpec): Parsed compile specifications for + TOSA containing the TOSA spec and original list. + additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra + operator-support checks to apply when partitioning. + + Raises: + RuntimeError: If the provided compile spec does not target TOSA. + + """ self.delegation_spec = DelegationSpec( TOSABackend.__name__, compile_spec.to_list() ) @@ -70,9 +143,22 @@ def __init__( self.tosa_spec = compile_spec.tosa_spec def partition(self, exported_program: ExportedProgram) -> PartitionResult: # noqa - # Run the CapabilityBasedPartitioner to return the largest possible - # subgraphs containing the nodes with the tags + """Partition the program and tag TOSA-compatible subgraphs. + + Run the FX capability-based partitioner to propose subgraphs, then + refine tags by removing boundary-only quantize/dequantize nodes and by + rejecting partitions that would lower to no-ops. Emit a detailed report + of rejected nodes and their reasons. + + Args: + exported_program (ExportedProgram): Program to analyze and + partition. + + Returns: + PartitionResult: The input program with nodes tagged for delegation + and a mapping of partition tags to delegation specs. + """ logger.info("TOSAPartitioner::partition") partition_tags: dict[str, DelegationSpec] = {} @@ -92,6 +178,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: # no partition_list = capability_partitioner.propose_partitions() def reject_partition(reason: str, partition, tag) -> None: + """Remove a proposed partition and record the rejection reason. + + Args: + reason (str): Human-readable explanation for rejection. + partition (object): Proposed partition object from the + capability partitioner. + tag (str): Delegation tag associated with the partition. + + """ for node in partition.nodes: if "delegation_tag" in node.meta: del node.meta["delegation_tag"] @@ -105,6 +200,16 @@ def reject_partition(reason: str, partition, tag) -> None: tag = f"tag{partition.id}" def is_partitioned(node: torch.fx.Node, tag=tag) -> bool: + """Return True if the node currently belongs to the partition ``tag``. + + Args: + node (torch.fx.Node): FX node to check. + tag (str): Delegation tag identifying the partition. + + Returns: + bool: True if the node carries the matching delegation tag. + + """ return ( "delegation_tag" in node.meta and node.meta["delegation_tag"] == tag ) @@ -113,8 +218,8 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool: node.meta["delegation_tag"] = tag partition_tags[tag] = self.delegation_spec - # De-tag outmost q-nodes upwards and dq-nodes downwards. - # De-tag if at least one input/ output is not part of partition. + # De-tag outermost q-nodes upwards and dq-nodes downwards. + # De-tag if at least one input/output is not part of the partition. for node in exported_program.graph_module.graph.nodes: if not is_partitioned(node): continue @@ -175,15 +280,41 @@ def ops_to_not_decompose( self, ep: ExportedProgram, ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: + """Return operators and a filter that should not be decomposed. + + Provide a base set of ops to preserve as-is and a predicate that keeps + certain activations whole when surrounded by quantize/dequantize ops in + a quantized graph. This helps downstream TOSA lowering and delegation. + + Args: + ep (ExportedProgram): Program used to infer target-specific policy. + + Returns: + Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: + A list of op overloads to keep intact, and an optional filter + function that returns True when an op should not be decomposed. + + """ ops_to_not_decompose_if_quant_op = [ torch.ops.aten.hardsigmoid.default, torch.ops.aten.hardswish.default, ] def filter_fn(node: torch.fx.Node) -> bool: - # This function filters for operators to not decompose where: - # - It's target is in ops_to_not_decompose_if_quant_op list. - # - All it's inputs/outputs are quantize operators. + """Return True to keep selected ops intact inside quantized regions. + + The predicate holds when the target is in + ``ops_to_not_decompose_if_quant_op`` and all inputs/outputs are + quantize/dequantize ops, indicating a quantized activation that + should not be decomposed. + + Args: + node (torch.fx.Node): FX node to evaluate. + + Returns: + bool: True to keep the op intact; otherwise, False. + + """ dq = torch.ops.quantized_decomposed.dequantize_per_tensor.default q = torch.ops.quantized_decomposed.quantize_per_tensor.default @@ -204,7 +335,7 @@ def filter_fn(node: torch.fx.Node) -> bool: return should_not_decompose - # Be default, do not decompose the operator + # By default, do not decompose the operator return True ops_to_not_decompose = [ diff --git a/backends/arm/tosa/quant_utils.py b/backends/arm/tosa/quant_utils.py index 86e8e5bad8b..562c77e30da 100644 --- a/backends/arm/tosa/quant_utils.py +++ b/backends/arm/tosa/quant_utils.py @@ -20,6 +20,7 @@ from executorch.backends.arm.tosa.mapping import TosaArg from torch.fx import Node + from tosa.RoundingMode import RoundingMode # type: ignore @@ -76,6 +77,59 @@ def insert_rescale_ops_to_int32_maxscale( return [rescaled_lhs, rescaled_rhs], back_scale +def insert_rescale_ops_int16_to_int32_maxscale( + tosa_graph: Any, inputs: list[TosaArg], node: Node, tosa_spec=None +) -> tuple[list[Any], float]: + """For ADD and SUB with int16 inputs, we rescale to int32 using a different common scale(2*max(left scale,right scale)) + compared to all the other cases. We multiply the left and right scales by 1<<12 giving us extra precision + for the computation without overflowing. + + Returns a list of the rescaled nodes and the scale factor used, + needed by insert_rescale_op_to_int16. + """ + + if len(inputs) > 2: + raise ValueError("More than two inputs not supported") + + tensors = inputs.copy() + # Reshape tensor according to TOSA dim order + for tensor in tensors: + dim_order = tensor.dim_order + tensor.shape = [tensor.shape[i] for i in dim_order] + + input_qparams = get_input_qparams(node) + lhs_qparams, rhs_qparams = input_qparams.values() + lhs_scale = lhs_qparams.get_scale_per_tensor() + rhs_scale = rhs_qparams.get_scale_per_tensor() + # Common scale for the two numbers + max_scale_2x = 2 * max(lhs_scale, rhs_scale) + SHIFT_INT16 = 12 + # We are adding two int16 numbers. If the zero point is non-null, the result will be in the range [-131070;131070], therefore we need 18 bits for the result. + # We have a 32-bit accumulator, so we can shift to the left by 12 bits and not overflow. In reality, because we divide by the 2*max(lhs_scale,rhs_scale) + # we are shifting to the left by 11. + lhs_factor = (1 << SHIFT_INT16) * lhs_scale / max_scale_2x + rhs_factor = (1 << SHIFT_INT16) * rhs_scale / max_scale_2x + rescaled_lhs = build_rescale_to_int32( + tosa_graph, + tensors[0], + lhs_qparams.get_zp_per_tensor(), + lhs_factor, + tosa_spec=tosa_spec, + ) + rescaled_rhs = build_rescale_to_int32( + tosa_graph, + tensors[1], + rhs_qparams.get_zp_per_tensor(), + rhs_factor, + tosa_spec=tosa_spec, + ) + out_qparam = get_output_qparams(node)[0] + out_scale = out_qparam.get_scale_per_tensor() + back_scale = max_scale_2x / (out_scale * (1 << SHIFT_INT16)) + + return [rescaled_lhs, rescaled_rhs], back_scale + + def insert_rescale_ops_to_int32( tosa_graph: Any, inputs: list[TosaArg], @@ -245,7 +299,9 @@ def compute_multiplier_and_shift( const_2_power_15_or_31 = 1 << offset shifted_mantissa = round(mantissa * const_2_power_15_or_31) - assert shifted_mantissa <= const_2_power_15_or_31 + assert ( + shifted_mantissa <= const_2_power_15_or_31 + ), f"Mantissa {shifted_mantissa} exceeds limit {const_2_power_15_or_31}" if shifted_mantissa == const_2_power_15_or_31: shifted_mantissa = shifted_mantissa // 2 @@ -255,13 +311,19 @@ def compute_multiplier_and_shift( shift = offset - shift # INT32_MAX, 2^31 - 1 - assert shifted_mantissa <= (const_2_power_15_or_31 - 1) + assert shifted_mantissa <= (const_2_power_15_or_31 - 1), ( + f"Mantissa {shifted_mantissa} exceeds signed max " + f"{const_2_power_15_or_31 - 1}" + ) multiplier = shifted_mantissa if shift > 62: multiplier = multiplier >> min(31, shift - 62) shift = 62 + + assert multiplier >= 0, "Multiplier should be non-negative" + assert shift >= 2 and shift <= 62, "Shift should be in range [2, 62]" multipliers.append(multiplier) shifts.append(shift) return multipliers, shifts @@ -313,10 +375,11 @@ def build_rescale( per_channel=False, ): import serializer.tosa_serializer as ts # type: ignore + import tosa.Op as TosaOp # type: ignore - scaleWidth = 32 - is_scale32 = True + scaleWidth = 16 if input_node.dtype == ts.DType.INT48 else 32 + is_scale32 = False if input_node.dtype == ts.DType.INT48 else True multipliers, shifts = compute_multiplier_and_shift(scale, scaleWidth) rescale_inputs = create_const_ops_for_rescale( tosa_fb, diff --git a/backends/arm/tosa/specification.py b/backends/arm/tosa/specification.py index b372cd5a636..3edf27760b5 100644 --- a/backends/arm/tosa/specification.py +++ b/backends/arm/tosa/specification.py @@ -4,12 +4,12 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe +"""Provide TOSA specification parsing and context utilities. -# -# Main implementation of AoT flow to partition and preprocess for Arm target -# backends. Converts via TOSA as an intermediate form supported by AoT and -# JIT compiler flows. -# +Use these helpers to parse and validate TOSA profile/extension strings and to +manage a lowering-time context for the active specification. + +""" import contextvars import re @@ -19,36 +19,39 @@ class TosaSpecification: - """ - This class implements a representation of TOSA specification - (https://www.mlplatform.org/tosa/tosa_spec.html) with a version, a profile - (with extension) and a level (8k). - For 1.00 releases the profile is INT or FP, and the extensions are for - INT: int16, int4, var, cf - FP: bf16, fp8e4m3, fp8e5m2, fft, var, cf + """Represent a TOSA specification. - The TOSA specification is encoded in the string represenatation - TOSA-major.minor.patch+profile[+level][+extensions] + A specification includes a semantic version, one or more profiles, and + optional extensions and levels (for example ``8k``). + The encoded form follows ``TOSA-..+[+][+...]``. + Profiles use uppercase (for example ``INT``, ``FP``); levels and extensions + use lowercase. + + Attributes: + version (Version): Parsed TOSA semantic version. + is_U55_subset (bool): True if the ``u55`` subset is requested. - Profiles are uppercase letters and extensions and level is lowercase. """ version: Version is_U55_subset: bool def support_integer(self) -> bool: - """ - Returns true if any integer operations are supported for the specification. - """ + """Return True if integer operations are supported.""" raise NotImplementedError def support_float(self) -> bool: - """ - Returns true if any float operations are supported for the specification. - """ + """Return True if floating-point operations are supported.""" raise NotImplementedError def __init__(self, version: Version, extras: List[str]): + """Initialize the base specification. + + Args: + version (Version): Parsed TOSA semantic version. + extras (List[str]): Remaining tokens such as profiles, levels, and extensions. + + """ self.version = version self.is_U55_subset = "u55" in extras @@ -57,11 +60,20 @@ def __init__(self, version: Version, extras: List[str]): @staticmethod def create_from_string(repr: str) -> "TosaSpecification": - """ - Creates a TOSA specification class from a string representation: - TOSA-1.00.0+INT+FP+int4+cf - """ + """Create a specification from a standard string format. + + Example: ``TOSA-1.00.0+INT+FP+int4+cf``. + Args: + repr (str): Standard representation string. + + Returns: + TosaSpecification: Parsed specification instance. + + Raises: + ValueError: If the representation is malformed or version is unsupported. + + """ pattern = r"^(TOSA)-([\d.]+)\+(.+)$" match = re.match(pattern, repr) if match: @@ -80,6 +92,18 @@ def create_from_string(repr: str) -> "TosaSpecification": class Tosa_1_00(TosaSpecification): + """Provide TOSA 1.00 profile and extension semantics. + + This variant validates profiles (``INT``, ``FP``), the optional ``8k`` level, + and allowed extensions based on the selected profiles. + + Attributes: + profiles (List[str]): Selected profiles, e.g., ``["INT"]`` or ``["INT", "FP"]``. + level_8k (bool): True if the ``8k`` level is enabled. + extensions (List[str]): Enabled extensions valid for the chosen profiles. + + """ + profiles: List[str] level_8k: bool extensions: List[str] @@ -91,6 +115,16 @@ class Tosa_1_00(TosaSpecification): } def __init__(self, version: Version, extras: List[str]): + """Initialize the 1.00 specification and validate extras. + + Args: + version (Version): Semantic version (major=1, minor=0). + extras (List[str]): Tokens including profiles, level, and extensions. + + Raises: + ValueError: If no/too many profiles are provided or extensions are invalid. + + """ super().__init__(version, extras) # Check that we have at least one profile in the extensions list @@ -129,12 +163,20 @@ def __init__(self, version: Version, extras: List[str]): self.extensions = extras def _get_profiles_string(self) -> str: + """Return the ``+``-joined profile segment (e.g., ``+INT+FP``).""" return "".join(["+" + p for p in self.profiles]) def _get_extensions_string(self) -> str: + """Return the ``+``-joined extensions segment (e.g., ``+int4+cf``).""" return "".join(["+" + e for e in self.extensions]) def __repr__(self): + """Return the standard specification string format. + + Returns: + str: Standard form like ``TOSA-1.00.0+INT+8k+int4``. + + """ extensions = self._get_extensions_string() if self.level_8k: extensions += "+8k" @@ -143,9 +185,24 @@ def __repr__(self): return f"TOSA-{self.version}{self._get_profiles_string()}{extensions}" def __hash__(self) -> int: + """Return a stable hash for use in sets and dict keys. + + Returns: + int: Hash value derived from version and profiles. + + """ return hash(str(self.version) + self._get_profiles_string()) def __eq__(self, other: object) -> bool: + """Return True if another instance represents the same spec. + + Args: + other (object): Object to compare. + + Returns: + bool: True if versions and profiles match. + + """ if isinstance(other, Tosa_1_00): return (self.version == other.version) and ( self._get_profiles_string() == other._get_profiles_string() @@ -153,12 +210,23 @@ def __eq__(self, other: object) -> bool: return False def support_integer(self): + """Return True if the ``INT`` profile is present.""" return "INT" in self.profiles def support_float(self): + """Return True if the ``FP`` profile is present.""" return "FP" in self.profiles def support_extension(self, extension: str) -> bool: + """Return True if an extension is supported and enabled. + + Args: + extension (str): Extension name (for example ``int4``, ``bf16``). + + Returns: + bool: True if the extension is valid for the active profiles and selected. + + """ for p in self.profiles: if extension in self.valid_extensions[p] and extension in self.extensions: return True @@ -167,30 +235,63 @@ def support_extension(self, extension: str) -> bool: class TosaLoweringContext: - """ - A context manager to handle the TOSA specific aspects of the lowering process. - For now it only handles the TOSA specification context, but it can be extended - to include other policies or configurations. + """Manage the TOSA specification context for lowering. + + For now, only the active ``TosaSpecification`` is tracked, but this can be + extended to carry additional lowering policies or configuration. + + Attributes: + tosa_spec_var (contextvars.ContextVar): Context variable storing the active spec. + spec (TosaSpecification): Specification passed to the context manager. + """ # Define a context variable for the spec tosa_spec_var: contextvars.ContextVar = contextvars.ContextVar("tosa_spec") def __init__(self, spec: TosaSpecification): + """Initialize the lowering context with a specification. + + Args: + spec (TosaSpecification): Active specification to put into context. + + """ self.spec = spec def __enter__(self): + """Set the context variable and return self. + + Returns: + TosaLoweringContext: This context manager instance. + + """ # Set the spec in the context variable and store the token for later reset self.token = TosaLoweringContext.tosa_spec_var.set(self.spec) return self def __exit__(self, exc_type, exc_value, traceback): + """Reset the context variable to its previous state. + + Args: + exc_type (type | None): Exception type, if any. + exc_value (BaseException | None): Exception instance, if any. + traceback (TracebackType | None): Traceback, if any. + + """ # Reset the context variable to its previous state TosaLoweringContext.tosa_spec_var.reset(self.token) -# A helper function to retrieve the current spec anywhere in your code def get_context_spec() -> TosaSpecification: + """Get the current ``TosaSpecification`` from the lowering context. + + Returns: + TosaSpecification: Active specification retrieved from the context var. + + Raises: + RuntimeError: If called outside a ``TosaLoweringContext``. + + """ try: return TosaLoweringContext.tosa_spec_var.get() except LookupError: diff --git a/backends/arm/util/_factory.py b/backends/arm/util/_factory.py new file mode 100644 index 00000000000..23d8215fc9b --- /dev/null +++ b/backends/arm/util/_factory.py @@ -0,0 +1,59 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec +from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner +from executorch.backends.arm.quantizer import ( + EthosUQuantizer, + TOSAQuantizer, + VgfQuantizer, +) +from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec +from executorch.backends.arm.tosa.partitioner import TOSAPartitioner +from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner +from executorch.exir.backend.compile_spec_schema import CompileSpec +from torch.fx.passes.operator_support import OperatorSupportBase + + +def parse_compile_spec(compile_specs: list[CompileSpec]) -> ArmCompileSpec: + output_format = None + for spec in compile_specs: + if spec.key == "output_format": + output_format = spec.value.decode() + break + else: + raise ValueError("Compile spec without output format.") + if output_format == TosaCompileSpec.get_output_format(): + return TosaCompileSpec.from_list(compile_specs) + if output_format == EthosUCompileSpec.get_output_format(): + return EthosUCompileSpec.from_list(compile_specs) + if output_format == VgfCompileSpec.get_output_format(): + return VgfCompileSpec.from_list(compile_specs) + raise ValueError(f"Unknown output format {output_format}") + + +def create_partitioner( + compile_spec: ArmCompileSpec, + additional_checks: list[OperatorSupportBase] | None = None, +): + if isinstance(compile_spec, TosaCompileSpec): + return TOSAPartitioner(compile_spec, additional_checks) + elif isinstance(compile_spec, EthosUCompileSpec): + return EthosUPartitioner(compile_spec, additional_checks) + elif isinstance(compile_spec, VgfCompileSpec): + return VgfPartitioner(compile_spec, additional_checks) + else: + raise ValueError("compile spec doesn't target any Arm Partitioner") + + +def create_quantizer(compile_spec: ArmCompileSpec): + if isinstance(compile_spec, TosaCompileSpec): + return TOSAQuantizer(compile_spec) + elif isinstance(compile_spec, EthosUCompileSpec): + return EthosUQuantizer(compile_spec) + elif isinstance(compile_spec, VgfCompileSpec): + return VgfQuantizer(compile_spec) + else: + raise ValueError("compile spec doesn't target any Arm Quantizer") diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py index a3dcbdc5c6f..8c36128cea8 100644 --- a/backends/arm/util/arm_model_evaluator.py +++ b/backends/arm/util/arm_model_evaluator.py @@ -1,11 +1,11 @@ # Copyright 2024-2025 Arm Limited and/or its affiliates. -# All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # pyre-unsafe +import json import logging import os import random @@ -14,7 +14,7 @@ from collections import defaultdict from pathlib import Path -from typing import Any, Optional, Tuple +from typing import Any, cast, Optional, Tuple import torch from torch.nn.modules import Module @@ -29,7 +29,139 @@ logger.setLevel(logging.INFO) +# ImageNet 224x224 transforms (Resize->CenterCrop->ToTensor->Normalize) +# If future models require different preprocessing, extend this helper accordingly. +def _get_imagenet_224_transforms(): + """Return standard ImageNet 224x224 preprocessing transforms.""" + return transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220]), + ] + ) + + +def _build_calibration_loader( + dataset: datasets.ImageFolder, max_items: int +) -> DataLoader: + """Return a DataLoader over a deterministic, shuffled subset of size <= max_items. + + Shuffles with seed: ARM_EVAL_CALIB_SEED (int) or default 1337; then selects first k and + sorts indices to keep enumeration order stable while content depends on seed. + """ + k = min(max_items, len(dataset)) + seed_env = os.getenv("ARM_EVAL_CALIB_SEED") + default_seed = 1337 + if seed_env is not None: + try: + seed = int(seed_env) + except ValueError: + logger.warning( + "ARM_EVAL_CALIB_SEED is not an int (%s); using default seed %d", + seed_env, + default_seed, + ) + seed = default_seed + else: + seed = default_seed + rng = random.Random(seed) + indices = list(range(len(dataset))) + rng.shuffle(indices) + selected = sorted(indices[:k]) + return torch.utils.data.DataLoader( + torch.utils.data.Subset(dataset, selected), batch_size=1, shuffle=False + ) + + +def _load_imagenet_folder(directory: str) -> datasets.ImageFolder: + """Shared helper to load an ImageNet-layout folder. + + Raises FileNotFoundError for a missing directory early to aid debugging. + """ + directory_path = Path(directory) + if not directory_path.exists(): + raise FileNotFoundError(f"Directory: {directory} does not exist.") + transform = _get_imagenet_224_transforms() + return datasets.ImageFolder(directory_path, transform=transform) + + class GenericModelEvaluator: + """Base evaluator computing quantization error metrics and optional compression ratio. + + Subclasses can extend: provide calibration (get_calibrator) and override evaluate() + to add domain specific metrics (e.g. top-1 / top-5 accuracy). + """ + + @staticmethod + def evaluate_topk( + model: Module, + dataset: datasets.ImageFolder, + batch_size: int, + topk: int = 5, + log_every: int = 50, + ) -> Tuple[float, float]: + """Evaluate model top-1 / top-k accuracy. + + Args: + model: Torch module (should be in eval() mode prior to call). + dataset: ImageFolder style dataset. + batch_size: Batch size for evaluation. + topk: Maximum k for accuracy (default 5). + log_every: Log running accuracy every N batches. + Returns: + (top1_accuracy, topk_accuracy) + """ + # Some exported / quantized models (torchao PT2E) disallow direct eval()/train(). + # Try to switch to eval mode, but degrade gracefully if unsupported. + try: + model.eval() + except NotImplementedError: + # Attempt to enable train/eval overrides if torchao helper is present. + try: + from torchao.quantization.pt2e.utils import ( # type: ignore + allow_exported_model_train_eval, + ) + + allow_exported_model_train_eval(model) + try: + model.eval() + except Exception: + logger.debug( + "Model eval still not supported after allow_exported_model_train_eval; proceeding without explicit eval()." + ) + except Exception: + logger.debug( + "Model eval() unsupported and torchao allow_exported_model_train_eval not available; proceeding." + ) + loaded_dataset = DataLoader(dataset, batch_size=batch_size, shuffle=False) + top1_correct = 0 + topk_correct = 0 + total = 0 + with torch.inference_mode(): # disable autograd + some backend optimizations + for i, (image, target) in enumerate(loaded_dataset): + prediction = model(image) + topk_indices = torch.topk(prediction, k=topk, dim=1).indices + # target reshaped for broadcasting + target_view = target.view(-1, 1) + top1_correct += (topk_indices[:, :1] == target_view).sum().item() + topk_correct += (topk_indices == target_view).sum().item() + batch_sz = image.size(0) + total += batch_sz + if (i + 1) % log_every == 0 or total == len(dataset): + logger.info( + "Eval progress: %d / %d top1=%.4f top%d=%.4f", + total, + len(dataset), + top1_correct / total, + topk, + topk_correct / total, + ) + top1_accuracy = top1_correct / len(dataset) + topk_accuracy = topk_correct / len(dataset) + return top1_accuracy, topk_accuracy + REQUIRES_CONFIG = False def __init__( @@ -52,12 +184,13 @@ def __init__( self.tosa_output_path = "" def get_model_error(self) -> defaultdict: - """ - Returns a dict containing the following metrics between the outputs of the FP32 and INT8 model: - - Maximum error - - Maximum absolute error - - Maximum percentage error - - Mean absolute error + """Return per-output quantization error statistics. + + Metrics (lists per output tensor): + max_error + max_absolute_error + max_percentage_error (safe-divided; zero fp32 elements -> 0%) + mean_absolute_error """ fp32_outputs, _ = tree_flatten(self.fp32_model(*self.example_input)) int8_outputs, _ = tree_flatten(self.int8_model(*self.example_input)) @@ -66,7 +199,12 @@ def get_model_error(self) -> defaultdict: for fp32_output, int8_output in zip(fp32_outputs, int8_outputs): difference = fp32_output - int8_output - percentage_error = torch.div(difference, fp32_output) * 100 + # Avoid divide by zero: elements where fp32 == 0 produce 0% contribution + percentage_error = torch.where( + fp32_output != 0, + difference / fp32_output * 100, + torch.zeros_like(difference), + ) model_error_dict["max_error"].append(torch.max(difference).item()) model_error_dict["max_absolute_error"].append( torch.max(torch.abs(difference)).item() @@ -131,69 +269,186 @@ def __init__( @staticmethod def __load_dataset(directory: str) -> datasets.ImageFolder: - directory_path = Path(directory) - if not directory_path.exists(): - raise FileNotFoundError(f"Directory: {directory} does not exist.") - - transform = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220] - ), - ] - ) - return datasets.ImageFolder(directory_path, transform=transform) + return _load_imagenet_folder(directory) @staticmethod def get_calibrator(training_dataset_path: str) -> DataLoader: dataset = MobileNetV2Evaluator.__load_dataset(training_dataset_path) - rand_indices = random.sample(range(len(dataset)), k=1000) + return _build_calibration_loader(dataset, 1000) - # Return a subset of the dataset to be used for calibration - return torch.utils.data.DataLoader( - torch.utils.data.Subset(dataset, rand_indices), - batch_size=1, - shuffle=False, + @classmethod + def from_config( + cls, + model_name: str, + fp32_model: Module, + int8_model: Module, + example_input: Tuple[torch.Tensor], + tosa_output_path: str | None, + config: dict[str, Any], + ) -> "MobileNetV2Evaluator": + """Factory constructing evaluator from a config dict. + + Expected keys: batch_size, validation_dataset_path + """ + return cls( + model_name, + fp32_model, + int8_model, + example_input, + tosa_output_path, + batch_size=config["batch_size"], + validation_dataset_path=config["validation_dataset_path"], ) - def __evaluate_mobilenet(self) -> Tuple[float, float]: + def evaluate(self) -> dict[str, Any]: + # Load dataset and compute top-1 / top-5 dataset = MobileNetV2Evaluator.__load_dataset(self.__validation_set_path) - loaded_dataset = DataLoader( - dataset, - batch_size=self.__batch_size, - shuffle=False, + top1_correct, top5_correct = GenericModelEvaluator.evaluate_topk( + self.int8_model, dataset, self.__batch_size, topk=5 ) + output = super().evaluate() - top1_correct = 0 - top5_correct = 0 + output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct} + return output - for i, (image, target) in enumerate(loaded_dataset): - prediction = self.int8_model(image) - top1_prediction = torch.topk(prediction, k=1, dim=1).indices - top5_prediction = torch.topk(prediction, k=5, dim=1).indices - top1_correct += (top1_prediction == target.view(-1, 1)).sum().item() - top5_correct += (top5_prediction == target.view(-1, 1)).sum().item() +class DeiTTinyEvaluator(GenericModelEvaluator): + REQUIRES_CONFIG = True - logger.info("Iteration: {}".format((i + 1) * self.__batch_size)) - logger.info( - "Top 1: {}".format(top1_correct / ((i + 1) * self.__batch_size)) - ) - logger.info( - "Top 5: {}".format(top5_correct / ((i + 1) * self.__batch_size)) - ) + def __init__( + self, + model_name: str, + fp32_model: Module, + int8_model: Module, + example_input: Tuple[torch.Tensor], + tosa_output_path: str | None, + batch_size: int, + validation_dataset_path: str, + ) -> None: + super().__init__( + model_name, fp32_model, int8_model, example_input, tosa_output_path + ) + self.__batch_size = batch_size + self.__validation_set_path = validation_dataset_path - top1_accuracy = top1_correct / len(dataset) - top5_accuracy = top5_correct / len(dataset) + @staticmethod + def __load_dataset(directory: str) -> datasets.ImageFolder: + return _load_imagenet_folder(directory) + + @staticmethod + def get_calibrator(training_dataset_path: str) -> DataLoader: + dataset = DeiTTinyEvaluator.__load_dataset(training_dataset_path) + return _build_calibration_loader(dataset, 1000) + + @classmethod + def from_config( + cls, + model_name: str, + fp32_model: Module, + int8_model: Module, + example_input: Tuple[torch.Tensor], + tosa_output_path: str | None, + config: dict[str, Any], + ) -> "DeiTTinyEvaluator": + """Factory constructing evaluator from a config dict. - return top1_accuracy, top5_accuracy + Expected keys: batch_size, validation_dataset_path + """ + return cls( + model_name, + fp32_model, + int8_model, + example_input, + tosa_output_path, + batch_size=config["batch_size"], + validation_dataset_path=config["validation_dataset_path"], + ) def evaluate(self) -> dict[str, Any]: - top1_correct, top5_correct = self.__evaluate_mobilenet() + # Load dataset and compute top-1 / top-5 + dataset = DeiTTinyEvaluator.__load_dataset(self.__validation_set_path) + top1, top5 = GenericModelEvaluator.evaluate_topk( + self.int8_model, dataset, self.__batch_size, topk=5 + ) output = super().evaluate() - - output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct} + output["metrics"]["accuracy"] = {"top-1": top1, "top-5": top5} return output + + +evaluators: dict[str, type[GenericModelEvaluator]] = { + "generic": GenericModelEvaluator, + "mv2": MobileNetV2Evaluator, + "deit_tiny": DeiTTinyEvaluator, +} + + +def evaluator_calibration_data( + evaluator_name: str, + evaluator_config: str | None, +): + evaluator = evaluators[evaluator_name] + + if hasattr(evaluator, "get_calibrator"): + assert evaluator_config is not None + + config_path = Path(evaluator_config) + with config_path.open() as f: + config = json.load(f) + + if evaluator is MobileNetV2Evaluator: + return evaluator.get_calibrator( + training_dataset_path=config["training_dataset_path"] + ) + if evaluator is DeiTTinyEvaluator: + return evaluator.get_calibrator( + training_dataset_path=config["training_dataset_path"] + ) + else: + raise RuntimeError(f"Unknown evaluator: {evaluator_name}") + + +def evaluate_model( + model_name: str, + intermediates: str, + model_fp32: torch.nn.Module, + model_int8: torch.nn.Module, + example_inputs: Tuple[torch.Tensor], + evaluator_name: str, + evaluator_config: str | None, +) -> None: + evaluator = evaluators[evaluator_name] + + intermediates_path = Path(intermediates) + tosa_paths = list(intermediates_path.glob("*.tosa")) + + if evaluator.REQUIRES_CONFIG: + assert evaluator_config is not None + config_path = Path(evaluator_config) + with config_path.open() as f: + config = json.load(f) + + # Prefer a subclass provided from_config if available. + if hasattr(evaluator, "from_config"): + factory = cast(Any, evaluator.from_config) # type: ignore[attr-defined] + init_evaluator = factory( + model_name, + model_fp32, + model_int8, + example_inputs, + str(tosa_paths[0]), + config, + ) + else: + raise RuntimeError( + f"Evaluator {evaluator_name} requires config but does not implement from_config()" + ) + else: + init_evaluator = evaluator( + model_name, model_fp32, model_int8, example_inputs, str(tosa_paths[0]) + ) + + quant_metrics = init_evaluator.evaluate() + output_json_path = intermediates_path / "quant_metrics.json" + + with output_json_path.open("w") as json_file: + json.dump(quant_metrics, json_file) diff --git a/backends/backends.bzl b/backends/backends.bzl index 5ca30a83b54..42aed059f22 100644 --- a/backends/backends.bzl +++ b/backends/backends.bzl @@ -6,7 +6,6 @@ def get_all_cpu_backend_targets(): """ return [ "//executorch/backends/xnnpack:xnnpack_backend", - "//executorch/backends/fb/qnnpack:qnnpack_backend", ] def get_all_cpu_aot_and_backend_targets(): @@ -18,6 +17,4 @@ def get_all_cpu_aot_and_backend_targets(): return [ "//executorch/backends/xnnpack:xnnpack_preprocess", "//executorch/backends/xnnpack/partition:xnnpack_partitioner", - "//executorch/backends/fb/qnnpack:qnnpack_preprocess", - "//executorch/backends/fb/qnnpack/partition:qnnpack_partitioner", ] + get_all_cpu_backend_targets() diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt index 47183bed21d..271b4806614 100644 --- a/backends/cadence/CMakeLists.txt +++ b/backends/cadence/CMakeLists.txt @@ -88,8 +88,11 @@ elseif(EXECUTORCH_FUSION_G3_OPT) ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 ) +elseif(EXECUTORCH_VISION_OPT) + set(TARGET_DIR vision) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) else() - set(TARGET_DIR reference) + set(TARGET_DIR generic) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) endif() diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index 0ec09bf4f9e..94ab6de0e29 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -130,6 +130,7 @@ runtime.python_library( deps = [ "fbcode//caffe2:torch", "fbcode//executorch/exir:scalar_type", + "fbcode//executorch/kernels/quantized:custom_ops_generated_lib", ], ) @@ -143,18 +144,13 @@ executorch_generated_lib( visibility = ["PUBLIC"], deps = [ "//executorch/backends/cadence/generic/kernels:cadence_kernels", - # Individual operator targets instead of combined cadence_generic_ops - "//executorch/backends/cadence/generic/operators:op_add", - "//executorch/backends/cadence/generic/operators:op_embedding", - "//executorch/backends/cadence/generic/operators:op_full", "//executorch/backends/cadence/generic/operators:op_requantize_out", - "//executorch/backends/cadence/generic/operators:op_view_copy", "//executorch/backends/cadence/generic/operators:im2row_out", "//executorch/backends/cadence/generic/operators:dequantize_per_tensor", "//executorch/backends/cadence/generic/operators:quantize_per_tensor", "//executorch/backends/cadence/generic/operators:quantized_add_out", - "//executorch/backends/cadence/generic/operators:quantized_conv_nchw_out", - "//executorch/backends/cadence/generic/operators:quantized_conv_nhwc_out", + "//executorch/backends/cadence/generic/operators:quantized_conv2d_nchw_out", + "//executorch/backends/cadence/generic/operators:quantized_conv2d_nhwc_out", "//executorch/backends/cadence/generic/operators:quantized_fully_connected_out", "//executorch/backends/cadence/generic/operators:quantized_layer_norm", "//executorch/backends/cadence/generic/operators:quantized_linear_out", diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 6c497d5bec4..765ddcd581d 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -24,6 +24,7 @@ from executorch.backends.cadence.aot.quantizer.quantizer import ( CadenceDefaultQuantizer, CadenceQuantizer, + CadenceW8A32MixedQuantizer, ) from executorch.backends.cadence.aot.utils import ( get_default_memory_config, @@ -59,6 +60,7 @@ def trace( model: torch.nn.Module, inputs: tuple[object, ...], dump_graphs: bool = False, + quantizer: Optional[CadenceQuantizer] = None, ) -> ExportedProgram: """ Trace the model with export and return an ExportedProgram. @@ -73,6 +75,12 @@ def trace( torch.ops.aten.rms_norm.default, ] + if isinstance(quantizer, CadenceW8A32MixedQuantizer): + ops_to_keep += [ + torch.ops.aten.gru.input, + torch.ops.aten.gru.data, + ] + program = trace_fn( model, inputs, is_qat=False, strict=True, ops_to_keep=ops_to_keep ) @@ -99,7 +107,7 @@ def prepare_pt2( Returns a GraphModule with the prepared model. """ - traced_program = trace(model, inputs, dump_graphs=dump_graphs) + traced_program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer) prepared_program = prepare_traced_pt2( traced_program, quantizer, dump_graphs=dump_graphs ) @@ -184,7 +192,7 @@ def get_fake_quant_model( # Make the model inference mode by calling model.eval() model.eval() - program = trace(model, inputs, dump_graphs=dump_graphs) + program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer) if dump_graphs: logging.info("Graph after trace:") diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index 1c626887649..d8024c0245a 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -184,21 +184,81 @@ - arg_meta: null kernel_name: impl::generic::quantize_per_tensor_out +- func: cadence::quantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::quantize_per_tensor_asym8s_out + +- func: cadence::quantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::quantize_per_tensor_asym8u_out + +- func: cadence::quantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::quantize_per_tensor_asym16s_out + +- func: cadence::quantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::quantize_per_tensor_asym16u_out + +- func: cadence::quantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::quantize_per_tensor_asym32s_out + - func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: impl::generic::dequantize_per_tensor_out -- func: cadence::quantized_conv_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::dequantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::dequantize_per_tensor_asym8s_out + +- func: cadence::dequantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::dequantize_per_tensor_asym8u_out + +- func: cadence::dequantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::dequantize_per_tensor_asym16s_out + +- func: cadence::dequantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::dequantize_per_tensor_asym16u_out + +- func: cadence::dequantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::generic::dequantize_per_tensor_asym32s_out + +- func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_out + kernel_name: impl::generic::quantized_conv2d_nchw_out -- func: cadence::quantized_conv_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_out + kernel_name: impl::generic::quantized_conv2d_nhwc_out - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: @@ -289,95 +349,95 @@ - arg_meta: null kernel_name: impl::generic::im2row_per_tensor_out -- func: cadence::quantized_conv_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nchw_per_tensor_out -- func: cadence::quantized_conv_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out -- func: cadence::quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::generic::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::generic::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) kernels: diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index a5f3102d600..3bdbb33d59b 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -284,111 +284,171 @@ - arg_meta: null kernel_name: impl::HiFi::quantize_per_tensor_out +- func: cadence::quantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantize_per_tensor_asym8s_out + +- func: cadence::quantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantize_per_tensor_asym8u_out + +- func: cadence::quantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantize_per_tensor_asym16s_out + +- func: cadence::quantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantize_per_tensor_asym16s_out + +- func: cadence::quantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantize_per_tensor_asym32s_out + - func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: impl::HiFi::dequantize_per_tensor_out -- func: cadence::quantized_conv_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::dequantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_out + kernel_name: impl::HiFi::dequantize_per_tensor_asym8s_out -- func: cadence::quantized_conv_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::dequantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_out + kernel_name: impl::HiFi::dequantize_per_tensor_asym8u_out -- func: cadence::quantized_conv_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::dequantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_per_tensor_out + kernel_name: impl::HiFi::dequantize_per_tensor_asym16s_out -- func: cadence::quantized_conv_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::dequantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_per_tensor_out + kernel_name: impl::HiFi::dequantize_per_tensor_asym16u_out -- func: cadence::quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::dequantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::dequantize_per_tensor_asym16s_out -- func: cadence::quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_out -- func: cadence::quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nhwc_out -- func: cadence::quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_per_tensor_out -- func: cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nhwc_per_tensor_out -- func: cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out -- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out + kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out + +- func: cadence::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out + +- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out + +- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out + +- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out + +- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: @@ -488,3 +548,18 @@ kernels: - arg_meta: null kernel_name: impl::HiFi::quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out + +- func: cadence::quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_w8a32_linear_out + +- func: cadence::quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_w8a32_conv_out + +- func: cadence::quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_w8a32_gru_out diff --git a/backends/cadence/aot/functions_vision.yaml b/backends/cadence/aot/functions_vision.yaml new file mode 100644 index 00000000000..8d9cdd16105 --- /dev/null +++ b/backends/cadence/aot/functions_vision.yaml @@ -0,0 +1,265 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This yaml file contains operators that are also defined by the ATen library. +# For lean mode: +# - Codegen'd target `executorch_generated_lib` will be reading all the information +# from this file, including operator schema and kernel metadata. +# - Selective build target `codegen:executorch_defined_ops` now is selecting all the +# operators in this file, by dumping all the op names into `selected_operators.yaml`. +# +# See the README.md file in executorch/kernels/portable for a description of the syntax used +# by this file. + + +# aten ops +- op: _to_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::to_copy_out + +- op: _softmax.out + kernels: + - arg_meta: null + kernel_name: impl::vision::native::_softmax_out + +- op: add.out + kernels: + - arg_meta: null + kernel_name: impl::vision::native::add_out + +- op: bmm.out + kernels: + - arg_meta: null + kernel_name: torch::executor::bmm_out + +- op: cat.out + kernels: + - arg_meta: null + kernel_name: torch::executor::cat_out + +- op: clone.out + kernels: + - arg_meta: null + kernel_name: torch::executor::clone_out + +- op: div.out + kernels: + - arg_meta: null + kernel_name: torch::executor::div_out + +- op: div.out_mode + kernels: + - arg_meta: null + kernel_name: torch::executor::div_out_mode + +- op: embedding.out + kernels: + - arg_meta: null + kernel_name: impl::vision::native::embedding_out + +- op: empty.out + kernels: + - arg_meta: null + kernel_name: torch::executor::empty_out + +- op: expand_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::expand_copy_out + +- op: full.out + kernels: + - arg_meta: null + kernel_name: impl::vision::native::full_out + +- op: gelu.out + kernels: + - arg_meta: null + kernel_name: torch::executor::gelu_out + +- op: hardtanh.out + kernels: + - arg_meta: null + kernel_name: torch::executor::hardtanh_out + +- op: max_pool2d_with_indices.out + kernels: + - arg_meta: null + kernel_name: torch::executor::max_pool2d_with_indices_out + +- op: mean.out + kernels: + - arg_meta: null + kernel_name: torch::executor::mean_dim_out + +- op: mul.out + kernels: + - arg_meta: null + kernel_name: torch::executor::mul_out + +- op: mul.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::mul_scalar_out + +- op: permute_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::permute_copy_out + +- op: rsqrt.out + kernels: + - arg_meta: null + kernel_name: torch::executor::rsqrt_out + +- op: sigmoid.out + kernels: + - arg_meta: null + kernel_name: torch::executor::sigmoid_out + +- op: slice_copy.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::slice_copy_Tensor_out + +- op: split_with_sizes_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::split_with_sizes_copy_out + +- op: sub.out + kernels: + - arg_meta: null + kernel_name: torch::executor::sub_out + +- op: view_copy.out + kernels: + - arg_meta: null + kernel_name: impl::vision::native::view_copy_out + +- op: where.self_out + kernels: + - arg_meta: null + kernel_name: torch::executor::where_out + +- op: transpose_copy.int_out + kernels: + - arg_meta: null + kernel_name: torch::executor::transpose_copy_int_out + +- op: eq.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::eq_scalar_out + +- op: logical_not.out + kernels: + - arg_meta: null + kernel_name: torch::executor::logical_not_out + +- op: any.out + kernels: + - arg_meta: null + kernel_name: torch::executor::any_out + +- op: native_group_norm.out + kernels: + - arg_meta: null + kernel_name: torch::executor::native_group_norm_out + +- op: sum.IntList_out + kernels: + - arg_meta: null + kernel_name: torch::executor::sum_dim_out + +- op: select_copy.int_out + kernels: + - arg_meta: null + kernel_name: torch::executor::select_copy_int_out + +# custom ops +- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantize_per_tensor_out + +- func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::vision::native::dequantize_per_tensor_out + +- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_conv_out + +- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_layer_norm_out +- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_layer_norm_per_tensor_out + +- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_linear_out + +- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_relu_out + +- func: cadence::quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_relu_per_tensor_out + +- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_matmul_out + +- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_linear_per_tensor_out + +- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::im2row_out + +- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::im2row_per_tensor_out + +- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_conv_per_tensor_out + +- func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_fully_connected_out + +- func: cadence::quantized_fully_connected.per_tensor_out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::quantized_fully_connected_per_tensor_out + +- func: cadence::requantize.out(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::requantize_out + +- func: cadence::requantize.per_tensor_out(Tensor input, float in_scale, int in_zero_point, float out_scale, int out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::native::requantize_per_tensor_out diff --git a/backends/cadence/aot/graph_builder.py b/backends/cadence/aot/graph_builder.py index 2cfd7900e8e..f609ba55472 100644 --- a/backends/cadence/aot/graph_builder.py +++ b/backends/cadence/aot/graph_builder.py @@ -44,12 +44,12 @@ class GraphBuilder(ExportPass): gm = builder.get_graph_module() """ - def __init__(self) -> None: + def __init__(self, fake_tensor_mode: Optional[FakeTensorMode] = None) -> None: self.exporter = ExportPass() self.tracer: ExportPass.ExportTracer = self.ExportTracer( self, torch.fx.graph.CodeGen() ) - self.fake_tensor_mode = FakeTensorMode( + self.fake_tensor_mode: FakeTensorMode = fake_tensor_mode or FakeTensorMode( allow_fallback_kernels=False, allow_non_fake_inputs=True, ) diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index bd2bf32834d..f827488adfb 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -28,12 +28,78 @@ "quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "quantize_per_tensor_asym8s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "quantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) + +lib.define( + "quantize_per_tensor_asym8u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "quantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) + +lib.define( + "quantize_per_tensor_asym16s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "quantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) + +lib.define( + "quantize_per_tensor_asym16u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "quantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) + +lib.define( + "quantize_per_tensor_asym32s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "quantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) + lib.define( "dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" ) lib.define( "dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "dequantize_per_tensor_asym8s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "dequantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "dequantize_per_tensor_asym8u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "dequantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "dequantize_per_tensor_asym16s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "dequantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "dequantize_per_tensor_asym16u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "dequantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) + +lib.define( + "dequantize_per_tensor_asym32s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)" +) +lib.define( + "dequantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)" +) lib.define( "quantized_layer_norm(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point) -> (Tensor Y)" @@ -86,28 +152,28 @@ ) lib.define( - "quantized_conv_nhwc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nhwc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( "quantized_matmul(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)" @@ -122,100 +188,100 @@ "quantized_matmul_asym8sxasym8s_asym8s.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) lib.define( - "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( "quantized_matmul_asym8uxasym8u_asym8u(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)" @@ -254,7 +320,7 @@ "float out_scale, int out_zero_point) -> (Tensor Z)" ) lib.define( - "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, " + "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " "Tensor indices, bool pruned_weights=False) -> (Tensor X)" ) lib.define( @@ -263,7 +329,7 @@ "Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False) -> (Tensor out)" ) lib.define( - "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, " + "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], bool ceil_mode=False, " "bool count_include_pad=True, int? divisor_override=None, Tensor? in_zero_point=None, bool channel_last=False) -> (Tensor out)" ) lib.define( @@ -448,7 +514,7 @@ "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, " + "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " "Tensor indices, bool pruned_weights=False, *, Tensor(a!) out) -> Tensor(a!)" ) @@ -459,7 +525,7 @@ "Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, " + "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], " "bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, " "Tensor? in_zero_point=None, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)" ) @@ -498,6 +564,29 @@ "_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "quantized_w8a32_linear(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor" +) +lib.define( + "quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)" +) + +lib.define( + "quantized_w8a32_conv(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor" +) +lib.define( + "quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)" +) + +lib.define( + "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale) -> Tensor" +) + +lib.define( + "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)" +) + + # Custom ops with aten namespace. Need to specify the lib var as FRAGMENT type as aten library is already defined aten_lib = Library("aten", "FRAGMENT") aten_lib.define( @@ -554,6 +643,66 @@ def quantize_per_tensor_meta( return input.new_empty(input.size(), dtype=dtype) +@register_fake("cadence::quantize_per_tensor_asym8s") +def quantize_per_tensor_asym8s_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=dtype) + + +@register_fake("cadence::quantize_per_tensor_asym8u") +def quantize_per_tensor_asym8u_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=dtype) + + +@register_fake("cadence::quantize_per_tensor_asym16s") +def quantize_per_tensor_asym16s_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=dtype) + + +@register_fake("cadence::quantize_per_tensor_asym16u") +def quantize_per_tensor_asym16u_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=dtype) + + +@register_fake("cadence::quantize_per_tensor_asym32s") +def quantize_per_tensor_asym32s_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=dtype) + + @register_fake("cadence::dequantize_per_tensor") def dequantize_per_tensor_meta( input: torch.Tensor, @@ -566,6 +715,66 @@ def dequantize_per_tensor_meta( return input.new_empty(input.size(), dtype=torch.float) +@register_fake("cadence::dequantize_per_tensor_asym8s") +def dequantize_per_tensor_asym8s_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=torch.float) + + +@register_fake("cadence::dequantize_per_tensor_asym8u") +def dequantize_per_tensor_asym8u_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=torch.float) + + +@register_fake("cadence::dequantize_per_tensor_asym16s") +def dequantize_per_tensor_asym16s_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=torch.float) + + +@register_fake("cadence::dequantize_per_tensor_asym16u") +def dequantize_per_tensor_asym16u_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=torch.float) + + +@register_fake("cadence::dequantize_per_tensor_asym32s") +def dequantize_per_tensor_asym32s_meta( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return input.new_empty(input.size(), dtype=torch.float) + + @register_fake("cadence::quantized_add") def quantized_add_meta( X: torch.Tensor, @@ -717,8 +926,8 @@ def quantized_linear_asym8uxasym8u_asym8u_per_tensor_meta( return src.new_empty(out_size, dtype=src.dtype) -@register_fake("cadence::quantized_conv_nhwc") -def quantized_conv_nhwc_meta( +@register_fake("cadence::quantized_conv2d_nhwc") +def quantized_conv2d_nhwc_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -761,8 +970,8 @@ def quantized_conv_nhwc_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw") -def quantized_conv_nchw_meta( +@register_fake("cadence::quantized_conv2d_nchw") +def quantized_conv2d_nchw_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -805,8 +1014,8 @@ def quantized_conv_nchw_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw.per_tensor") -def quantized_conv_nchw_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nchw.per_tensor") +def quantized_conv2d_nchw_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -849,8 +1058,8 @@ def quantized_conv_nchw_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nhwc.per_tensor") -def quantized_conv_nhwc_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nhwc.per_tensor") +def quantized_conv2d_nhwc_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -893,8 +1102,8 @@ def quantized_conv_nhwc_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -942,8 +1151,8 @@ def quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -991,8 +1200,8 @@ def quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1040,8 +1249,8 @@ def quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1089,8 +1298,8 @@ def quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1138,8 +1347,8 @@ def quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1187,8 +1396,8 @@ def quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1236,8 +1445,8 @@ def quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake("cadence::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1285,8 +1494,10 @@ def quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake( + "cadence::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor" +) +def quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1334,8 +1545,10 @@ def quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake( + "cadence::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor" +) +def quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1383,8 +1596,10 @@ def quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake( + "cadence::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor" +) +def quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -1432,8 +1647,10 @@ def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake( + "cadence::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor" +) +def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -2050,10 +2267,10 @@ def avg_pool2d_meta( kernel_size: Tuple[int], stride: Tuple[int], padding: Tuple[int], - ceil_mode: bool, - count_include_pad: Optional[bool] = True, + ceil_mode: bool = False, + count_include_pad: bool = True, divisor_override: Optional[int] = None, - in_zero_point: Optional[int] = None, + in_zero_point: Optional[torch.Tensor] = None, channel_last: bool = False, ) -> torch.Tensor: # Use torch native meta kernels when operator semantics are similar @@ -2108,6 +2325,28 @@ def transposed_im2row_meta( return input.new_empty(output_size, dtype=input.dtype) +@register_fake("cadence::quantized_embedding_byte") +def quantized_embedding_byte_meta( + weight: torch.Tensor, + weight_scales: torch.Tensor, + weight_zero_points: torch.Tensor | None, + indices: torch.Tensor, + pruned_weights: bool = False, +) -> torch.Tensor: + assert not pruned_weights + assert len(weight.shape) == 2 + assert 1 <= len(weight_scales.shape) <= 2 + if len(weight_scales.shape) == 2: + num_groups = weight_scales.shape[-1] + assert weight.shape[1] % num_groups == 0 + + if weight_zero_points is not None: + assert weight_zero_points.shape == weight_scales.shape + + assert 1 <= len(indices.shape) <= 2 + return torch.empty(*indices.shape, weight.shape[1], dtype=torch.float32) + + @register_fake("cadence::where_Scalar") def where_Scalar_meta( condition: torch.Tensor, @@ -2190,8 +2429,8 @@ def roi_align_box_processor_meta( return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8) -@register_fake("cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake("cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -2226,8 +2465,8 @@ def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake("cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -2262,8 +2501,8 @@ def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta( +@register_fake("cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -2298,8 +2537,8 @@ def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta( return input.new_empty(output_size, dtype=input.dtype) -@register_fake("cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_meta( +@register_fake("cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_meta( input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -2368,3 +2607,67 @@ def quantized_softmax_per_tensor_meta( out_zero_point: int, ) -> torch.Tensor: return input.new_empty(input.size(), dtype=input.dtype) + + +@register_fake("cadence::quantized_w8a32_linear") +def quantized_w8a32_linear_meta( + src: torch.Tensor, + weight: torch.Tensor, + w_scale: float, + bias: torch.Tensor, + b_scale: float, +) -> torch.Tensor: + # src comes in shape [leading_dims, in_dim] + # weight comes in shape [in_dim, out_dim] + # output comes in empty with shape [leading_dims, out_dim] + src_shape = list(src.shape) + weight_shape = weight.shape + assert len(weight_shape) == 2 + assert src_shape[-1] == weight_shape[-1] + src_shape[-1] = weight_shape[0] + return src.new_empty(src_shape, dtype=src.dtype) + + +@register_fake("cadence::quantized_w8a32_conv") +def quantized_w8a32_conv_meta( + src: torch.Tensor, + weight: torch.Tensor, + w_scale: float, + bias: torch.Tensor, + b_scale: float, +) -> torch.Tensor: + # src comes in shape [batch, in_channel, in_length] + # weight comes in shape [out_ch, in_ch, kernel_dim] + # output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1] + assert len(src.shape) == 3 + + kernel_size, out_channels, in_channels = weight.shape + assert in_channels == src.shape[-1] + + # Compute the output tensor size + output_size = get_conv1d_output_size( + src.permute(0, 2, 1).shape, + out_channels, + stride=1, + padding=0, + dilation=1, + kernel_size=kernel_size, + channel_last=False, + ) + return src.new_empty(output_size, dtype=src.dtype) + + +@register_fake("cadence::quantized_w8a32_gru") +def quantized_w8a32_gru_meta( + inputs: torch.Tensor, + hidden: torch.Tensor, + weights_inputs: torch.Tensor, + w_i_scale: float, + weights_hidden: torch.Tensor, + w_h_scale: float, + bias_inputs: torch.Tensor, + b_i_scale: float, + bias_hidden: torch.Tensor, + b_h_scale: float, +) -> torch.Tensor: + return inputs.new_empty((2, hidden.shape[-1]), dtype=inputs.dtype) diff --git a/backends/cadence/aot/program_builder.py b/backends/cadence/aot/program_builder.py index 862ba4e977c..46d730b68ff 100644 --- a/backends/cadence/aot/program_builder.py +++ b/backends/cadence/aot/program_builder.py @@ -12,6 +12,7 @@ from torch import Tensor from torch._export.verifier import Verifier from torch._ops import OpOverload +from torch._subclasses.fake_tensor import FakeTensorMode from torch.export import ExportedProgram from torch.export.exported_program import ModuleCallEntry, ModuleCallSignature from torch.export.graph_signature import ( @@ -37,6 +38,7 @@ def __init__( self, mode: Optional[IrMode] = None, _core_aten_ops_exception_list: Optional[list[OpOverload]] = None, + fake_tensor_mode: Optional[FakeTensorMode] = None, ) -> None: self.input_specs: list[InputSpec] = [] self.output_specs: list[OutputSpec] = [] @@ -46,7 +48,7 @@ def __init__( self._core_aten_ops_exception_list: list[OpOverload] = ( _core_aten_ops_exception_list or [] ) - super().__init__() + super().__init__(fake_tensor_mode=fake_tensor_mode) def insert_input_spec( self, target: str, input_kind: InputKind, value: Tensor diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py index ed14574a8c8..2fa0f794e3c 100644 --- a/backends/cadence/aot/quantizer/fusion_pass.py +++ b/backends/cadence/aot/quantizer/fusion_pass.py @@ -24,6 +24,9 @@ LayerNormPattern, LinearPattern, MatmulPattern, + MixedW8A32ConvPattern, + MixedW8A32GruPattern, + MixedW8A32LinearPattern, ReluPattern0, ReluPattern1, SoftmaxPattern, @@ -390,6 +393,29 @@ def get_args_and_kwargs_relu( return args, kwargs +def get_args_and_kwargs_mixed_w8a32_linear( + graph_module: GraphModule, + other_inputs: List[fx.Node], + weights_inputs: List[fx.Node], + dequants_weights: List[fx.Node], + bias_inputs: List[fx.Node], + dequants_biases: List[fx.Node], +) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]: + w_scale_ = dequants_weights[0].args[1] + b_scale_ = dequants_biases[0].args[1] + + args = ( + other_inputs[0], + weights_inputs[0], + w_scale_, + bias_inputs[0], + b_scale_, + ) + kwargs = {} + + return args, kwargs + + def get_args_and_kwargs_softmax( graph_module: GraphModule, inputs_inputs: List[fx.Node], @@ -454,6 +480,87 @@ def get_args_and_kwargs_softmax( out_zero_point_tensor, ) kwargs = {} + + return args, kwargs + + +def get_args_and_kwargs_mixed_w8a32_conv( + graph_module: GraphModule, + other_inputs: List[fx.Node], + weights_inputs: List[fx.Node], + dequants_weights: List[fx.Node], + bias_inputs: List[fx.Node], + dequants_biases: List[fx.Node], + op_node: fx.Node, +) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]: + # Stride, padding, dilation, groups not supported yet + if len(op_node.args) > 3: + assert op_node.args[3] == [1] # Stride + if len(op_node.args) > 4: + assert op_node.args[4] == [0] # Padding + if len(op_node.args) > 5: + assert op_node.args[5] == [1] # Dilation + if len(op_node.args) > 6: + assert op_node.args[6] == 1 # Groups + + assert len(dequants_weights) == 1 + assert len(dequants_biases) == 1 + W_scale_ = dequants_weights[0].args[1] + B_scale_ = dequants_biases[0].args[1] + + transposed_inputs = graph_module.graph.call_function( + torch.ops.aten.permute.default, + (other_inputs[0], [0, 2, 1]), # NCL -> NLC + ) + transposed_weights = graph_module.graph.call_function( + torch.ops.aten.permute.default, + (weights_inputs[0], [2, 0, 1]), # NCL -> NLC + ) + + args = ( + transposed_inputs, + transposed_weights, + W_scale_, + bias_inputs[0], + B_scale_, + ) + kwargs = {} + + return args, kwargs + + +def get_args_and_kwargs_mixed_w8a32_gru( + graph_module: GraphModule, + other_inputs: List[fx.Node], + weights_inputs: List[fx.Node], + dequants_weights: List[fx.Node], + bias_inputs: List[fx.Node], + dequants_biases: List[fx.Node], + op_node: fx.Node, +) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]: + # Stride, padding, dilation, groups not supported yet + + assert len(dequants_weights) == 2 + assert len(dequants_biases) == 2 + w_i_scale = dequants_weights[0].args[1] + w_h_scale = dequants_weights[1].args[1] + b_i_scale = dequants_biases[0].args[1] + b_h_scale = dequants_biases[1].args[1] + + args = ( + other_inputs[0], + other_inputs[1], + weights_inputs[0], + w_i_scale, + weights_inputs[1], + w_h_scale, + bias_inputs[0], + b_i_scale, + bias_inputs[1], + b_h_scale, + ) + kwargs = {} + return args, kwargs @@ -471,7 +578,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 pattern.partition_types(), ) for fused_partition in fused_partitions: - anchors = pattern.get_anchors(graph_module, fused_partition) + anchors, op_node = pattern.get_anchors(graph_module, fused_partition) if not anchors or anchors.empty: continue if any(self.is_fused(p.nodes) for p in fused_partition): @@ -512,13 +619,10 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 bias_inputs = [node.args[0] for node in dequants_biases] other_inputs = [node.args[idx] for node, idx in anchors.others] - # The node is the first index of the list and first of the tuple - anchor_output_node = anchors.output[0][0] - - assert len(anchor_output_node.users) == 1 - quant_node = list(anchor_output_node.users.keys())[0] + assert op_node is not None, "op_node is None" + quant_node = list(op_node.users.keys())[0] - with graph_module.graph.inserting_after(anchor_output_node): + with graph_module.graph.inserting_after(op_node): args = tuple( inputs_inputs + weights_inputs + other_inputs + bias_inputs ) @@ -532,7 +636,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 ) elif isinstance(pattern, CatPattern): args, kwargs = get_args_and_kwargs_cat( - inputs_inputs, other_inputs, anchor_output_node + inputs_inputs, other_inputs, op_node ) elif isinstance(pattern, ConvReluPatterns): # For ConvReLU, we are fusing Conv+ReLU @@ -563,7 +667,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 dequants_weights, bias_inputs, quant_node, - anchor_output_node, + op_node, ) elif isinstance(pattern, LinearPattern): args, kwargs = get_args_and_kwargs_linear( @@ -618,20 +722,57 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 inputs_inputs, dequants_inputs, quant_node, - anchor_output_node, + op_node, ) + elif isinstance(pattern, MixedW8A32LinearPattern): + args, kwargs = get_args_and_kwargs_mixed_w8a32_linear( + graph_module, + other_inputs, + weights_inputs, + dequants_weights, + bias_inputs, + dequants_biases, + ) + elif isinstance(pattern, MixedW8A32ConvPattern): + args, kwargs = get_args_and_kwargs_mixed_w8a32_conv( + graph_module, + other_inputs, + weights_inputs, + dequants_weights, + bias_inputs, + dequants_biases, + op_node, + ) + elif isinstance(pattern, MixedW8A32GruPattern): + args, kwargs = get_args_and_kwargs_mixed_w8a32_gru( + graph_module, + other_inputs, + weights_inputs, + dequants_weights, + bias_inputs, + dequants_biases, + op_node, + ) + fused = graph_module.graph.call_function( pattern.replacement_op(), args, kwargs, ) - fused.meta = quant_node.meta - quant_node.replace_all_uses_with(fused) + + if len(anchors.output) > 0: + fused.meta = quant_node.meta + quant_node.replace_all_uses_with(fused) + else: + fused.meta = op_node.meta + op_node.replace_all_uses_with(fused) + if op_node.op == "output": + _ = graph_module.graph.output((fused,)) legalize_graph(graph_module) graph_module.graph.eliminate_dead_code() - # pyre-fixme[7]: Incompatible return type graph_module.recompile() + return PassResult(graph_module, True) @classmethod # pyre-ignore[2]: Parameter `nodes` has no type specified diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index 33b476f5120..2452cfdcfea 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -8,7 +8,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import List, Optional, Tuple, Union +from typing import List, Tuple, Union import torch from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams @@ -67,7 +67,7 @@ def partition_types(self) -> list[OpOverload]: @abstractmethod def get_anchors( self, gm: torch.fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> Optional[PartitionAnchors]: + ) -> Tuple[PartitionAnchors, fx.Node]: pass @abstractmethod @@ -85,7 +85,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... addmm_node = fused_partition[0].nodes[-1] @@ -101,11 +101,14 @@ def get_anchors( qscheme=torch.per_tensor_affine, ) - return PartitionAnchors( - inputs=[(addmm_node, 1)], - weights=[(addmm_node, 2)], - biases=[(addmm_node, 0, bias_qspec)], - output=[(addmm_node,)], + return ( + PartitionAnchors( + inputs=[(addmm_node, 1)], + weights=[(addmm_node, 2)], + biases=[(addmm_node, 0, bias_qspec)], + output=[(addmm_node,)], + ), + addmm_node, ) def replacement_op(self) -> OpOverload: @@ -118,7 +121,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... add_node = fused_partition[0].nodes[-1] @@ -129,15 +132,21 @@ def get_anchors( add_node.args[1], fx.Node ) if not is_tensor_add or len(add_node.kwargs) > 0: - return PartitionAnchors( - empty=True, + return ( + PartitionAnchors( + empty=True, + ), + add_node, ) - return PartitionAnchors( - inputs=[(add_node, 0), (add_node, 1)], - weights=[], - biases=[], - output=[(add_node,)], + return ( + PartitionAnchors( + inputs=[(add_node, 0), (add_node, 1)], + weights=[], + biases=[], + output=[(add_node,)], + ), + add_node, ) def replacement_op(self) -> OpOverload: @@ -150,15 +159,18 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... bmm_node = fused_partition[0].nodes[-1] - return PartitionAnchors( - inputs=[(bmm_node, 0), (bmm_node, 1)], - weights=[], - biases=[], - output=[(bmm_node,)], + return ( + PartitionAnchors( + inputs=[(bmm_node, 0), (bmm_node, 1)], + weights=[], + biases=[], + output=[(bmm_node,)], + ), + bmm_node, ) def replacement_op(self) -> OpOverload: @@ -171,7 +183,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... cat_node = fused_partition[0].nodes[-1] @@ -198,13 +210,16 @@ def get_anchors( ) ) - return PartitionAnchors( - inputs=args, - weights=[], - biases=[], - output=[ - (cat_node, SharedQuantizationSpec((cat_node.args[0][0], cat_node))) - ], + return ( + PartitionAnchors( + inputs=args, + weights=[], + biases=[], + output=[ + (cat_node, SharedQuantizationSpec((cat_node.args[0][0], cat_node))) + ], + ), + cat_node, ) def replacement_op(self) -> OpOverload: @@ -217,7 +232,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... conv1d_node = fused_partition[0].nodes[-1] @@ -238,16 +253,19 @@ def get_anchors( if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None: bias = [(conv1d_node, 2, bias_qspec)] - return PartitionAnchors( - inputs=[(conv1d_node, 0)], - weights=[(conv1d_node, 1)], - # pyre-fixme[6]: Incompatible parameter type - biases=bias, - output=[(conv1d_node,)], + return ( + PartitionAnchors( + inputs=[(conv1d_node, 0)], + weights=[(conv1d_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(conv1d_node,)], + ), + conv1d_node, ) def replacement_op(self) -> OpOverload: - return torch.ops.cadence.quantized_conv_nchw.default + return torch.ops.cadence.quantized_conv2d_nchw.default class Conv2dPattern(QuantizationPattern): @@ -256,7 +274,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... conv2d_node = fused_partition[0].nodes[-1] @@ -277,16 +295,19 @@ def get_anchors( if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None: bias = [(conv2d_node, 2, bias_qspec)] - return PartitionAnchors( - inputs=[(conv2d_node, 0)], - weights=[(conv2d_node, 1)], - # pyre-fixme[6]: Incompatible parameter type - biases=bias, - output=[(conv2d_node,)], + return ( + PartitionAnchors( + inputs=[(conv2d_node, 0)], + weights=[(conv2d_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(conv2d_node,)], + ), + conv2d_node, ) def replacement_op(self) -> OpOverload: - return torch.ops.cadence.quantized_conv_nchw.default + return torch.ops.cadence.quantized_conv2d_nchw.default class LayerNormPattern(QuantizationPattern): @@ -295,7 +316,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... layer_norm_node = fused_partition[0].nodes[-1] @@ -311,13 +332,16 @@ def get_anchors( # Weights are used in quantized mode by our kernel, so they are # passed in as others here along with the normalized shape. - return PartitionAnchors( - inputs=[(layer_norm_node, 0)], - weights=[], - biases=[], - # Ordering: normalized_shape, weights, bias - others=others, - output=[(layer_norm_node,)], + return ( + PartitionAnchors( + inputs=[(layer_norm_node, 0)], + weights=[], + biases=[], + # Ordering: normalized_shape, weights, bias + others=others, + output=[(layer_norm_node,)], + ), + layer_norm_node, ) def replacement_op(self) -> OpOverload: @@ -330,7 +354,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... linear_node = fused_partition[0].nodes[-1] @@ -351,12 +375,15 @@ def get_anchors( if len(linear_node.args) > 2: bias = [(linear_node, 2, bias_qspec)] - return PartitionAnchors( - inputs=[(linear_node, 0)], - weights=[(linear_node, 1)], - # pyre-fixme[6]: Incompatible parameter type - biases=bias, - output=[(linear_node,)], + return ( + PartitionAnchors( + inputs=[(linear_node, 0)], + weights=[(linear_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(linear_node,)], + ), + linear_node, ) def replacement_op(self) -> OpOverload: @@ -369,15 +396,18 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... matmul_node = fused_partition[0].nodes[-1] - return PartitionAnchors( - inputs=[(matmul_node, 0), (matmul_node, 1)], - weights=[], - biases=[], - output=[(matmul_node,)], + return ( + PartitionAnchors( + inputs=[(matmul_node, 0), (matmul_node, 1)], + weights=[], + biases=[], + output=[(matmul_node,)], + ), + matmul_node, ) def replacement_op(self) -> OpOverload: @@ -392,15 +422,18 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... relu_node = fused_partition[0].nodes[-1] - return PartitionAnchors( - inputs=[(relu_node, 0)], - weights=[], - biases=[], - output=[(relu_node,)], + return ( + PartitionAnchors( + inputs=[(relu_node, 0)], + weights=[], + biases=[], + output=[(relu_node,)], + ), + relu_node, ) def replacement_op(self) -> OpOverload: @@ -427,7 +460,7 @@ def partition_types(self) -> List[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # The first node should be conv, the second should be relu # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... conv_node = fused_partition[0].nodes[-1] # Second to last node @@ -451,16 +484,19 @@ def get_anchors( if len(conv_node.args) > 2 and conv_node.args[2] is not None: bias = [(conv_node, 2, bias_qspec)] - return PartitionAnchors( - inputs=[(conv_node, 0)], - weights=[(conv_node, 1)], - # pyre-fixme[6]: Incompatible parameter type - biases=bias, - output=[(relu_node,)], # Output is from the relu node + return ( + PartitionAnchors( + inputs=[(conv_node, 0)], + weights=[(conv_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(relu_node,)], # Output is from the relu node + ), + relu_node, ) def replacement_op(self) -> OpOverload: - return torch.ops.cadence.quantized_conv_nchw.default + return torch.ops.cadence.quantized_conv2d_nchw.default # Conv1d + regular relu op fusion @@ -488,22 +524,197 @@ def partition_types(self) -> List[OpOverload]: class SoftmaxPattern(QuantizationPattern): - def partition_types(self) -> List[OpOverload]: return [torch.ops.aten._softmax.default] def get_anchors( self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: + ) -> Tuple[PartitionAnchors, fx.Node]: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... softmax_node = fused_partition[0].nodes[-1] - return PartitionAnchors( - inputs=[(softmax_node, 0)], - weights=[], - biases=[], - output=[(softmax_node,)], + return ( + PartitionAnchors( + inputs=[(softmax_node, 0)], + weights=[], + biases=[], + output=[(softmax_node,)], + ), + softmax_node, ) def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_softmax.default + + +class MixedW8A32LinearPattern(QuantizationPattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.linear.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> Tuple[PartitionAnchors, fx.Node]: + # pyre-ignore[29] + linear_layer = fused_partition[0].nodes[-1] + + # Bail if the arguments have different shapes than expected + if len(linear_layer.args) != 3 or len(linear_layer.kwargs) > 0: + return ( + PartitionAnchors( + empty=True, + ), + linear_layer, + ) + + input_node = linear_layer.args[0] + input_shape = input_node.meta["tensor_meta"].shape + + # Bail if the weights are not multiple of 4 (SIMD) + if input_shape[-1] % 4 != 0: + return ( + PartitionAnchors( + empty=True, + ), + linear_layer, + ) + # Currenly only supporting vector-matrix multiplication + if len(input_shape) > 0 and input_shape[-2] != 1: + return ( + PartitionAnchors( + empty=True, + ), + linear_layer, + ) + + return ( + PartitionAnchors( + inputs=[], + weights=[(linear_layer, 1)], + biases=[(linear_layer, 2)], + output=[], + others=[(linear_layer, 0)], + ), + linear_layer, + ) + + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_w8a32_linear.default + + +class MixedW8A32ConvPattern(QuantizationPattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.conv1d.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> Tuple[PartitionAnchors, fx.Node]: + # pyre-ignore[29] + conv_layer = fused_partition[0].nodes[-1] + + # Bail if the arguments have different shapes than expected + # Stride, padding, dilation and groups are not supported + if len(conv_layer.args) != 3 or len(conv_layer.kwargs) > 0: + return ( + PartitionAnchors( + empty=True, + ), + conv_layer, + ) + + cnn_weights = conv_layer.args[1] + if hasattr(cnn_weights.meta, "tensor_meta"): + cnn_weights_shape = cnn_weights.meta["tensor_meta"].shape + # Bail if the channels are not multiple of 4 (SIMD) + if cnn_weights_shape[0] % 4 != 0: + return ( + PartitionAnchors( + empty=True, + ), + conv_layer, + ) + if cnn_weights_shape[1] % 4 != 0: + return ( + PartitionAnchors( + empty=True, + ), + conv_layer, + ) + # Bail if the kernel size is not 3 + if cnn_weights_shape[2] != 3: + return ( + PartitionAnchors( + empty=True, + ), + conv_layer, + ) + + return ( + PartitionAnchors( + inputs=[], + weights=[(conv_layer, 1)], + biases=[(conv_layer, 2)], + output=[], + others=[(conv_layer, 0)], + ), + conv_layer, + ) + + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_w8a32_conv.default + + +class MixedW8A32GruPattern(QuantizationPattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.gru.input] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> Tuple[PartitionAnchors, fx.Node]: + # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... + gru_layer = fused_partition[0].nodes[-1] + if len(gru_layer.kwargs) > 0: + return ( + PartitionAnchors( + empty=True, + ), + gru_layer, + ) + + # Bail if input or states are not multiple of 4 (SIMD) + if gru_layer.args[0].meta["tensor_meta"].shape[-1] % 4 != 0: + return ( + PartitionAnchors( + empty=True, + ), + gru_layer, + ) + if gru_layer.args[1].meta["tensor_meta"].shape[-1] % 4 != 0: + return ( + PartitionAnchors( + empty=True, + ), + gru_layer, + ) + + class Wrapper: # noqa: B903 + def __init__(self, args, meta): + self.args = args + self.meta = meta + + wrapper = Wrapper(tuple(gru_layer.args[2]), gru_layer.meta) + + return ( + PartitionAnchors( + inputs=[], + # pyre-fixme[6]: Expected `List[Tuple[Node, int]]` but got `List[Tuple[Wrapper, int]]`. + weights=[(wrapper, 0), (wrapper, 1)], + # pyre-fixme[6]: Expected `List[Union[Tuple[Node, int], Tuple[Node, int, DerivedQuantizationSpec]]]` but got `List[Tuple[Wrapper, int]]`. + biases=[(wrapper, 2), (wrapper, 3)], + output=[], + others=[(gru_layer, 0), (gru_layer, 1)], + ), + gru_layer, + ) + + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_w8a32_gru.default diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index ad5f935173e..d4af074c475 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -24,6 +24,9 @@ LayerNormPattern, LinearPattern, MatmulPattern, + MixedW8A32ConvPattern, + MixedW8A32GruPattern, + MixedW8A32LinearPattern, QuantizationPattern, ReluPattern0, ReluPattern1, @@ -109,6 +112,13 @@ None, ) +qconfig_A32W8sym = QuantizationConfig( + input_activation=None, + output_activation=None, + weight=wgt_qspec_sym8s, + bias=wgt_qspec_sym8s, +) + class CadenceAtenQuantizer(Quantizer): def __init__( @@ -133,7 +143,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: if not no_outside_users(fused_partition): continue - anchors = self.pattern.get_anchors(model, fused_partition) + anchors, _ = self.pattern.get_anchors(model, fused_partition) if not anchors or anchors.empty: continue if is_annotated( @@ -302,6 +312,26 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: super().__init__(quantizers) +class CadenceW8A32MixedQuantizer(CadenceQuantizer): + """ + Quantizer for mixed quantization, 8 bit weights and 32 bit activations + TODO: Experimental quantizer, not yet well supported in OSS + """ + + def __init__(self) -> None: + quantizers = [] + quantizers.append( + CadenceAtenQuantizer(MixedW8A32LinearPattern(), qconfig_A32W8sym) + ) + quantizers.append( + CadenceAtenQuantizer(MixedW8A32ConvPattern(), qconfig_A32W8sym) + ) + quantizers.append( + CadenceAtenQuantizer(MixedW8A32GruPattern(), qconfig_A32W8sym) + ) + super().__init__(quantizers) + + class CadenceWithSoftmaxQuantizer(CadenceQuantizer): """ Quantizer including A16 softmax diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py index 2a53c2dde7a..ed9bb438a9e 100644 --- a/backends/cadence/aot/ref_implementations.py +++ b/backends/cadence/aot/ref_implementations.py @@ -6,16 +6,17 @@ # pyre-strict - from typing import Callable import torch +import torch.nn as nn +import torch.nn.functional as F from executorch.exir.scalar_type import ScalarType from torch.library import impl, Library - m = Library("cadence", "IMPL", "CompositeExplicitAutograd") +torch.ops.load_library("//executorch/kernels/quantized:custom_ops_generated_lib") qdtype_map: dict[ScalarType, torch.dtype] = { ScalarType.QINT8: torch.qint8, @@ -38,7 +39,7 @@ def quantize_per_tensor( Args: - input_tensor (Tensor): input tensor - - scale (float): Inverse of quantization scale. Derived from the ratio + - scale (float): Quantization scale. Derived from the ratio between the min/max of the floating-point tensor and the min/max of the quantized range, and then inverted. - zero_point (int): The point which represents 0 in the quantized @@ -61,13 +62,16 @@ def quantize_per_tensor( ] if dtype not in supported_quant_types: raise ValueError( - f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_quant_types}" + f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_quant_types}" ) - quantized = torch.round(input_tensor * scale + zero_point).to(dtype) - return torch.max( - torch.min(quantized, torch.tensor(quant_max)), - torch.tensor(quant_min), + return torch.ops.quantized_decomposed.quantize_per_tensor( + input_tensor, + scale, + zero_point, + quant_min, + quant_max, + dtype, ) @@ -97,7 +101,7 @@ def dequantize_per_tensor( is already provided. - quant_max (int): The largest value in the quantized domain. Unused since scale is already provided. - - dtype (torch.dtype): The type of the output tensor. Must be a floating point type. + - dtype (torch.dtype): The type of the input tensor. """ supported_quant_types = [ torch.int8, @@ -108,23 +112,15 @@ def dequantize_per_tensor( ] if input_tensor.dtype not in supported_quant_types: raise ValueError(f"Input dtype must be one of {supported_quant_types}") - supported_dequant_types = [ - torch.float, - torch.float32, - torch.float16, - torch.bfloat16, - ] - if dtype not in supported_dequant_types: - raise ValueError( - f"Unsupported dtype to dequantize to. Supported dtypes must be one of {supported_dequant_types}" - ) - - # Needed to prevent underflow in cases where the zero_point is larger than - # the quantized value. - if not input_tensor.dtype.is_signed: - input_tensor = input_tensor.to(torch.int32) - - return (input_tensor - zero_point).to(dtype) * scale + if input_tensor.dtype != dtype: + raise ValueError("Input dtype must match dtype") + + # Use the reference implementation from torch quantized_decomposed library + # Unlike quantize_per_tensor, dequantize_per_tensor doesn't have a behavior + # difference, since there's no rounding algorithm (just arithmetic). + return torch.ops.quantized_decomposed.dequantize_per_tensor( + input_tensor, scale, zero_point, quant_min, quant_max, dtype + ) @impl(m, "quantized_add.per_tensor") @@ -180,12 +176,10 @@ def quantized_add_per_tensor( dequant_X = X_scale * (X - X_zero_point) dequant_Y = Y_scale * (Y - Y_zero_point) - out_scale_inv = 1 / out_scale - # q_min/q_max are unused args return quantize_per_tensor( dequant_X + dequant_Y, - out_scale_inv, + out_scale, out_zero_point, torch.iinfo(dtype).min, torch.iinfo(dtype).max, @@ -259,8 +253,7 @@ def quantized_linear_common( - out_zero_point (int): The quantized mapping of zero for the output - offset (Tensor): Unused """ - out_scale = -out_multiplier * (1 / (1 << 31)) * (2**out_shift) - out_scale_inv = 1 / out_scale + out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift)) N, K = weight.shape @@ -271,7 +264,7 @@ def quantized_linear_common( supported_dtypes = [torch.int8, torch.uint8, torch.int32] if dtype not in supported_dtypes: raise ValueError( - f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_dtypes}" + f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_dtypes}" ) out = torch.nn.functional.linear( @@ -281,7 +274,7 @@ def quantized_linear_common( ) return quantize_per_tensor( out, - out_scale_inv, + out_scale, out_zero_point, torch.iinfo(dtype).min, torch.iinfo(dtype).max, @@ -337,8 +330,8 @@ def variant( if out_shift.numel() != 1: raise ValueError("out_shift must be a scalar") - if out_shift.dtype != torch.int64: - raise ValueError("out_shift must be an int64") + if out_shift.dtype != torch.int32: + raise ValueError("out_shift must be an int32") _out_shift = int(out_shift.item()) _out_multiplier = int(out_multiplier[0].item()) @@ -399,6 +392,17 @@ def quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor() -> torch.Tensor: def quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor() -> torch.Tensor: ... +@impl(m, "fully_connected") +def fully_connected( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, +) -> torch.Tensor: + if input_tensor.shape[0] != 1: + raise ValueError("Fully connected linear only supports batch size of 1") + return F.linear(input_tensor, weight, bias) + + @impl(m, "quantized_matmul") def quantized_matmul( X: torch.Tensor, @@ -423,25 +427,27 @@ def quantized_matmul( - out_multiplier (int): The multiplier used to scale the output - out_shift (int): The shift used to scale the output - out_zero_point (int): The quantized mapping of zero for the output - - transposed (bool): Whether to transpose the weight tensor + - transposed (bool): Whether Y is transposed. """ if bias is not None and not torch.all(bias == 0): raise ValueError("bias must be None or all zeros since unused in out variant") - # Looks weird, but quantized linear assumes weights are pre-transposed, - # hence we transpose only if `transposed` is False. - if not transposed: - Y = Y.T + if transposed: + Y = Y.transpose(-1, -2) - return quantized_linear_common( - X, - Y, - bias or torch.zeros(1, dtype=torch.int32), - X_zero_point, - Y_zero_point, - out_multiplier, - out_shift, + out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift)) + + out = torch.matmul( + (X - X_zero_point).float(), + (Y - Y_zero_point).float(), + ) + return quantize_per_tensor( + out, + out_scale, out_zero_point, + torch.iinfo(X.dtype).min, + torch.iinfo(X.dtype).max, + X.dtype, ) @@ -538,7 +544,7 @@ def quantized_layer_norm_per_tensor( ) float_input_tensor = dequantize_per_tensor( - input_tensor, X_scale, X_zero_point, -128, 127, torch.float32 + input_tensor, X_scale, X_zero_point, -128, 127, input_tensor.dtype ) out = torch.nn.functional.layer_norm( float_input_tensor, normalized_shape, weight, bias, eps=eps @@ -546,7 +552,7 @@ def quantized_layer_norm_per_tensor( return quantize_per_tensor( out, - 1 / output_scale, + output_scale, output_zero_point, torch.iinfo(input_tensor.dtype).min, torch.iinfo(input_tensor.dtype).max, @@ -615,7 +621,7 @@ def quantized_conv_per_tensor( return quantize_per_tensor( float_out, - 1.0 / output_scale, + output_scale, output_zero_point, torch.iinfo(input_tensor.dtype).min, torch.iinfo(input_tensor.dtype).max, @@ -623,8 +629,8 @@ def quantized_conv_per_tensor( ) -@impl(m, "quantized_conv_nchw.per_tensor") -def quantized_conv_nchw_per_tensor( +@impl(m, "quantized_conv2d_nchw.per_tensor") +def quantized_conv2d_nchw_per_tensor( input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -679,8 +685,8 @@ def quantized_conv_nchw_per_tensor( ) -@impl(m, "quantized_conv_nhwc.per_tensor") -def quantized_conv_nhwc_per_tensor( +@impl(m, "quantized_conv2d_nhwc.per_tensor") +def quantized_conv2d_nhwc_per_tensor( input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, @@ -800,7 +806,7 @@ def variant( # Call the appropriate base function match layout: case "nchw": - return quantized_conv_nchw_per_tensor( + return quantized_conv2d_nchw_per_tensor( input_tensor, weight, bias, @@ -817,7 +823,7 @@ def variant( out_shift, ) case "nhwc": - return quantized_conv_nhwc_per_tensor( + return quantized_conv2d_nhwc_per_tensor( input_tensor, weight, bias, @@ -841,84 +847,248 @@ def variant( return decorator -@impl(m, "quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nchw", torch.int8, torch.int8) -def quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nchw", torch.uint8, torch.uint8) -def quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nhwc", torch.int8, torch.int8) -def quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nhwc", torch.uint8, torch.uint8) -def quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nchw", torch.int8, torch.int8) -def quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nchw", torch.uint8, torch.uint8) -def quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nhwc", torch.int8, torch.int8) -def quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nhwc", torch.uint8, torch.uint8) -def quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nchw", torch.int8, torch.int8) -def quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor() -> ( + torch.Tensor +): ... -@impl(m, "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nchw", torch.uint8, torch.uint8) -def quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor() -> ( + torch.Tensor +): ... -@impl(m, "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nhwc", torch.int8, torch.int8) -def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor() -> ( + torch.Tensor +): ... -@impl(m, "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nhwc", torch.uint8, torch.uint8) -def quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> ( + torch.Tensor +): ... -@impl(m, "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True) -def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True) -def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor") +@impl(m, "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor") @quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True) -def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... +def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... -@impl(m, "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor") +@impl(m, "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor") @quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True) -def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... + + +@impl(m, "convolution") +def convolution( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int, int], + padding: tuple[int, int], + dilation: tuple[int, int], + groups: int, + channel_last: bool = False, +) -> torch.Tensor: + conv_is_1d = len(input_tensor.shape) == 3 + if channel_last: + if conv_is_1d: + input_tensor = input_tensor.movedim(-1, 1).contiguous() + if len(weight.shape) != 3: + raise ValueError("Weight tensor must be 3D if input is 3D") + weight = weight.movedim(-1, 1).contiguous() + else: + input_tensor = input_tensor.movedim(-1, -3) + if len(weight.shape) != 4: + raise ValueError("Weight tensor must be 4D if input is nd > 3") + weight = torch.permute(weight, (0, -1, 1, 2)).contiguous() + + _stride: tuple[int, int] | int = stride + _padding: tuple[int, int] | int = padding + _dilation: tuple[int, int] | int = dilation + + if conv_is_1d: + conv = torch.nn.functional.conv1d + _stride = stride[0] + _padding = padding[0] + _dilation = dilation[0] + else: + conv = torch.nn.functional.conv2d + + conv_out = conv(input_tensor, weight, bias, _stride, _padding, _dilation, groups) + if channel_last: + if conv_is_1d: + conv_out = conv_out.movedim(1, -1).contiguous() + else: + conv_out = conv_out.movedim(-3, -1).contiguous() + + return conv_out + + +@impl(m, "transposed_convolution") +def transposed_convolution( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int, int], + padding: tuple[int, int], + dilation: tuple[int, int], + output_padding: tuple[int, int], + groups: int, + channel_last: bool = False, +) -> torch.Tensor: + + conv_is_1d = len(input_tensor.shape) == 3 + if channel_last: + if conv_is_1d: + input_tensor = input_tensor.movedim(-1, 1).contiguous() + if len(weight.shape) != 3: + raise ValueError("Weight tensor must be 3D if input is 3D") + weight = weight.movedim(-1, 1).contiguous() + else: + input_tensor = input_tensor.movedim(-1, -3) + if len(weight.shape) != 4: + raise ValueError("Weight tensor must be 4D if input is nd > 3") + weight = torch.permute(weight, (0, -1, 1, 2)).contiguous() + + _stride: tuple[int, int] | int = stride + _padding: tuple[int, int] | int = padding + _dilation: tuple[int, int] | int = dilation + _output_padding: tuple[int, int] | int = output_padding + if conv_is_1d: + conv = torch.nn.functional.conv_transpose1d + _stride = stride[0] + _padding = padding[0] + _dilation = dilation[0] + _output_padding = output_padding[0] + else: + conv = torch.nn.functional.conv_transpose2d + + conv_out = conv( + input_tensor, + weight, + bias, + _stride, + _padding, + _output_padding, + groups, + _dilation, + ) + if channel_last: + if conv_is_1d: + conv_out = conv_out.movedim(1, -1).contiguous() + else: + conv_out = conv_out.movedim(-3, -1).contiguous() + + return conv_out + + +@impl(m, "avg_pool2d") +def avg_pool2d( + input_tensor: torch.Tensor, + kernel_size: tuple[int, int], + stride: tuple[int, int], + padding: tuple[int, int], + ceil_mode: bool = False, + count_include_pad: bool = False, + divisor_override: int | None = None, + in_zero_point: torch.Tensor | None = None, + channel_last: bool = False, +) -> torch.Tensor: + if channel_last: + raise NotImplementedError("Channel last is not yet supported for avg_pool2d") + + in_dtype = input_tensor.dtype + pad_h, pad_w = padding + if in_zero_point is not None: + # Avg pool2d does not allow non-0 padding, + # so we manually pad the input + pad_value = in_zero_point.item() + if not count_include_pad: + # To simulate this, just pad with 0s + pad_value = 0 + + input_tensor = torch.nn.functional.pad( + input_tensor, + (pad_w, pad_w, pad_h, pad_h), + mode="constant", + value=pad_value, + ).float() + + padding = (0, 0) + + out = torch.nn.functional.avg_pool2d( + input_tensor, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override, + ) + + if in_zero_point is not None: + min_val = torch.iinfo(in_dtype).min + max_val = torch.iinfo(in_dtype).max + out = torch.clamp(torch.round(out), min_val, max_val) + + return out.to(in_dtype) def quantized_relu_common( @@ -942,8 +1112,10 @@ def quantized_relu_common( if X.dtype not in supported_dtypes: raise ValueError(f"X dtype must be one of {supported_dtypes}. Got {X.dtype}") - out_scale = -out_multiplier * (1 / (1 << 31)) * (2**out_shift) - dequantized_X = torch.where(X > X_zero_point, X - X_zero_point, torch.zeros_like(X)) + out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift)) + dequantized_X = torch.where( + X > X_zero_point, X - X_zero_point, torch.zeros_like(X) + ).to(torch.float32) return quantize_per_tensor( dequantized_X, out_scale, @@ -955,7 +1127,6 @@ def quantized_relu_common( def quantized_relu_variant( - per_tensor: bool, dtype: torch.dtype | None = None, ) -> Callable[[Callable[..., torch.Tensor]], Callable[..., torch.Tensor]]: """Create a quantized relu variant with type checking.""" @@ -963,43 +1134,20 @@ def quantized_relu_variant( def decorator(_: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]: def variant( X: torch.Tensor, - X_zero_point: torch.Tensor | int, + X_zero_point: int, out_zero_point: int, - out_multiplier: torch.Tensor | int, - out_shift: torch.Tensor | int, + out_multiplier: int, + out_shift: int, ) -> torch.Tensor: - if per_tensor: - if dtype and X.dtype != dtype: - raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}") - - assert isinstance(out_shift, int) - assert isinstance(out_multiplier, int) - _out_shift = out_shift - _out_multiplier = out_multiplier - else: - assert isinstance(out_multiplier, torch.Tensor) - if out_multiplier.numel() > 1: - raise ValueError("Only scalar out_multiplier is supported") - - assert isinstance(out_shift, torch.Tensor) - if out_shift.numel() > 1: - raise ValueError("Only scalar out_shift is supported") - - assert isinstance(X_zero_point, torch.Tensor) - if X_zero_point.shape != X.shape: - raise ValueError( - f"X_zero_point shape must be {X.shape}. Got {X_zero_point.shape}" - ) - - _out_multiplier = int(out_multiplier.item()) - _out_shift = int(out_shift.item()) + if dtype and X.dtype != dtype: + raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}") return quantized_relu_common( X, X_zero_point, out_zero_point, - _out_multiplier, - _out_shift, + out_multiplier, + out_shift, ) return variant @@ -1007,33 +1155,28 @@ def variant( return decorator -@impl(m, "quantized_relu") -@quantized_relu_variant(False) -def quantized_relu() -> torch.Tensor: ... - - @impl(m, "quantized_relu.per_tensor") -@quantized_relu_variant(True) +@quantized_relu_variant() def quantized_relu_per_tensor() -> torch.Tensor: ... @impl(m, "quantized_relu_asym8s_asym8s.per_tensor") -@quantized_relu_variant(True, torch.int8) +@quantized_relu_variant(torch.int8) def quantized_relu_asym8s_asym8s_per_tensor() -> torch.Tensor: ... @impl(m, "quantized_relu_asym8u_asym8u.per_tensor") -@quantized_relu_variant(True, torch.uint8) +@quantized_relu_variant(torch.uint8) def quantized_relu_asym8u_asym8u_per_tensor() -> torch.Tensor: ... -@impl(m, "requantize") -def requantize( +@impl(m, "requantize.per_tensor") +def requantize_per_tensor( input: torch.Tensor, - in_scale: torch.Tensor, - in_zero_point: torch.Tensor, - out_scale: torch.Tensor, - out_zero_point: torch.Tensor, + in_scale: float, + in_zero_point: int, + out_scale: float, + out_zero_point: int, dtype: ScalarType, ) -> torch.Tensor: if dtype in qdtype_map: @@ -1042,11 +1185,6 @@ def requantize( torch.dequantize(input), out_scale, out_zero_point, qdtype_map[dtype] ) - # For in_scale or out_scale other than scalar, it requires quant/dequant - # per channel, but the channel dimension value is missing - if in_scale.numel() > 1 or out_scale.numel() > 1: - raise NotImplementedError("Only scalar scales are supported") - quant_min = torch.iinfo(input.dtype).min quant_max = torch.iinfo(input.dtype).max # pyre-fixme[6]: This dtype is actually the right one. @@ -1056,15 +1194,385 @@ def requantize( return torch.ops.quantized_decomposed.quantize_per_tensor( torch.ops.quantized_decomposed.dequantize_per_tensor( input, - in_scale.flatten()[0], - in_zero_point.flatten()[0], + in_scale, + in_zero_point, quant_min, quant_max, input.dtype, ), - out_scale.flatten()[0], - out_zero_point.flatten()[0], + out_scale, + out_zero_point, out_quant_min, out_quant_max, dtype, ) + + +@impl(m, "rms_norm") +def rms_norm( + X: torch.Tensor, + normalized_shape: tuple[int], + W: torch.Tensor, + eps: float, +) -> torch.Tensor: + return W * nn.RMSNorm(list(normalized_shape), eps=eps, dtype=X.dtype)(X) + + +@impl(m, "where_Scalar") +def where_Scalar( + condition: torch.Tensor, + if_true: float, + if_false: float, +) -> torch.Tensor: + if condition.dtype != torch.bool: + raise ValueError("condition must be a bool tensor") + + return torch.where(condition, if_true, if_false) + + +@impl(m, "rope") +def rope( + input_tensor: torch.Tensor, + sin_tensor: torch.Tensor, + cos_tensor: torch.Tensor, + pos: torch.Tensor | None, +) -> torch.Tensor: + original_shape = input_tensor.shape + + if len(original_shape) not in [4, 5]: + raise ValueError( + f"Input tensor must be 4D or 5D. Got {len(original_shape)}D tensor" + ) + if original_shape[0] != 1: + raise ValueError("Input tensor must have batch size 1") + if len(original_shape) == 5: + input_tensor = input_tensor.view( + input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2], -1 + ) + + _, s, h, hd = input_tensor.shape + + if hd % 2: + raise ValueError("Hidden dimension must be divisible by 2") + + if sin_tensor.shape != (s, hd // 2) or cos_tensor.shape != (s, hd // 2): + raise ValueError( + f"sin_tensor and cos_tensor must have shape {s, hd // 2}. Got {sin_tensor.shape} and {cos_tensor.shape}" + ) + + if pos is not None: + if pos.shape != (input_tensor.shape[1],): + raise ValueError( + f"pos must have shape {input_tensor.shape[1]}. Got {pos.shape}" + ) + sin_tensor = sin_tensor[pos] + cos_tensor = cos_tensor[pos] + + sin_tensor = sin_tensor.unsqueeze(1) + cos_tensor = cos_tensor.unsqueeze(1) + + x0, x1 = input_tensor[..., ::2], input_tensor[..., 1::2] + rotated = torch.cat( + [x0 * cos_tensor - x1 * sin_tensor, x0 * sin_tensor + x1 * cos_tensor], dim=-1 + ) + return rotated.view(original_shape) + + +@impl(m, "im2row") +def im2row( + input_tensor: torch.Tensor, + kernel_size: tuple[int, int], + dilation: tuple[int, int], + padding: tuple[int, int], + stride: tuple[int, int], + in_zero_point: torch.Tensor, + channel_last: bool = False, +) -> torch.Tensor: + """ + Converts an input tensor into a 2D matrix where each row is a flattened sliding window (patch) + from the input, suitable for use in convolution as a matrix multiplication (im2row). + + Args: + - input_tensor: Input tensor of shape (N, C, H, W) or (N, H, W, C) if channel_last. + - kernel_size: Size of the convolution kernel. + - dilation: Dilation of the convolution kernel. + - padding: Padding to apply to the input. + - stride: Stride of the convolution. + - in_zero_point : Zero point for input quantization (broadcastable to input). + - channel_last: If True, input is in NHWC format, else NCHW. + + Returns: + - Tensor of shape (N, num_patches, patch_size) + """ + if len(input_tensor.shape) == 3: + height_dim = 1 if channel_last else 2 + input_tensor = input_tensor.unsqueeze(height_dim) + + if in_zero_point is not None: + if in_zero_point.numel() != 1 and in_zero_point.shape != ( + input_tensor.shape[0], + ): + raise ValueError( + f"Input zero point must be a scalar or broadcastable to input shape {input_tensor.shape}" + ) + if in_zero_point.dtype != torch.int32: + raise ValueError("Input zero point must be an int32 tensor") + + if channel_last: + input_tensor = input_tensor.movedim(-1, -3).contiguous() # NHWC -> NCHW + + N, C, H, W = input_tensor.shape + kH, kW = kernel_size + dH, dW = dilation + pH, pW = padding + sH, sW = stride + + # Handle padding with zero point values + if in_zero_point is not None and (pH > 0 or pW > 0): + # Expand zero point to (N, 1, 1, 1) for broadcasting + in_zero_point = in_zero_point.expand(N) + + # Pad input with the per-batch zero point values + input_tensor = torch.stack( + [ + torch.nn.functional.pad( + input_tensor[i], + (pW, pW, pH, pH), + mode="constant", + value=in_zero_point[i].item(), + ) + for i in range(len(input_tensor)) + ] + ) + + padding = (0, 0) # Already padded manually + + # Use unfold to extract sliding local blocks + # Unfold: (N, C, H, W) -> (N, C, L, kH, kW), where L = number of sliding windows + # torch.nn.functional.unfold returns (N, C*kH*kW, L) + patches = torch.nn.functional.unfold( + input_tensor.float(), # unfold not implemented for int + kernel_size=(kH, kW), + dilation=(dH, dW), + padding=padding, + stride=(sH, sW), + ).to( + input_tensor.dtype + ) # (N, C*kH*kW, L) + + # Transpose to (N, L, C*kH*kW) + patches = patches.transpose(1, 2).contiguous() + + # Reshape to (N*L, C*kH*kW) + patches = patches.view(N, -1, C * kH * kW) + + # If channel_last, output should be in NHWC patch order (but im2row is always row-major) + return patches + + +@impl(m, "im2row.per_tensor") +def im2row_per_tensor( + input_tensor: torch.Tensor, + kernel_size: tuple[int, int], + dilation: tuple[int, int], + padding: tuple[int, int], + stride: tuple[int, int], + in_zero_point: int, + channel_last: bool = False, +) -> torch.Tensor: + return im2row( + input_tensor, + kernel_size, + dilation, + padding, + stride, + torch.tensor(in_zero_point, dtype=torch.int32), + channel_last, + ) + + +@impl(m, "transposed_im2row") +def transposed_im2row( + input_tensor: torch.Tensor, + kernel_size: tuple[int, int], + dilation: tuple[int, int], + padding: tuple[int, int], + stride: tuple[int, int], + output_padding: tuple[int, int], + in_zero_point: torch.Tensor, + channel_last: bool = False, +) -> torch.Tensor: + """ + Converts input tensor patches into im2row format for transposed convolutions. + This function extracts patches from input in a pattern suitable for transposed convolution. + + Args: + - input_tensor: Input spatial tensor, NCHW or NHWC format (3D or 4D). + - kernel_size: Size of the convolution kernel. + - dilation: Dilation of the convolution kernel. + - padding: Padding to apply to the input. + - stride: Stride of the convolution. + - output_padding: Additional output padding for transposed convolution. + - in_zero_point: Zero point for input quantization (broadcastable to input). + - channel_last: If True, input is in NHWC format, else NCHW. + + Returns: + - 3D tensor of shape (N, output_h * output_w, kernel_h * kernel_w * in_c) + """ + # Handle 1D convolution case by adding height dimension + if len(input_tensor.shape) == 3: + height_dim = 1 if channel_last else 2 + input_tensor = input_tensor.unsqueeze(height_dim) + + if in_zero_point is not None: + if in_zero_point.dtype != torch.int32: + raise ValueError("Input zero point must be an int32 tensor") + + # Move to NCHW for processing if needed + if channel_last: + input_tensor = input_tensor.movedim(-1, -3).contiguous() # NHWC -> NCHW + + N, C, H_in, W_in = input_tensor.shape + + # Output: (N, C*H_in*W_in, H_out, W_out) + H_out = ( + (H_in - 1) * stride[0] + + kernel_size[0] + + output_padding[0] + - 2 * padding[0] + + dilation[0] * (kernel_size[0] - 1) + ) + W_out = ( + (W_in - 1) * stride[1] + + kernel_size[1] + + output_padding[1] + - 2 * padding[1] + + dilation[1] * (kernel_size[1] - 1) + ) + + # For each input pixel, create a channel where the upsampled (transposed conv) patch is placed + # Output: (N, C*H_in*W_in, H_out, W_out) + inp_flat = input_tensor.reshape(N, C * H_in * W_in) + + # Calculate output spatial size + H_out = ( + (H_in - 1) * stride[0] + - 2 * padding[0] + + dilation[0] * (kernel_size[0] - 1) + + output_padding[0] + + 1 + ) + W_out = ( + (W_in - 1) * stride[1] + - 2 * padding[1] + + dilation[1] * (kernel_size[1] - 1) + + output_padding[1] + + 1 + ) + + # Compute the upsampled (top-left) position for each input pixel + h_idx = torch.arange(H_in, device=input_tensor.device) + w_idx = torch.arange(W_in, device=input_tensor.device) + grid_h, grid_w = torch.meshgrid(h_idx, w_idx, indexing="ij") + out_h_idx = grid_h * stride[0] - padding[0] + out_w_idx = grid_w * stride[1] - padding[1] + + # Compute all input pixel positions (flattened) + ch_idx = torch.arange(C * H_in * W_in, device=input_tensor.device) + ij_idx = ch_idx % (H_in * W_in) + i_idx = ij_idx // W_in + j_idx = ij_idx % W_in + + # For each input pixel, compute the output positions for the kernel window + kh_idx = torch.arange(kernel_size[0], device=input_tensor.device) + kw_idx = torch.arange(kernel_size[1], device=input_tensor.device) + kh_grid, kw_grid = torch.meshgrid(kh_idx, kw_idx, indexing="ij") + kh_grid = kh_grid.reshape(-1) + kw_grid = kw_grid.reshape(-1) + num_kernel = kernel_size[0] * kernel_size[1] + + # Broadcast to all channels and kernel positions + ch_idx_b = ch_idx.repeat_interleave(num_kernel) + n_kernel = ch_idx.shape[0] * num_kernel + + i_idx_b = i_idx.repeat_interleave(num_kernel) + j_idx_b = j_idx.repeat_interleave(num_kernel) + kh_b = kh_grid.repeat(ch_idx.shape[0]) + kw_b = kw_grid.repeat(ch_idx.shape[0]) + + h_out = out_h_idx[i_idx_b, j_idx_b] + kh_b * dilation[0] + w_out = out_w_idx[i_idx_b, j_idx_b] + kw_b * dilation[1] + + # Mask for valid output positions + valid = (h_out >= 0) & (h_out < H_out) & (w_out >= 0) & (w_out < W_out) + + # Prepare indices for advanced indexing + n_idx = ( + torch.arange(N, device=input_tensor.device) + .view(-1, 1) + .expand(N, n_kernel) + .reshape(-1) + ) + ch_idx_full = ch_idx_b.expand(N, n_kernel).reshape(-1) + h_out_full = h_out.expand(N, n_kernel).reshape(-1) + w_out_full = w_out.expand(N, n_kernel).reshape(-1) + valid_full = valid.expand(N, n_kernel).reshape(-1) + + # Gather input values for each channel + inp_vals = inp_flat[:, ch_idx_b].reshape(-1) + + # Create output tensor + patches = torch.zeros((N, C * H_in * W_in, H_out, W_out), dtype=input_tensor.dtype) + + # If in_zero_point is provided, fill patches with it + if in_zero_point is not None: + if in_zero_point.numel() == 1: + patches.fill_(in_zero_point.item()) + else: + # Broadcast in_zero_point to (N, C, H_in, W_in) + assert in_zero_point.shape == (N,) + in_zero_point = in_zero_point.view(N, 1, 1, 1) + patches = patches + in_zero_point + + # Scatter input values to output positions (only valid positions) + patches[ + n_idx[valid_full], + ch_idx_full[valid_full], + h_out_full[valid_full], + w_out_full[valid_full], + ] = inp_vals[valid_full] + + # Optionally, flatten to (N, num_patches, patch_size) if needed + patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous() + return patches + + +@impl(m, "quantized_embedding_byte") +def quantized_embedding_byte( + weight: torch.Tensor, + weight_scales: torch.Tensor, + weight_zero_points: torch.Tensor | None, + indices: torch.Tensor, + pruned_weights: bool = False, +) -> torch.Tensor: + if pruned_weights: + raise NotImplementedError("Pruned weights not supported") + + # Cannot use torch.ops.quantized_decomposed.embedding_byte.dtype because + # it doesn't support num_groups == 1 + num_groups = 1 + if len(weight_scales.shape) == 2: + num_groups = weight_scales.shape[1] + + group_size = weight.shape[1] // num_groups + weight = torch.ops.torchao.dequantize_affine.default( + input=weight, + block_size=(1, group_size), + scale=weight_scales, + zero_point=weight_zero_points, + input_dtype=weight.dtype, + quant_min=torch.iinfo(weight.dtype).min, + quant_max=torch.iinfo(weight.dtype).max, + ) + + return weight[indices] diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py index 663c5825e52..263d3a521f3 100644 --- a/backends/cadence/aot/remove_ops.py +++ b/backends/cadence/aot/remove_ops.py @@ -9,7 +9,7 @@ import logging from dataclasses import dataclass, field -from typing import cast, List, Optional, Sequence, Set +from typing import cast, List, Optional, Sequence, Set, Type import torch import torch.fx @@ -926,19 +926,28 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: return super().call(graph_module) +class CommonRemovePasses: + passes: List[Type[ExportPass]] = [ + RemoveCloneOpPass, + RemoveAliasCopyOpPass, + RemoveNopExpandOpPass, + RemoveNopSliceOrViewOpPass, + RemoveNopSelectOpPass, + RemoveToOpsPass, + RemoveZeroSizedCatArgsPass, + RemovePermutesAroundElementwiseOps, + RemoveSqueezeViewBeforeElementwiseOps, + RemoveCatFromSliceCopyPass, + ] + + class CadenceRemoveNops: - passes = [ + passes: List[Type[ExportPass]] = CommonRemovePasses.passes + [ SimplifySliceOpPass, RemoveCloneOpsTransformImported, - RemoveToOpsPass, RemoveNopRequantizeOpPass, - RemoveZeroSizedCatArgsPass, - RemoveNopSliceOrViewOpPass, - RemoveNopExpandOpPass, RemoveZeroSizedConstantPadNd, - RemoveCloneOpPass, RemoveContiguousOpPass, - RemoveAliasCopyOpPass, RemoveNopMulOpPass, RemoveNopAddOpPass, RemoveNopLinalgVectorNormOpPass, diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py index c575be6e7fc..3cfc059e75b 100644 --- a/backends/cadence/aot/replace_ops.py +++ b/backends/cadence/aot/replace_ops.py @@ -43,7 +43,6 @@ from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue -from torch._subclasses import FakeTensor from torch.fx.node import Argument # A map to represent ops that: @@ -90,14 +89,10 @@ def replace_logical_nop_where_with_where( # Get the third arg node and its input logical_not_node = node.args[0] - logical_not_input_tensor = ( - logical_not_node.args[0].to_tensor() - if isinstance(logical_not_node.args[0], ProxyValue) - else logical_not_node.args[0] - ) + logical_not_input_node = logical_not_node.args[0] # If the logical_not input is not a boolean tensor, bail. - if logical_not_input_tensor.meta["spec"].dtype != torch.bool: + if logical_not_input_node.meta["val"].dtype != torch.bool: continue # Replace the where op with another one, flipping the inputs and using the boolean @@ -263,7 +258,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) # Glean the shape of input and output tensor - in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0] + in_tensor = args[0].to_tensor() in_shape = in_tensor.shape out_shape = meta["val"].shape # Get the select dimension @@ -295,7 +290,7 @@ def call_operator(self, op, args, kwargs, meta): # Create a zero bias tensor, and insert it as a graph buffer before the # current node - mat2_tensor = mat2.to_tensor() if isinstance(mat2, ProxyValue) else mat2 + mat2_tensor = mat2.to_tensor() bias_size = mat2_tensor.size(1) zero_bias = super().call_operator( exir_ops.edge.aten.full.default, @@ -410,7 +405,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) # Get the old dim and new dim order - in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0] + in_tensor = args[0].to_tensor() old_dims = tuple(range(in_tensor.dim())) new_dims = args[1] @@ -438,11 +433,17 @@ class ReplaceConvolutionOptionalArgsWithConcreteArgsPass(ExportPass): """ def call_operator(self, op, args, kwargs, meta): - if get_edge_overload_packet(op) != exir_ops.edge.aten.convolution: + op_packet = get_edge_overload_packet(op) + if op_packet not in { + exir_ops.edge.cadence.convolution, + exir_ops.edge.cadence.transposed_convolution, + }: return super().call_operator(op, args, kwargs, meta) + is_transposed = op_packet == exir_ops.edge.cadence.transposed_convolution + expected_args = 9 if is_transposed else 8 + assert len(args) == expected_args # Check if the bias is already concrete - assert len(args) == 9 if args[2] is not None: return super().call_operator(op, args, kwargs, meta) @@ -482,11 +483,7 @@ def call_operator(self, op, args, kwargs, meta): repeats = args[1] # Glean the shapes of input tensor - in_shape = list( - in_tensor.to_tensor().shape - if isinstance(in_tensor, ProxyValue) - else in_tensor.shape - ) + in_shape = list(in_tensor.to_tensor().shape) # If the size of repeats is more than the dimensionality of the tensor, # the output of repeat will be a higher-dimensional tensor. We reshape @@ -693,43 +690,27 @@ def call_operator(self, op, args, kwargs, meta): # graph operation (in this case a transpose_copy op) to be an explicit # ProxyValue as well. If not, the view op can be done directly on the # tensor. - transposed_weight = ( - super().call_operator( - exir_ops.edge.aten.transpose_copy.int, - ( - weight, - 0, - 1, - ), - kwargs, - meta, - ) - if isinstance(weight, ProxyValue) - else weight.transpose(0, 1) + transposed_weight = super().call_operator( + exir_ops.edge.aten.transpose_copy.int, + ( + weight, + 0, + 1, + ), + kwargs, + meta, ) - flipped_weight = ( - super().call_operator( - exir_ops.edge.aten.flip.default, - ( - transposed_weight, - [-1] if transposed_weight.to_tensor().dim() == 3 else [-1, -2], - ), - kwargs, - meta, - ) - if isinstance(transposed_weight, ProxyValue) - else ( - transposed_weight.flip(-1) - if transposed_weight.dim() == 3 - else transposed_weight.flip(-1, -2) - ) + flipped_weight = super().call_operator( + exir_ops.edge.aten.flip.default, + ( + transposed_weight, + [-1] if transposed_weight.to_tensor().dim() == 3 else [-1, -2], + ), + kwargs, + meta, ) - # From the previous checks, if flipped_weight is a FakeTensor, it has to be - # a constant (if not, it would be a ProxyValue). Mark it as such. - if isinstance(flipped_weight, FakeTensor): - flipped_weight.constant = flipped_weight new_args = ( in_tensor, flipped_weight, @@ -745,16 +726,10 @@ def call_operator(self, op, args, kwargs, meta): # Verify that output_padding is 0. assert all( x == 0 for x in output_padding - ), "Cannot handle padded output in convolution" + ), f"Cannot handle padded output in convolution. Got {output_padding=}" - # If the innermost dim of output tensor is 1, then the stride - # should be 1. Note that the first dimension of output tensor is - # channel - new_stride = stride.copy() - out_shape = meta["val"].shape - assert out_shape is not None - for i, e in enumerate(out_shape[2:]): - new_stride[i] = 1 if e == 1 else stride[i] + # Keep the original stride to maintain correct output dimensions + new_stride = stride new_args = ( in_tensor, @@ -787,8 +762,8 @@ class ReplaceTrivialConvWithLinear(ExportPass): trivial_conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = { exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default, - exir_ops.edge.cadence.quantized_conv_nchw.default: exir_ops.edge.cadence.quantized_linear.default, - exir_ops.edge.cadence.quantized_conv_nhwc.default: exir_ops.edge.cadence.quantized_linear.default, + exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default, + exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default, } def call_operator(self, op, args, kwargs, meta): @@ -800,8 +775,8 @@ def call_operator(self, op, args, kwargs, meta): # extra args holding at least the zero point and scale of input, weight, bias, # and output tensor. quantized_op = ( - op == exir_ops.edge.cadence.quantized_conv_nchw.default - or op == exir_ops.edge.cadence.quantized_conv_nhwc.default + op == exir_ops.edge.cadence.quantized_conv2d_nchw.default + or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default ) assert (len(args) == 8 and not quantized_op) or ( len(args) >= 12 and quantized_op @@ -809,15 +784,9 @@ def call_operator(self, op, args, kwargs, meta): (in_tensor, weight, bias, stride, padding, dilation, groups) = args[0:7] # Glean the shapes of input, weight, and output - in_shape = ( - in_tensor.to_tensor().shape - if isinstance(in_tensor, ProxyValue) - else in_tensor.shape - ) + in_shape = in_tensor.to_tensor().shape - weight_shape = ( - weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape - ) + weight_shape = weight.to_tensor().shape out_shape = meta["val"].shape assert None not in {in_shape, weight_shape, out_shape} @@ -839,26 +808,16 @@ def call_operator(self, op, args, kwargs, meta): # Reshape the weight to [out_channels, in_channels * X] K = math.prod(weight_shape[1:]) - # If weight is a ProxyValue, linear_weight needs to be the output of a - # graph operation (in this case a view_copy op) to be an explicit ProxyValue - # as well. If not, the view op can be done directly on the tensor. - linear_weight = ( - super().call_operator( - exir_ops.edge.aten.view_copy.default, - ( - weight, - [weight_shape[0], K], - ), - kwargs, - meta, - ) - if isinstance(weight, ProxyValue) - else weight.contiguous().view(weight_shape[0], K) + # Weight is always a ProxyValue, so we need a view_copy operation + linear_weight = super().call_operator( + exir_ops.edge.aten.view_copy.default, + ( + weight, + [weight_shape[0], K], + ), + kwargs, + meta, ) - # From the previous check, if linear_weight is a FakeTensor, it has to be - # a constant (if not, it would be a ProxyValue). Mark it as such. - if isinstance(linear_weight, FakeTensor): - linear_weight.constant = linear_weight # Reshape the input from 3d to 2d tensor in_view = super().call_operator( @@ -881,11 +840,7 @@ def call_operator(self, op, args, kwargs, meta): out_zero_point, ) = args[7:12] # If the multiplier and shift tensors are provided, use them. - if ( - len(args) >= 14 - and isinstance(args[12], ProxyValue) - and isinstance(args[13], ProxyValue) - ): + if len(args) >= 14: out_multiplier = args[12] out_shift = args[13] # If not, compute them. @@ -979,18 +934,18 @@ def call_operator( ) -> ProxyValue: if op not in { exir_ops.edge.cadence.convolution.default, - exir_ops.edge.cadence.quantized_conv_nchw.default, + exir_ops.edge.cadence.quantized_conv2d_nchw.default, }: return super().call_operator(op, args, kwargs, meta) - quantized_op = op == exir_ops.edge.cadence.quantized_conv_nchw.default + quantized_op = op == exir_ops.edge.cadence.quantized_conv2d_nchw.default if not quantized_op and len(args) == 8 and args[-1] is True: # Already in NHWC layout. return super().call_operator(op, args, kwargs, meta) new_op = ( - exir_ops.edge.cadence.quantized_conv_nhwc.default + exir_ops.edge.cadence.quantized_conv2d_nhwc.default if quantized_op else exir_ops.edge.cadence.convolution.default ) @@ -1067,8 +1022,8 @@ class ReplaceConvWithIm2RowAndLinear(ExportPass): # decompose to. conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = { exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default, - exir_ops.edge.cadence.quantized_conv_nchw.default: exir_ops.edge.cadence.quantized_linear.default, - exir_ops.edge.cadence.quantized_conv_nhwc.default: exir_ops.edge.cadence.quantized_linear.default, + exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default, + exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default, } def call_operator(self, op, args, kwargs, meta): @@ -1077,8 +1032,8 @@ def call_operator(self, op, args, kwargs, meta): # Get the relevant args from convolution node. quantized_op = ( - op == exir_ops.edge.cadence.quantized_conv_nchw.default - or op == exir_ops.edge.cadence.quantized_conv_nhwc.default + op == exir_ops.edge.cadence.quantized_conv2d_nchw.default + or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default ) assert (len(args) == 8 and not quantized_op) or ( len(args) >= 12 and quantized_op @@ -1089,9 +1044,7 @@ def call_operator(self, op, args, kwargs, meta): if groups != 1: return super().call_operator(op, args, kwargs, meta) - weight_shape = ( - weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape - ) + weight_shape = weight.to_tensor().shape # If this is a pointwise convolution, im2col will start dominating the # runtime. So we call convolution op for this case. if ( @@ -1110,7 +1063,7 @@ def call_operator(self, op, args, kwargs, meta): # channel_last layout is specified by the channel_last arg of conv # op, which is either the last argument (15th) or implicitely False # if the op is quantized, or the last argument if not. - channel_last = op == exir_ops.edge.cadence.quantized_conv_nhwc.default + channel_last = op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default # The weight tensor is [out_channels, in_channels, X] for NCHW layout, # and [out_channels, X, in_channels] for NHWC layout. Here, X is the # kernel_width for conv1d, and X = kernel_height * kernel_width for @@ -1130,8 +1083,6 @@ def call_operator(self, op, args, kwargs, meta): {"dtype": torch.int32}, meta, ) - if isinstance(in_tensor.to_tensor(), FakeTensor) - else get_zero_point(in_tensor.to_tensor()) ) if quantized_op else torch.tensor(0, dtype=torch.int32) @@ -1167,26 +1118,16 @@ def call_operator(self, op, args, kwargs, meta): # Get the product of the >2 dims of the weight K = math.prod(weight_shape[1:]) - # If weight is a ProxyValue, linear_weight needs to be the output of a - # graph operation (in this case a view_copy op) to be an explicit ProxyValue - # as well. If not, the view op can be done directly on the tensor. - linear_weight = ( - super().call_operator( - exir_ops.edge.aten.view_copy.default, - ( - weight, - [weight_shape[0], K], - ), - kwargs, - meta, - ) - if isinstance(weight, ProxyValue) - else weight.contiguous().view(weight_shape[0], K) + # Weight is always a ProxyValue, so we need a view_copy operation + linear_weight = super().call_operator( + exir_ops.edge.aten.view_copy.default, + ( + weight, + [weight_shape[0], K], + ), + kwargs, + meta, ) - # From the previous check, if linear_weight is a FakeTensor, it has to be - # a constant (if not, it would be a ProxyValue). Mark it as such. - if isinstance(linear_weight, FakeTensor): - linear_weight.constant = linear_weight # Create the linear node, which multiplies the 3d input with 2d weight # tensors with bias addition. The outermost dimension of the input is @@ -1200,11 +1141,7 @@ def call_operator(self, op, args, kwargs, meta): out_zero_point, ) = args[7:12] # If the multiplier and shift tensors are provided, use them. - if ( - len(args) >= 14 - and isinstance(args[12], ProxyValue) - and isinstance(args[13], ProxyValue) - ): + if len(args) >= 14: out_multiplier = args[12] out_shift = args[13] # If not, compute them. @@ -1292,9 +1229,7 @@ def call_operator(self, op, args, kwargs, meta): # Get the shapes out_shape = meta["val"].shape - weight_shape = ( - weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape - ) + weight_shape = weight.to_tensor().shape assert None not in {weight_shape, out_shape} # Determine if the transposed_convolution is NCHW or NHWC. The NHWC, @@ -1348,26 +1283,16 @@ def call_operator(self, op, args, kwargs, meta): # Reshape the weight to [out_channels, in_channels * X] K = math.prod(weight_shape[1:]) - # If weight is a ProxyValue, linear_weight needs to be the output of a - # graph operation (in this case a view_copy op) to be an explicit ProxyValue - # as well. If not, the view op can be done directly on the tensor. - linear_weight = ( - super().call_operator( - exir_ops.edge.aten.view_copy.default, - ( - weight, - [weight_shape[0], K], - ), - kwargs, - meta, - ) - if isinstance(weight, ProxyValue) - else weight.contiguous().view(weight_shape[0], K) + # Weight is always a ProxyValue, so we need a view_copy operation + linear_weight = super().call_operator( + exir_ops.edge.aten.view_copy.default, + ( + weight, + [weight_shape[0], K], + ), + kwargs, + meta, ) - # From the previous check, if linear_weight is a FakeTensor, it has to be - # a constant (if not, it would be a ProxyValue). Mark it as such. - if isinstance(linear_weight, FakeTensor): - linear_weight.constant = linear_weight # Create the linear node, which multiplies the 3d input with 2d weight # tensors with bias addition. The outermost dimension of the input is @@ -1438,7 +1363,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) # Get the input tensor and shape - in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0] + in_tensor = args[0].to_tensor() in_shape = in_tensor.shape # Get the output tensor shape out_shape = meta["val"].shape @@ -1507,7 +1432,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) # Extract the input tensor - in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0] + in_tensor = args[0].to_tensor() leading_dims = math.prod(in_tensor.shape[:-1]) # If the tensor is not a vector, do nothing. if leading_dims != 1: @@ -1573,11 +1498,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator( exir_ops.edge.aten.full.default, ( - ( - args[0].to_tensor().shape - if isinstance(args[0], ProxyValue) - else args[0].shape - ), + args[0].to_tensor().shape, args[1], ), {}, @@ -1618,60 +1539,58 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass): replaced_scalar_args: dict[ EdgeOpOverloadPacket, tuple[EdgeOpOverload, Sequence[int]] ] = { - exir_ops.edge.cadence.quantized_add: ( + exir_ops.edge.cadence.quantized_add.default: ( exir_ops.edge.cadence.quantized_add.per_tensor, [1, 2, 4, 5], ), - exir_ops.edge.cadence.quantized_conv_nchw: ( - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.default: ( + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, [8, 9, 12, 13], ), - exir_ops.edge.cadence.quantized_conv_nhwc: ( - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.default: ( + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, [8, 9, 12, 13], ), - exir_ops.edge.cadence.quantized_fully_connected: ( + exir_ops.edge.cadence.quantized_fully_connected.default: ( exir_ops.edge.cadence.quantized_fully_connected.per_tensor, [4, 5, 6], ), - exir_ops.edge.cadence.quantized_layer_norm: ( + exir_ops.edge.cadence.quantized_layer_norm.default: ( exir_ops.edge.cadence.quantized_layer_norm.per_tensor, [1, 2], ), - exir_ops.edge.cadence.quantized_linear: ( + exir_ops.edge.cadence.quantized_linear.default: ( exir_ops.edge.cadence.quantized_linear.per_tensor, [4, 5, 6], ), - exir_ops.edge.cadence.quantized_relu: ( + exir_ops.edge.cadence.quantized_relu.default: ( exir_ops.edge.cadence.quantized_relu.per_tensor, [1, 3, 4], ), - exir_ops.edge.cadence.im2row: ( + exir_ops.edge.cadence.im2row.default: ( exir_ops.edge.cadence.im2row.per_tensor, [5], ), - exir_ops.edge.cadence.requantize: ( + exir_ops.edge.cadence.requantize.default: ( exir_ops.edge.cadence.requantize.per_tensor, [1, 2, 3, 4], ), } def call_operator(self, op, args, kwargs, meta): - op_edge_overload_packet = get_edge_overload_packet(op) - - if op_edge_overload_packet not in self.replaced_scalar_args: + if op not in self.replaced_scalar_args: return super().call_operator(op, args, kwargs, meta) # Get all the args that need to be replaced. - new_op, args_to_be_replaced = self.replaced_scalar_args[op_edge_overload_packet] + new_op, args_to_be_replaced = self.replaced_scalar_args[op] + + if op == new_op: + return super().call_operator(op, args, kwargs, meta) updated_args = list(args) for op_arg_index in args_to_be_replaced: arg = args[op_arg_index] - if not isinstance(arg, ProxyValue): - return super().call_operator(op, args, kwargs, meta) - - if not arg.is_tensor(): + if not isinstance(arg, ProxyValue) or not arg.is_tensor(): return super().call_operator(op, args, kwargs, meta) if not isinstance(arg.node.target, EdgeOpOverload): @@ -1712,7 +1631,7 @@ def call_operator(self, op, args, kwargs, meta): # Determine if the op is avg_pool1d or avg_pool2d avg_pool1d: bool = op == exir_ops.edge.aten.avg_pool1d.default # Get the input tensor - in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0] + in_tensor = args[0].to_tensor() # Replace avg_pool2d with custom avg_pool2d, and if the input tensor is # quantized, pass its zero_point tensor as arg to the custom avg_pool2d. @@ -1725,7 +1644,7 @@ def call_operator(self, op, args, kwargs, meta): ceil_mode = args[4] if len(args) >= 5 else False count_include_pad = args[5] if len(args) >= 6 else True divisor_override = args[6] if len(args) >= 7 else None - zero_point = torch.tensor(0, dtype=torch.int32) + zero_point = args[7] if len(args) >= 8 else None # If the op is avg_pool1d, then we need to reshape the 3d input to a 4d # tensor. @@ -2078,7 +1997,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) # Get the second tensor - Y_tensor = Y_arg.to_tensor() if isinstance(Y_arg, ProxyValue) else Y_arg + Y_tensor = Y_arg.to_tensor() # Concretize the bias zero_bias = super().call_operator( exir_ops.edge.aten.full.default, @@ -2087,19 +2006,14 @@ def call_operator(self, op, args, kwargs, meta): meta, ) - # If the arg was a ProxyValue, insert a transpose node. Otherwise we - # can simply transpose the tensor inplace. - if isinstance(Y_arg, ProxyValue): - transpose_args = (Y_arg, -1, -2) - transpose_node = super().call_operator( - exir_ops.edge.aten.transpose_copy.int, - transpose_args, - {}, - meta, - ) - Y_arg_t = transpose_node - else: - Y_arg_t = Y_tensor.transpose(-1, -2) + # Y_arg is always a ProxyValue, so we insert a transpose node + transpose_args = (Y_arg, -1, -2) + Y_arg_t = super().call_operator( + exir_ops.edge.aten.transpose_copy.int, + transpose_args, + {}, + meta, + ) # Construct the new args, and return the transposed matmult op new_args = ( @@ -2194,7 +2108,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) # Get the input tensor - in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0] + in_tensor = args[0].to_tensor() # Permute NCHW to NHWC for computation in_tensor_permuted = in_tensor.permute(0, 2, 3, 1) in_tensor_shape = in_tensor_permuted.shape @@ -2242,6 +2156,52 @@ def call_operator(self, op, args, kwargs, meta): ) +@register_cadence_pass(CadencePassAttribute(opt_level=0)) +class ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding(ExportPass): + """ + Replace torch.ops.quantized_decomposed.embedding_byte.dtype with + torch.ops.cadence.quantized_embedding_byte + """ + + def call_operator( + self, + op: torch._ops.OpOverload, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + # Check if the op is the quantized_decomposed.embedding_byte.dtype + if ( + op == exir_ops.edge.quantized_decomposed.embedding_byte.default + or op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype + ): + # Replace with cadence.quantized_embedding_byte + if len(args) < 6: + raise AssertionError( + f"Expected 6 arguments for embedding_byte, got {len(args)}" + ) + embedding = args[0] + scales = args[1] + weight_zero_points = args[2] + indices = args[5] + if op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype: + dtype = kwargs.get("dtype", None) + if dtype is not None and dtype != torch.float32: + raise AssertionError( + f"Unsupported output dtype for embedding_byte: {dtype}" + ) + + new_args = (embedding, scales, weight_zero_points, indices, False) + new_kwargs = {} + return super().call_operator( + exir_ops.edge.cadence.quantized_embedding_byte.default, + new_args, + new_kwargs, + meta, + ) + return super().call_operator(op, args, kwargs, meta) + + class CommonReplacePasses: passes = [ ReplaceSqueezeAndUnsqueezeWithViewPass, @@ -2251,6 +2211,10 @@ class CommonReplacePasses: ReplaceRepeatWithCatPass, ReplaceFullLikeWithFullPass, ReplaceAtenConvolutionWithCadenceConvolutionPass, + ReplacePT2QuantWithCadenceQuantPass, + ReplacePT2DequantWithCadenceDequantPass, + ReplacePowWithMulPass, + ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding, ] @@ -2296,13 +2260,10 @@ class CadenceReplaceOpsInGraph: ReplaceScalarTensorWithFullPass, ReplaceInfArgInFullWithValuePass, ReplaceLogicalNotBooleanWhereWithWherePass, - ReplacePT2QuantWithCadenceQuantPass, - ReplacePT2DequantWithCadenceDequantPass, ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass, ReplaceAdaptiveAvgPoolWithAtenAvgPoolPass, ReplaceAtenAvgPoolWithCadenceAvgPoolPass, ReplaceWhereWithFullArgsWithWhereScalar, ReplaceAtenApproxGeluWithApproxGeluPass, - ReplacePowWithMulPass, ReplaceMulTensorWithMulAndFullOpsPass, ] diff --git a/backends/cadence/aot/simplify_ops.py b/backends/cadence/aot/simplify_ops.py index bf836f09044..92c14cb0f5d 100644 --- a/backends/cadence/aot/simplify_ops.py +++ b/backends/cadence/aot/simplify_ops.py @@ -19,7 +19,7 @@ from executorch.backends.cadence.aot.utils import rebind from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload -from executorch.exir.pass_base import ExportPass, ProxyValue +from executorch.exir.pass_base import ExportPass @register_cadence_pass(CadencePassAttribute(opt_level=0)) @@ -75,7 +75,7 @@ def call_operator(self, op, args, kwargs, meta): slice_scatter = op == exir_ops.edge.aten.slice_scatter.default # Parse the arguments # Extract the tensor to be sliced, and the slicing dimension - in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0] + in_tensor = args[0].to_tensor() dim = args[1 + slice_scatter] if len(args) > 1 + slice_scatter else 0 # Make dim non-negative dim = dim if dim >= 0 else dim + in_tensor.dim() diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py index 30b30e085dc..259752f3893 100644 --- a/backends/cadence/aot/tests/test_ref_implementations.py +++ b/backends/cadence/aot/tests/test_ref_implementations.py @@ -36,12 +36,11 @@ def test_quantize_per_tensor( ) -> None: input_tensor = torch.tensor([input_value]) scale = (f_max - f_min) / (q_max - q_min) - inv_scale = 1.0 / scale - zero_point = round(-f_min * inv_scale) + q_min + zero_point = round(-f_min * 1 / scale) + q_min expected_output = torch.tensor([expected_value], dtype=target_dtype) output = torch.ops.cadence.quantize_per_tensor( - input_tensor, inv_scale, zero_point, q_min, q_max, target_dtype + input_tensor, scale, zero_point, q_min, q_max, target_dtype ) self.assertEqual( @@ -85,7 +84,7 @@ def test_dequantize_per_tensor( expected_output = torch.tensor([expected_value], dtype=torch.float32) output = torch.ops.cadence.dequantize_per_tensor( - input_tensor, scale, zero_point, q_min, q_max, torch.float32 + input_tensor, scale, zero_point, q_min, q_max, input_tensor.dtype ) self.assertEqual( @@ -173,9 +172,9 @@ def test_quantized_add( torch.tensor( [1073741824], dtype=torch.int32 ), # out_multiplier (0.5 * 2^31) - torch.tensor([0], dtype=torch.int64), # out_shift + torch.tensor([0], dtype=torch.int32), # out_shift 0, # out_zero_point - torch.tensor([[-2]], dtype=dtype), # expected_output + torch.tensor([[0]], dtype=dtype), # expected_output per_tensor, False, False, @@ -198,9 +197,9 @@ def test_quantized_add( torch.tensor( [1073741824], dtype=torch.int32 ), # out_multiplier (0.5 * 2^31) - torch.tensor([0], dtype=torch.int64), # out_shift + torch.tensor([0], dtype=torch.int32), # out_shift 0, # out_zero_point - torch.tensor([[-10, -30]], dtype=dtype), # expected_output + torch.tensor([[-2, -8]], dtype=dtype), # expected_output per_tensor, False, False, @@ -208,6 +207,28 @@ def test_quantized_add( for (per_tensor, dtype) in ( (False, torch.int8), (True, torch.int8), + ) + ], + *[ + ( + torch.Size([1, 3]), # src_shape: 1 sample, 3 input features + torch.Size( + [2, 3] + ), # weight_shape: 2 output features, 3 input features + 0, # in_zero_point + torch.tensor([0, 0, 0], dtype=dtype), # weight_zero_point + torch.tensor( + [1073741824], dtype=torch.int32 + ), # out_multiplier (0.5 * 2^31) + torch.tensor([0], dtype=torch.int32), # out_shift + 0, # out_zero_point + torch.tensor([[0, 0]], dtype=dtype), # expected_output + per_tensor, + False, + False, + ) + for (per_tensor, dtype) in ( + (False, torch.uint8), (True, torch.uint8), ) ], @@ -223,10 +244,10 @@ def test_quantized_add( torch.tensor( [1073741824], dtype=torch.int32 ), # out_multiplier (0.5 * 2^31) - torch.tensor([0], dtype=torch.int64), # out_shift + torch.tensor([0], dtype=torch.int32), # out_shift 0, # out_zero_point torch.tensor( - [[[-2, -8, -14], [-6, -28, -50]]], dtype=dtype + [[[0, -2, -4], [-2, -7, -12]]], dtype=dtype ), # expected_output per_tensor, False, @@ -235,7 +256,6 @@ def test_quantized_add( for (per_tensor, dtype) in ( (False, torch.int8), (True, torch.int8), - (True, torch.uint8), ) ], # Test case 4: Non-zero zero points @@ -250,9 +270,9 @@ def test_quantized_add( torch.tensor( [268435456], dtype=torch.int32 ), # out_multiplier (1.0 * 2^31) - torch.tensor([0], dtype=torch.int64), # out_shift + torch.tensor([0], dtype=torch.int32), # out_shift 1, # out_zero_point - torch.tensor([[-15, 25]], dtype=dtype), # expected_output + torch.tensor([[1, 1]], dtype=dtype), # expected_output per_tensor, False, False, @@ -260,7 +280,7 @@ def test_quantized_add( for (per_tensor, dtype) in ( (False, torch.int8), (True, torch.int8), - (True, torch.uint8), + # (True, torch.uint8), ) ], # Test case 5: Non-uniform weight zero points @@ -275,14 +295,14 @@ def test_quantized_add( torch.tensor( [268435456], dtype=torch.int32 ), # out_multiplier (1.0 * 2^31) - torch.tensor([0], dtype=torch.int64), # out_shift + torch.tensor([0], dtype=torch.int32), # out_shift 1, # out_zero_point - torch.tensor([[-23, 17]], dtype=dtype), # expected_output + torch.tensor([[1, 1]], dtype=dtype), # expected_output False, False, False, ) - for dtype in (torch.int8, torch.uint8) + for dtype in (torch.int8,) ], # Test case 6: Non-zero out_shift (shift=1) *[ @@ -297,10 +317,10 @@ def test_quantized_add( [268435456], dtype=torch.int32 ), # out_multiplier (0.125 * 2^31) torch.tensor( - [1], dtype=torch.int64 + [1], dtype=torch.int32 ), # out_shift (shift=1, doubles the scale) 1, # out_zero_point - torch.tensor([[-7, 13]], dtype=dtype), # expected_output + torch.tensor([[1, 2]], dtype=dtype), # expected_output per_tensor, False, False, @@ -319,16 +339,39 @@ def test_quantized_add( [268435456], dtype=torch.int32 ), # out_multiplier (0.125 * 2^31) torch.tensor( - [1], dtype=torch.int64 + [1], dtype=torch.int32 + ), # out_shift (shift=1, doubles the scale) + 1, # out_zero_point + torch.tensor([[1, 2]], dtype=dtype), # expected_output + per_tensor, + matmul, + transposed_matmul, + ) + for (matmul, transposed_matmul) in ((True, False), (True, True)) + for (per_tensor, dtype) in ((True, torch.int8),) + ], + *[ + ( + torch.Size([2, 1, 2]), # src_shape: 1 sample, 2 input features + torch.Size( + [2, 2, 2] + ), # weight_shape: 2 output features, 2 input features + 2, # in_zero_point + torch.tensor([1, 1], dtype=dtype), # weight_zero_point + torch.tensor( + [268435456], dtype=torch.int32 + ), # out_multiplier (0.125 * 2^31) + torch.tensor( + [1], dtype=torch.int32 ), # out_shift (shift=1, doubles the scale) 1, # out_zero_point - torch.tensor([[-7, 17]], dtype=dtype), # expected_output + torch.tensor([[[1, 2]], [[0, -1]]], dtype=dtype), # expected_output per_tensor, matmul, transposed_matmul, ) for (matmul, transposed_matmul) in ((True, False), (True, True)) - for (per_tensor, dtype) in ((True, torch.int8), (True, torch.uint8)) + for (per_tensor, dtype) in ((True, torch.int8),) ], ] ) @@ -360,7 +403,7 @@ def test_quantized_linear( .to(expected_output.dtype) ) if matmul and not transposed_matmul: - weight = weight.T + weight = weight.transpose(-1, -2) if per_tensor: weight_zero_point = weight_zero_point[0] @@ -906,9 +949,9 @@ def test_quantized_conv_per_tensor( convs = [ ( - torch.ops.cadence.quantized_conv_nchw.per_tensor + torch.ops.cadence.quantized_conv2d_nchw.per_tensor if memory_format == torch.contiguous_format - else torch.ops.cadence.quantized_conv_nhwc.per_tensor + else torch.ops.cadence.quantized_conv2d_nhwc.per_tensor ) ] @@ -916,30 +959,30 @@ def test_quantized_conv_per_tensor( if input_tensor.dtype == torch.int8 and weight.dtype == torch.int8: if memory_format == torch.contiguous_format: optimized_convs = [ - torch.ops.cadence.quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor, - torch.ops.cadence.quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor, - torch.ops.cadence.quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor, + torch.ops.cadence.quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor, + torch.ops.cadence.quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor, + torch.ops.cadence.quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor, ] else: optimized_convs = [ - torch.ops.cadence.quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor, - torch.ops.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor, - torch.ops.cadence.quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor, + torch.ops.cadence.quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor, + torch.ops.cadence.quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor, + torch.ops.cadence.quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor, ] elif input_tensor.dtype == torch.uint8 and weight.dtype == torch.uint8: if memory_format == torch.contiguous_format: optimized_convs = [ - torch.ops.cadence.quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor, - torch.ops.cadence.quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor, - torch.ops.cadence.quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor, + torch.ops.cadence.quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor, + torch.ops.cadence.quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor, + torch.ops.cadence.quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor, ] else: optimized_convs = [ - torch.ops.cadence.quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor, - torch.ops.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor, - torch.ops.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor, + torch.ops.cadence.quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor, + torch.ops.cadence.quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor, + torch.ops.cadence.quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor, ] convs.extend(optimized_convs) @@ -1045,21 +1088,20 @@ def test_quantized_conv_per_tensor( [4, 2, 0, -2], dtype=dtype ), # expected: relu(1,3,5,7) = (1,3,5,7) * (-1.0) + 5 = (4,2,0,-2) ) - for dtype in [torch.int8, torch.uint8] + for dtype in [torch.int8] ], - # Test case 4: Non-per-tensor *[ ( - "non_per_tensor", - torch.tensor([-1, -2, -3, 1, 2, 3], dtype=dtype), # input - torch.tensor([0, 0, 0, 1, 1, 1]), # X_zero_point + "positive_with_shift_unsigned", + torch.tensor([2, 4, 6, 8], dtype=dtype), # input + 1, # X_zero_point 5, # out_zero_point - torch.tensor([1073741824]), # out_multiplier (0.5 * 2^31) - torch.tensor([1]), # out_shift (multiply by 2^1 = 2) + 1073741824, # out_multiplier (0.5 * 2^31) + 1, # out_shift (multiply by 2^1 = 2) dtype, # dtype - torch.tensor([5, 5, 5, 5, 4, 3], dtype=dtype), + torch.tensor([4, 2, 0, 0], dtype=dtype), ) - for dtype in [torch.int8] + for dtype in [torch.uint8] ], ] ) @@ -1067,41 +1109,33 @@ def test_quantized_relu( self, name: str, X: torch.Tensor, - X_zero_point: torch.Tensor | int, + X_zero_point: int, out_zero_point: int, - out_multiplier: torch.Tensor | int, - out_shift: torch.Tensor | int, + out_multiplier: int, + out_shift: int, dtype: torch.dtype, expected_output: torch.Tensor, ) -> None: - if isinstance(X_zero_point, int): - assert isinstance(out_multiplier, int) - assert isinstance(out_shift, int) - - match dtype: - case torch.int8: - quantized_relu = ( - torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor - ) - case torch.uint8: - quantized_relu = ( - torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor - ) - case _: - quantized_relu = torch.ops.cadence.quantized_relu_per_tensor + match dtype: + case torch.int8: + quantized_relu = ( + torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor + ) + case torch.uint8: + quantized_relu = ( + torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor + ) + case _: + quantized_relu = torch.ops.cadence.quantized_relu_per_tensor - output = quantized_relu( - X, - X_zero_point, - out_zero_point, - out_multiplier, - out_shift, - ) - else: - output = torch.ops.cadence.quantized_relu( - X, X_zero_point, out_zero_point, out_multiplier, out_shift - ) + output = quantized_relu( + X, + X_zero_point, + out_zero_point, + out_multiplier, + out_shift, + ) # Verify output properties self.assertEqual(output.dtype, dtype, f"Output dtype should be {dtype}") @@ -1112,3 +1146,1277 @@ def test_quantized_relu( torch.equal(output, expected_output), f"Output values don't match expected in {name}. Got {output}, expected {expected_output}", ) + + def test_where_Scalar(self) -> None: + input_tensor = torch.tensor([1, 2, 3, 4], dtype=torch.int8) + out = torch.ops.cadence.where_Scalar(input_tensor > 2, 1.0, 0.0) + self.assertTrue( + torch.equal(out, torch.tensor([0.0, 0.0, 1.0, 1.0], dtype=torch.float32)) + ) + with self.assertRaises(ValueError) as context: + torch.ops.cadence.where_Scalar(input_tensor, 1.0, 0.0) + + self.assertIn("condition must be a bool tensor", str(context.exception)) + + @expand( + [ + ( + "h1xhd4", + torch.tensor([[[[1.0, 2.0, 3.0, 4.0]]]], dtype=torch.float32), + torch.tensor([[0.0, 0.0]], dtype=torch.float32), + torch.tensor([[1.0, 1.0]], dtype=torch.float32), + torch.tensor([[[[1.0, 3.0, 2.0, 4.0]]]], dtype=torch.float32), + ), + ( + "h2xhd4", + torch.tensor( + [[[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]]], + dtype=torch.float32, + ), + torch.tensor([[0.0, 1.0]], dtype=torch.float32), + torch.tensor([[1.0, 0.0]], dtype=torch.float32), + torch.tensor( + [[[[1.0, -4.0, 2.0, 3.0], [5, -8.0, 6.0, 7.0]]]], + dtype=torch.float32, + ), + ), + ( + "s2xh2xhd4", + torch.tensor( + [ + [ + [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], + [[9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]], + ] + ], + dtype=torch.float32, + ), + torch.tensor([[0.0, 1.0], [0.0, 1.0]], dtype=torch.float32), + torch.tensor([[1.0, 0.0], [1.0, 0.0]], dtype=torch.float32), + torch.tensor( + [ + [ + [[1.0, -4.0, 2.0, 3.0], [5.0, -8.0, 6.0, 7.0]], + [[9.0, -12.0, 10.0, 11.0], [13.0, -16.0, 14.0, 15.0]], + ] + ], + dtype=torch.float32, + ), + ), + ( + "pos_not_none", + torch.tensor( + [ + [ + [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], + [[9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]], + ] + ], + dtype=torch.float32, + ), + torch.tensor([[1.0, 0.0], [0.0, 1.0]], dtype=torch.float32), + torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32), + torch.tensor( + [ + [ + [[1.0, -4.0, 2.0, 3.0], [5.0, -8.0, 6.0, 7.0]], + [[-10.0, 11.0, 9.0, 12.0], [-14.0, 15.0, 13.0, 16.0]], + ] + ], + dtype=torch.float32, + ), + torch.tensor([1, 0]), + ), + ] + ) + def test_rope( + self, + name: str, + input_tensor: torch.Tensor, + sin_tensor: torch.Tensor, + cos_tensor: torch.Tensor, + expected_output: torch.Tensor, + pos: torch.Tensor | None = None, + ) -> None: + output = torch.ops.cadence.rope(input_tensor, sin_tensor, cos_tensor, pos) + + # Verify output properties + self.assertEqual( + output.dtype, + input_tensor.dtype, + f"Output dtype should match input dtype in {name}", + ) + self.assertEqual( + output.shape, + input_tensor.shape, + f"Output shape should match input shape in {name}", + ) + + # Verify output matches expected values + self.assertTrue( + torch.allclose(output, expected_output, rtol=1e-4, atol=1e-4), + f"Output values don't match expected in {name}. Got {output}, expected {expected_output}", + ) + + @expand( + [ + # Test case 1: Basic 2D convolution (NCHW format) + ( + "basic_2d_nchw", + torch.tensor( + [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + torch.tensor( + [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32 + ), # weight: 1x1x2x2 (identity-like filter) + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + False, # channel_last + torch.tensor( + [[[[5.0]]]], dtype=torch.float32 + ), # expected: 1*1 + 4*1 = 5 + ), + # Test case 2: Basic 2D convolution (NHWC format) + ( + "basic_2d_nhwc", + torch.tensor( + [[[[1.0], [2.0]], [[3.0], [4.0]]]], dtype=torch.float32 + ), # input: 1x2x2x1 (NHWC) + torch.tensor( + [[[[1.0], [0.0]], [[0.0], [1.0]]]], dtype=torch.float32 + ), # weight: 1x2x2x1 (NHWC format) + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + True, # channel_last + torch.tensor( + [[[[5.0]]]], dtype=torch.float32 + ), # expected: 1*1 + 4*1 = 5 + ), + # Test case 3: 2D convolution with stride=2 + ( + "conv2d_stride2", + torch.tensor( + [ + [ + [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [9.0, 10.0, 11.0, 12.0], + [13.0, 14.0, 15.0, 16.0], + ] + ] + ], + dtype=torch.float32, + ), # input: 1x1x4x4 + torch.tensor( + [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32 + ), # weight: 1x1x2x2 (sum filter) + torch.tensor([0.0], dtype=torch.float32), # bias + (2, 2), # stride=2 + (0, 0), # padding + (1, 1), # dilation + 1, # groups + False, # channel_last + torch.tensor([[[[14.0, 22.0], [46.0, 54.0]]]], dtype=torch.float32), + ), + # Test case 4: 2D convolution with padding=1 + ( + "conv2d_padding1", + torch.tensor( + [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + torch.tensor( + [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32 + ), # weight: 1x1x2x2 + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride + (1, 1), # padding=1 + (1, 1), # dilation + 1, # groups + False, # channel_last + torch.tensor( + [[[[1.0, 2.0, 0.0], [3.0, 5.0, 2.0], [0.0, 3.0, 4.0]]]], + dtype=torch.float32, + ), # expected with padding + ), + # Test case 5: 2D convolution with dilation=2 + ( + "conv2d_dilation2", + torch.tensor( + [ + [ + [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [9.0, 10.0, 11.0, 12.0], + [13.0, 14.0, 15.0, 16.0], + ] + ] + ], + dtype=torch.float32, + ), # input: 1x1x4x4 + torch.tensor( + [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32 + ), # weight: 1x1x2x2 + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (2, 2), # dilation=2 + 1, # groups + False, # channel_last + torch.tensor([[[[24.0, 28.0], [40.0, 44.0]]]], dtype=torch.float32), + ), + # Test case 6: 2D grouped convolution (groups=2) + ( + "conv2d_groups2", + torch.tensor( + [ + [ + [[1.0, 2.0], [3.0, 4.0]], # first input channel + [[5.0, 6.0], [7.0, 8.0]], # second input channel + ] + ], + dtype=torch.float32, + ), # input: 1x2x2x2 + torch.tensor( + [ + [[[1.0, 1.0], [1.0, 1.0]]], # first group weight + [[[0.5, 0.5], [0.5, 0.5]]], # second group weight + ], + dtype=torch.float32, + ), # weight: 2x1x2x2 + torch.tensor([0.0, 1.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 2, # groups=2 + False, # channel_last + torch.tensor([[[[10.0]], [[14.0]]]], dtype=torch.float32), + ), + # Test case 7: 1D convolution (NCL format) + ( + "conv1d_ncl", + torch.tensor( + [[[1.0, 2.0, 3.0, 4.0]]], dtype=torch.float32 + ), # input: 1x1x4 + torch.tensor([[[1.0, 1.0]]], dtype=torch.float32), # weight: 1x1x2 + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride (only stride[1] is used for 1D) + (0, 0), # padding (only padding[1] is used for 1D) + (1, 1), # dilation (only dilation[1] is used for 1D) + 1, # groups + False, # channel_last + torch.tensor( + [[[3.0, 5.0, 7.0]]], dtype=torch.float32 + ), # expected: [1+2, 2+3, 3+4] + ), + # Test case 8: 1D convolution (NLC format) + ( + "conv1d_nlc", + torch.tensor( + [[[1.0], [2.0], [3.0], [4.0]]], dtype=torch.float32 + ), # input: 1x4x1 (NLC) + torch.tensor( + [[[1.0], [1.0]]], dtype=torch.float32 + ), # weight: 1x2x1 (NLC) + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + True, # channel_last + torch.tensor([[[3.0], [5.0], [7.0]]], dtype=torch.float32), + ), + # Test case 9: Multi-channel input and output + ( + "multi_channel", + torch.tensor( + [ + [ + [[1.0, 2.0], [3.0, 4.0]], # first input channel + [[0.5, 1.0], [1.5, 2.0]], # second input channel + ] + ], + dtype=torch.float32, + ), # input: 1x2x2x2 + torch.tensor( + [ + [ # first output channel + [[1.0, 0.0], [0.0, 1.0]], # weights for first input channel + [ + [2.0, 0.0], + [0.0, 2.0], + ], # weights for second input channel + ], + [ # second output channel + [[0.5, 0.5], [0.5, 0.5]], # weights for first input channel + [ + [1.0, 1.0], + [1.0, 1.0], + ], # weights for second input channel + ], + ], + dtype=torch.float32, + ), # weight: 2x2x2x2 + torch.tensor([0.0, 1.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + False, # channel_last + torch.tensor([[[[10.0]], [[11.0]]]], dtype=torch.float32), + ), + # Test case 10: Convolution with non-zero bias + ( + "conv2d_with_bias", + torch.tensor( + [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + torch.tensor( + [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32 + ), # weight: 1x1x2x2 + torch.tensor([10.0], dtype=torch.float32), # bias=10 + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + False, # channel_last + torch.tensor( + [[[[15.0]]]], dtype=torch.float32 + ), # expected: 5 + 10 = 15 + ), + ] + ) + def test_convolution( + self, + name: str, + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int, int], + padding: tuple[int, int], + dilation: tuple[int, int], + groups: int, + channel_last: bool, + expected_output: torch.Tensor, + ) -> None: + output = torch.ops.cadence.convolution( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + channel_last, + ) + + # Verify output properties + self.assertEqual( + output.dtype, + input_tensor.dtype, + f"Output dtype should match input dtype in {name}", + ) + self.assertEqual( + output.shape, + expected_output.shape, + f"Output shape should match expected shape in {name}", + ) + + # Verify output matches expected values + self.assertTrue( + torch.equal(output, expected_output), + f"Output values don't match expected in {name}. Got {output}, expected {expected_output}", + ) + + @expand( + [ + # Basic 2D transposed convolution with stride=1 (current test case - corrected name) + ( + "basic_2d_stride1", + torch.tensor( + [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + torch.tensor( + [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32 + ), # weight: 1x1x2x2 + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + (0, 0), # output_padding + False, # channel_last + torch.tensor( + [[[[1.0, 3.0, 2.0], [4.0, 10.0, 6.0], [3.0, 7.0, 4.0]]]], + dtype=torch.float32, + ), + ), + # 2D transposed convolution with channel_last=True (NHWC format) + ( + "channel_last_nhwc", + torch.tensor( + [[[[1.0], [2.0]], [[3.0], [4.0]]]], dtype=torch.float32 + ), # input: 1x2x2x1 (NHWC) + torch.tensor( + [[[[1.0], [1.0]], [[1.0], [1.0]]]], dtype=torch.float32 + ), # weight: 1x2x2x1 (NHWC) + torch.tensor([0.0], dtype=torch.float32), # bias + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + (0, 0), # output_padding + True, # channel_last=True + torch.tensor( + [ + [ + [[1.0], [3.0], [2.0]], + [[4.0], [10.0], [6.0]], + [[3.0], [7.0], [4.0]], + ] + ], + dtype=torch.float32, + ), + ), + # 2D transposed convolution with non-zero bias + ( + "with_bias", + torch.tensor( + [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + torch.tensor( + [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32 + ), # weight: 1x1x2x2 + torch.tensor([5.0], dtype=torch.float32), # bias=5.0 + (1, 1), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + (0, 0), # output_padding + False, # channel_last + torch.tensor( + [[[[6.0, 7.0, 5.0], [8.0, 10.0, 7.0], [5.0, 8.0, 9.0]]]], + dtype=torch.float32, + ), + ), + # 1D transposed convolution (3D tensor, NLC format) + ( + "conv1d_nlc", + torch.tensor( + [[[1.0], [2.0], [3.0]]], dtype=torch.float32 + ), # input: 1x3x1 (NLC) + torch.tensor( + [[[1.0], [0.5]]], dtype=torch.float32 + ), # weight: 1x2x1 (NLC) + torch.tensor([0.0], dtype=torch.float32), # bias + (2, 0), # stride + (0, 0), # padding + (1, 1), # dilation + 1, # groups + (0, 0), # output_padding + True, # channel_last=True + torch.tensor( + [[[1.0], [0.5], [2.0], [1.0], [3.0], [1.5]]], dtype=torch.float32 + ), + ), + ] + ) + def test_transposed_convolution( + self, + name: str, + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int, int], + padding: tuple[int, int], + dilation: tuple[int, int], + groups: int, + output_padding: tuple[int, int], + channel_last: bool, + expected_output: torch.Tensor, + ) -> None: + output = torch.ops.cadence.transposed_convolution( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + output_padding, + groups, + channel_last, + ) + + # Verify output properties + self.assertEqual( + output.dtype, + input_tensor.dtype, + f"Output dtype should match input dtype in {name}", + ) + self.assertEqual( + output.shape, + expected_output.shape, + f"Output shape should match expected shape in {name}", + ) + + # Verify output matches expected values + self.assertTrue( + torch.equal(output, expected_output), + f"Output values don't match expected in {name}. Got {output}, expected {expected_output}", + ) + + @expand( + [ + # Basic non-quantized average pooling + ( + "basic_non_quantized", + torch.tensor( + [ + [ + [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [9.0, 10.0, 11.0, 12.0], + [13.0, 14.0, 15.0, 16.0], + ] + ] + ], + dtype=torch.float32, + ), # input: 1x1x4x4 + (2, 2), # kernel_size + (2, 2), # stride + (0, 0), # padding + False, # ceil_mode + False, # count_include_pad + None, # divisor_override + None, # in_zero_point (non-quantized) + False, # channel_last + torch.tensor( + [[[[3.5, 5.5], [11.5, 13.5]]]], dtype=torch.float32 + ), # expected: average of 2x2 blocks + ), + # Non-quantized with count_include_pad=True and padding + ( + "non_quantized_count_include_pad", + torch.tensor( + [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + (3, 3), # kernel_size (larger than input) + (1, 1), # stride + (1, 1), # padding + False, # ceil_mode + True, # count_include_pad=True + None, # divisor_override + None, # in_zero_point (non-quantized) + False, # channel_last + torch.tensor( + [[[[2.5, 2.5], [2.5, 2.5]]]], + dtype=torch.float32, + ), + ), + # Non-quantized with divisor_override + ( + "non_quantized_divisor_override", + torch.tensor( + [[[[2.0, 4.0], [6.0, 8.0]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + (2, 2), # kernel_size + (1, 1), # stride + (0, 0), # padding + False, # ceil_mode + False, # count_include_pad + 2, # divisor_override (instead of 4) + None, # in_zero_point (non-quantized) + False, # channel_last + torch.tensor( + [[[[10.0]]]], dtype=torch.float32 + ), # expected: (2+4+6+8)/2 = 10 + ), + # Quantized with non-zero zero_point and padding + ( + "quantized_nonzero_zero_point", + torch.tensor( + [[[[130, 132], [134, 136]]]], dtype=torch.uint8 + ), # input: 1x1x2x2, values around zero_point=128 + (3, 3), # kernel_size + (1, 1), # stride + (1, 1), # padding + False, # ceil_mode + True, # count_include_pad=True + None, # divisor_override + 128, # in_zero_point=128 (padded areas will have this value) + False, # channel_last + torch.tensor( + [[[[130, 130], [130, 130]]]], dtype=torch.uint8 + ), # expected: averages including padded zero_point values + ), + # Quantized with divisor_override + ( + "quantized_divisor_override", + torch.tensor( + [[[[64, 96], [128, 160]]]], dtype=torch.float32 + ), # input: 1x1x2x2 + (2, 2), # kernel_size + (1, 1), # stride + (0, 0), # padding + False, # ceil_mode + False, # count_include_pad + 2, # divisor_override (instead of 4) + None, # in_zero_point=None + False, # channel_last + torch.tensor( + [[[[224]]]], dtype=torch.float32 + ), # expected: (64+96+128+160)/2 = 224 + ), + # Large values that need clamping + ( + "quantized_clamping_test", + torch.tensor( + [[[[120, 125], [125, 127]]]], dtype=torch.int8 + ), # input: 1x1x2x2, large values for int8 + (2, 2), # kernel_size + (1, 1), # stride + (0, 0), # padding + False, # ceil_mode + False, # count_include_pad + None, # divisor_override + 0, # in_zero_point=0 + False, # channel_last + torch.tensor( + [[[[124]]]], dtype=torch.int8 + ), # expected: (120+125+125+127)/4 = 124.25 -> 124, within int8 range + ), + ] + ) + def test_avg_pool2d( + self, + name: str, + input_tensor: torch.Tensor, + kernel_size: tuple[int, int], + stride: tuple[int, int], + padding: tuple[int, int], + ceil_mode: bool, + count_include_pad: bool, + divisor_override: int | None, + in_zero_point: int | None, + channel_last: bool, + expected_output: torch.Tensor, + ) -> None: + output = torch.ops.cadence.avg_pool2d( + input_tensor, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override, + in_zero_point if in_zero_point is None else torch.tensor([in_zero_point]), + channel_last, + ) + + # Verify output properties + self.assertEqual( + output.dtype, + input_tensor.dtype, + f"Output dtype should match input dtype in {name}", + ) + self.assertEqual( + output.shape, + expected_output.shape, + f"Output shape should match expected shape in {name}", + ) + + # Verify output matches expected values + if input_tensor.dtype.is_floating_point: + self.assertTrue( + torch.allclose(output, expected_output, rtol=1e-4, atol=1e-4), + f"Output values don't match expected in {name}. Got {output}, expected {expected_output}", + ) + else: + self.assertTrue( + torch.equal(output, expected_output), + f"Output values don't match expected in {name}. Got {output}, expected {expected_output}", + ) + + @expand( + [ + # Basic 2x2 kernel, stride 1, no padding, NCHW + ( + "nchw_basic_2x2", + torch.tensor( + [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32 + ), # (N=1, C=1, H=3, W=3) + (2, 2), # kernel_size + (1, 1), # dilation + (0, 0), # padding + (1, 1), # stride + None, # in_zero_point + False, # channel_last + False, + torch.tensor( + [ + [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]], + ], + dtype=torch.float32, + ), + ), + # 2x2 kernel, stride 2, no padding, NCHW + ( + "nchw_stride2", + torch.tensor( + [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32 + ), + (2, 2), + (1, 1), + (0, 0), + (2, 2), + None, + False, + False, + torch.tensor( + [ + [[1, 2, 4, 5]], + ], + dtype=torch.float32, # Only every other patch in each dim + ), + ), + # 2x2 kernel, stride 1, padding 1, NCHW + ( + "nchw_padding1", + torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.float32), # (1,1,2,2) + (2, 2), + (1, 1), + (1, 1), + (1, 1), + None, + False, + False, + torch.tensor( + [ + [ + [0, 0, 0, 1], + [0, 0, 1, 2], + [0, 0, 2, 0], + [0, 1, 0, 3], + [1, 2, 3, 4], + [2, 0, 4, 0], + [0, 3, 0, 0], + [3, 4, 0, 0], + [4, 0, 0, 0], + ], + ], + dtype=torch.float32, + ), + ), + # 2x2 kernel, stride 1, no padding, NHWC + ( + "nhwc_basic_2x2", + torch.tensor( + [[[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]]], + dtype=torch.float32, + ), # (N=1, H=3, W=3, C=1) + (2, 2), + (1, 1), + (0, 0), + (1, 1), + None, + True, + False, + torch.tensor( + [ + [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]], + ], + dtype=torch.float32, + ), + ), + # 2x2 kernel, stride 1, no padding, NCHW, in_zero_point=1 + ( + "nchw_in_zero_point_no_padding", + torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8), + (2, 2), + (1, 1), + (0, 0), + (1, 1), + torch.tensor(1, dtype=torch.int32), + False, + False, + torch.tensor( + [ + [[2, 3, 5, 6], [3, 4, 6, 7], [5, 6, 8, 9], [6, 7, 9, 10]], + ], + dtype=torch.int8, + ), + ), + ( + "nchw_in_zero_point_with_padding=1_and_stride=2", + torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8), + (2, 2), + (1, 1), + (1, 1), + (2, 2), + torch.tensor(-1, dtype=torch.int32), + False, + False, + torch.tensor( + [ + [ + [-1, -1, -1, 2], + [-1, -1, 3, 4], + [-1, 5, -1, 8], + [6, 7, 9, 10], + ], + ], + dtype=torch.int8, + ), + ), + # 2x2 kernel, stride 1, no padding, NHWC, in_zero_point=2 + ( + "nhwc_in_zero_point", + torch.tensor( + [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]], + dtype=torch.int8, + ), + (2, 2), + (1, 1), + (0, 0), + (1, 1), + torch.tensor(2, dtype=torch.int32), + True, + False, + torch.tensor( + [ + [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]], + ], + dtype=torch.int8, + ), + ), + # Multi-channel input, 2x2 kernel, stride 1, no padding, NCHW + ( + "nchw_multi_channel", + torch.tensor( + [ + [ + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], # channel 0 + [[10, 11, 12], [13, 14, 15], [16, 17, 18]], # channel 1 + ] + ], + dtype=torch.float32, + ), # (1,2,3,3) + (2, 2), + (1, 1), + (0, 0), + (1, 1), + None, + False, + False, + torch.tensor( + [ + [ + [1, 2, 4, 5, 10, 11, 13, 14], + [2, 3, 5, 6, 11, 12, 14, 15], + [4, 5, 7, 8, 13, 14, 16, 17], + [5, 6, 8, 9, 14, 15, 17, 18], + ], + ], + dtype=torch.float32, + ), + ), + # Multi-channel input and multi-channel zero-point + ( + "nchw_multi_channel_and_zero_point_no_padding", + torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32), + (1, 2), + (1, 1), + (0, 0), + (1, 1), + torch.tensor([-1, -2], dtype=torch.int32), + False, + False, + torch.tensor([[[1, 2], [2, 3]], [[4, 5], [5, 6]]], dtype=torch.int32), + ), + ( + "nchw_multi_channel_and_zero_point_with_padding=1_and_stride=(2, 1)", + torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32), + (1, 2), + (1, 1), + (2, 1), + (2, 2), + torch.tensor([-1, -2], dtype=torch.int32), + False, + False, + torch.tensor( + [ + [ + [-1, -1], + [-1, -1], + [-1, 1], + [2, 3], + [-1, -1], + [-1, -1], + ], + [ + [-2, -2], + [-2, -2], + [-2, 4], + [5, 6], + [-2, -2], + [-2, -2], + ], + ], + dtype=torch.int32, + ), + ), + ( + "per_tensor", + torch.tensor( + [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]], + dtype=torch.int8, + ), + (2, 2), + (1, 1), + (0, 0), + (1, 1), + 2, + True, + True, + torch.tensor( + [ + [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]], + ], + dtype=torch.int8, + ), + ), + ] + ) + def test_im2row( + self, + name: str, + input_tensor: torch.Tensor, + kernel_size: tuple[int, int], + dilation: tuple[int, int], + padding: tuple[int, int], + stride: tuple[int, int], + in_zero_point: torch.Tensor | None, + channel_last: bool, + per_tensor: bool, + expected_output: torch.Tensor, + ) -> None: + if per_tensor: + output = torch.ops.cadence.im2row.per_tensor( + input_tensor, + kernel_size, + dilation, + padding, + stride, + in_zero_point, + channel_last, + ) + else: + output = torch.ops.cadence.im2row( + input_tensor, + kernel_size, + dilation, + padding, + stride, + in_zero_point, + channel_last, + ) + self.assertEqual( + output.shape, + expected_output.shape, + f"im2row output shape mismatch in {name}", + ) + self.assertTrue( + torch.equal(output, expected_output), + f"im2row output mismatch in {name}: got {output}, expected {expected_output}", + ) + + @expand( + [ + ( + "basic_2x2", + torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32), + (2, 2), + (1, 1), + (0, 0), + (1, 1), + (0, 0), + None, + False, + torch.tensor( + [ + [ + [1, 0, 0, 0], + [1, 2, 0, 0], + [0, 2, 0, 0], + [1, 0, 3, 0], + [1, 2, 3, 4], + [0, 2, 0, 4], + [0, 0, 3, 0], + [0, 0, 3, 4], + [0, 0, 0, 4], + ] + ], + dtype=torch.int32, + ), + ), + ( + "basic_2x2_with_zero_point", + torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32), + (2, 2), + (1, 1), + (0, 0), + (1, 1), + (0, 0), + torch.tensor(100, dtype=torch.int32), + False, + torch.tensor( + [ + [ + [1, 100, 100, 100], + [1, 2, 100, 100], + [100, 2, 100, 100], + [1, 100, 3, 100], + [1, 2, 3, 4], + [100, 2, 100, 4], + [100, 100, 3, 100], + [100, 100, 3, 4], + [100, 100, 100, 4], + ] + ], + dtype=torch.int32, + ), + ), + ( + "basic_2x2_with_stride_2", + torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32), + (2, 2), # kernel size + (1, 1), # dilation + (0, 0), # padding + (2, 2), # stride + (0, 0), # output padding + None, + False, + torch.tensor( + [ + [ + [1, 0, 0, 0], + [1, 0, 0, 0], + [0, 2, 0, 0], + [0, 2, 0, 0], + [1, 0, 0, 0], + [1, 0, 0, 0], + [0, 2, 0, 0], + [0, 2, 0, 0], + [0, 0, 3, 0], + [0, 0, 3, 0], + [0, 0, 0, 4], + [0, 0, 0, 4], + [0, 0, 3, 0], + [0, 0, 3, 0], + [0, 0, 0, 4], + [0, 0, 0, 4], + ] + ], + dtype=torch.int32, + ), + ), + ( + "batch2_with_batch2_zero_point", + torch.tensor( + [ + [[[1, 2], [3, 4]]], + [[[5, 6], [7, 8]]], + ], + dtype=torch.int32, + ), # input: (2,1,2,2) + (2, 2), # kernel_size + (1, 1), # dilation + (0, 0), # padding + (1, 1), # stride + (0, 0), # output_padding + torch.tensor([100, 200], dtype=torch.int32), # in_zero_point per batch + False, # channel_last + torch.tensor( + [ + [ + [1, 100, 100, 100], + [1, 2, 100, 100], + [100, 2, 100, 100], + [1, 100, 3, 100], + [1, 2, 3, 4], + [100, 2, 100, 4], + [100, 100, 3, 100], + [100, 100, 3, 4], + [100, 100, 100, 4], + ], + [ + [5, 200, 200, 200], + [5, 6, 200, 200], + [200, 6, 200, 200], + [5, 200, 7, 200], + [5, 6, 7, 8], + [200, 6, 200, 8], + [200, 200, 7, 200], + [200, 200, 7, 8], + [200, 200, 200, 8], + ], + ], + dtype=torch.int32, + ), + ), + ] + ) + def test_transposed_im2row( + self, + name: str, + input_tensor: torch.Tensor, + kernel_size: tuple[int, int], + dilation: tuple[int, int], + padding: tuple[int, int], + stride: tuple[int, int], + output_padding: tuple[int, int], + in_zero_point: torch.Tensor | int | None, + channel_last: bool, + expected_output: torch.Tensor, + ) -> None: + output = torch.ops.cadence.transposed_im2row( + input_tensor, + kernel_size, + dilation, + padding, + stride, + output_padding, + in_zero_point, + channel_last, + ) + + self.assertEqual( + output.shape, + expected_output.shape, + f"transposed_im2row output shape mismatch in {name}: got {output.shape}, expected {expected_output.shape}", + ) + self.assertTrue( + torch.equal(output, expected_output), + f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}", + ) + + @expand( + [ + ( + "1_group", + torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8), + torch.tensor([1, 1, 1], dtype=torch.float32), + torch.tensor([0, 0, 0], dtype=torch.int8), + torch.tensor([0, 2, 1], dtype=torch.int64), + torch.tensor( + [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]], + dtype=torch.float32, + ), + ), + ( + "2_groups", + torch.tensor( + [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8 + ), + torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32), + torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8), + torch.tensor([0, 2, 1], dtype=torch.int64), + torch.tensor( + [ + [0.0, 0.5, 1.0, 2.0], + [10.0, 12.5, 15.0, 18.0], + [3.0, 4.5, 6.0, 8.0], + ], + dtype=torch.float32, + ), + ), + ( + "1_group_none_zero_point", + torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8), + torch.tensor([1, 1, 1], dtype=torch.float32), + None, + torch.tensor([0, 2, 1], dtype=torch.int64), + torch.tensor( + [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]], + dtype=torch.float32, + ), + ), + ( + "1_group_batch2", + torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8), + torch.tensor([1, 1, 1], dtype=torch.float32), + torch.tensor([0, 0, 0], dtype=torch.int8), + torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64), + torch.tensor( + [ + [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]], + [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]], + ], + dtype=torch.float32, + ), + ), + ( + "2_groups_batch2", + torch.tensor( + [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8 + ), + torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32), + torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8), + torch.tensor([[0, 2, 1], [2, 1, 0]], dtype=torch.int64), + torch.tensor( + [ + [ + [0.0, 0.5, 1.0, 2.0], + [10.0, 12.5, 15.0, 18.0], + [3.0, 4.5, 6.0, 8.0], + ], + [ + [10.0, 12.5, 15.0, 18.0], + [3.0, 4.5, 6.0, 8.0], + [0.0, 0.5, 1.0, 2.0], + ], + ], + dtype=torch.float32, + ), + ), + ( + "1_group_none_zero_point_batch2", + torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8), + torch.tensor([1, 1, 1], dtype=torch.float32), + None, + torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64), + torch.tensor( + [ + [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]], + [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]], + ], + dtype=torch.float32, + ), + ), + ] + ) + def test_quantized_embedding_byte( + self, + name: str, + weight: torch.Tensor, + weight_scales: torch.Tensor, + weight_zero_points: torch.Tensor | None, + indices: torch.Tensor, + expected_out: torch.Tensor, + ) -> None: + self.assertTrue( + torch.equal( + torch.ops.cadence.quantized_embedding_byte( + weight, weight_scales, weight_zero_points, indices + ), + expected_out, + ) + ) diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py index ca5168db2be..e2fbd516757 100644 --- a/backends/cadence/aot/tests/test_replace_ops_passes.py +++ b/backends/cadence/aot/tests/test_replace_ops_passes.py @@ -45,6 +45,7 @@ ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass, ReplaceSplitWithSlicePass, ReplaceSqueezeAndUnsqueezeWithViewPass, + ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding, ReplaceTransposedConvWithLinearPass, ReplaceTrivialConvWithLinear, ReplaceWhereWithFullArgsWithWhereScalar, @@ -52,9 +53,10 @@ from executorch.backends.cadence.aot.typing_stubs import expand from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass +from executorch.exir.pass_base import ExportPass, ProxyValue from executorch.exir.passes import dead_code_elimination_pass from torch.fx.passes.infra.pass_base import PassResult +from torch.utils import _pytree as pytree class TestReplaceOpsPasses(unittest.TestCase): @@ -345,6 +347,194 @@ def test_replace_functionally_equivalent_op_targets_unsafe_split( count_node(graph_after_passes, exir_ops.edge.aten.unsafe_split.Tensor), 0, x ) + def assertTensorMetadataIsSame( + self, a: Sequence[torch.Tensor], b: Sequence[torch.Tensor] + ) -> None: + for i, (_a, _b) in enumerate(zip(a, b)): + # TODO: actually compare the tensors. + self.assertTrue( + _a.shape == _b.shape, f"Tensor {i}: {_a.shape} != {_b.shape}" + ) + self.assertTrue( + _a.dtype == _b.dtype, f"Tensor {i}: {_a.dtype} != {_b.dtype}" + ) + + @expand( + [ + [(1, 8, 18), 8, 16, 3], + [(1, 8, 18), 8, 16, 5, 2], + # depthwise + bias + [(1, 8, 18), 8, 16, 5, 2, 0, 1, True], + # no bias + [(1, 8, 18), 8, 16, 3, 2, 4, 3, False, False], + # bias + transposed + [(1, 8, 18), 8, 16, 5, 2, 0, 1, False, True], + # Stride of 2 needed. + [(1, 8, 3), 8, 8, 48, 2, 23], + ] + ) + @torch.no_grad() + def test_replace_aten_conv_with_cadence_conv( + self, + shape: Tuple[int, ...], + in_channels: int, + out_channels: int, + kernel: int, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + depthwise: bool = False, + bias_enabled: bool = True, + output_padding: Optional[int] = None, + ) -> None: + groups = in_channels if depthwise else 1 + builder = GraphBuilder() + x_tensor = torch.randn(*shape, dtype=torch.float32) + x = builder.placeholder("x", x_tensor) + weights_tensor = torch.randn( + [out_channels, in_channels // groups, kernel], dtype=torch.float32 + ) + weights = builder.placeholder("weights", weights_tensor) + bias: Optional[ProxyValue] = None + bias_tensor: Optional[torch.Tensor] = None + if bias_enabled: + bias_tensor = torch.randn([out_channels], dtype=torch.float32) + bias = builder.placeholder("bias", bias_tensor) + convolution = builder.call_operator( + op=exir_ops.edge.aten.convolution.default, + args=( + x, + weights, + bias, + [stride], + [padding], + [dilation], + False, + [output_padding] if output_padding else [0], + groups, + ), + ) + builder.output([convolution]) + original_gm = builder.get_graph_module() + + replacement_pass_result = ( + ReplaceAtenConvolutionWithCadenceConvolutionPass().call(original_gm) + ) + self.assertIsNotNone(replacement_pass_result) + graph_after_passes = replacement_pass_result.graph_module + + self.assertEqual( + count_node(graph_after_passes, exir_ops.edge.aten.convolution.default), + 0, + ) + self.assertEqual( + count_node(graph_after_passes, exir_ops.edge.cadence.convolution.default), + 1, + ) + self.assertEqual( + count_node( + graph_after_passes, exir_ops.edge.cadence.transposed_convolution.default + ), + 0, + ) + + inputs = (x.to_tensor(), weights.to_tensor()) + if bias is not None: + inputs += (bias.to_tensor(),) + self.assertTensorMetadataIsSame( + pytree.tree_flatten(original_gm.forward(*inputs))[0], + pytree.tree_flatten(graph_after_passes.forward(*inputs))[0], + ) + + @expand( + [ + [(1, 8, 18), 8, 16, 3], + [(1, 8, 18), 8, 16, 5, 2], + # depthwise + bias + [(1, 8, 18), 8, 16, 5, 2, 0, 1, True, True], + # no bias + [(1, 8, 18), 8, 16, 3, 2, 4, 3, False, False], + # depthwise + no bias + [(1, 8, 18), 8, 16, 3, 1, 0, 1, True, False], + # bias + [(1, 8, 18), 8, 16, 5, 2, 0, 1, False, True], + ] + ) + @torch.no_grad() + def test_replace_aten_transposed_conv_with_cadence_transposed_conv( + self, + shape: Tuple[int, ...], + in_channels: int, + out_channels: int, + kernel: int, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + depthwise: bool = False, + bias_enabled: bool = True, + output_padding: Optional[int] = None, + ) -> None: + groups = in_channels if depthwise else 1 + builder = GraphBuilder() + x = builder.placeholder("x", torch.randn(*shape, dtype=torch.float32)) + weights_shape = [in_channels, out_channels // groups, kernel] + weights = builder.placeholder( + "weights", + torch.randn(weights_shape, dtype=torch.float32), + ) + bias = ( + builder.placeholder( + "bias", torch.randn([out_channels], dtype=torch.float32) + ) + if bias_enabled + else None + ) + convolution = builder.call_operator( + op=exir_ops.edge.aten.convolution.default, + args=( + x, + weights, + bias, + [stride], + [padding], + [dilation], + True, + [output_padding] if output_padding else [0], + groups, + ), + ) + builder.output([convolution]) + original_gm = builder.get_graph_module() + + replacement_pass_result = ( + ReplaceAtenConvolutionWithCadenceConvolutionPass().call(original_gm) + ) + self.assertIsNotNone(replacement_pass_result) + graph_after_passes = replacement_pass_result.graph_module + + self.assertEqual( + count_node(graph_after_passes, exir_ops.edge.aten.convolution.default), + 0, + ) + self.assertEqual( + count_node(graph_after_passes, exir_ops.edge.cadence.convolution.default), + 0, + ) + self.assertEqual( + count_node( + graph_after_passes, exir_ops.edge.cadence.transposed_convolution.default + ), + 1, + ) + + inputs = (x.to_tensor(), weights.to_tensor()) + if bias is not None: + inputs += (bias.to_tensor(),) + self.assertTensorMetadataIsSame( + pytree.tree_flatten(original_gm.forward(*inputs))[0], + pytree.tree_flatten(graph_after_passes.forward(*inputs))[0], + ) + @expand( [ [(1, 8, 33), 8, 16, 3], @@ -455,8 +645,6 @@ def test_replace_convolution_optional_args_with_concrete_args( bias_enabled: bool = True, channel_last: bool = False, ) -> None: - transposed = True - output_padding = [0] groups = in_channels if depthwise else 1 builder = GraphBuilder() x = builder.placeholder("x", torch.randn(*shape, dtype=torch.float32)) @@ -477,7 +665,7 @@ def test_replace_convolution_optional_args_with_concrete_args( args=(x, [0, 2, 1]), ) convolution = builder.call_operator( - op=exir_ops.edge.aten.convolution.default, + op=exir_ops.edge.cadence.convolution.default, args=( x, weights, @@ -485,9 +673,8 @@ def test_replace_convolution_optional_args_with_concrete_args( [stride], [padding], [dilation], - transposed, - output_padding, groups, + False, ), ) if channel_last: @@ -504,7 +691,7 @@ def test_replace_convolution_optional_args_with_concrete_args( 1, ) self.assertEqual( - count_node(graph_after_passes, exir_ops.edge.aten.convolution.default), + count_node(graph_after_passes, exir_ops.edge.cadence.convolution.default), 1, ) @@ -1666,7 +1853,7 @@ def create_quantized_convolution_graph_module( out_multiplier, out_shift, ), - op=exir_ops.edge.cadence.quantized_conv_nhwc.default, + op=exir_ops.edge.cadence.quantized_conv2d_nhwc.default, args=args, ) else: @@ -1680,7 +1867,7 @@ def create_quantized_convolution_graph_module( out_multiplier, out_shift, ), - op=exir_ops.edge.cadence.quantized_conv_nchw.default, + op=exir_ops.edge.cadence.quantized_conv2d_nchw.default, args=args, ) @@ -1688,7 +1875,7 @@ def test_quantized_convolution_default_channel_last(self) -> None: # Create a graph with a single convolution node. gm = self.create_quantized_convolution_graph_module() self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.default), 1 + count_node(gm, exir_ops.edge.cadence.quantized_conv2d_nchw.default), 1 ) self.assertEqual(count_node(gm, exir_ops.edge.aten.permute_copy.default), 0) @@ -1698,7 +1885,8 @@ def test_quantized_convolution_default_channel_last(self) -> None: # Check that no replacement was made. self.assertEqual( count_node( - gm_after_replacement, exir_ops.edge.cadence.quantized_conv_nhwc.default + gm_after_replacement, + exir_ops.edge.cadence.quantized_conv2d_nhwc.default, ), 1, ) @@ -1714,7 +1902,7 @@ def test_no_transpose_if_already_quantized_conv_channel_last(self) -> None: # Check if graph module is valid by running exportpass on it. gm = ExportPass().call(gm).graph_module self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.default), 1 + count_node(gm, exir_ops.edge.cadence.quantized_conv2d_nhwc.default), 1 ) # Apply replacement pass. @@ -1723,7 +1911,8 @@ def test_no_transpose_if_already_quantized_conv_channel_last(self) -> None: # Check that no replacement was made. self.assertEqual( count_node( - gm_after_replacement, exir_ops.edge.cadence.quantized_conv_nhwc.default + gm_after_replacement, + exir_ops.edge.cadence.quantized_conv2d_nhwc.default, ), 1, ) @@ -2081,3 +2270,48 @@ def test_replace_aten_linalg_svd_with_cadence_linalg_svd( count_node(graph_after_passes, exir_ops.edge.cadence.linalg_svd.default), 1, ) + + @expand([("dtype",), ("default",)]) + @torch.no_grad() + def test_replace_quantized_embedding( + self, + name: str, + ) -> None: + embedding = torch.ones(5, 6, dtype=torch.int8) + indices = torch.tensor([0, 2], dtype=torch.int32) + scales = torch.ones(5, 2, dtype=torch.float32) + zero_points = None + + original_gm = single_op_builder( + placeholders=(embedding, scales, indices), + op=( + exir_ops.edge.quantized_decomposed.embedding_byte.dtype + if name == "dtype" + else exir_ops.edge.quantized_decomposed.embedding_byte.default + ), + args=(embedding, scales, zero_points, -128, 127, indices), + kwargs={"dtype": torch.float32} if name == "dtype" else {}, + ) + + p = ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding() + graph_after_passes = cast(PassResult, p(original_gm)).graph_module + + self.assertEqual( + count_node( + graph_after_passes, + ( + exir_ops.edge.quantized_decomposed.embedding_byte.dtype + if name == "dtype" + else exir_ops.edge.quantized_decomposed.embedding_byte.default + ), + ), + 0, + ) + + self.assertEqual( + count_node( + graph_after_passes, + exir_ops.edge.cadence.quantized_embedding_byte.default, + ), + 1, + ) diff --git a/backends/cadence/aot/tests/test_type_dispatch_passes.py b/backends/cadence/aot/tests/test_type_dispatch_passes.py index 4ae10ea83dd..870735aad1a 100644 --- a/backends/cadence/aot/tests/test_type_dispatch_passes.py +++ b/backends/cadence/aot/tests/test_type_dispatch_passes.py @@ -199,29 +199,29 @@ def test_dispatch_quantized_matmul( "int8_nchw", torch.int8, (1, 3, 8, 8), # x_shape - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nchw", torch.uint8, (1, 3, 8, 8), # x_shape - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor, ), ( "int8_nhwc", torch.int8, (1, 8, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nhwc", torch.uint8, (1, 8, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor, ), ] ) @@ -256,29 +256,29 @@ def test_dispatch_quantized_conv_2d( "int8_nchw_dilated", torch.int8, (1, 3, 8, 8), # x_shape - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nchw_dilated", torch.uint8, (1, 3, 8, 8), # x_shape - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor, ), ( "int8_nhwc_dilated", torch.int8, (1, 8, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nhwc_dilated", torch.uint8, (1, 8, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor, ), ] ) @@ -313,29 +313,29 @@ def test_dispatch_quantized_conv_2d_dilated( "int8_nchw_1d", torch.int8, (1, 3, 8), # x_shape - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nchw_1d", torch.uint8, (1, 3, 8), # x_shape - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor, ), ( "int8_nhwc_1d", torch.int8, (1, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nhwc_1d", torch.uint8, (1, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor, ), ] ) @@ -410,32 +410,32 @@ def test_dispatch_quantized_add( torch.int8, (1, 3, 8, 8), # x_shape (3, 1, 3, 3), # w_shape (groups=3, input_channels=3) - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nchw_depthwise", torch.uint8, (1, 3, 8, 8), # x_shape (3, 1, 3, 3), # w_shape (groups=3, input_channels=3) - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor, ), ( "int8_nhwc_depthwise", torch.int8, (1, 8, 8, 3), # x_shape (3, 3, 3, 1), # w_shape (groups=3, input_channels=3) - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor, ), ( "uint8_nhwc_depthwise", torch.uint8, (1, 8, 8, 3), # x_shape (3, 3, 3, 1), # w_shape (groups=3, input_channels=3) - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor, ), ] ) diff --git a/backends/cadence/aot/type_dispatch.py b/backends/cadence/aot/type_dispatch.py index 958a78a4808..37f753767e9 100644 --- a/backends/cadence/aot/type_dispatch.py +++ b/backends/cadence/aot/type_dispatch.py @@ -27,6 +27,7 @@ class OpConfig: base_name: str type_dispatch_suffixes: dict[tuple[torch.dtype, ...], str] weight_arg_idx: Optional[int] = None + is_quant_op: bool = False variant: str = "per_tensor" @@ -62,16 +63,16 @@ class CompileTimeTypeDispatchPass(ExportPass): weight_arg_idx=2, variant="default", ), - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor: OpConfig( - "quantized_conv_nchw", + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor: OpConfig( + "quantized_conv2d_nchw", type_dispatch_suffixes={ (torch.int8, torch.int8): "asym8sxsym8s_asym8s", (torch.uint8, torch.uint8): "asym8uxsym8u_asym8u", }, weight_arg_idx=1, ), - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor: OpConfig( - "quantized_conv_nhwc", + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor: OpConfig( + "quantized_conv2d_nhwc", type_dispatch_suffixes={ (torch.int8, torch.int8): "asym8sxsym8s_asym8s", (torch.uint8, torch.uint8): "asym8uxsym8u_asym8u", @@ -100,6 +101,29 @@ class CompileTimeTypeDispatchPass(ExportPass): }, variant="default", ), + exir_ops.edge.cadence.quantize_per_tensor.default: OpConfig( + "quantize_per_tensor", + type_dispatch_suffixes={ + (torch.int8,): "asym8s", + (torch.uint8,): "asym8u", + (torch.int16,): "asym16s", + (torch.uint16,): "asym16s", + (torch.int32,): "asym32s", + }, + variant="default", + is_quant_op=True, + ), + exir_ops.edge.cadence.dequantize_per_tensor.default: OpConfig( + "dequantize_per_tensor", + type_dispatch_suffixes={ + (torch.int8,): "asym8s", + (torch.uint8,): "asym8u", + (torch.int16,): "asym16s", + (torch.uint16,): "asym16s", + (torch.int32,): "asym32s", + }, + variant="default", + ), } def call_operator( @@ -120,6 +144,8 @@ def call_operator( if config.weight_arg_idx is not None: weight_dtype = args[config.weight_arg_idx].to_tensor().dtype dtype_key = (input_dtype, weight_dtype) + elif config.is_quant_op: + dtype_key = (args[5],) else: dtype_key = (input_dtype,) @@ -132,13 +158,13 @@ def call_operator( typed_op_name = f"{base_name}_{type_suffix}" if op in [ - exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, ]: groups = args[6] input_channels = ( args[0].to_tensor().shape[1] - if op == exir_ops.edge.cadence.quantized_conv_nchw.per_tensor + if op == exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor else args[0].to_tensor().shape[-1] ) is_depthwise = groups == input_channels @@ -151,9 +177,11 @@ def call_operator( elif is_dilated: typed_op_name = f"{base_name}_dilated_{type_suffix}" elif is_1d and groups == 1: - typed_op_name = ( - f"quantized_conv1d_{base_name.split('_')[-1]}_{type_suffix}" - ) + if "nchw" in base_name: + layout_suffix = "ncl" + else: + layout_suffix = "nlc" + typed_op_name = f"quantized_conv1d_{layout_suffix}_{type_suffix}" typed_op = getattr( getattr(exir_ops.edge.cadence, typed_op_name), config.variant diff --git a/backends/cadence/build_cadence_vision.sh b/backends/cadence/build_cadence_vision.sh new file mode 100755 index 00000000000..7c2c6d68860 --- /dev/null +++ b/backends/cadence/build_cadence_vision.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +unset CMAKE_PREFIX_PATH +unset XTENSA_CORE +export XTENSA_CORE=XRC_Vision_130_AO +git submodule sync +git submodule update --init --recursive +./install_requirements.sh +./install_executorch.sh + +rm -rf cmake-out + +STEPWISE_BUILD=false + +if $STEPWISE_BUILD; then + echo "Building ExecuTorch" + CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CPUINFO=OFF \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=OFF \ + -Bcmake-out . + + echo "Building any Cadence-specific binaries on top" + CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \ + -DEXECUTORCH_BUILD_KERNELS_LLM=OFF \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_VISION_OPT=ON \ + -DHAVE_FNMATCH_H=OFF \ + -Bcmake-out/backends/cadence \ + backends/cadence + cmake --build cmake-out/backends/cadence -j8 +else + echo "Building Cadence toolchain with ExecuTorch packages" + cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" + CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \ + -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CPUINFO=OFF \ + -DEXECUTORCH_BUILD_CADENCE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \ + -DEXECUTORCH_BUILD_KERNELS_LLM=OFF \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_VISION_OPT=ON \ + -DHAVE_FNMATCH_H=OFF \ + -Bcmake-out + cmake --build cmake-out --target install --config Release -j8 +fi + +echo "Run simple model to verify cmake build" +python3 -m examples.portable.scripts.export --model_name="add" +xt-run --turbo cmake-out/executor_runner --model_path=add.pte diff --git a/backends/cadence/generic/kernels/kernels.cpp b/backends/cadence/generic/kernels/kernels.cpp index 568d8468af9..25e25cfa60a 100644 --- a/backends/cadence/generic/kernels/kernels.cpp +++ b/backends/cadence/generic/kernels/kernels.cpp @@ -73,6 +73,7 @@ typed_quantize_val(int8_t); typed_quantize_val(uint8_t); typed_quantize_val(int16_t); typed_quantize_val(uint16_t); +typed_quantize_val(int32_t); #undef typed_quantize_val #define typed_quantize_vec(dtype) \ @@ -86,6 +87,7 @@ typed_quantize_vec(int8_t); typed_quantize_vec(uint8_t); typed_quantize_vec(int16_t); typed_quantize_vec(uint16_t); +typed_quantize_vec(int32_t); #undef typed_quantize_vec #define typed_dequantize_val(dtype) \ @@ -94,6 +96,7 @@ typed_dequantize_val(int8_t); typed_dequantize_val(uint8_t); typed_dequantize_val(int16_t); typed_dequantize_val(uint16_t); +typed_dequantize_val(int32_t); #undef typed_dequantize_val #define typed_dequantize_vec(dtype) \ @@ -107,6 +110,7 @@ typed_dequantize_vec(int8_t); typed_dequantize_vec(uint8_t); typed_dequantize_vec(int16_t); typed_dequantize_vec(uint16_t); +typed_dequantize_vec(int32_t); #undef typed_dequantize_vec } // namespace kernels diff --git a/backends/cadence/generic/operators/CMakeLists.txt b/backends/cadence/generic/operators/CMakeLists.txt index ea5b699f441..63d8902ac89 100644 --- a/backends/cadence/generic/operators/CMakeLists.txt +++ b/backends/cadence/generic/operators/CMakeLists.txt @@ -16,10 +16,6 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake) # ATen compliant ops that are needed to run this model. set(_aten_ops__srcs - "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/op_embedding.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp" @@ -31,10 +27,13 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp" @@ -58,6 +57,7 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_native_group_norm.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp" @@ -80,15 +80,15 @@ target_include_directories( add_library( custom_ops "quantized_linear_out.cpp" - "quantized_conv_nchw_out.cpp" - "quantized_conv_nhwc_out.cpp" + "quantized_conv2d_nchw_out.cpp" + "quantized_conv2d_nhwc_out.cpp" "quantized_relu_out.cpp" "quantized_layer_norm.cpp" "quantize_per_tensor.cpp" "quantized_fully_connected_out.cpp" "dequantize_per_tensor.cpp" "quantized_matmul_out.cpp" - "requantize_out.cpp" + "op_requantize_out.cpp" "im2row_out.cpp" ) target_include_directories( diff --git a/backends/cadence/generic/operators/dequantize_per_tensor.cpp b/backends/cadence/generic/operators/dequantize_per_tensor.cpp index 1481981ee0b..ec05272da1b 100644 --- a/backends/cadence/generic/operators/dequantize_per_tensor.cpp +++ b/backends/cadence/generic/operators/dequantize_per_tensor.cpp @@ -18,7 +18,7 @@ using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; using ::impl::generic::kernels::dequantize; -void dequantize_per_tensor_out( +Tensor& dequantize_per_tensor_out( KernelRuntimeContext& context, const Tensor& input, double scale, @@ -44,12 +44,96 @@ void dequantize_per_tensor_out( } else if (input.scalar_type() == ScalarType::Short) { const int16_t* input_data = input.const_data_ptr(); dequantize(out_data, input_data, scale, zero_point, numel); + } else if (input.scalar_type() == ScalarType::Int) { + const int32_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); } else { ET_CHECK_MSG( false, "Unhandled input dtype %hhd", static_cast(input.scalar_type())); } + return out; +} + +Tensor& dequantize_per_tensor_asym8s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int8_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +Tensor& dequantize_per_tensor_asym8u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const uint8_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +Tensor& dequantize_per_tensor_asym16s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int16_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +Tensor& dequantize_per_tensor_asym16u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const uint16_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +Tensor& dequantize_per_tensor_asym32s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int32_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; } } // namespace native diff --git a/backends/cadence/generic/operators/quantize_per_tensor.cpp b/backends/cadence/generic/operators/quantize_per_tensor.cpp index 29b233dab09..8ce70d2b51d 100644 --- a/backends/cadence/generic/operators/quantize_per_tensor.cpp +++ b/backends/cadence/generic/operators/quantize_per_tensor.cpp @@ -20,7 +20,7 @@ using ::impl::generic::kernels::quantize; // Quantize the input tensor (PT2 version). Note that quant_ are not // used in any computation. -void quantize_per_tensor_out( +Tensor& quantize_per_tensor_out( KernelRuntimeContext& context, const Tensor& input, double scale, @@ -34,30 +34,110 @@ void quantize_per_tensor_out( if (out.scalar_type() == ScalarType::Byte) { uint8_t* out_data = out.mutable_data_ptr(); - impl::generic::kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); + quantize(out_data, input_data, 1. / scale, zero_point, numel); } else if (out.scalar_type() == ScalarType::Char) { int8_t* out_data = out.mutable_data_ptr(); - impl::generic::kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); + quantize(out_data, input_data, 1. / scale, zero_point, numel); } else if ( out.scalar_type() == ScalarType::Bits16 || out.scalar_type() == ScalarType::UInt16) { uint16_t* out_data = out.mutable_data_ptr(); - impl::generic::kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); + quantize(out_data, input_data, 1. / scale, zero_point, numel); } else if (out.scalar_type() == ScalarType::Short) { int16_t* out_data = out.mutable_data_ptr(); - impl::generic::kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Int) { + int32_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); } else { ET_CHECK_MSG( false, "Unhandled input dtype %hhd", static_cast(out.scalar_type())); } + return out; } -} // namespace native -} // namespace generic -} // namespace impl +Tensor& quantize_per_tensor_asym8s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int8_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +Tensor& quantize_per_tensor_asym8u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + uint8_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +Tensor& quantize_per_tensor_asym16s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +Tensor& quantize_per_tensor_asym16u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + uint16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +Tensor& quantize_per_tensor_asym32s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int32_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +}; // namespace native +}; // namespace generic +}; // namespace impl diff --git a/backends/cadence/generic/operators/quantized_conv_nchw_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp similarity index 94% rename from backends/cadence/generic/operators/quantized_conv_nchw_out.cpp rename to backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp index 6eeabcf1d52..fbb01c82e65 100644 --- a/backends/cadence/generic/operators/quantized_conv_nchw_out.cpp +++ b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp @@ -157,7 +157,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( // bias_scale, since it is a product of the two. The kernel will branch to // quantized::conv1d or quantized::conv2d based on the dimensionality of // activation tensor. -void quantized_conv_nchw( +void quantized_conv2d_nchw( const Tensor& input, const Tensor& weight, const Tensor& bias, @@ -228,7 +228,7 @@ void quantized_conv_nchw( #undef typed_quantized_conv2d_nchw } -void quantized_conv_nchw_out( +void quantized_conv2d_nchw_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -248,7 +248,7 @@ void quantized_conv_nchw_out( const float bias_scale_float = bias_scale.const_data_ptr()[0]; const int32_t weight_zero_point_int = weight_zero_point.const_data_ptr()[0]; - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -264,7 +264,7 @@ void quantized_conv_nchw_out( out); } -void quantized_conv_nchw_per_tensor_out( +void quantized_conv2d_nchw_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -282,7 +282,7 @@ void quantized_conv_nchw_per_tensor_out( __ET_UNUSED int64_t out_shift, bool channel_last, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -298,7 +298,7 @@ void quantized_conv_nchw_per_tensor_out( out); } -void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -315,7 +315,7 @@ void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -331,7 +331,7 @@ void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -348,7 +348,7 @@ void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -364,7 +364,7 @@ void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -381,7 +381,7 @@ void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -397,7 +397,7 @@ void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -414,7 +414,7 @@ void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -430,7 +430,7 @@ void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -447,7 +447,7 @@ void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -463,7 +463,7 @@ void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -480,7 +480,7 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -496,7 +496,7 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -513,7 +513,7 @@ void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -529,7 +529,7 @@ void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -546,7 +546,7 @@ void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, diff --git a/backends/cadence/generic/operators/quantized_conv_nhwc_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp similarity index 94% rename from backends/cadence/generic/operators/quantized_conv_nhwc_out.cpp rename to backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp index d377048b142..eca836dcc94 100644 --- a/backends/cadence/generic/operators/quantized_conv_nhwc_out.cpp +++ b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp @@ -144,7 +144,7 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic( } } -void quantized_conv_nhwc( +void quantized_conv2d_nhwc( const Tensor& input, const Tensor& weight, const Tensor& bias, @@ -215,7 +215,7 @@ void quantized_conv_nhwc( #undef typed_quantized_conv2d_nhwc } -void quantized_conv_nhwc_out( +void quantized_conv2d_nhwc_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -235,7 +235,7 @@ void quantized_conv_nhwc_out( const float bias_scale_float = bias_scale.const_data_ptr()[0]; const int32_t weight_zero_point_int = weight_zero_point.const_data_ptr()[0]; - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -251,7 +251,7 @@ void quantized_conv_nhwc_out( out); } -void quantized_conv_nhwc_per_tensor_out( +void quantized_conv2d_nhwc_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -269,7 +269,7 @@ void quantized_conv_nhwc_per_tensor_out( __ET_UNUSED int64_t out_shift, bool channel_last, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -285,7 +285,7 @@ void quantized_conv_nhwc_per_tensor_out( out); } -void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -302,7 +302,7 @@ void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -318,7 +318,7 @@ void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -335,7 +335,7 @@ void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -351,7 +351,7 @@ void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -368,7 +368,7 @@ void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -384,7 +384,7 @@ void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -401,7 +401,7 @@ void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -417,7 +417,7 @@ void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -434,7 +434,7 @@ void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -450,7 +450,7 @@ void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -467,7 +467,7 @@ void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -483,7 +483,7 @@ void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -500,7 +500,7 @@ void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -516,7 +516,7 @@ void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( out); } -void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -533,7 +533,7 @@ void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, diff --git a/backends/cadence/generic/operators/targets.bzl b/backends/cadence/generic/operators/targets.bzl index 4ff821158bc..fa0f128b229 100644 --- a/backends/cadence/generic/operators/targets.bzl +++ b/backends/cadence/generic/operators/targets.bzl @@ -4,64 +4,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def define_common_targets(): # Individual operator targets with optimized dependencies - # Basic operators (need broadcast_util and scalar_utils) - runtime.cxx_library( - name = "op_add", - srcs = ["op_add.cpp"], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:broadcast_util", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - runtime.cxx_library( - name = "op_full", - srcs = ["op_full.cpp"], - platforms = CXX, - deps = [ - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - # Simple operators (only need kernel_includes) - runtime.cxx_library( - name = "op_embedding", - srcs = ["op_embedding.cpp"], - platforms = CXX, - deps = [ - "//executorch/runtime/kernel:kernel_includes", - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - runtime.cxx_library( - name = "op_view_copy", - srcs = ["op_view_copy.cpp"], - platforms = CXX, - deps = [ - "//executorch/runtime/kernel:kernel_includes", - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - # Operators that need the operators.h header and basic runtime runtime.cxx_library( name = "im2row_out", srcs = ["im2row_out.cpp"], @@ -102,6 +44,7 @@ def define_common_targets(): ], visibility = [ "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", ], ) @@ -136,8 +79,8 @@ def define_common_targets(): ) runtime.cxx_library( - name = "quantized_conv_nchw_out", - srcs = ["quantized_conv_nchw_out.cpp"], + name = "quantized_conv2d_nchw_out", + srcs = ["quantized_conv2d_nchw_out.cpp"], exported_headers = ["operators.h", "quantized_ops.h"], platforms = CXX, deps = [ @@ -151,8 +94,8 @@ def define_common_targets(): ) runtime.cxx_library( - name = "quantized_conv_nhwc_out", - srcs = ["quantized_conv_nhwc_out.cpp"], + name = "quantized_conv2d_nhwc_out", + srcs = ["quantized_conv2d_nhwc_out.cpp"], exported_headers = ["operators.h", "quantized_ops.h"], platforms = CXX, deps = [ diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index d9223d7bd18..237c605443f 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -127,6 +127,7 @@ typed_quantize_val(int8_t); typed_quantize_val(uint8_t); typed_quantize_val(int16_t); typed_quantize_val(uint16_t); +typed_quantize_val(int32_t); #undef typed_quantize_val #define typed_quantize_vec(dtype) \ @@ -150,6 +151,7 @@ typed_dequantize_val(int8_t); typed_dequantize_val(uint8_t); typed_dequantize_val(int16_t); typed_dequantize_val(uint16_t); +typed_dequantize_val(int32_t); #undef typed_dequantize_val #define typed_dequantize_vec(dtype) \ diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 6bd63c6d9f6..26555da9760 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -96,8 +96,8 @@ add_library( "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp" - "op_quantized_conv_nchw_out.cpp" - "op_quantized_conv_nhwc_out.cpp" + "op_quantized_conv2d_nchw_out.cpp" + "op_quantized_conv2d_nhwc_out.cpp" "op_quantized_fully_connected_out" ) target_include_directories( diff --git a/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp index f416082b10f..30ce938e24d 100644 --- a/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp +++ b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp @@ -45,6 +45,9 @@ void dequantize_per_tensor_out( input.scalar_type() == ScalarType::UInt16) { const uint16_t* input_data = input.const_data_ptr(); dequantize(out_data, input_data, scale, zero_point, numel); + } else if (input.scalar_type() == ScalarType::Int) { + const int32_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); } else { ET_CHECK_MSG( false, @@ -53,6 +56,66 @@ void dequantize_per_tensor_out( } } +void dequantize_per_tensor_asym8u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const uint8_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); +} + +void dequantize_per_tensor_asym16s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int16_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); +} + +void dequantize_per_tensor_asym16u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const uint16_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); +} + +void dequantize_per_tensor_asym32s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int32_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); +} + } // namespace native } // namespace HiFi } // namespace impl diff --git a/backends/cadence/hifi/operators/op_dequantize_per_tensor_asym8s.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor_asym8s.cpp new file mode 100644 index 00000000000..d1099b1a4db --- /dev/null +++ b/backends/cadence/hifi/operators/op_dequantize_per_tensor_asym8s.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace impl { +namespace HiFi { +namespace native { + +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +void dequantize_per_tensor_asym8s_out( + KernelRuntimeContext& ctx, + const Tensor& input, + double scale, + int64_t zero_point, + __ET_UNUSED int64_t quant_min, + __ET_UNUSED int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + const size_t numel = out.numel(); + const int8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_asym8s_f32( + out_data, input_data, zero_point, scale, numel); +} + +}; // namespace native +}; // namespace HiFi +}; // namespace impl diff --git a/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp index b2f47619f05..579a4533057 100644 --- a/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp +++ b/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp @@ -19,10 +19,13 @@ namespace impl { namespace HiFi { namespace native { + namespace { + using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; +using ::impl::HiFi::kernels::quantize; // Add checks for dtype quant min/max bounds. template @@ -92,22 +95,22 @@ void quantize_per_tensor_out( const size_t numel = out.numel(); if (out.scalar_type() == ScalarType::Byte) { uint8_t* out_data = out.mutable_data_ptr(); - impl::HiFi::kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); + quantize(out_data, input_data, 1. / scale, zero_point, numel); } else if (out.scalar_type() == ScalarType::Char) { int8_t* out_data = out.mutable_data_ptr(); xa_nn_elm_quantize_f32_asym8s( out_data, input_data, scale, zero_point, numel); } else if (out.scalar_type() == ScalarType::Short) { int16_t* out_data = out.mutable_data_ptr(); - impl::HiFi::kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); + quantize(out_data, input_data, 1. / scale, zero_point, numel); } else if ( out.scalar_type() == ScalarType::Bits16 || out.scalar_type() == ScalarType::UInt16) { uint16_t* out_data = out.mutable_data_ptr(); - impl::HiFi::kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Int) { + int32_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); } else { ET_KERNEL_CHECK_MSG( ctx, @@ -119,6 +122,66 @@ void quantize_per_tensor_out( } } -} // namespace native -} // namespace HiFi -} // namespace impl +void quantize_per_tensor_asym8u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + uint8_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); +} + +void quantize_per_tensor_asym16s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); +} + +void quantize_per_tensor_asym16u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + uint16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); +} + +void quantize_per_tensor_asym32s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int32_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); +} + +}; // namespace native +}; // namespace HiFi +}; // namespace impl diff --git a/backends/cadence/hifi/operators/op_quantize_per_tensor_asym8s.cpp b/backends/cadence/hifi/operators/op_quantize_per_tensor_asym8s.cpp new file mode 100644 index 00000000000..552b6acf150 --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantize_per_tensor_asym8s.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +namespace impl { +namespace HiFi { +namespace native { + +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +void quantize_per_tensor_asym8s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_asym8s(out_data, input_data, scale, zero_point, numel); +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 96% rename from backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp index 566325e0f10..b5ab0cdbaa2 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NCHW 1D convolution for int8 x int8 -> int8 -void xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s( +void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -144,7 +144,7 @@ void xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s( } } -void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -161,7 +161,7 @@ void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s( + xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 96% rename from backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp index de5f76b0fff..60e700f563b 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NCHW 1D convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u( +void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -144,7 +144,7 @@ void xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u( } } -void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -161,7 +161,7 @@ void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u( + xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 95% rename from backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp index b549ad13307..c9a3d2b58de 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NHWC 1D convolution for int8 x int8 -> int8 -void xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s( +void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -93,7 +93,7 @@ void xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s( } } -void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -110,7 +110,7 @@ void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s( + xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 95% rename from backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp index f5dbb083522..2d7a4cba509 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NHWC 1D convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u( +void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -93,7 +93,7 @@ void xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u( } } -void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -110,7 +110,7 @@ void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u( + xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 97% rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp index e4074829cf0..e2584485686 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NCHW convolution for int8 x int8 -> int8 -void xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s( +void xa_opt_quantized_conv2d_nchw_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -207,7 +207,7 @@ void xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s( } } -void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -224,7 +224,7 @@ void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s( + xa_opt_quantized_conv2d_nchw_asym8sxsym8s_asym8s( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 97% rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp index 201b5d7da16..8444fef6bd1 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NCHW convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u( +void xa_opt_quantized_conv2d_nchw_asym8uxsym8u_asym8u( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -207,7 +207,7 @@ void xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u( } } -void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -224,7 +224,7 @@ void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u( + xa_opt_quantized_conv2d_nchw_asym8uxsym8u_asym8u( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 96% rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp index a0e47104e18..787984e52db 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Specialized depthwise NCHW convolution for int8 x int8 -> int8 -void xa_opt_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s( +void xa_opt_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -162,7 +162,7 @@ void xa_opt_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s( kNnlibMaxDim); } -void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -179,7 +179,7 @@ void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s( + xa_opt_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 96% rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp index 03274413f65..219eaf44ad7 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Specialized depthwise NCHW convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u( +void xa_opt_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -162,7 +162,7 @@ void xa_opt_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u( kNnlibMaxDim); } -void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -179,7 +179,7 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u( + xa_opt_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 98% rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp index 34c861faed5..fc279f2bbdf 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -122,7 +122,7 @@ __attribute__((noinline)) void conv2d_nchw_dilated_asym8sxsym8s_asym8s_core( } } -void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 98% rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp index 6393554e18f..08ca4657c75 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -123,7 +123,7 @@ __attribute__((noinline)) void conv2d_nchw_dilated_asym8uxsym8u_asym8u_core( } } -void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp similarity index 98% rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp index 604f881ab96..984747d9316 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp @@ -156,7 +156,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( } } -void xa_opt_quantized_conv_nchw( +void xa_opt_quantized_conv2d_nchw( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -444,7 +444,7 @@ void xa_opt_quantized_conv_nchw( // bias_scale, since it is a product of the two. The kernel will branch to // quantized::conv1d or quantized::conv2d based on the dimensionality of // activation tensor. -void quantized_conv_nchw( +void quantized_conv2d_nchw( const Tensor& input, const Tensor& weight, const Tensor& bias, @@ -515,7 +515,7 @@ void quantized_conv_nchw( #undef typed_quantized_conv2d_nchw } -void quantized_conv_nchw_out( +void quantized_conv2d_nchw_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -546,7 +546,7 @@ void quantized_conv_nchw_out( optimized = 0; if (optimized) { - xa_opt_quantized_conv_nchw( + xa_opt_quantized_conv2d_nchw( ctx, input, weight, @@ -562,7 +562,7 @@ void quantized_conv_nchw_out( output_zero_point, out); } else { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, @@ -579,7 +579,7 @@ void quantized_conv_nchw_out( } } -void quantized_conv_nchw_per_tensor_out( +void quantized_conv2d_nchw_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -606,7 +606,7 @@ void quantized_conv_nchw_per_tensor_out( optimized = 0; if (optimized) { - xa_opt_quantized_conv_nchw( + xa_opt_quantized_conv2d_nchw( ctx, input, weight, @@ -622,7 +622,7 @@ void quantized_conv_nchw_per_tensor_out( output_zero_point, out); } else { - quantized_conv_nchw( + quantized_conv2d_nchw( input, weight, bias, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 96% rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp index 3f62c82bfcd..9bd7e641144 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NHWC convolution for int8 x int8 -> int8 -void xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s( +void xa_opt_quantized_conv2d_nhwc_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -150,7 +150,7 @@ void xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s( } } -void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -167,7 +167,7 @@ void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s( + xa_opt_quantized_conv2d_nhwc_asym8sxsym8s_asym8s( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 96% rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp index 32267591cf3..433cbf76fce 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Optimized NHWC convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u( +void xa_opt_quantized_conv2d_nhwc_asym8uxsym8u_asym8u( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -150,7 +150,7 @@ void xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u( } } -void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -167,7 +167,7 @@ void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u( + xa_opt_quantized_conv2d_nhwc_asym8uxsym8u_asym8u( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 95% rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp index c232f7e5ef2..384ebbb4f48 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Specialized depthwise NHWC convolution for int8 x int8 -> int8 -void xa_opt_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s( +void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -132,7 +132,7 @@ void xa_opt_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s( } } -void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -149,7 +149,7 @@ void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s( + xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 95% rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp index 5ef102c31d1..07df1a416d7 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -22,7 +22,7 @@ namespace HiFi { namespace native { // Specialized depthwise NHWC convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u( +void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -132,7 +132,7 @@ void xa_opt_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u( } } -void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -149,7 +149,7 @@ void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u( + xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( ctx, input, weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp similarity index 98% rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp index 35a1cbda0f9..91965594a5d 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -122,7 +122,7 @@ __attribute__((noinline)) void conv2d_nhwc_dilated_asym8sxsym8s_asym8s_core( } } -void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out( +void quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp similarity index 98% rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp index 62b5008ab7e..14dc31a719f 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -122,7 +122,7 @@ __attribute__((noinline)) void conv2d_nhwc_dilated_asym8uxsym8u_asym8u_core( } } -void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( +void quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp similarity index 98% rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp index 5aa087c4b75..a5d503853c4 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp @@ -147,7 +147,7 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic( } } -void xa_opt_quantized_conv_nhwc( +void xa_opt_quantized_conv2d_nhwc( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -350,7 +350,7 @@ void xa_opt_quantized_conv_nhwc( } } -void quantized_conv_nhwc( +void quantized_conv2d_nhwc( const Tensor& input, const Tensor& weight, const Tensor& bias, @@ -421,7 +421,7 @@ void quantized_conv_nhwc( #undef typed_quantized_conv2d_nhwc } -void quantized_conv_nhwc_out( +void quantized_conv2d_nhwc_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -452,7 +452,7 @@ void quantized_conv_nhwc_out( optimized = 0; if (optimized) { - xa_opt_quantized_conv_nhwc( + xa_opt_quantized_conv2d_nhwc( ctx, input, weight, @@ -468,7 +468,7 @@ void quantized_conv_nhwc_out( output_zero_point, out); } else { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, @@ -485,7 +485,7 @@ void quantized_conv_nhwc_out( } } -void quantized_conv_nhwc_per_tensor_out( +void quantized_conv2d_nhwc_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -512,7 +512,7 @@ void quantized_conv_nhwc_per_tensor_out( optimized = 0; if (optimized) { - xa_opt_quantized_conv_nhwc( + xa_opt_quantized_conv2d_nhwc( ctx, input, weight, @@ -528,7 +528,7 @@ void quantized_conv_nhwc_per_tensor_out( output_zero_point, out); } else { - quantized_conv_nhwc( + quantized_conv2d_nhwc( input, weight, bias, diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h index 11b93f4a89c..f7f5194d91a 100644 --- a/backends/cadence/hifi/operators/operators.h +++ b/backends/cadence/hifi/operators/operators.h @@ -83,7 +83,7 @@ void quantized_linear_per_tensor_out( const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); -void quantized_conv_nhwc_out( +void quantized_conv2d_nhwc_out( ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& input, const ::executorch::aten::Tensor& weight, @@ -101,7 +101,7 @@ void quantized_conv_nhwc_out( const ::executorch::aten::Tensor& out_shift, ::executorch::aten::Tensor& out); -void quantized_conv_nchw_out( +void quantized_conv2d_nchw_out( ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& input, const ::executorch::aten::Tensor& weight, @@ -119,7 +119,7 @@ void quantized_conv_nchw_out( const ::executorch::aten::Tensor& out_shift, ::executorch::aten::Tensor& out); -void quantized_conv_nchw_per_tensor_out( +void quantized_conv2d_nchw_per_tensor_out( ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& input, const ::executorch::aten::Tensor& weight, @@ -137,7 +137,7 @@ void quantized_conv_nchw_per_tensor_out( int64_t out_shift, ::executorch::aten::Tensor& out); -void quantized_conv_nhwc_per_tensor_out( +void quantized_conv2d_nhwc_per_tensor_out( ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& input, const ::executorch::aten::Tensor& weight, diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index fa263d4017c..1f9814c4a4e 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -44,6 +44,7 @@ OPERATORS = [ "cat", "clamp", "dequantize_per_tensor", + "dequantize_per_tensor_asym8s", "div", "embedding", "eq", @@ -63,24 +64,24 @@ OPERATORS = [ "ne", "permute_copy", "pow", - "quantized_conv_nchw_out", - "quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv_nhwc_out", - "quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out", - "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv2d_nchw_out", + "quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv2d_nhwc_out", + "quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out", "quantized_fully_connected_out", "quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out", "quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out", @@ -95,6 +96,7 @@ OPERATORS = [ "quantized_relu_asym8s_asym8s_per_tensor_out", "quantized_relu_asym8u_asym8u_per_tensor_out", "quantize_per_tensor", + "quantize_per_tensor_asym8s", "remainder", "rsqrt", "select_copy", diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS index 9c65c469280..65a578f4751 100644 --- a/backends/cadence/runtime/TARGETS +++ b/backends/cadence/runtime/TARGETS @@ -21,6 +21,7 @@ runtime.python_library( "//executorch/devtools/bundled_program/serialize:lib", "//executorch/devtools:lib", "//executorch/exir:lib", + ":etdump", ], ) diff --git a/backends/cadence/runtime/etdump.py b/backends/cadence/runtime/etdump.py new file mode 100644 index 00000000000..4ef5d28285a --- /dev/null +++ b/backends/cadence/runtime/etdump.py @@ -0,0 +1,173 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import logging +import os +from typing import cast, Optional, Tuple + +import torch +from executorch.devtools import Inspector +from executorch.devtools.inspector import Event, EventBlock, PerfData +from executorch.devtools.inspector._inspector_utils import TimeScale +from tabulate import tabulate + + +class CadenceETDump: + def __init__(self, output_dir: str) -> None: + self.tensor_dump_dir: str = os.path.join(output_dir, "tensors") + self.etdump_path: str = os.path.join(output_dir, "etdump.etdp") + self.etrecord_path: Optional[str] = os.path.join(output_dir, "etrecord.bin") + self.debug_buffer_path: Optional[str] = os.path.join( + output_dir, "debug_output.bin" + ) + + if not os.path.exists(self.etdump_path): + raise RuntimeError(f"{self.etdump_path} does not exist") + # pyre-ignore[6]: os.path.exists expects str, but got Optional[str] + if not os.path.exists(self.etrecord_path): + logging.warning( + "ETRecord not found, intermediate tensors will not be dumped" + ) + self.etrecord_path = None + # pyre-ignore[6]: os.path.exists expects str, but got Optional[str] + if not os.path.exists(self.debug_buffer_path): + logging.warning( + "Debug buffer not found, intermediate tensors will not be dumped" + ) + self.debug_buffer_path = None + + self.et_inspector: Inspector = Inspector( + etdump_path=self.etdump_path, + debug_buffer_path=self.debug_buffer_path, + etrecord=self.etrecord_path, + source_time_scale=TimeScale.CYCLES, + target_time_scale=TimeScale.CYCLES, + ) + + def get_outputs(self, log_to_stdout: bool = False) -> Tuple[torch.Tensor]: + output = [ + event_block.run_output + for event_block in self.et_inspector.event_blocks + if event_block.name == "Execute" + ] + logging.debug(f"[CadenceETDump] output: {output}") + return output[0] + + def get_execute_event_block(self) -> EventBlock: + exec_blocks = [ + eb for eb in self.et_inspector.event_blocks if eb.name == "Execute" + ] + return exec_blocks[0] + + def should_include_event(self, event: Event) -> bool: + # exclude duplicate events + if event.name in ("OPERATOR_CALL", "Method::execute"): + return False + + # exclude custom multi-zion events + if event.name.startswith("DELEGATE_ZION"): + return False + + return True + + def print_summary( + self, + bundled_prog_size: Optional[int] = None, + external_link: Optional[str] = None, + ) -> None: + """ + Print performance summary with optional program size and external link. + + Args: + bundled_prog_size: Size of the bundled program in bytes (optional) + external_link: External analytics/monitoring link (optional, e.g., Scuba link for Meta internal use) + """ + block = self.get_execute_event_block() + op_events = [e for e in block.events if self.should_include_event(e)] + op_time_sum = sum([cast(PerfData, e.perf_data).avg for e in op_events]) + + overall_event = [ev for ev in block.events if ev.name == "Method::execute"] + if not len(overall_event) == 1: + logging.warning( + f"Expected one 'Method::execute' event, found {len(overall_event)}" + ) + + total_cycles = cast(PerfData, overall_event[0].perf_data).avg + op_cycles = op_time_sum + + # Build table data and headers dynamically based on what's provided + table_data = [ + "{:,.0f}".format(total_cycles), + "{:,.0f}".format(op_cycles), + "{:,.0f}".format(total_cycles - op_cycles), + "{:.2%}".format((total_cycles - op_cycles) / total_cycles), + ] + headers = [ + "Total Cycles", + "Cycles in Ops", + "Other Cycles", + "Framework Tax (%)", + ] + + # Add optional fields if provided + if bundled_prog_size is not None: + table_data.append("{:,.0f}".format(bundled_prog_size)) + headers.append("Bundled Program Size (bytes)") + + if external_link is not None: + table_data.append(external_link) + headers.append("External Link") + + logging.info( + "Performance Summary:\n%s", + tabulate( + [table_data], + headers=headers, + tablefmt="outline", + ), + ) + + def print_event_block(self) -> None: + logging.info("Profiled events:") + if logging.getLogger().level <= logging.INFO: + self.et_inspector.print_data_tabular() + + def dump_intermediate_tensors(self) -> None: + if self.etrecord_path is None: + logging.info("[CadenceETDump] Intermediate tensors not available") + return + + logging.info( + f"[CadenceETDump] Dumping intermediate tensors to {self.tensor_dump_dir}" + ) + os.makedirs(self.tensor_dump_dir, exist_ok=True) + exec_blocks = [ + eb for eb in self.et_inspector.event_blocks if eb.name == "Execute" + ] + if len(exec_blocks) > 1: + logging.warning( + f'Found {len(exec_blocks)} "Execute" blocks, using the first one and ignoring the rest.' + ) + block = exec_blocks[0] + + # OPERATOR_CALL events are duplicates that contain framework tax data. We don't need them + op_events = [e for e in block.events if e.name != "OPERATOR_CALL"] + torch.set_printoptions(profile="full") + + for event in op_events: + instr_id = event._instruction_id + if not event.debug_data: + logging.debug( + f"Missing intermediate tensor data for {event.name} ({instr_id=})" + ) + continue + + with open(f"{self.tensor_dump_dir}/{instr_id}.txt", "w") as f: + for dd in event.debug_data: + f.write(f"{str(dd)}\n\n") + torch.set_printoptions(profile="default") diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py index 4d1c876bcdb..a7d35fbd0c9 100644 --- a/backends/cadence/runtime/runtime.py +++ b/backends/cadence/runtime/runtime.py @@ -9,9 +9,8 @@ import logging import numbers -import os import tempfile -from typing import Any, Optional, Sequence, Tuple, Union +from typing import Any, Optional, Sequence, Union import executorch.exir.schema as et_schema @@ -19,8 +18,8 @@ import torch from executorch.backends.cadence.runtime import utils +from executorch.backends.cadence.runtime.etdump import CadenceETDump from executorch.backends.cadence.runtime.executor import Executor -from executorch.devtools import Inspector from executorch.exir import ExecutorchProgramManager from executorch.exir._serialize._program import deserialize_pte_binary from executorch.exir.schema import DataLocation @@ -30,90 +29,6 @@ from torch.utils._pytree import TreeSpec -class CadenceETDump: - def __init__(self, output_dir: str) -> None: - self.tensor_dump_dir: str = os.path.join(output_dir, "tensors") - self.etdump_path: str = os.path.join(output_dir, "etdump.etdp") - self.etrecord_path: Optional[str] = os.path.join(output_dir, "etrecord.bin") - self.debug_buffer_path: Optional[str] = os.path.join( - output_dir, "debug_output.bin" - ) - - if not os.path.exists(self.etdump_path): - raise RuntimeError(f"{self.etdump_path} does not exist") - # pyre-ignore[6]: os.path.exists expects str, but got Optional[str] - if not os.path.exists(self.etrecord_path): - logging.warning( - "ETRecord not found, intermediate tensors will not be dumped" - ) - self.etrecord_path = None - # pyre-ignore[6]: os.path.exists expects str, but got Optional[str] - if not os.path.exists(self.debug_buffer_path): - logging.warning( - "Debug buffer not found, intermediate tensors will not be dumped" - ) - self.debug_buffer_path = None - - self.et_inspector: Inspector = Inspector( - etdump_path=self.etdump_path, - debug_buffer_path=self.debug_buffer_path, - etrecord=self.etrecord_path, - ) - - def get_outputs(self, log_to_stdout: bool = False) -> Tuple[torch.Tensor]: - output = [ - event_block.run_output - for event_block in self.et_inspector.event_blocks - if event_block.name == "Execute" - ] - logging.debug(f"[ETdump] output: {output}") - return output[0] - - def print_event_block(self) -> None: - logging.debug("[ETdump] data tabular:") - if logging.getLogger().level <= logging.DEBUG: - self.et_inspector.print_data_tabular() - - def print_event_data(self) -> None: - logging.debug("[ETdump] event data ") - for event_block in self.et_inspector.event_blocks: - for event in event_block.events: - logging.debug(event) - - def dump_intermediate_tensors(self) -> None: - if self.etrecord_path is None: - logging.info("[ETdump] Intermediate tensors not available") - return - - logging.info(f"[ETdump] Dumping intermediate tensors to {self.tensor_dump_dir}") - os.makedirs(self.tensor_dump_dir, exist_ok=True) - exec_blocks = [ - eb for eb in self.et_inspector.event_blocks if eb.name == "Execute" - ] - if len(exec_blocks) > 1: - logging.warning( - f'Found {len(exec_blocks)} "Execute" blocks, using the first one and ignoring the rest.' - ) - block = exec_blocks[0] - - # OPERATOR_CALL events are duplicates that contain framework tax data. We don't need them - op_events = [e for e in block.events if e.name != "OPERATOR_CALL"] - torch.set_printoptions(profile="full") - - for event in op_events: - instr_id = event._instruction_id - if not event.debug_data: - logging.debug( - f"Missing intermediate tensor data for {event.name} ({instr_id=})" - ) - continue - - with open(f"{self.tensor_dump_dir}/{instr_id}.txt", "w") as f: - for dd in event.debug_data: - f.write(f"{str(dd)}\n\n") - torch.set_printoptions(profile="default") - - def get_op_names(program: et_schema.Program, execution_plan_id: int = 0) -> set[str]: """ Get the list of operators from a Program @@ -162,6 +77,9 @@ def run( etdump = CadenceETDump(output_dir=working_dir) outputs = etdump.get_outputs() + # Print performance summary + etdump.print_summary() + assert isinstance(out_spec, TreeSpec) outputs = torch.utils._pytree.tree_unflatten(outputs, out_spec) diff --git a/backends/cadence/runtime/targets.bzl b/backends/cadence/runtime/targets.bzl index dabe42ad824..09a116764c2 100644 --- a/backends/cadence/runtime/targets.bzl +++ b/backends/cadence/runtime/targets.bzl @@ -13,3 +13,17 @@ def define_common_targets(): "//executorch/runtime/platform:platform", ], ) + + runtime.python_library( + name = "etdump", + srcs = ["etdump.py"], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS" + ], + deps = [ + "fbcode//executorch/devtools:lib", + "fbcode//executorch/devtools/inspector:inspector_utils", + "fbsource//third-party/pypi/tabulate:tabulate", + ], + ) diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py index 5b204e99fcb..e49cf412c19 100644 --- a/backends/cadence/utils/facto_util.py +++ b/backends/cadence/utils/facto_util.py @@ -22,9 +22,95 @@ MAX_CASES = 50 +# Global cache to store generated shapes per tensor to ensure consistency +_shape_cache: dict[str, list[int]] = {} + + def apply_tensor_contraints(op_name: str, index: int) -> list[object]: - # Constraint to limit tensor size product to < 4000 - max_size_constraint = cp.Size.Le(lambda deps, r, d: max(1, int((3999) ** (1 / r)))) + # Constraint to limit tensor size to < 4000 bytes with fully randomized shapes + import random + + def get_dtype_bytes(dtype: torch.dtype) -> int: + """Get the number of bytes per element for a given dtype""" + dtype_bytes = { + torch.int8: 1, + torch.uint8: 1, + torch.int16: 2, + torch.uint16: 2, + torch.int32: 4, + torch.float32: 4, + torch.int64: 8, + torch.float64: 8, + torch.bool: 1, + torch.float: 4, # alias for float32 + torch.int: 4, # alias for int32 + torch.long: 8, # alias for int64 + } + return dtype_bytes.get(dtype, 4) # Default to 4 bytes if dtype not found + + def generate_random_shape_with_byte_limit( + rank: int, dtype: torch.dtype, max_bytes: int = 3999, seed_base: int = 42 + ) -> list[int]: + """Generate a random shape with given rank ensuring total byte size < max_bytes""" + random.seed(seed_base + rank) + + bytes_per_element = get_dtype_bytes(dtype) + max_elements = max_bytes // bytes_per_element + + # Start with all dimensions as 1 + shape = [1] * rank + remaining_elements = ( + max_elements - 1 + ) # Leave room since we start with product=1 + + # Randomly distribute the remaining capacity across dimensions + for i in range(rank): + if remaining_elements <= 1: + break + + # Calculate maximum size this dimension can have without exceeding limit + current_product = 1 + for j in range(rank): + if j != i: + current_product *= shape[j] + + max_size_for_dim = min( + remaining_elements // current_product, 50 + ) # Cap at 50 + if max_size_for_dim > shape[i]: + # Randomly choose a size between current and max + new_size = random.randint(shape[i], max_size_for_dim) + shape[i] = new_size + remaining_elements = max_elements // (current_product * new_size) + remaining_elements = max(1, remaining_elements) + + # Final random shuffle of the dimensions to make it more random + random.shuffle(shape) + return shape + + def random_size_constraint(deps: object, r: int, d: int) -> int: + """Generate random sizes ensuring total byte size < 4000 bytes""" + # Use conservative approach: assume worst case is 4 bytes per element (float32/int32) + # This ensures we never exceed 4000 bytes regardless of actual dtype + worst_case_dtype = torch.float32 # 4 bytes per element + + # Create a unique key for this tensor configuration + cache_key = f"{r}_{d}_conservative" + + if cache_key not in _shape_cache: + # Generate a new random shape for this rank using worst-case byte estimation + shape = generate_random_shape_with_byte_limit( + r, worst_case_dtype, max_bytes=3999, seed_base=42 + r * 10 + d + ) + _shape_cache[cache_key] = shape + + # Return the size for dimension d, ensuring we don't go out of bounds + cached_shape = _shape_cache[cache_key] + return cached_shape[d] if d < len(cached_shape) else 1 + + max_size_constraint = cp.Size.Le( + lambda deps, r, d: random_size_constraint(deps, r, d) + ) tensor_constraints = ( [ @@ -81,7 +167,7 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: cp.Size.Ge(lambda deps, r, d: 1), max_size_constraint, ] - else: + elif index == 1: # input tensor(a) tensor_constraints = [ cp.Dtype.In( lambda deps: [ @@ -99,6 +185,25 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: cp.Size.Ge(lambda deps, r, d: 1), max_size_constraint, ] + else: # input tensor(b) + tensor_constraints = [ + cp.Dtype.In( + lambda deps: [ + torch.int8, + torch.int16, + torch.uint8, + torch.uint16, + torch.int32, + torch.float32, + ] + ), + cp.Dtype.Eq(lambda deps: deps[1].dtype), + cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), + cp.Value.Le(lambda deps, dtype, struct: 2**4), + cp.Rank.Ge(lambda deps: 1), + cp.Size.Ge(lambda deps, r, d: 1), + max_size_constraint, + ] case "embedding.default": tensor_constraints = [ cp.Dtype.In(lambda deps: [torch.float, torch.int]), @@ -117,6 +222,34 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: cp.Value.Le(lambda deps, dtype, struct: 2), ] ) + case "transpose_copy.int": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float32, torch.int32]), + ] + ) + case "permute_copy.default": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float32, torch.int8, torch.uint8]), + cp.Rank.Le( + lambda deps: 5 + ), # xa_nn_transpose only supports up to 5D + cp.Rank.Ge(lambda deps: 1), # Must have at least 1 dimension + ] + ) + case "sqrt.default": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float32, torch.int32]), + ] + ) + case "clamp.default": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float32, torch.int32]), + ] + ) case "rsqrt.default": tensor_constraints.extend( [ @@ -127,6 +260,12 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: cp.Value.Le(lambda deps, dtype, struct: 2**2), ] ) + case "relu.default": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float32]), + ] + ) case "mean.dim": tensor_constraints.extend( [ @@ -136,10 +275,17 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: case "exp.default": tensor_constraints.extend( [ + cp.Dtype.In(lambda deps: [torch.float32]), cp.Value.Ge(lambda deps, dtype, struct: -(2**2)), cp.Value.Le(lambda deps, dtype, struct: 2**2), ] ) + case "tanh.default": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float32]), + ] + ) case "slice_copy.Tensor": tensor_constraints.extend( [ @@ -148,6 +294,34 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: cp.Value.Le(lambda deps, dtype, struct: 2), ] ) + case "div.Scalar" | "add.Tensor" | "mul.Tensor" | "sub.Tensor": + tensor_constraints.extend( + [ + cp.Dtype.In( + lambda deps: [ + torch.int32, + torch.int64, + torch.float32, + ] + ), + ] + ) + case "split_copy.Tensor": + tensor_constraints.extend( + [ + cp.Dtype.In( + lambda deps: [ + torch.int32, + torch.int64, + torch.float32, + ] + ), + cp.Value.Ge(lambda deps, dtype, struct: 1), + cp.Value.Le(lambda deps, dtype, struct: 2**3), + cp.Rank.Le(lambda deps: 3), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + ) case "constant_pad_nd.default": tensor_constraints.extend( [ @@ -178,6 +352,12 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: cp.Rank.Le(lambda deps: 2**2), ] ) + case "pow.Tensor_Scalar": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float32, torch.int32]), + ] + ) case "div.Tensor_mode" | "minimum.default": if index == 0: tensor_constraints = [ diff --git a/backends/cadence/vision/kernels/CMakeLists.txt b/backends/cadence/vision/kernels/CMakeLists.txt new file mode 100644 index 00000000000..fa7b2b5203b --- /dev/null +++ b/backends/cadence/vision/kernels/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# lint_cmake: -linelength +add_library( + cadence_kernels + kernels.cpp + ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/tensor_transposef.c + ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/vsoftmaxf.c + ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/expf_tbl.c + ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/nanf_tbl.c + ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/inff_tbl.c +) + +# Let files say "include ". +set(_common_include_directories + ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 +) + +target_include_directories( + cadence_kernels + PUBLIC . ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include + ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include_private + ${_common_include_directories} +) + +target_link_libraries(cadence_kernels PRIVATE idma) diff --git a/backends/cadence/vision/kernels/kernels.cpp b/backends/cadence/vision/kernels/kernels.cpp new file mode 100644 index 00000000000..70c811df741 --- /dev/null +++ b/backends/cadence/vision/kernels/kernels.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +namespace impl { +namespace vision { +namespace kernels { + +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) { + Result temp_mem_res = ctx.allocate_temp(size); + return temp_mem_res.ok() ? temp_mem_res.get() : nullptr; +} + +// Quantize a fp32 value to an int8_t/uint8_t value +template +T quantize(const float x, float scale, int32_t zero_point) { + constexpr float min_val = std::numeric_limits::min(); + constexpr float max_val = std::numeric_limits::max(); + float tmp = roundf(x * scale + zero_point); + return std::max(std::min(tmp, max_val), min_val); +} + +// Quantize an fp32 array to an int8_t/uint8_t array +template +void quantize( + T* __restrict__ y, + const float* __restrict__ x, + float inv_scale, + int32_t zero_point, + size_t size) { + for (size_t i = 0; i < size; ++i) { + y[i] = quantize(x[i], inv_scale, zero_point); + } +} + +// Dequantize an int8_t/uint8_t value to an fp32 value +template +float dequantize(const T x, float scale, int32_t zero_point) { + return scale * (x - zero_point); +} + +// Dequantize an int8_t/uint8_t/int16_t array to an fp32 array +template +void dequantize( + float* __restrict__ y, + const T* __restrict__ x, + float scale, + int32_t zero_point, + size_t size) { + for (size_t i = 0; i < size; ++i) { + y[i] = dequantize(x[i], scale, zero_point); + } +} + +// Requantize the int8_t/uint8_t in value to a uint8_t/int8_t out value. +// The scale and zero_point for requantization are in the args. +template +OT requantize( + const IT in, + float in_scale, + int32_t in_zero_point, + float inv_out_scale, + int32_t out_zero_point) { + float dequant = dequantize(in, in_scale, in_zero_point); + return quantize(dequant, inv_out_scale, out_zero_point); +} + +// Requantize the int8_t/uint8_t in array to a uint8_t/int8_t out array. +// The scale and zero_point for requantization are in the args. +template +void requantize( + OT* __restrict__ out, + const IT* __restrict__ in, + float in_scale, + int32_t in_zero_point, + float inv_out_scale, + int32_t out_zero_point, + size_t size) { + for (size_t i = 0; i < size; ++i) { + out[i] = requantize( + in[i], in_scale, in_zero_point, inv_out_scale, out_zero_point); + } +} + +// explicit template instantiation + +#define typed_quantize_val(dtype) \ + template dtype quantize(const float x, float inv_scale, int32_t zero_point); +typed_quantize_val(int8_t); +typed_quantize_val(uint8_t); +typed_quantize_val(int16_t); +typed_quantize_val(uint16_t); +typed_quantize_val(int32_t); +#undef typed_quantize_val + +#define typed_quantize_vec(dtype) \ + template void quantize( \ + dtype* __restrict__ y, \ + const float* __restrict__ x, \ + float inv_scale, \ + int32_t zero_point, \ + size_t size); +typed_quantize_vec(int8_t); +typed_quantize_vec(uint8_t); +typed_quantize_vec(int16_t); +typed_quantize_vec(uint16_t); +typed_quantize_vec(int32_t); +#undef typed_quantize_vec + +#define typed_dequantize_val(dtype) \ + template float dequantize(const dtype x, float scale, int32_t zero_point); +typed_dequantize_val(int8_t); +typed_dequantize_val(uint8_t); +typed_dequantize_val(int16_t); +typed_dequantize_val(uint16_t); +typed_dequantize_val(int32_t); +#undef typed_dequantize_val + +#define typed_dequantize_vec(dtype) \ + template void dequantize( \ + float* __restrict__ y, \ + const dtype* __restrict__ x, \ + float scale, \ + int32_t zero_point, \ + size_t size); +typed_dequantize_vec(int8_t); +typed_dequantize_vec(uint8_t); +typed_dequantize_vec(int16_t); +typed_dequantize_vec(uint16_t); +typed_dequantize_vec(int32_t); +#undef typed_dequantize_vec + +#define typed_requantize_val(itype, otype) \ + template otype requantize( \ + const itype in, \ + float in_scale, \ + int32_t in_zero_point, \ + float inv_out_scale, \ + int32_t out_zero_point); +typed_requantize_val(int8_t, int8_t); +typed_requantize_val(int8_t, uint8_t); +typed_requantize_val(int8_t, int16_t); +typed_requantize_val(int8_t, uint16_t); +typed_requantize_val(uint8_t, int8_t); +typed_requantize_val(uint8_t, uint8_t); +typed_requantize_val(uint8_t, int16_t); +typed_requantize_val(uint8_t, uint16_t); +typed_requantize_val(int16_t, int8_t); +typed_requantize_val(int16_t, uint8_t); +typed_requantize_val(int16_t, int16_t); +typed_requantize_val(int16_t, uint16_t); +typed_requantize_val(uint16_t, int8_t); +typed_requantize_val(uint16_t, uint8_t); +typed_requantize_val(uint16_t, int16_t); +typed_requantize_val(uint16_t, uint16_t); +#undef typed_requantize_val + +#define typed_requantize_vec(itype, otype) \ + template void requantize( \ + otype* __restrict__ out, \ + const itype* __restrict__ in, \ + float in_scale, \ + int32_t in_zero_point, \ + float inv_out_scale, \ + int32_t out_zero_point, \ + size_t size); +typed_requantize_vec(int8_t, int8_t); +typed_requantize_vec(int8_t, uint8_t); +typed_requantize_vec(int8_t, int16_t); +typed_requantize_vec(int8_t, uint16_t); +typed_requantize_vec(uint8_t, int8_t); +typed_requantize_vec(uint8_t, uint8_t); +typed_requantize_vec(uint8_t, int16_t); +typed_requantize_vec(uint8_t, uint16_t); +typed_requantize_vec(int16_t, int8_t); +typed_requantize_vec(int16_t, uint8_t); +typed_requantize_vec(int16_t, int16_t); +typed_requantize_vec(int16_t, uint16_t); +typed_requantize_vec(uint16_t, int8_t); +typed_requantize_vec(uint16_t, uint8_t); +typed_requantize_vec(uint16_t, int16_t); +typed_requantize_vec(uint16_t, uint16_t); +#undef typed_requantize_vec + +}; // namespace kernels +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/kernels/kernels.h b/backends/cadence/vision/kernels/kernels.h new file mode 100644 index 00000000000..e86a36515ec --- /dev/null +++ b/backends/cadence/vision/kernels/kernels.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include "inttypes.h" +#include "stddef.h" + +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::Result; + +namespace impl { +namespace vision { +namespace kernels { + +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size); + +template +T quantize(const float x, float scale, int32_t zero_point); + +template +float dequantize(const T x, float scale, int32_t zero_point); + +template +void quantize( + T* __restrict__ y, + const float* __restrict__ x, + float scale, + int32_t zero_point, + size_t size); + +// Deuantize an int8_t/uint8_t/int16_t array to an fp32 array +template +void dequantize( + float* __restrict__ y, + const T* __restrict__ x, + float scale, + int32_t zero_point, + size_t size); + +template +OT requantize( + const IT in, + float in_scale, + int32_t in_zero_point, + float inv_out_scale, + int32_t out_zero_point); + +template +void requantize( + OT* __restrict__ out, + const IT* __restrict__ in, + float in_scale, + int32_t in_zero_point, + float inv_out_scale, + int32_t out_zero_point, + size_t size); + +}; // namespace kernels +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/kernels/targets.bzl b/backends/cadence/vision/kernels/targets.bzl new file mode 100644 index 00000000000..02136c872b3 --- /dev/null +++ b/backends/cadence/vision/kernels/targets.bzl @@ -0,0 +1,25 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "cadence_kernels", + srcs = ["kernels.cpp"], + exported_headers = [ + "kernels.h", + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + platforms = CXX, + compatible_with = select({ + "DEFAULT": [], + "ovr_config//cpu:xtensa": ["ovr_config//cpu:xtensa"], + }), + define_static_target = True, + deps = [ + "//executorch/backends/cadence/vision/third-party:vision-nnlib", + "//executorch/runtime/kernel:kernel_includes", + ], + ) diff --git a/backends/cadence/vision/operators/CMakeLists.txt b/backends/cadence/vision/operators/CMakeLists.txt new file mode 100644 index 00000000000..76b784681be --- /dev/null +++ b/backends/cadence/vision/operators/CMakeLists.txt @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.19) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake) + +if(NOT PYTHON_EXECUTABLE) + resolve_python_executable() +endif() + +# ATen compliant ops that are needed to run this model. +set(_aten_ops__srcs + "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_embedding.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_softmax.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_rsqrt.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_transpose_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_eq.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_logical_not.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_any.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_native_group_norm.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/delinearize_index.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp" +) +add_library(aten_ops_cadence ${_aten_ops__srcs}) +target_link_libraries(aten_ops_cadence PUBLIC executorch) +target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) + +# Let files say "include ". +set(_common_include_directories + ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 +) + +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) + +# Custom ops that are needed to run the test model. +add_library( + custom_ops + "op_quantized_linear_out.cpp" + "op_quantized_conv_out.cpp" + "op_quantized_relu_out.cpp" + "op_quantized_layer_norm.cpp" + "op_quantize_per_tensor.cpp" + "op_quantized_fully_connected_out.cpp" + "op_dequantize_per_tensor.cpp" + "op_quantized_matmul_out.cpp" + "op_requantize_out.cpp" + "op_im2row_out.cpp" +) +target_include_directories( + custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) + +target_link_libraries(custom_ops PUBLIC executorch) +target_link_libraries(custom_ops PRIVATE cadence_kernels) + +# Generate C++ bindings to register kernels into both PyTorch (for AOT) and +# Executorch (for runtime). Here select all ops in functions_vision.yaml +gen_selected_ops( + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML + "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions_vision.yaml" "" "" +) +generate_bindings_for_kernels( + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_vision.yaml +) +message("Generated cadence x86 files ${gen_command_sources}") + +gen_operators_lib( + LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence +) diff --git a/backends/cadence/generic/operators/op_add.cpp b/backends/cadence/vision/operators/op_add.cpp similarity index 72% rename from backends/cadence/generic/operators/op_add.cpp rename to backends/cadence/vision/operators/op_add.cpp index 89b67467605..81014143275 100644 --- a/backends/cadence/generic/operators/op_add.cpp +++ b/backends/cadence/vision/operators/op_add.cpp @@ -11,8 +11,18 @@ #include #include -namespace torch { -namespace executor { +using executorch::aten::Scalar; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::canCast; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::Error; +using torch::executor::native::utils::extract_scalar; + +namespace impl { +namespace vision { namespace native { Tensor& add_out( @@ -23,6 +33,8 @@ Tensor& add_out( Tensor& out) { (void)ctx; + using namespace torch::executor::native::utils; + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); @@ -39,7 +51,9 @@ Tensor& add_out( using CTYPE_IN = float; using CTYPE_OUT = float; CTYPE_IN alpha_val; - ET_EXTRACT_SCALAR(alpha, alpha_val); + ET_CHECK_MSG( + extract_scalar(alpha, &alpha_val), + "Could not be extracted: wrong type or out of range"); apply_binary_elementwise_fn( [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { @@ -57,5 +71,5 @@ Tensor& add_out( } } // namespace native -} // namespace executor -} // namespace torch +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp new file mode 100644 index 00000000000..daffecda1bf --- /dev/null +++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +void dequantize_per_tensor_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + + if (input.scalar_type() == ScalarType::Byte) { + const uint8_t* input_data = input.const_data_ptr(); + kernels::dequantize( + out_data, input_data, scale, zero_point, numel); + } else if (input.scalar_type() == ScalarType::Char) { + const int8_t* input_data = input.const_data_ptr(); + kernels::dequantize(out_data, input_data, scale, zero_point, numel); + } else if ( + input.scalar_type() == ScalarType::Bits16 || + input.scalar_type() == ScalarType::UInt16) { + const uint16_t* input_data = input.const_data_ptr(); + kernels::dequantize( + out_data, input_data, scale, zero_point, numel); + } else if (input.scalar_type() == ScalarType::Short) { + const int16_t* input_data = input.const_data_ptr(); + kernels::dequantize( + out_data, input_data, scale, zero_point, numel); + } else if (input.scalar_type() == ScalarType::Int) { + const int32_t* input_data = input.const_data_ptr(); + kernels::dequantize( + out_data, input_data, scale, zero_point, numel); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(input.scalar_type())); + } +} + +}; // namespace native +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/generic/operators/op_embedding.cpp b/backends/cadence/vision/operators/op_embedding.cpp similarity index 92% rename from backends/cadence/generic/operators/op_embedding.cpp rename to backends/cadence/vision/operators/op_embedding.cpp index ce28789a156..5273cb083e8 100644 --- a/backends/cadence/generic/operators/op_embedding.cpp +++ b/backends/cadence/vision/operators/op_embedding.cpp @@ -8,13 +8,13 @@ #include -namespace torch { -namespace executor { -namespace native { - using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; +namespace impl { +namespace vision { +namespace native { + void embedding_out( KernelRuntimeContext& ctx, const Tensor& weight, @@ -37,5 +37,5 @@ void embedding_out( } } // namespace native -} // namespace executor -} // namespace torch +} // namespace vision +} // namespace impl diff --git a/backends/cadence/generic/operators/op_full.cpp b/backends/cadence/vision/operators/op_full.cpp similarity index 68% rename from backends/cadence/generic/operators/op_full.cpp rename to backends/cadence/vision/operators/op_full.cpp index 21d5fc56299..afc29718a2b 100644 --- a/backends/cadence/generic/operators/op_full.cpp +++ b/backends/cadence/vision/operators/op_full.cpp @@ -9,12 +9,18 @@ #include #include -namespace torch { -namespace executor { -namespace native { - +using executorch::aten::IntArrayRef; +using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; +using torch::executor::Error; +using torch::executor::native::utils::extract_scalar; +using torch::executor::native::utils::get_scalar_dtype; + +namespace impl { +namespace vision { +namespace native { Tensor& full_out( KernelRuntimeContext& ctx, @@ -23,7 +29,7 @@ Tensor& full_out( Tensor& out) { (void)ctx; - ScalarType val_type = utils::get_scalar_dtype(fill_value); + ScalarType val_type = get_scalar_dtype(fill_value); ScalarType out_type = out.scalar_type(); Error err = resize_tensor(out, sizes); @@ -31,7 +37,9 @@ Tensor& full_out( ET_SWITCH_REAL_TYPES_AND(Bool, val_type, ctx, "full", CTYPE_VAL, [&] { CTYPE_VAL val; - ET_EXTRACT_SCALAR(fill_value, val); + ET_CHECK_MSG( + extract_scalar(fill_value, &val), + "Could not be extracted: wrong type or out of range"); ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "full", CTYPE_OUT, [&] { CTYPE_OUT val_casted = static_cast(val); @@ -46,5 +54,5 @@ Tensor& full_out( } } // namespace native -} // namespace executor -} // namespace torch +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_im2row_out.cpp b/backends/cadence/vision/operators/op_im2row_out.cpp new file mode 100644 index 00000000000..501f8ce5376 --- /dev/null +++ b/backends/cadence/vision/operators/op_im2row_out.cpp @@ -0,0 +1,298 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace impl { +namespace vision { +namespace native { + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +template +__attribute__((always_inline)) void im2row_( + const T* __restrict__ data_im, + const int32_t in_zero_point, + /* input parameters*/ + const int32_t channels, + const int32_t height, + const int32_t width, + /* output parameters */ + const int32_t out_height, + const int32_t out_width, + /* convolution parameters */ + const int32_t kernel_h, + const int32_t kernel_w, + const int32_t pad_h, + const int32_t pad_w, + const int32_t stride_h, + const int32_t stride_w, + const int32_t dilation_h, + const int32_t dilation_w, + T* __restrict__ data_col, + bool channels_last) { + // Consider convolving the input image of dimensions channels * height * width + // (or height * width * channels for NHWC layout) with a filter of dimensions + // channels * kernels_h * kernels_w. Assume that this convolution will produce + // an output of dimensinos out_height x out_width. For each point the output, + // im2row takes the data from the input that is used in the computation of + // that output point, and flattens it into a vector of size channels_col = + // channels * kernel_h * kernel_w. The output of im2row will therefore be a 2D + // array of size (out_height * out_width) x channels_col + const int32_t channels_col = channels * kernel_h * kernel_w; + + // If the layout is NHWC, we can copy 'channels' worth of contiguous data + // points when performing im2row. + if (channels_last) { + // Iterate over the output domain + for (int _h = 0; _h < out_height; ++_h) { + for (int _w = 0; _w < out_width; ++_w) { + int32_t i_col = _h * out_width + _w; + // Each point in the output domain is the result of applying a filter of + // size kernel_h x kernel_w x channels on the input. But since channels + // is contiguous, we will not explicitly have a loop for it. + for (int _kh = 0; _kh < kernel_h; ++_kh) { + int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h; + for (int _kw = 0; _kw < kernel_w; ++_kw) { + int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w; + + // h_im and w_im are the actual height and width coordinates of the + // input tensor from where we need to copy 'channels' points. + const T* __restrict__ slice_im = + data_im + (h_im * width + w_im) * channels; + T* __restrict__ slice_col = data_col + i_col * channels_col + + (_kh * kernel_w + _kw) * channels; + // If the coordinates were within the input domain, we copy + // 'channels' contiguous values. Otherwise we will fill the output + // with 0's. + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + std::memcpy(slice_col, slice_im, channels * sizeof(T)); + } else { + std::fill_n(slice_col, channels, T(in_zero_point)); + } + } + } + } + } + } else { + // Iterate over the output domain + for (int _h = 0; _h < out_height; ++_h) { + for (int _w = 0; _w < out_width; ++_w) { + int32_t i_col = _h * out_width + _w; + + // Each point in the output domain is the result of applying a filter + // of size chanenls * kernel_h x kernel_w on the input + for (int _c = 0; _c < channels; ++_c) { + for (int _kh = 0; _kh < kernel_h; ++_kh) { + for (int _kw = 0; _kw < kernel_w; ++_kw) { + // c_col is the linearized access in the channels_col vector. + int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw; + // h_im and w_im are the actual height and width coordinates of + // the input tensor that we need to copy to the output. + int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h; + int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w; + // If the current data access is within the input tensor, copy the + // value + data_col[i_col * channels_col + c_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? data_im[(_c * height + h_im) * width + w_im] + : static_cast(in_zero_point); + } + } + } + } + } + } +} + +void im2row_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef dilation, + IntArrayRef padding, + IntArrayRef stride, + const Tensor& in_zero_point, + bool channel_last, + Tensor& out) { + // Compute the input tensor's dims + bool unit_height = input.dim() == 3; + const int32_t batch_size = input.size(0); + const int32_t in_c = + channel_last ? input.size(3 - unit_height) : input.size(1); + const int32_t in_h = + unit_height ? 1 : (channel_last ? input.size(1) : input.size(2)); + const int32_t in_w = + channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height); + + // Get the kernel parameters + int32_t kernel_h = kernel_size[0]; + int32_t kernel_w = kernel_size[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t pad_h = padding[0]; + int32_t pad_w = padding[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + + // If we were to apply a convolution on the input tensor, compute the output + // height and width. + int32_t out_h = + (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1; + int32_t out_w = + (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1; + + ET_DCHECK_MSG( + (out_h * out_w) == out.size(1), "dimension mismatch for output"); + ET_DCHECK_MSG( + (kernel_h * kernel_w * in_c) == out.size(2), + "dimension mismatch for output"); + + // Check if the input is per-tensor quantized or per-channel quantized. The + // zero point for each batch could differ for per-channel quantized input. + bool per_tensor_quantized = in_zero_point.numel() == 1; + +#define typed_im2row(dtype, ctype) \ + case ScalarType::dtype: { \ + const ctype* __restrict__ in_data = input.const_data_ptr(); \ + ctype* __restrict__ out_data = out.mutable_data_ptr(); \ + const int32_t* __restrict__ zero_point = \ + in_zero_point.const_data_ptr(); \ + int32_t in_plane = in_c * in_h * in_w; \ + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; \ + for (size_t n = 0; n < batch_size; ++n) { \ + im2row_( \ + &in_data[n * in_plane], \ + per_tensor_quantized ? zero_point[0] : zero_point[n], \ + in_c, \ + in_h, \ + in_w, \ + out_h, \ + out_w, \ + kernel_h, \ + kernel_w, \ + pad_h, \ + pad_w, \ + stride_h, \ + stride_w, \ + dilation_h, \ + dilation_w, \ + &out_data[n * out_plane], \ + channel_last); \ + } \ + break; \ + } + + ScalarType dtype = input.scalar_type(); + switch (dtype) { + typed_im2row(Float, float); + typed_im2row(Byte, uint8_t); + typed_im2row(Char, int8_t); + default: + ET_DCHECK_MSG( + false, + "im2row not implemented for dtype %s", + torch::executor::toString(dtype)); + } +#undef typed_im2row +} + +void im2row_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef dilation, + IntArrayRef padding, + IntArrayRef stride, + int64_t in_zero_point, + bool channel_last, + Tensor& out) { + // Compute the input tensor's dims + bool unit_height = input.dim() == 3; + const int32_t batch_size = input.size(0); + const int32_t in_c = + channel_last ? input.size(3 - unit_height) : input.size(1); + const int32_t in_h = + unit_height ? 1 : (channel_last ? input.size(1) : input.size(2)); + const int32_t in_w = + channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height); + + // Get the kernel parameters + int32_t kernel_h = kernel_size[0]; + int32_t kernel_w = kernel_size[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t pad_h = padding[0]; + int32_t pad_w = padding[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + + // If we were to apply a convolution on the input tensor, compute the output + // height and width. + int32_t out_h = + (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1; + int32_t out_w = + (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1; + + ET_DCHECK_MSG( + (out_h * out_w) == out.size(1), "dimension mismatch for output"); + ET_DCHECK_MSG( + (kernel_h * kernel_w * in_c) == out.size(2), + "dimension mismatch for output"); + +#define typed_im2row_per_tensor(dtype, ctype) \ + case ScalarType::dtype: { \ + const ctype* __restrict__ in_data = input.const_data_ptr(); \ + ctype* __restrict__ out_data = out.mutable_data_ptr(); \ + int32_t in_plane = in_c * in_h * in_w; \ + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; \ + for (size_t n = 0; n < batch_size; ++n) { \ + im2row_( \ + &in_data[n * in_plane], \ + in_zero_point, \ + in_c, \ + in_h, \ + in_w, \ + out_h, \ + out_w, \ + kernel_h, \ + kernel_w, \ + pad_h, \ + pad_w, \ + stride_h, \ + stride_w, \ + dilation_h, \ + dilation_w, \ + &out_data[n * out_plane], \ + channel_last); \ + } \ + break; \ + } + + ScalarType dtype = input.scalar_type(); + switch (dtype) { + typed_im2row_per_tensor(Float, float); + typed_im2row_per_tensor(Byte, uint8_t); + typed_im2row_per_tensor(Char, int8_t); + default: + ET_DCHECK_MSG( + false, + "im2row.per_tensor not implemented for dtype %s", + torch::executor::toString(dtype)); + } +#undef typed_im2row_per_tensor +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp new file mode 100644 index 00000000000..cd72d2de2b5 --- /dev/null +++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +// Quantize the input tensor (PT2 version). Note that quant_ are not +// used in any computation. +void quantize_per_tensor_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + + if (out.scalar_type() == ScalarType::Byte) { + uint8_t* out_data = out.mutable_data_ptr(); + kernels::quantize( + out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Char) { + int8_t* out_data = out.mutable_data_ptr(); + kernels::quantize( + out_data, input_data, 1. / scale, zero_point, numel); + } else if ( + out.scalar_type() == ScalarType::Bits16 || + out.scalar_type() == ScalarType::UInt16) { + uint16_t* out_data = out.mutable_data_ptr(); + kernels::quantize( + out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Short) { + int16_t* out_data = out.mutable_data_ptr(); + kernels::quantize( + out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Int) { + int32_t* out_data = out.mutable_data_ptr(); + kernels::quantize( + out_data, input_data, 1. / scale, zero_point, numel); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(out.scalar_type())); + } +} + +}; // namespace native +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp new file mode 100644 index 00000000000..1e1e6c8cdc7 --- /dev/null +++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp @@ -0,0 +1,608 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +// This implements a generic 2d conv kernel that operates on raw pointers. +// The version handles both quantized and fp32 convolutions. +// The input is of shape [n x c x h x w] +// The weight is of shape [oc x wc x wh x ww], where wc == c +// The output is of shape [n x oc x oh x ow] +// The bias is of shape [oc] +template < + typename IT = float, + typename WT = IT, + typename BT = IT, + typename OT = IT, + bool quantized = false> +__attribute__((noinline)) void conv2d_nchw_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t c, + int32_t h, + int32_t w, + int32_t oc, + int32_t wc, + int32_t wh, + int32_t ww, + int32_t oh, + int32_t ow, + // Stride + int16_t s0, + int16_t s1, + // Padding + int16_t p0, + int16_t p1, + // Dilation + int16_t d0, + int16_t d1, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + int32_t weight_zero_point = 0, + float bias_scale = 1, + float out_scale = 1, + OT out_zero_point = 0) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * c * h * w; + OT* out_batch = p_out + _n * oc * oh * ow; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + OT* out_plane = out_batch + _oc * oh * ow; + const WT* weight_batch = p_weight + _oc * wc * wh * ww; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of size + // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an + // output channel of size 1 x oh x ow. + for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) { + for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) { + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to the + // output channel being computed) with the corresponding weight + // channel. + // If the padding is 0, and dilation is 1, then we can remove the + // unnecessary checks, and simplify the code so that it can be + // vectorized by Tensilica compiler. + if (zero_pad_unit_dilation) { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + int ioff = (_h + _wh) * w + (_w + _ww); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } else { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_h + d0 * _wh - p0) >= 0) && + ((_h + d0 * _wh - p0) < h) && + ((_w + d1 * _ww - p1) >= 0) && + ((_w + d1 * _ww - p1) < w)) { + int ioff = + (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } + } + if (quantized) { + float val = bias_scale * acc; + out_plane[_oh * ow + _ow] = + kernels::quantize(val, inv_out_scale, out_zero_point); + } else { + out_plane[_oh * ow + _ow] = acc; + } + } + } + } + } + } +} + +template < + typename IT = float, + typename WT = IT, + typename BT = IT, + typename OT = IT, + bool quantized = false> +__attribute__((noinline)) void conv2d_nhwc_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t h, + int32_t w, + int32_t c, + int32_t oc, + int32_t wh, + int32_t ww, + int32_t wc, + int32_t oh, + int32_t ow, + // Stride + int16_t s0, + int16_t s1, + // Padding + int16_t p0, + int16_t p1, + // Dilation + int16_t d0, + int16_t d1, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + int32_t weight_zero_point = 0, + float bias_scale = 1, + float out_scale = 1, + OT out_zero_point = 0) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * h * w * c; + OT* out_batch = p_out + _n * oh * ow * oc; + for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) { + for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) { + OT* out_line = out_batch + (_oh * ow + _ow) * oc; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + const WT* weight_batch = p_weight + _oc * wh * ww * wc; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of + // size h x w x icpg, with a stencil of size wh x ww x icpg, to + // compute an output channel of size oh x ow x 1. + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to + // the output channel being computed) with the corresponding + // weight channel. If the padding is 0, and dilation is 1, then + // we can remove the unnecessary checks, and simplify the code + // so that it can be vectorized by Tensilica compiler.x`` + if (zero_pad_unit_dilation) { + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + const IT* in_line = + in_batch + (_h + _wh) * w * c + (_w + _ww) * c; + const WT* weight_line = + weight_batch + _wh * ww * wc + _ww * wc; + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + float lhs = in_line[_ic] - in_zero_point; + float rhs = weight_line[_ic - sic] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } else { + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_h + d0 * _wh - p0) >= 0) && + ((_h + d0 * _wh - p0) < h) && + ((_w + d1 * _ww - p1) >= 0) && + ((_w + d1 * _ww - p1 < w))) { + const IT* in_line = in_batch + + (_h + d0 * _wh - p0) * w * c + (_w + d1 * _ww - p1) * c; + const WT* weight_line = + weight_batch + _wh * ww * wc + _ww * wc; + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + float lhs = in_line[_ic] - in_zero_point; + float rhs = weight_line[_ic - sic] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } + } + if (quantized) { + float val = bias_scale * acc; + out_line[_oc] = + kernels::quantize(val, inv_out_scale, out_zero_point); + } else { + out_line[_oc] = acc; + } + } + } + } + } + } +} + +// The quantized convolution kernel. in_scale and weight_scale are implicit in +// bias_scale, since it is a product of the two. The kernel will branch to +// quantized::conv1d or quantized::conv2d based on the dimensionality of +// activation tensor. +void quantized_conv_nchw( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int16_t groups, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + bool conv1d = input.dim() == 3; + // input = [n, c, h, w] + const int n = input.size(0); + const int c = input.size(1); + const int h = conv1d ? 1 : input.size(2); + const int w = conv1d ? input.size(2) : input.size(3); + // weight = [oc, wc, wh, ww] + const int oc = weight.size(0); + const int wc = weight.size(1); + const int wh = conv1d ? 1 : weight.size(2); + const int ww = conv1d ? weight.size(2) : weight.size(3); + // output = [n, oc, oh, ow] + const int oh = conv1d ? 1 : out.size(2); + const int ow = conv1d ? out.size(2) : out.size(3); + +#define typed_quantized_conv2d_nchw(ctype, dtype) \ + case ScalarType::dtype: { \ + conv2d_nchw_core_generic( \ + input.const_data_ptr(), \ + weight.const_data_ptr(), \ + bias.const_data_ptr(), \ + out.mutable_data_ptr(), \ + n, \ + c, \ + h, \ + w, \ + oc, \ + wc, \ + wh, \ + ww, \ + oh, \ + ow, \ + stride[0], \ + stride[1], \ + padding[0], \ + padding[1], \ + dilation[0], \ + dilation[1], \ + groups, \ + in_zero_point, \ + weight_zero_point, \ + bias_scale, \ + output_scale, \ + (ctype)output_zero_point); \ + break; \ + } + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nchw); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_conv2d_nchw +} + +void quantized_conv_nhwc( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int16_t groups, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + bool conv1d = input.dim() == 3; + // input = [n, h, w, c] + const int n = input.size(0); + const int h = conv1d ? 1 : input.size(1); + const int w = conv1d ? input.size(1) : input.size(2); + const int c = conv1d ? input.size(2) : input.size(3); + // weight = [oc, wh, ww, wc] + const int oc = weight.size(0); + const int wh = conv1d ? 1 : weight.size(1); + const int ww = conv1d ? weight.size(1) : weight.size(2); + const int wc = conv1d ? weight.size(2) : weight.size(3); + // output = [n, oh, ow, oc] + const int oh = conv1d ? 1 : out.size(1); + const int ow = conv1d ? out.size(1) : out.size(2); + +#define typed_quantized_conv2d_nhwc(ctype, dtype) \ + case ScalarType::dtype: { \ + conv2d_nhwc_core_generic( \ + input.const_data_ptr(), \ + weight.const_data_ptr(), \ + bias.const_data_ptr(), \ + out.mutable_data_ptr(), \ + n, \ + h, \ + w, \ + c, \ + oc, \ + wh, \ + ww, \ + wc, \ + oh, \ + ow, \ + stride[0], \ + stride[1], \ + padding[0], \ + padding[1], \ + dilation[0], \ + dilation[1], \ + groups, \ + in_zero_point, \ + weight_zero_point, \ + bias_scale, \ + output_scale, \ + (ctype)output_zero_point); \ + break; \ + } + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nhwc); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_conv2d_nhwc +} + +void quantized_conv_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED const Tensor& out_multiplier, + __ET_UNUSED const Tensor& out_shift, + bool channel_last, + Tensor& out) { + const float bias_scale_float = bias_scale.const_data_ptr()[0]; + const int32_t weight_zero_point_int = + weight_zero_point.const_data_ptr()[0]; + if (channel_last) { + quantized_conv_nhwc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point_int, + bias_scale_float, + output_scale, + output_zero_point, + out); + } else { + quantized_conv_nchw( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point_int, + bias_scale_float, + output_scale, + output_zero_point, + out); + } +} + +void quantized_conv_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + bool channel_last, + Tensor& out) { + if (channel_last) { + quantized_conv_nhwc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); + } else { + quantized_conv_nchw( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); + } +} + +void quantized_conv2d_nchw_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + int64_t out_multiplier, + int64_t out_shift, + Tensor& out) { + quantized_conv_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + false, // channel_last = false for NCHW + out); +} + +void quantized_conv2d_nhwc_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + int64_t out_multiplier, + int64_t out_shift, + Tensor& out) { + quantized_conv_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + true, // channel_last = true for NHWC + out); +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp new file mode 100644 index 00000000000..29aa8906414 --- /dev/null +++ b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using ::executorch::aten::optional; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +void quantized_fully_connected_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& in, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + const Tensor& weight_zero_point_t, + const Tensor& out_multiplier, + const Tensor& out_shift, + int64_t out_zero_point, + __ET_UNUSED const optional& offset, + Tensor& out) { +#define typed_quantized_linear(ctype, dtype) \ + case ScalarType::dtype: { \ + quantized_linear_( \ + in, \ + weight, \ + bias, \ + in_zero_point, \ + weight_zero_point_t, \ + out_multiplier, \ + out_shift, \ + out_zero_point, \ + out); \ + break; \ + } + + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } +#undef typed_quantized_linear +} + +void quantized_fully_connected_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& in, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + int64_t weight_zero_point, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + __ET_UNUSED const optional& offset, + Tensor& out) { +#define typed_quantized_linear(ctype, dtype) \ + case ScalarType::dtype: { \ + quantized_linear_per_tensor_( \ + in, \ + weight, \ + bias, \ + in_zero_point, \ + weight_zero_point, \ + out_multiplier, \ + out_shift, \ + out_zero_point, \ + out); \ + break; \ + } + + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } +#undef typed_quantized_linear +} + +}; // namespace native +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_layer_norm.cpp b/backends/cadence/vision/operators/op_quantized_layer_norm.cpp new file mode 100644 index 00000000000..a9685eddedb --- /dev/null +++ b/backends/cadence/vision/operators/op_quantized_layer_norm.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::getLeadingDims; +using ::executorch::runtime::KernelRuntimeContext; + +namespace impl { +namespace vision { +namespace native { + +// Compute quantized layer_norm. The current implementation assumes that the +// input is per-tensor quantized. +template +void quantized_layer_norm_per_tensor_( + const Tensor& input, + double input_scale, + int64_t input_zero_point, + const Tensor& weight, + const Tensor& bias, + double eps, + double output_scale, + int64_t output_zero_point, + Tensor& out) { + // Get the raw pointers to input, output, weight, and bias + const T* __restrict__ in_data = input.const_data_ptr(); + T* __restrict__ out_data = out.mutable_data_ptr(); + const float* __restrict__ weight_data = weight.const_data_ptr(); + const float* __restrict__ bias_data = bias.const_data_ptr(); + + float output_inv_scale = 1.0f / output_scale; + + size_t last_dim = input.size(input.dim() - 1); + size_t leading_dims = getLeadingDims(input, input.dim() - 1); + + // Visualize the input tensor as a set of 1d vectors, and compute the + // layer_norm for each vector. + for (size_t i = 0; i < leading_dims; ++i) { + const T* x = in_data + i * last_dim; + T* y = out_data + i * last_dim; + + // compute sum and squared sum. The fp32 sum can be approximated as: + // (X_1 - in_zero_point) * in_scale + (X_2 - in_zero_point) * in_scale + ... + // (X_N - in_zero_point) * in_scale. + int32_t sum = 0; + int32_t sq_sum = last_dim * input_zero_point * input_zero_point; + for (size_t j = 0; j < last_dim; ++j) { + int32_t val = x[j]; + sum += val; + sq_sum += val * val; + } + sq_sum -= (2 * sum * input_zero_point); + sum -= (last_dim * input_zero_point); + + float mean = (input_scale * sum) / last_dim; + float variance = + (sq_sum * input_scale * input_scale) / last_dim - mean * mean; + float inv_std = 1.0f / std::sqrt(variance + eps); + + // y = (x - mean) / std * kGamma + kBeta + for (int j = 0; j < last_dim; ++j) { + // y[j] = (x[j] - mean) / std * kGamma + kBeta; + // Since X is quantized, we dequantize it, compute fp32 result, and + // quantize the result to an int8/uint8 value. + float val = kernels::dequantize(x[j], input_scale, input_zero_point); + + val = (val - mean) * inv_std * weight_data[j] + bias_data[j]; + y[j] = kernels::quantize(val, output_inv_scale, output_zero_point); + } + } +} + +// Compute quantized layer_norm. The current implementation assumes that the +// input is per-tensor quantized. +template +void quantized_layer_norm_( + const Tensor& input, + const Tensor& in_scale, + const Tensor& in_zero_point, + const Tensor& weight, + const Tensor& bias, + double eps, + double output_scale, + int64_t output_zero_point, + Tensor& out) { + // Extract the zero point and scale for input tensor. + float input_scale = in_scale.const_data_ptr()[0]; + int64_t input_zero_point = in_zero_point.const_data_ptr()[0]; + + // Call other overload + quantized_layer_norm_per_tensor_( + input, + input_scale, + input_zero_point, + weight, + bias, + eps, + output_scale, + output_zero_point, + out); +} + +void quantized_layer_norm_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& in_scale, + const Tensor& in_zero_point, + __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape, + const Tensor& weight, + const Tensor& bias, + double eps, + double output_scale, + int64_t output_zero_point, + Tensor& out) { + if (input.scalar_type() == executorch::aten::ScalarType::Byte) { + quantized_layer_norm_( + input, + in_scale, + in_zero_point, + weight, + bias, + eps, + output_scale, + output_zero_point, + out); + } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { + quantized_layer_norm_( + input, + in_scale, + in_zero_point, + weight, + bias, + eps, + output_scale, + output_zero_point, + out); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(input.scalar_type())); + } +} + +void quantized_layer_norm_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + double in_scale, + int64_t in_zero_point, + __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape, + const Tensor& weight, + const Tensor& bias, + double eps, + double output_scale, + int64_t output_zero_point, + Tensor& out) { + if (input.scalar_type() == executorch::aten::ScalarType::Byte) { + quantized_layer_norm_per_tensor_( + input, + in_scale, + in_zero_point, + weight, + bias, + eps, + output_scale, + output_zero_point, + out); + } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { + quantized_layer_norm_per_tensor_( + input, + in_scale, + in_zero_point, + weight, + bias, + eps, + output_scale, + output_zero_point, + out); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(input.scalar_type())); + } +} + +}; // namespace native +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_linear_out.cpp b/backends/cadence/vision/operators/op_quantized_linear_out.cpp new file mode 100644 index 00000000000..b6b7cdd17bc --- /dev/null +++ b/backends/cadence/vision/operators/op_quantized_linear_out.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using executorch::aten::Tensor; +using executorch::runtime::getLeadingDims; +using executorch::runtime::KernelRuntimeContext; + +template +void inline _typed_quantized_linear( + const Tensor& src, + const Tensor& weight, + const Tensor& bias, + int64_t src_zero_point, + const Tensor& weight_zero_point_t, + const Tensor& out_multiplier, + const Tensor& out_shift, + int64_t out_zero_point, + Tensor& out) { + const T* __restrict__ src_data = src.const_data_ptr(); + const T* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + T* __restrict__ out_data = out.mutable_data_ptr(); + + int32_t weight_zero_point = weight_zero_point_t.const_data_ptr()[0]; + + // input comes in shape [batch_size, in_dim] + // weight comes in shape [out_dim, in_dim] + // output comes in empty with shape [batch_size, out_dim] + // Perform matrix multiply (M x N) x (N x P) => M x P + const auto M = weight.size(0); // = out_dim + const auto N = weight.size(1); // = in_dim + + // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the + // leading dimensions is d0 * d1 * ... * d_{N-2} + const auto leading_dims = getLeadingDims(src, src.dim() - 1); + + ET_CHECK_MSG( + out_multiplier.numel() == 1, "out_multiplier should have one element"); + ET_CHECK_MSG( + out_shift.numel() == 1, "out_multiplier should have one element"); + + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + + // Compute the out_scale from out_multiplier and out_shift + const float out_scale = + -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]); + + for (int i = 0; i < leading_dims; ++i) { + for (int j = 0; j < M; ++j) { + float sum = bias_data[j]; + for (int k = 0; k < N; ++k) { + sum += (src_data[i * N + k] - src_zero_point) * + (weight_data[j * N + k] - weight_zero_point); + } + out_data[i * M + j] = + kernels::quantize(sum, out_scale, out_zero_point); + } + } +} + +void quantized_linear_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& src, + const Tensor& weight, + const Tensor& bias, + int64_t src_zero_point, + const Tensor& weight_zero_point_t, + const Tensor& out_multiplier, + const Tensor& out_shift, + int64_t out_zero_point, + __ET_UNUSED const executorch::aten::optional& offset, + Tensor& out) { + // TODO: refactor to use switch case as quantized_linear_per_tensor_out + if (out.scalar_type() == executorch::aten::ScalarType::Byte) { + _typed_quantized_linear( + src, + weight, + bias, + src_zero_point, + weight_zero_point_t, + out_multiplier, + out_shift, + out_zero_point, + out); + } else if (out.scalar_type() == executorch::aten::ScalarType::Char) { + _typed_quantized_linear( + src, + weight, + bias, + src_zero_point, + weight_zero_point_t, + out_multiplier, + out_shift, + out_zero_point, + out); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(src.scalar_type())); + } +} + +void quantized_linear_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& src, + const Tensor& weight, + const Tensor& bias, + const int64_t src_zero_point, + const int64_t weight_zero_point, + const int64_t out_multiplier, + const int64_t out_shift, + const int64_t out_zero_point, + __ET_UNUSED const executorch::aten::optional& offset, + Tensor& out) { +#define typed_quantized_linear_per_tensor(ctype, dtype) \ + case executorch::aten::ScalarType::dtype: { \ + quantized_linear_per_tensor_( \ + src, \ + weight, \ + bias, \ + src_zero_point, \ + weight_zero_point, \ + out_multiplier, \ + out_shift, \ + out_zero_point, \ + out); \ + break; \ + } + + executorch::aten::ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear_per_tensor); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", executorch::runtime::toString(dtype)); + } +#undef typed_quantized_linear_per_tensor +} + +}; // namespace native +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp new file mode 100644 index 00000000000..54a303288c3 --- /dev/null +++ b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using executorch::aten::Tensor; +using executorch::runtime::getLeadingDims; +using executorch::runtime::KernelRuntimeContext; + +// The quantized matmul. The quantized matmul accumulates in a wider register, +// whose type is TA. +template < + typename TZ, + typename TA = float, + bool transposed = false, + typename TX = TZ, + typename TY = TZ> +__attribute__((noinline)) void qmatmul( + TZ* __restrict__ Z, + int32_t Z_multiplier, + int32_t Z_shift, + int32_t Z_zero_point, + const TX* __restrict__ X, + int32_t X_zero_point, + const TY* __restrict__ y, + int32_t Y_zero_point, + size_t m, + size_t n, + size_t p) { + // Compute the Z_scale from Z_multiplier and Z_shift + const float Z_scale = -Z_multiplier * 1.0 / (1 << 31) * pow(2, Z_shift); + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < p; ++j) { + TA sum = 0; + for (size_t k = 0; k < n; ++k) { + if (transposed) { + sum += (X[i * n + k] - X_zero_point) * (y[j * n + k] - Y_zero_point); + } else { + sum += (X[i * n + k] - X_zero_point) * (y[k * p + j] - Y_zero_point); + } + } + Z[i * p + j] = kernels::quantize(sum, Z_scale, Z_zero_point); + } + } +} + +template +void inline _typed_quantized_matmul( + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const executorch::aten::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& out) { + size_t batch_size = getLeadingDims(X, X.dim() - 2); + size_t leading_dim = X.size(X.dim() - 2); + size_t out_dim = Y.size(Y.dim() - 1 - transposed); + size_t in_dim = X.size(X.dim() - 1); + + T* __restrict__ out_data = out.mutable_data_ptr(); + const T* __restrict__ X_data = X.const_data_ptr(); + const T* __restrict__ Y_data = Y.const_data_ptr(); + for (size_t i = 0; i < batch_size; ++i) { + const T* x = X_data + i * leading_dim * in_dim; + const T* y = Y_data + i * in_dim * out_dim; + T* z = out_data + i * leading_dim * out_dim; + if (transposed) { + qmatmul( + z, + static_cast(out_multiplier), + static_cast(out_shift), + static_cast(out_zero_point), + x, + static_cast(X_zero_point), + y, + static_cast(Y_zero_point), + leading_dim, + in_dim, + out_dim); + } else { + qmatmul( + z, + static_cast(out_multiplier), + static_cast(out_shift), + static_cast(out_zero_point), + x, + static_cast(X_zero_point), + y, + static_cast(Y_zero_point), + leading_dim, + in_dim, + out_dim); + } + } +} + +void quantized_matmul_out( + KernelRuntimeContext& ctx, + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const executorch::aten::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& out) { + if (out.scalar_type() == executorch::aten::ScalarType::Byte) { + _typed_quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + } else if (out.scalar_type() == executorch::aten::ScalarType::Char) { + _typed_quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(X.scalar_type())); + } +} + +}; // namespace native +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_relu_out.cpp b/backends/cadence/vision/operators/op_quantized_relu_out.cpp new file mode 100644 index 00000000000..45b9e09b1dd --- /dev/null +++ b/backends/cadence/vision/operators/op_quantized_relu_out.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +template +void quantized_relu_( + const Tensor& input, + const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + T q_zero_point = in_zero_point.const_data_ptr()[0]; + const T* __restrict__ in = input.const_data_ptr(); + T* __restrict__ out = output.mutable_data_ptr(); + + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + + // Compute the out_scale from out_multiplier and out_shift + const float out_scale = + -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]); + + for (size_t i = 0, e = input.numel(); i < e; ++i) { + const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0; + out[i] = kernels::quantize(temp, out_scale, out_zero_point); + } +} + +void quantized_relu_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + if (input.scalar_type() == executorch::aten::ScalarType::Byte) { + quantized_relu_( + input, + in_zero_point, + out_zero_point, + out_multiplier, + out_shift, + output); + } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { + quantized_relu_( + input, + in_zero_point, + out_zero_point, + out_multiplier, + out_shift, + output); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(input.scalar_type())); + } +} + +template +void quantized_relu_per_tensor_out_( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const int64_t in_zero_point, + const int64_t out_zero_point, + const int64_t out_multiplier, + const int64_t out_shift, + Tensor& output) { + const T* __restrict__ in = input.const_data_ptr(); + T* __restrict__ out = output.mutable_data_ptr(); + + // Compute the out_scale from out_multiplier and out_shift + const float out_scale = -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift); + + for (size_t i = 0, e = input.numel(); i < e; ++i) { + const float temp = in[i] > in_zero_point ? (in[i] - in_zero_point) : 0; + out[i] = kernels::quantize(temp, out_scale, out_zero_point); + } +} + +void quantized_relu_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const int64_t in_zero_point, + const int64_t out_zero_point, + const int64_t out_multiplier, + const int64_t out_shift, + Tensor& output) { +#define typed_quantized_relu(ctype, dtype) \ + case executorch::aten::ScalarType::dtype: { \ + quantized_relu_per_tensor_out_( \ + ctx, \ + input, \ + in_zero_point, \ + out_zero_point, \ + out_multiplier, \ + out_shift, \ + output); \ + break; \ + } + + executorch::aten::ScalarType dtype = input.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu) + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_relu +} + +}; // namespace native +}; // namespace vision +}; // namespace impl diff --git a/backends/cadence/vision/operators/op_requantize_out.cpp b/backends/cadence/vision/operators/op_requantize_out.cpp new file mode 100644 index 00000000000..ef538bf4045 --- /dev/null +++ b/backends/cadence/vision/operators/op_requantize_out.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +// Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor. +// The scale and zero_point for requantization are in the args. +Tensor& requantize_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& in_scale_t, + const Tensor& in_zero_point_t, + const Tensor& out_scale_t, + const Tensor& out_zero_point_t, + const ScalarType out_dtype, + Tensor& out) { + ET_KERNEL_CHECK_MSG( + ctx, + in_scale_t.scalar_type() == ScalarType::Float, + InvalidArgument, + out, + "In scale is not a float: %s", + torch::executor::toString(in_scale_t.scalar_type())); + float in_scale = in_scale_t.const_data_ptr()[0]; + + ET_KERNEL_CHECK_MSG( + ctx, + in_zero_point_t.scalar_type() == ScalarType::Int, + InvalidArgument, + out, + "In zero point is not an int: %s", + torch::executor::toString(in_zero_point_t.scalar_type())); + int32_t in_zero_point = in_zero_point_t.const_data_ptr()[0]; + + ET_KERNEL_CHECK_MSG( + ctx, + out_scale_t.scalar_type() == ScalarType::Float, + InvalidArgument, + out, + "Out scale is not a float: %s", + torch::executor::toString(out_scale_t.scalar_type())); + float out_scale = out_scale_t.const_data_ptr()[0]; + + ET_KERNEL_CHECK_MSG( + ctx, + out_zero_point_t.scalar_type() == ScalarType::Int, + InvalidArgument, + out, + "Out zero point is not an int: %s", + torch::executor::toString(out_zero_point_t.scalar_type())); + int32_t out_zero_point = out_zero_point_t.const_data_ptr()[0]; + + ET_KERNEL_CHECK_MSG( + ctx, + out.scalar_type() == out_dtype, + InvalidArgument, + out, + "Out tensor dtype (%s) does not match the passed in out dtype (%s)", + torch::executor::toString(out.scalar_type()), + torch::executor::toString(out_dtype)); + + const size_t numel = out.numel(); + ScalarType in_dtype = input.scalar_type(); + + // Assert that the output tensor's dtype is same as out_dtype. + ET_KERNEL_CHECK_MSG( + ctx, + out_dtype == out.scalar_type(), + InvalidArgument, + out, + "Out dtype %s does not match requant dtype %s", + torch::executor::toString(out.scalar_type()), + torch::executor::toString(out_dtype)); + +#define typed_requantize(ctype, dtype) \ + const ctype* input_data = input.const_data_ptr(); \ + dtype* out_data = out.mutable_data_ptr(); \ + kernels::requantize( \ + out_data, \ + input_data, \ + in_scale, \ + in_zero_point, \ + 1.0 / out_scale, \ + out_zero_point, \ + numel); + +#define typed_requantize_in(ctype) \ + switch (out_dtype) { \ + case ScalarType::Byte: { \ + typed_requantize(ctype, uint8_t); \ + break; \ + } \ + case ScalarType::Char: { \ + typed_requantize(ctype, int8_t); \ + break; \ + } \ + case ScalarType::UInt16: { \ + typed_requantize(ctype, uint16_t); \ + break; \ + } \ + case ScalarType::Short: { \ + typed_requantize(ctype, int16_t); \ + break; \ + } \ + default: \ + ET_KERNEL_CHECK_MSG( \ + ctx, \ + false, \ + InvalidArgument, \ + out, \ + "Unhandled output dtype %s", \ + torch::executor::toString(out_dtype)); \ + } + + switch (in_dtype) { + case ScalarType::Byte: { + typed_requantize_in(uint8_t); + break; + } + case ScalarType::Char: { + typed_requantize_in(int8_t); + break; + } + case ScalarType::UInt16: { + typed_requantize_in(uint16_t); + break; + } + case ScalarType::Short: { + typed_requantize_in(int16_t); + break; + } + default: + ET_KERNEL_CHECK_MSG( + ctx, + false, + InvalidArgument, + out, + "Unhandled input dtype %s", + torch::executor::toString(in_dtype)); + } +#undef typed_requantize_in +#undef typed_requantize + return out; +} + +// Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor. +// The scale and zero_point for requantization are in the args. +Tensor& requantize_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + double in_scale, + int64_t in_zero_point, + double out_scale, + int64_t out_zero_point, + const ScalarType out_dtype, + Tensor& out) { + ET_KERNEL_CHECK_MSG( + ctx, + out.scalar_type() == out_dtype, + InvalidArgument, + out, + "Out tensor dtype (%s) does not match the passed in out dtype (%s)", + torch::executor::toString(out.scalar_type()), + torch::executor::toString(out_dtype)); + + const size_t numel = out.numel(); + ScalarType in_dtype = input.scalar_type(); + + // Assert that the output tensor's dtype is same as out_dtype. + ET_KERNEL_CHECK_MSG( + ctx, + out_dtype == out.scalar_type(), + InvalidArgument, + out, + "Out dtype %s does not match requant dtype %s", + torch::executor::toString(out.scalar_type()), + torch::executor::toString(out_dtype)); + +#define typed_requantize(ctype, dtype) \ + const ctype* input_data = input.const_data_ptr(); \ + dtype* out_data = out.mutable_data_ptr(); \ + kernels::requantize( \ + out_data, \ + input_data, \ + static_cast(in_scale), \ + static_cast(in_zero_point), \ + 1.0 / static_cast(out_scale), \ + static_cast(out_zero_point), \ + numel); + +#define typed_requantize_in(ctype) \ + switch (out_dtype) { \ + case ScalarType::Byte: { \ + typed_requantize(ctype, uint8_t); \ + break; \ + } \ + case ScalarType::Char: { \ + typed_requantize(ctype, int8_t); \ + break; \ + } \ + case ScalarType::UInt16: { \ + typed_requantize(ctype, uint16_t); \ + break; \ + } \ + case ScalarType::Short: { \ + typed_requantize(ctype, int16_t); \ + break; \ + } \ + default: \ + ET_KERNEL_CHECK_MSG( \ + ctx, \ + false, \ + InvalidArgument, \ + out, \ + "Unhandled output dtype %s", \ + torch::executor::toString(out_dtype)); \ + } + + switch (in_dtype) { + case ScalarType::Byte: { + typed_requantize_in(uint8_t); + break; + } + case ScalarType::Char: { + typed_requantize_in(int8_t); + break; + } + case ScalarType::UInt16: { + typed_requantize_in(uint16_t); + break; + } + case ScalarType::Short: { + typed_requantize_in(int16_t); + break; + } + default: + ET_KERNEL_CHECK_MSG( + ctx, + false, + InvalidArgument, + out, + "Unhandled input dtype %s", + torch::executor::toString(in_dtype)); + } +#undef typed_requantize_in +#undef typed_requantize + return out; +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp new file mode 100644 index 00000000000..58ca33c6a0b --- /dev/null +++ b/backends/cadence/vision/operators/op_softmax.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; +using torch::executor::Error; + +namespace impl { +namespace vision { +namespace native { + +Tensor& _softmax_out( + KernelRuntimeContext& ctx, + const Tensor& in, + int64_t dim, + bool half_to_float, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + torch::executor::check_softmax_args(in, dim, half_to_float, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(in, out), + InvalidArgument, + out); + + // Adjust for negative dim + dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim; + + const executorch::aten::optional& dim_t = dim; + const size_t d = ET_NORMALIZE_IX(dim_t.value(), in.dim()); + const size_t size = in.size(d); + + size_t stride = 1, outer_size = 1; + + size_t outer_stride = 1; + + constexpr auto name = "_softmax.out"; + constexpr int MaxDim = 5; + + bool optimized = true; + bool ping_pong_process = false; + bool ping_process_pong = false; + + if ((d == in.dim() - 1)) { + if (size <= IDMA_BUFF_SIZE / 4 && in.dim() != 1) { + ping_pong_process = true; + } else if (size <= IDMA_BUFF_SIZE / 2) { + ping_process_pong = true; + } + } + + if (out.scalar_type() != ScalarType::Float) + optimized = false; + + if (in.dim() > MaxDim) + optimized = false; + + if (optimized) { + const float* ptr_inp = (float*)in.const_data_ptr(); + float* out_data = (float*)out.mutable_data_ptr(); + + /* Channel 0*/ + idma_init(0, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL); + idma_init_loop(0, descbuf[0], IDMA_2D_DESC, 1, NULL, NULL); + + /* Channel 1*/ + idma_init(1, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL); + idma_init_loop(1, descbuf[1], IDMA_2D_DESC, 1, NULL, NULL); + + if (ping_pong_process) { + for (int i = 0; i < in.dim(); i++) { + if (i != d) + outer_size *= in.size(i); + } + + outer_stride = size; + stride = size; + + int pp_swap = 0; + + float32_t* ptr_out = out_data; + float32_t* ptr_in = (float32_t*)ptr_inp; + + idma_copy_2d_desc( + 0, inpData[pp_swap], ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); + pp_swap = 1; + + for (int i = 0; i < (outer_size - 1); i++) { + IDMA_HW_WAIT_ALL(0); + ptr_in += outer_stride; + idma_copy_2d_desc( + 0, + inpData[pp_swap], + ptr_in, + 4 * stride, + DESC_IDMA_PRIOR_H, + 1, + 0, + 0); + pp_swap = pp_swap ^ 1; + + /* PROCESS CALL */ + vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride); + + IDMA_HW_WAIT_ALL(1); + idma_copy_2d_desc( + 1, + ptr_out, + outData[pp_swap], + 4 * stride, + DESC_IDMA_PRIOR_H, + 1, + 0, + 0); + ptr_out += outer_stride; + } + + IDMA_HW_WAIT_ALL(0); + pp_swap = pp_swap ^ 1; + + /* PROCESS CALL */ + vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride); + + IDMA_HW_WAIT_ALL(1); + idma_copy_2d_desc( + 1, ptr_out, outData[pp_swap], 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); + + IDMA_HW_WAIT_ALL(1); + + return out; + } else if (ping_process_pong) { + for (int i = 0; i < in.dim(); i++) { + if (i != d) + outer_size *= in.size(i); + } + + outer_stride = size; + stride = size; + + float32_t* ptr_out = out_data; + float32_t* ptr_in = (float32_t*)ptr_inp; + + for (int i = 0; i < outer_size; i++) { + idma_copy_2d_desc( + 0, data_dram0, ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); + IDMA_HW_WAIT_ALL(0); + + vsoftmaxf(data_dram1, data_dram0, stride); + + idma_copy_2d_desc( + 1, ptr_out, data_dram1, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); + IDMA_HW_WAIT_ALL(1); + + ptr_in += outer_stride; + ptr_out += outer_stride; + } + + return out; + } else { + int num_inp_dims = in.dim(); + int num_out_dims = num_inp_dims; + + int ptr_inp_shape[MaxDim]; + int ptr_out_shape[MaxDim]; + int ptr_permute_vec[MaxDim]; + + for (int i = 0; i < num_inp_dims; i++) + ptr_inp_shape[i] = in.size(i); + + for (int i = 0; i < num_inp_dims; i++) { + if (i == d) + ptr_permute_vec[i] = num_inp_dims - 1; + else if (i == (num_inp_dims - 1)) + ptr_permute_vec[num_inp_dims - 1] = d; + else + ptr_permute_vec[i] = i; + + ptr_out_shape[i] = ptr_inp_shape[ptr_permute_vec[i]]; + + if (i != d) + outer_size = outer_size * ptr_inp_shape[i]; + } + + outer_stride = size; + + float* ptr_out = (float*)kernels::allocate_temp_memory( + ctx, out.numel() * sizeof(float)); + + ET_KERNEL_CHECK(ctx, ptr_out != nullptr, MemoryAllocationFailed, out); + + float* ptr_out1 = (float*)kernels::allocate_temp_memory( + ctx, out.numel() * sizeof(float)); + + ET_KERNEL_CHECK(ctx, ptr_out1 != nullptr, MemoryAllocationFailed, out); + + tensor_transposef( + ptr_out, + ptr_out_shape, + ptr_inp, + ptr_inp_shape, + ptr_permute_vec, + num_out_dims, + num_inp_dims); + + for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + size_t outer = outer_idx * outer_stride; + for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) { + size_t base = outer + inner_idx; + + float* ptr_in_data = &ptr_out[base]; + float* ptr_out_data = &ptr_out1[base]; + + vsoftmaxf(ptr_out_data, ptr_in_data, size); + } + } + + tensor_transposef( + out_data, + ptr_inp_shape, + ptr_out1, + ptr_out_shape, + ptr_permute_vec, + num_out_dims, + num_inp_dims); + + return out; + } + } + + ET_SWITCH_FLOATHBF16_TYPES( + in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + torch::executor::apply_over_dim( + [in_data, out_data]( + const size_t size, const size_t stride, const size_t base) { + // calculate max in softmax dim. During softmax computation each + // value is subtracted by the maximum in value before calling exp + // to preserve numerical stability. + const CTYPE max_in = torch::executor::apply_unary_reduce_fn( + [](const CTYPE val_in, CTYPE val_accum) { + return std::max(val_in, val_accum); + }, + in_data + base, + size, + stride); + + const CTYPE temp_sum = + torch::executor::apply_unary_map_reduce_fn( + [max_in](const CTYPE val_in) { + return std::exp(val_in - max_in); + }, + [](const CTYPE mapped_in, CTYPE val_accum) { + return val_accum + mapped_in; + }, + in_data + base, + size, + stride); + + torch::executor::apply_unary_map_fn( + [max_in, temp_sum](const CTYPE val_in) { + return std::exp(val_in - max_in) / temp_sum; + }, + in_data + base, + out_data + base, + size, + stride); + }, + in, + dim); + }); + + return out; +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/generic/operators/op_view_copy.cpp b/backends/cadence/vision/operators/op_view_copy.cpp similarity index 80% rename from backends/cadence/generic/operators/op_view_copy.cpp rename to backends/cadence/vision/operators/op_view_copy.cpp index 162e9ee201b..6d4d3a8a5e0 100644 --- a/backends/cadence/generic/operators/op_view_copy.cpp +++ b/backends/cadence/vision/operators/op_view_copy.cpp @@ -8,10 +8,12 @@ #include -namespace torch { -namespace executor { +namespace impl { +namespace vision { namespace native { +using executorch::aten::IntArrayRef; +using ::executorch::aten::IntArrayRef; using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; @@ -25,5 +27,5 @@ Tensor& view_copy_out( } } // namespace native -} // namespace executor -} // namespace torch +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/operators.h b/backends/cadence/vision/operators/operators.h new file mode 100644 index 00000000000..36c4486bf85 --- /dev/null +++ b/backends/cadence/vision/operators/operators.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace impl { +namespace vision { +namespace native { + +using ::executorch::runtime::getLeadingDims; + +#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \ + _(uint8_t, Byte) \ + _(int8_t, Char) + +inline __attribute__((always_inline)) void linear_( + const ::executorch::aten::Tensor& input, + const ::executorch::aten::Tensor& weight, + const ::executorch::aten::optional<::executorch::aten::Tensor>& bias, + ::executorch::aten::Tensor& output) { + const float* __restrict__ input_data = input.const_data_ptr(); + const float* __restrict__ weight_data = weight.const_data_ptr(); + const float* __restrict__ bias_data = bias.value().const_data_ptr(); + float* __restrict__ output_data = output.mutable_data_ptr(); + + // input comes in shape [batch_size, in_dim] + // weight comes in shape [out_dim, in_dim] + // output comes in empty with shape [batch_size, out_dim] + // Perform matrix multiply (M x N) x (N x P) => M x P + int64_t M = weight.size(0); // = out_dim + int64_t N = weight.size(1); // = in_dim + + // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the + // leading dimensions is d0 * d1 * ... * d_{N-2} + int64_t leading_dims = getLeadingDims(input, input.dim() - 1); + + for (int i = 0; i < leading_dims; ++i) { + for (int j = 0; j < M; ++j) { + float sum = bias_data[j]; + for (int k = 0; k < N; ++k) { + sum += input_data[i * N + k] * weight_data[j * N + k]; + } + output_data[i * M + j] = sum; + } + } +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/quantized_ops.h b/backends/cadence/vision/operators/quantized_ops.h new file mode 100644 index 00000000000..a7251724c53 --- /dev/null +++ b/backends/cadence/vision/operators/quantized_ops.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +template +inline __attribute__((always_inline)) void quantized_linear_per_tensor_( + const ::executorch::aten::Tensor& src, + const ::executorch::aten::Tensor& weight, + const ::executorch::aten::Tensor& bias, + const int64_t src_zero_point, + const int64_t weight_zero_point, + const int64_t out_multiplier, + const int64_t out_shift, + const int64_t out_zero_point, + ::executorch::aten::Tensor& out) { + // input comes in shape [leading_dims, in_dim] + // weight comes in shape [out_dim, in_dim] + // output comes in empty with shape [leading_dims, out_dim] + // Perform matrix multiply (M x N) x (N x P)' => M x P + const int64_t leading_dims = + executorch::runtime::getLeadingDims(src, src.dim() - 1); + const int64_t out_dim = weight.size(0); // = out_dim + const int64_t in_dim = weight.size(1); // = in_dim + + const T* __restrict__ in_data = src.const_data_ptr(); + const T* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + T* __restrict__ out_data = out.mutable_data_ptr(); + + // Compute the requant_scale from out_multiplier and out_shift + const float requant_scale = + -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift); + + for (size_t i = 0; i < leading_dims; ++i) { + for (size_t j = 0; j < out_dim; ++j) { + int32_t sum = bias_data[j]; + for (size_t k = 0; k < in_dim; ++k) { + int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point; + int32_t w = + (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point; + sum += x * w; + } + out_data[i * out_dim + j] = impl::vision::kernels::quantize( + sum, requant_scale, out_zero_point); + } + } +} + +template +inline __attribute__((always_inline)) void quantized_linear_per_tensor_( + const ::executorch::aten::Tensor& src, + const ::executorch::aten::Tensor& weight, + const ::executorch::aten::Tensor& bias, + int64_t src_zero_point, + const ::executorch::aten::Tensor& weight_zero_point_t, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + ::executorch::aten::Tensor& out) { + // Get the zero_point of weight. + int32_t weight_zero_point = weight_zero_point_t.const_data_ptr()[0]; + quantized_linear_per_tensor_( + src, + weight, + bias, + src_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + out); +} + +template +inline __attribute__((always_inline)) void quantized_linear_per_channel_( + const ::executorch::aten::Tensor& src, + const ::executorch::aten::Tensor& weight, + const ::executorch::aten::Tensor& bias, + int64_t src_zero_point, + int64_t weight_zero_point, + const ::executorch::aten::Tensor& out_multiplier, + const ::executorch::aten::Tensor& out_shift, + int64_t out_zero_point, + ::executorch::aten::Tensor& out) { + // input comes in shape [leading_dims, in_dim] + // weight comes in shape [out_dim, in_dim] + // output comes in empty with shape [leading_dims, out_dim] + // Perform matrix multiply (M x N) x (N x P)' => M x P + int64_t leading_dims = + executorch::runtime::getLeadingDims(src, src.dim() - 1); + const int64_t out_dim = weight.size(0); // = out_dim + const int64_t in_dim = weight.size(1); // = in_dim + + const T* __restrict__ in_data = src.const_data_ptr(); + const T* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + T* __restrict__ out_data = out.mutable_data_ptr(); + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + + for (size_t i = 0; i < leading_dims; ++i) { + for (size_t j = 0; j < out_dim; ++j) { + int32_t sum = bias_data[j]; + for (size_t k = 0; k < in_dim; ++k) { + int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point; + int32_t w = + (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point; + sum += x * w; + } + // Compute the out_scale from out_multiplier and out_shift + const float out_scale = + -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]); + out_data[i * out_dim + j] = + impl::vision::kernels::quantize(sum, out_scale, out_zero_point); + } + } +} + +template +inline __attribute__((always_inline)) void quantized_linear_( + const ::executorch::aten::Tensor& src, + const ::executorch::aten::Tensor& weight, + const ::executorch::aten::Tensor& bias, + int64_t src_zero_point, + int64_t weight_zero_point, + const ::executorch::aten::Tensor& out_multiplier, + const ::executorch::aten::Tensor& out_shift, + int64_t out_zero_point, + ::executorch::aten::Tensor& out) { + if (out_multiplier.numel() == 1) { + // Use per-tensor quantization kernel. + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + quantized_linear_per_tensor_( + src, + weight, + bias, + src_zero_point, + weight_zero_point, + out_multiplier_data[0], + out_shift_data[0], + out_zero_point, + out); + return; + } + + // Use per-channel quantization kernel. + quantized_linear_per_channel_( + src, + weight, + bias, + src_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + out); +} + +template +inline __attribute__((always_inline)) void quantized_linear_( + const ::executorch::aten::Tensor& src, + const ::executorch::aten::Tensor& weight, + const ::executorch::aten::Tensor& bias, + int64_t src_zero_point, + const ::executorch::aten::Tensor& weight_zero_point_t, + const ::executorch::aten::Tensor& out_multiplier, + const ::executorch::aten::Tensor& out_shift, + int64_t out_zero_point, + ::executorch::aten::Tensor& out) { + // Get the zero_point of weight. + int32_t weight_zero_point = weight_zero_point_t.const_data_ptr()[0]; + quantized_linear_( + src, + weight, + bias, + src_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + out); +} diff --git a/backends/cadence/vision/operators/targets.bzl b/backends/cadence/vision/operators/targets.bzl new file mode 100644 index 00000000000..2dd47e12bd2 --- /dev/null +++ b/backends/cadence/vision/operators/targets.bzl @@ -0,0 +1,83 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + + +def define_operator(name: str, deps: list[str] | None = None) -> None: + op_name = "op_{}".format(name) + + # Deps used by all operators. + common_deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/vision/kernels:cadence_kernels", + "//executorch/kernels/portable/cpu/util:dtype_util", + "//executorch/kernels/portable/cpu/util:elementwise_util", + "//executorch/kernels/portable/cpu/pattern:bitwise_op", + "//executorch/backends/cadence/vision/third-party:vision-nnlib", + "//executorch/kernels/portable/cpu/pattern:comparison_op" + ] + if deps == None: + deps = [] + + # Determine which headers to export based on operator name + exported_headers = ["operators.h"] + + # Add quantized_ops.h header for quantized operators + quantized_ops = [ + "quantized_fully_connected_out", + "quantized_matmul_out", + "quantized_layer_norm", + "quantized_relu_out", + "quantized_conv_out", + "quantized_linear_out", + "quantize_per_tensor", + "dequantize_per_tensor", + "requantize_out" + ] + + if name in quantized_ops: + exported_headers.append("quantized_ops.h") + + runtime.cxx_library( + name = op_name, + srcs = [op_name + ".cpp"], + platforms = CXX, + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + compatible_with = ["ovr_config//cpu:xtensa"], + deps = deps + common_deps, + exported_headers = exported_headers, + ) + +OPERATORS = [ + "add", + "full", + "quantized_fully_connected_out", + "quantized_matmul_out", + "requantize_out", + "dequantize_per_tensor", + "im2row_out", + "quantized_layer_norm", + "quantized_relu_out", + "softmax", + "embedding", + "quantized_conv_out", + "quantized_linear_out", + "quantize_per_tensor", + "view_copy" +] + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + # Define build targets for all operators registered in the tables above. + for op in OPERATORS: + define_operator(op) diff --git a/backends/cadence/vision/third-party/dummy.c b/backends/cadence/vision/third-party/dummy.c new file mode 100644 index 00000000000..52fb7c18c38 --- /dev/null +++ b/backends/cadence/vision/third-party/dummy.c @@ -0,0 +1,17 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* Dummy source file for non-Xtensa builds + * This file is used when building the vision-nnlib library on platforms + * other than Xtensa, providing empty stubs for compatibility. + * The actual function implementations are provided as stubs via DISCARD_FUN + * in headers when COMPILER_XTENSA is not defined. + */ + +// This file intentionally contains no function definitions and no includes. +// When COMPILER_XTENSA is not defined, all functions are stubbed out +// using the DISCARD_FUN macro in the header files. diff --git a/backends/cadence/vision/third-party/include/api.h b/backends/cadence/vision/third-party/include/api.h new file mode 100644 index 00000000000..efb80c3d76d --- /dev/null +++ b/backends/cadence/vision/third-party/include/api.h @@ -0,0 +1,83 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + * API + */ + +#ifndef __API_H__ +#define __API_H__ + +#include "dtypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*------------------------------------------------------------------------- +Softmax + +Description: The function computes the softmax (normalized exponential +function) of input data. 16-bit fixed-point functions accept inputs in +Q3.12 and form outputs in Q7.8 format. + +vsoftmax 16-bit +vsoftmax_fp16 IEEE-754 Std. half precision floating-point. +vsoftmaxf IEEE-754 Std. single precision floating-point. + +Accuracy: +2 LSB for fixed point API +2 ULP for floating point API +NOTE: Accuracy of function may depend on amount of data and their +distribution. Given accuracy is achieved for N=2 for any pair of +data from input domain. + + +Parameters: +Input: +x[N] input data, Q3.12 floating point +N Length of input/output data vectors +Output: +y[N] result, Q7.8 or floating point + +Restrictions: +x,y aligned on 2*BBE_SIMD_WIDTH-bytes boundary (vsoftmax) +x,y Must not overlap +N multiple of BBE_SIMD_WIDTH (vsoftmax) +-------------------------------------------------------------------------*/ +void vsoftmaxf(float32_t *y, const float32_t *x, int N); + +void tensor_transposef(float32_t *restrict ptr_out + ,const int *const ptr_out_shape + ,const float32_t *restrict ptr_inp + ,const int *const ptr_inp_shape + ,const int *restrict ptr_permute_vec + ,int num_out_dims + ,int num_inp_dims); + +#ifdef __cplusplus +}; +#endif + +#endif /* __API_H__ */ diff --git a/backends/cadence/vision/third-party/include/dtypes.h b/backends/cadence/vision/third-party/include/dtypes.h new file mode 100644 index 00000000000..c12bbf23ac2 --- /dev/null +++ b/backends/cadence/vision/third-party/include/dtypes.h @@ -0,0 +1,380 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + * Cross-platform data type definitions and utility macros + */ + +#ifndef __DTYPES_H__ +#define __DTYPES_H__ + +#include + +#ifndef COMPILER_ANSI +/* ---------------------------------------------------------- + Compilers autodetection + ----------------------------------------------------------*/ +#define ___UNKNOWN_COMPILER_YET +#ifdef ___UNKNOWN_COMPILER_YET +#ifdef _MSC_VER + +#ifdef _ARM_ +#define COMPILER_CEARM9E /* Microsoft Visual C++,ARM9E */ +#else +#define COMPILER_MSVC /* Microsoft Visual C++ */ +#endif + +#undef ___UNKNOWN_COMPILER_YET +#endif +#endif + +#ifdef ___UNKNOWN_COMPILER_YET +#ifdef _TMS320C6X +#if defined(_TMS320C6400) +#define COMPILER_C64 +#undef ___UNKNOWN_COMPILER_YET +#endif +#if defined(_TMS320C6400_PLUS) +#define COMPILER_C64PLUS +#undef ___UNKNOWN_COMPILER_YET +#endif +#endif +#endif + +#ifdef ___UNKNOWN_COMPILER_YET +#ifdef __TMS320C55X__ +#define COMPILER_C55 +#undef ___UNKNOWN_COMPILER_YET +#endif +#endif + +#ifdef ___UNKNOWN_COMPILER_YET +#ifdef __ADSPBLACKFIN__ +#define COMPILER_ADSP_BLACKFIN +#undef ___UNKNOWN_COMPILER_YET +#endif +#endif + +#ifdef ___UNKNOWN_COMPILER_YET +#ifdef __XCC__ +#define COMPILER_XTENSA +#undef ___UNKNOWN_COMPILER_YET +#endif +#endif + +#ifdef ___UNKNOWN_COMPILER_YET +#ifdef __GNUC__ +#ifdef __arm__ +#ifndef COMPILER_GNU_ARM +#endif +#define COMPILER_GNUARM /* GNU C/C++ compiler*/ +#else +/* GNU GCC x86 compiler */ +#ifndef COMPILER_GNU +#endif +#define COMPILER_GNU /* GNU C/C++ */ +#endif +#undef ___UNKNOWN_COMPILER_YET +#endif +#endif + +#ifdef ___UNKNOWN_COMPILER_YET +#error Unknown compiler +#endif + +#endif /* #ifndef COMPILER_ANSI */ + +/* ---------------------------------------------------------- + Language-dependent definitions + ----------------------------------------------------------*/ +#ifdef __cplusplus + +#undef extern_C +#define extern_C extern "C" + +#else + +#undef extern_C +#define extern_C + +#ifndef false +#define false 0 +#endif +#ifndef true +#define true 1 +#endif + +#endif + +/* Assertion support */ +#if !defined(_ASSERT) +#include +#if defined(_DEBUG) /*&& defined(COMPILER_MSVC)*/ +#define ASSERT(x) \ + { assert(x); } +#else + +/*#undef ASSERT*/ +#ifndef ASSERT +#define ASSERT(_ignore) ((void)0) +#endif + +#endif /* _DEBUG */ +#else /* ASSERT*/ +#define ASSERT(exp) \ + { \ + extern void ExternalAssertHandler(void *, void *, unsigned); \ + (void)((exp) || (ExternalAssertHandler(#exp, __FILE__, __LINE__), 0)); \ + } +#endif /* ASSERT */ + +/*** Inline methods definition ***/ +#undef inline_ +#if (defined COMPILER_MSVC) || (defined COMPILER_CEARM9E) +#define inline_ __inline +#elif defined(COMPILER_ADSP_BLACKFIN) +#define inline_ inline +#elif defined(COMPILER_ANSI) +#define inline_ +#elif (defined COMPILER_GNU) || (defined COMPILER_GNUARM) || \ + (defined COMPILER_ARM) +#define inline_ static inline +#else +#define inline_ static inline +#endif + +#ifndef MAX_INT16 +#define MAX_INT16 ((int16_t)0x7FFF) +#endif +#ifndef MIN_INT16 +#define MIN_INT16 ((int16_t)0x8000) +#endif +#ifndef MAX_INT32 +#define MAX_INT32 ((int32_t)0x7FFFFFFFL) +#endif +#ifndef MIN_INT32 +#define MIN_INT32 ((int32_t)0x80000000L) +#endif +#ifndef MIN_INT64 +#define MIN_INT64 ((int64_t)0x8000000000000000LL) +#endif +#ifndef MAX_INT64 +#define MAX_INT64 ((int64_t)0x7fffffffffffffffLL) +#endif + +/* size of variables in bytes */ +#ifdef COMPILER_C55 +#define SIZEOF_BYTE(x) (sizeof(x) << 1) +#else +#define SIZEOF_BYTE(x) sizeof(x) +#endif + +/*--------------------------------------- + special keywords definition + restrict keyword means that the memory + is addressed exclusively via + this pointer + onchip keyword means that the memory + is on-chip and can not be + accessed via external bus +---------------------------------------*/ +#if defined(COMPILER_C55) +#define NASSERT _nassert +#elif defined(COMPILER_C64) +#define onchip +#define NASSERT _nassert +#elif defined(COMPILER_ADSP_BLACKFIN) +#define onchip +#define NASSERT(x) __builtin_assert(x) +#elif defined(COMPILER_GNUARM) +#define onchip +#define NASSERT(x) \ + { (void)__builtin_expect((x) != 0, 1); } +#define restrict __restrict +#elif defined(COMPILER_GNU) +#define onchip +#define NASSERT(x) \ + { \ + (void)__builtin_expect((x) != 0, 1); \ + ASSERT(x); \ + } +#define restrict __restrict +#elif defined(COMPILER_CEARM9E) +#define onchip +#define NASSERT(x) +#define restrict +#elif defined(COMPILER_XTENSA) +#ifndef restrict +#define restrict __restrict +#endif +#define onchip +#define NASSERT(x) \ + { \ + (void)__builtin_expect((x) != 0, 1); \ + ASSERT(x); \ + } +#else +#define restrict +#define onchip +#define NASSERT ASSERT +#endif +#if defined(COMPILER_ADSP_BLACKFIN) +#define NASSERT_ALIGN(addr, align) __builtin_aligned(addr, align) +#else +#define NASSERT_ALIGN(addr, align) NASSERT(((uintptr_t)(addr)) % (align) == 0) +#endif +#define NASSERT_ALIGN2(addr) NASSERT_ALIGN(addr, 2) +#define NASSERT_ALIGN4(addr) NASSERT_ALIGN(addr, 4) +#define NASSERT_ALIGN8(addr) NASSERT_ALIGN(addr, 8) +#define NASSERT_ALIGN16(addr) NASSERT_ALIGN(addr, 16) +#define NASSERT_ALIGN32(addr) NASSERT_ALIGN(addr, 32) +#define NASSERT_ALIGN64(addr) NASSERT_ALIGN(addr, 64) +#define NASSERT_ALIGN128(addr) NASSERT_ALIGN(addr, 128) +/* ---------------------------------------------------------- + Common types + ----------------------------------------------------------*/ +#if defined(COMPILER_GNU) | defined(COMPILER_GNUARM) | defined(COMPILER_XTENSA) +/* + typedef signed char int8_t; + typedef unsigned char uint8_t; +*/ +#include +#elif defined(COMPILER_C64) +#include +#elif defined(COMPILER_C55) +#include +typedef signed char int8_t; +typedef unsigned char uint8_t; +#elif defined(COMPILER_ADSP_BLACKFIN) +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned short uint16_t; +typedef long int32_t; +typedef short int16_t; +typedef long long int64_t; +typedef unsigned long long uint64_t; +typedef uint32_t uintptr_t; +#else +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned short uint16_t; +typedef long int32_t; +typedef short int16_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#endif + +#if defined(COMPILER_CEARM9E) +typedef uint32_t uintptr_t; +#endif + +#if defined(COMPILER_ARM) +typedef uint32_t uintptr_t; +#endif + +typedef int16_t float16_t; +typedef float float32_t; +typedef double float64_t; +typedef int16_t fract16; +typedef int32_t fract32; + +typedef union tag_complex_fract16 { + struct { + int16_t re, im; + } s; + uint32_t a; /* just for 32-bit alignment */ +} complex_fract16; + +typedef union tag_complex_fract32 { + struct { + int32_t re, im; + } s; + uint64_t a; /* just for 64-bit alignment */ +} complex_fract32; + +#if defined(COMPILER_MSVC) +#if 0 +/* Note: Visual Studio does not support C99 compatible complex types yet */ +typedef union tag_complex_float { + struct { + float32_t re, im; + } s; + uint64_t a; /* just for 64-bit alignment */ +} complex_float; +typedef union tag_complex_double { + struct { + float64_t re, im; + } s; + uint64_t a[2]; /* only 64-bit alignment under Visual Studio :(( */ +} complex_double; + +inline_ float32_t crealf(complex_float x) { return x.s.re; } +inline_ float32_t cimagf(complex_float x) { return x.s.im; } +inline_ float64_t creal(complex_double x) { return x.s.re; } +inline_ float64_t cimag(complex_double x) { return x.s.im; } +#else +#include +#define complex_float _Fcomplex +#define complex_double _Dcomplex +#endif + +#else +/* C99 compatible type */ +#include +#define complex_float __complex__ float +#define complex_double __complex__ double +#endif + +/* complex half-precision datatype */ +typedef union tag_complex_float16 { + struct { + float16_t re, im; + } s; + uint32_t a; /* just for 32-bit alignment */ +} complex_float16; + +inline_ float16_t crealh(complex_float16 x) { return x.s.re; } +inline_ float16_t cimagh(complex_float16 x) { return x.s.im; } +/* union data type for writing float32_t/float64_t constants in a bitexact + * form */ +union ufloat32uint32 { + uint32_t u; + float32_t f; +}; +union ufloat64uint64 { + uint64_t u; + float64_t f; +}; +union ufloat16uint16 { + uint16_t u; + float16_t f; +}; + +#if defined(__RENAMING__) +#include "__renaming__.h" +#endif + +#endif /* __DTYPE_H__ */ diff --git a/backends/cadence/vision/third-party/include_private/common.h b/backends/cadence/vision/third-party/include_private/common.h new file mode 100644 index 00000000000..4fc07d8b4d1 --- /dev/null +++ b/backends/cadence/vision/third-party/include_private/common.h @@ -0,0 +1,199 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ + +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#if defined COMPILER_XTENSA +#include +#include +#include +#include +#include +#include +#if XCHAL_HAVE_IDMA +#ifndef IDMA_USE_MULTICHANNEL + #define IDMA_USE_MULTICHANNEL 1 +#endif +#include +#endif +#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH + +#include "xtensa/config/core-isa.h" +#include "xtensa/tie/xt_ivpn.h" +#if XCHAL_HAVE_IDMA +#include "xtensa/idma.h" +#endif + +#ifdef _MSC_VER +#define ALIGN(x) _declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned(x))) +#endif + +#ifdef COMPILER_XTENSA +#define ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline)) +#define ATTRIBUTE_NEVER_INLINE __attribute__((noinline)) +#define ATTRIBUTE_UNUSED __attribute__((unused)) +#else +#define ATTRIBUTE_ALWAYS_INLINE +#define ATTRIBUTE_NEVER_INLINE +#define ATTRIBUTE_UNUSED +#endif + +/* 'restrict' qualifier, is applied to pointers only under clang compiler */ +#ifdef __clang__ +#define restrict_clang restrict +#else +#define restrict_clang +#endif + +// Performance measurement macros +#define XTPERF_PRINTF(...) printf(__VA_ARGS__) +#define TIME_DECL(test) long start_time_##test, end_time_##test; +#define TIME_START(test) { start_time_##test = 0; XT_WSR_CCOUNT(0); } +#define TIME_END(test) { end_time_##test = XT_RSR_CCOUNT(); } +#define TIME_DISPLAY(test, opcnt, opname) { long long cycles_##test = end_time_##test - start_time_##test; \ + XTPERF_PRINTF("PERF_LOG : %s : %d : %s : %lld : cycles : %.2f : %s/cycle : %.2f : cycles/%s\n", \ + #test, opcnt, opname, cycles_##test, cycles_##test == 0 ? 0 : (double)(opcnt)/cycles_##test, \ + opname, cycles_##test == 0 ? 0 : 1/((double)(opcnt)/cycles_##test), opname); } + +//----------------------------------------------------- +// log2(BBE_SIMD_WIDTH) +//----------------------------------------------------- +#define LOG2_IVP_SIMD_WIDTH 5 +#define ALIGN_SIMD ALIGN(64) +#define ALIGN_2SIMD ALIGN(128) + +#define LOG2_SIMD_N_2 (LOG2_IVP_SIMD_WIDTH - 1) +#define LOG2_SIMD_2N (LOG2_IVP_SIMD_WIDTH + 1) +//----------------------------------------------------- +// some C++ support +//----------------------------------------------------- + +// special XCC type casting of pointers +#ifdef __cplusplus +#define castxcc(type_, ptr) (ptr) +#else +#define castxcc(type_, ptr) (type_ *)(ptr) +#endif + +//----------------------------------------------------- +// C99 pragma wrapper +//----------------------------------------------------- + +#ifdef COMPILER_XTENSA +#define __Pragma(a) _Pragma(a) +#else +#define __Pragma(a) +#endif + +//----------------------------------------------------- +// Conditionalization support +//----------------------------------------------------- +/* place DISCARD_FUN(retval_type,name) instead of function definition for + functions to be discarded from the executable THIS WORKS only for external + library functions declared as extern "C" and not supported for internal + references without "C" qualifier! +*/ +#ifdef COMPILER_MSVC +#pragma section("$DISCARDED_FUNCTIONS", execute, discard) +#pragma section("$$$$$$$$$$", execute, discard) +#define DISCARD_FUN(retval_type, name, arglist) \ + __pragma(alloc_text("$DISCARDED_FUNCTIONS", name)) \ + __pragma(section("$DISCARDED_FUNCTIONS", execute, discard)) \ + __pragma(warning(push)) __pragma(warning(disable : 4026 4716)) \ + retval_type name arglist {} \ + __pragma(warning(pop)) +#endif + +#if defined(COMPILER_XTENSA) || defined(COMPILER_GNU) +#define DISCARD_FUN(retval_type, name, arglist) \ + __asm__(".type " #name ", @object\n\t.global " #name \ + "\n\t.align 4\n\t" #name ":\n\t.long 0x49438B96,0x4D73F192\n\t"); +#endif + +/*------ LIST OF DEFINES DEPENDING ON ISA OPTIONS ------*/ + +/* Single-precision Extended Vector Floating-point option */ +#if ((XCHAL_HAVE_VISION_SP_VFPU)) +#define HAVE_SPX_VFPU 1 +#else +#define HAVE_SPX_VFPU 0 +#endif + +/* all vector single precision/Extended vector floating point instructions */ +#if ((XCHAL_HAVE_VISION_SP_VFPU)) +#define HAVE_SPX_VFPU 1 +#define HAVE_VFPU 1 +#else +#define HAVE_SPX_VFPU 0 +#define HAVE_VFPU 0 +#endif + +/* all scalar single precision floating point instructions */ +#if ((XCHAL_HAVE_VISION_SP_VFPU) || (XCHAL_HAVE_FP)) +#define HAVE_FPU 1 +#else +#define HAVE_FPU 0 +#endif + +#else +#define HAVE_VFPU 0 +#define HAVE_FPU 0 +#endif + +/* detect if half precision FPU is present in a core */ +#if ((XCHAL_HAVE_VISION_HP_VFPU)) +#define HAVE_HPFPU 1 +#include +#else +#define HAVE_HPFPU 0 +#endif + +/* detect if double precision FPU is present in a core */ +#if ((XCHAL_HAVE_VISION_DP_VFPU)) +#define HAVE_DPFPU 1 +#include +#else +#define HAVE_DPFPU 0 +#endif + +/* + 32x32 multiplier +*/ +#if defined(BBE_MULN_2X32) +#define HAVE_32X32 1 +#else +#define HAVE_32X32 0 +#endif + +#ifdef __cplusplus +#define externC extern "C" +#else +#define externC extern +#endif + +#endif // __COMMON_H__ diff --git a/backends/cadence/vision/third-party/include_private/expf_tbl.h b/backends/cadence/vision/third-party/include_private/expf_tbl.h new file mode 100644 index 00000000000..702164aba11 --- /dev/null +++ b/backends/cadence/vision/third-party/include_private/expf_tbl.h @@ -0,0 +1,53 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ + +/* + tables for expf(x) approximation +*/ +#ifndef __EXPF_TBL_H__ +#define __EXPF_TBL_H__ + +/* Portable data types. */ +#include "dtypes.h" +#include "common.h" + +/* + polynomial coefficients for 2^x in range 0...1 + + derived by MATLAB code: + order=6; + x=(0:pow2(1,-16):1); + y=2.^x; + p=polyfit(x,y,6); + p(order+1)=1; + p(order)=p(order)-(sum(p)-2); +*/ +externC const int32_t expftbl_Q30[8]; +externC const union ufloat32uint32 + expfminmax[2]; /* minimum and maximum arguments of expf() input */ +externC const int32_t invln2_Q30; /* 1/ln(2), Q30 */ +externC const union ufloat32uint32 expftblf[7]; +externC const union ufloat32uint32 log2_e[2]; +#endif /* __EXPF_TBL_H__ */ diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h new file mode 100644 index 00000000000..841a39cf891 --- /dev/null +++ b/backends/cadence/vision/third-party/include_private/idma_init.h @@ -0,0 +1,36 @@ +#ifndef __IDMA__INIT_H__ +#define __IDMA__INIT_H__ + +#include "../include/dtypes.h" +#include "common.h" + +#define IDMA_BUFF_SIZE \ + 16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output) + +#ifndef PLACE_IN_DRAM0 +#define PLACE_IN_DRAM0 \ + __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data"))) +#endif + +#ifndef PLACE_IN_DRAM1 +#define PLACE_IN_DRAM1 \ + __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data"))) +#endif + +float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0; +float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1; + +float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]}; +float32_t* outData[2] = { + &data_dram0[IDMA_BUFF_SIZE / 4], + &data_dram1[IDMA_BUFF_SIZE / 4]}; + +IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC); +IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC); + +idma_buffer_t* descbuf[] = { + buffer_idma_ch0, + buffer_idma_ch1, +}; + +#endif // __IDMA__INIT_H__ diff --git a/backends/cadence/vision/third-party/include_private/inff_tbl.h b/backends/cadence/vision/third-party/include_private/inff_tbl.h new file mode 100644 index 00000000000..1326e92a3c1 --- /dev/null +++ b/backends/cadence/vision/third-party/include_private/inff_tbl.h @@ -0,0 +1,39 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ + +/* + Infinities for single precision routines +*/ +#ifndef __INFF_TBL_H__ +#define __INFF_TBL_H__ + +#include "dtypes.h" +#include "common.h" + +externC const union ufloat32uint32 minusInff; /* -Inf */ +externC const union ufloat32uint32 plusInff; /* +Inf */ +externC const union ufloat32uint32 realmaxf; /* maximum floating point number */ +externC const union ufloat32uint32 realminf; /* minimum floating point number */ +#endif /* __INFF_TBL_H__ */ diff --git a/backends/cadence/vision/third-party/include_private/nanf_tbl.h b/backends/cadence/vision/third-party/include_private/nanf_tbl.h new file mode 100644 index 00000000000..4881b99f070 --- /dev/null +++ b/backends/cadence/vision/third-party/include_private/nanf_tbl.h @@ -0,0 +1,42 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + NaN values for single precision routines +*/ + +#ifndef __NANF_TBL_H__ +#define __NANF_TBL_H__ + +/* Portable data types. */ +#include "dtypes.h" +/* Common utility macros. */ +#include "common.h" + +extern const union ufloat32uint32 sNaNf; /* Signalling NaN */ +extern const union ufloat32uint32 qNaNf; /* Quiet NaN */ +extern const union ufloat32uint32 minus_sNaNf; /* Negative Signalling NaN */ +extern const union ufloat32uint32 minus_qNaNf; /* Negative Quiet NaN */ + +#endif /* __NANF_TBL_H__ */ diff --git a/backends/cadence/vision/third-party/library/api/tensor_transposef.c b/backends/cadence/vision/third-party/library/api/tensor_transposef.c new file mode 100644 index 00000000000..e6865033740 --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/tensor_transposef.c @@ -0,0 +1,167 @@ +#include "api.h" +#include "common.h" + +/* + * Currently only supports upto 5D input tensors. + * 1/2/3/4 D input tensors will be scaled up to 5D. + * For example, 2x3 -> 1x1x1x2x3. + */ + +void tensor_transposef(float32_t *restrict ptr_out + ,const int *const ptr_out_shape + ,const float32_t *restrict ptr_inp + ,const int *const ptr_inp_shape + ,const int *restrict ptr_permute_vec + ,int num_out_dims + ,int num_inp_dims) +{ + + /* Shift all dim with 1 in the outer part */ + int eff_output_shape[5]; + int eff_permute_vec[5]; + + for (int i = 0; i < num_out_dims; i++){ + eff_output_shape[i] = ptr_out_shape[i]; + eff_permute_vec[i] = ptr_permute_vec[i]; + } + + int one_i = num_out_dims - 1, non_one_i = num_out_dims - 1; + while (one_i > 0 && non_one_i >= 0){ + while (one_i > 0 && eff_output_shape[one_i] != 1){ + one_i--; + } + non_one_i = one_i; + while (non_one_i >= 0 && eff_output_shape[non_one_i]==1){ + non_one_i--; + } + if (one_i > 0 && non_one_i >= 0){ + int temp; + /*swap output_shape*/ + { + temp = eff_output_shape[one_i]; + eff_output_shape[one_i] = eff_output_shape[non_one_i]; + eff_output_shape[non_one_i] = temp; + } + /*swap permute_vec*/ + { + temp = eff_permute_vec[one_i]; + eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; + eff_permute_vec[non_one_i] = temp; + } + } + } + + /* Promoting lesser dim tensors to 5D tensors. + * Also updating the permute_vec and shapes as needed for optimization */ + int ptr_5D_inp_shape[5] = {1, 1, 1, 1, 1}; + int ptr_5D_out_shape[5] = {1, 1, 1, 1, 1}; + int ptr_5D_permute_vec[5] = {0, 1, 2, 3, 4}; + + /* Check if any inner inp dimension is same in the output */ + int last_dim_same = 1, last_n_same_dim = 0; + int itr = num_inp_dims - 1; + while(itr >= 0){ + last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; + last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; + itr--; + } + + int dims_added = 5 - num_inp_dims; + itr = num_inp_dims - 1; + int same_count = last_n_same_dim; + int count = 4; + while(itr >= 0){ + ptr_5D_inp_shape[count] = (same_count > 0) ? ptr_5D_inp_shape[count] * ptr_inp_shape[itr] : ptr_inp_shape[itr]; + ptr_5D_out_shape[count] = (same_count > 0) ? ptr_5D_out_shape[count] * eff_output_shape[itr] : eff_output_shape[itr]; + same_count--; + itr--; + count = (same_count > 0) ? count : count - 1; + } + + itr = num_inp_dims - 1; + same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; + count = 4; + while(itr >= 0){ + ptr_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; + same_count--; + itr--; + count--; + } + + int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; + int inp_dim1, inp_dim2, inp_dim3, inp_dim4; + int inp_stride[5]; + + out_dim0 = ptr_5D_out_shape[0]; + out_dim1 = ptr_5D_out_shape[1]; + out_dim2 = ptr_5D_out_shape[2]; + out_dim3 = ptr_5D_out_shape[3]; + out_dim4 = ptr_5D_out_shape[4]; + + inp_dim1 = ptr_5D_inp_shape[1]; + inp_dim2 = ptr_5D_inp_shape[2]; + inp_dim3 = ptr_5D_inp_shape[3]; + inp_dim4 = ptr_5D_inp_shape[4]; + + inp_stride[0] = inp_dim1 * inp_dim2 * inp_dim3 * inp_dim4; + inp_stride[1] = inp_dim2 * inp_dim3 * inp_dim4; + inp_stride[2] = inp_dim3 * inp_dim4; + inp_stride[3] = inp_dim4; + inp_stride[4] = 1; + + if (last_n_same_dim){ + int itr0, itr1, itr2, itr3, itr4; + float32_t *ptr_inp0 = (float32_t *)ptr_inp; + for (itr0 = 0; itr0 < out_dim0; itr0++){ + float32_t *ptr_inp1 = ptr_inp0 + (itr0 * inp_stride[ptr_5D_permute_vec[0]]); +#pragma looptr_count min=1 + for (itr1 = 0; itr1 < out_dim1; itr1++){ + float32_t *ptr_inp2 = ptr_inp1 + (itr1 * inp_stride[ptr_5D_permute_vec[1]]); +#pragma looptr_count min=1 + for (itr2 = 0; itr2 < out_dim2; itr2++){ + float32_t *ptr_inp3 = ptr_inp2 + (itr2 * inp_stride[ptr_5D_permute_vec[2]]); +#pragma looptr_count min=1 + for (itr3 = 0; itr3 < out_dim3; itr3++, ptr_out += out_dim4){ + float32_t *ptr_inp4 = ptr_inp3 + (itr3 * inp_stride[ptr_5D_permute_vec[3]]); + xb_vecN_2xf32 *restrict pae_i = (xb_vecN_2xf32 *)(ptr_inp4); + xb_vecN_2xf32 *restrict pae_o = (xb_vecN_2xf32 *)(ptr_out); + valign a_inp = IVP_LAN_2XF32_PP(pae_i); + valign a_out = IVP_ZALIGN(); + xb_vecN_2xf32 d0; + for(itr4 = 0; itr4 < (out_dim4 >> (LOG2_IVP_SIMD_WIDTH - 1)); itr4++){ + IVP_LAN_2XF32_IP(d0, a_inp, pae_i); + IVP_SAN_2XF32_IP(d0, a_out, pae_o); + } + IVP_SAPOSN_2XF32_FP(a_out, pae_o); + float32_t *restrict puae_i = (float32_t *)(pae_i); + float32_t *restrict puae_o = (float32_t *)(pae_o); +#pragma looptr_count max = 17 + for(itr4 = 0; itr4 < (out_dim4 & (IVP_SIMD_WIDTH / 2 - 1)); itr4++){ + puae_o[itr4] = puae_i[itr4]; + } + } + } + } + } + } + else{ + int itr0, itr1, itr2, itr3, itr4; + float32_t *ptr_inp0 = (float32_t *)ptr_inp; + for(itr0 = 0; itr0 < out_dim0; itr0++){ + float32_t *ptr_inp1 = ptr_inp0 + (itr0 * inp_stride[ptr_5D_permute_vec[0]]); + for(itr1 = 0; itr1 < out_dim1; itr1++){ + float32_t *ptr_inp2 = ptr_inp1 + (itr1 * inp_stride[ptr_5D_permute_vec[1]]); + for(itr2 = 0; itr2 < out_dim2; itr2++){ + float32_t *ptr_inp3 = ptr_inp2 + (itr2 * inp_stride[ptr_5D_permute_vec[2]]); + for(itr3 = 0; itr3 < out_dim3; itr3++){ + float32_t *ptr_inp4 = ptr_inp3 + (itr3 * inp_stride[ptr_5D_permute_vec[3]]); + for(itr4 = 0; itr4 < out_dim4; itr4++){ + *ptr_out++ = *ptr_inp4; + ptr_inp4 = ptr_inp4 + inp_stride[ptr_5D_permute_vec[4]]; + } + } + } + } + } + } +} diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c new file mode 100644 index 00000000000..27487c75d6c --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c @@ -0,0 +1,241 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + NatureDSP_Baseband library. Vector Mathematics. + Softmax, floating-point data +*/ +#include "api.h" +#include "common.h" +#include "expf_tbl.h" +#include "inff_tbl.h" +#include "nanf_tbl.h" + +/*------------------------------------------------------------------------- +Softmax + +Description: The function computes the softmax (normalized exponential +function) of input data. 16-bit fixed-point functions accept inputs in +Q3.12 and form outputs in Q7.8 format. + +vsoftmax 16-bit +vsoftmax_fp16 IEEE-754 Std. half precision floating-point. +vsoftmaxf IEEE-754 Std. single precision floating-point. + +Accuracy: +2 LSB for fixed point API +2 ULP for floating point API +NOTE: Accuracy of function may depend on amount of data and their +distribution. Given accuracy is achieved for N=2 for any pair of +data from input domain. + + +Parameters: +Input +: +x[N] input data, Q3.12 floating point +N Length of input/output data vectors +Output: +y[N] result, Q7.8 or floating point + +Restrictions: +x,y Must not overlap +-------------------------------------------------------------------------*/ + +#define IVP_ADDSN_2X32(b_, c_) \ + ({ \ + xb_vecN_2x32v a_; \ + xb_vecN_2x64w tmp_a_; \ + tmp_a_ = IVP_MULN_2X32(b_, 1); \ + IVP_MULAN_2X32(tmp_a_, c_, 1); \ + a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \ + a_; \ + }) + +#if !HAVE_VFPU +DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N)) +#else +void vsoftmaxf(float32_t* y, const float32_t* x, int N) { +#if !defined(IVP_MULN_2X32) +#else + const int* pTbl = (const int*)expftbl_Q30; +#endif + const xb_vecN_2xf32* restrict pX; + xb_vecN_2xf32* restrict pY; + xb_vecN_2xf32 norm, ysum, xmax; + int n; + valign al_X, al_R, al_Y; + if (N < 0) + return; + xmax = minusInff.f; + pX = (const xb_vecN_2xf32*)x; + al_X = IVP_LAN_2XF32_PP(pX); + al_Y = IVP_ZALIGN(); + for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) { + xb_vecN_2xf32 x; + IVP_LAN_2XF32_IP(x, al_X, pX); + xmax = IVP_MAXNUMN_2XF32(xmax, x); + } + if (N & (IVP_SIMD_WIDTH / 2 - 1)) { + xb_vecN_2xf32 x; + IVP_LAVN_2XF32_XP( + x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_MAXNUMN_2XF32T( + xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); + } + + xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0); + __Pragma("no_reorder"); + ysum = 0.f; + pX = (const xb_vecN_2xf32*)x; + pY = (xb_vecN_2xf32*)y; + al_X = IVP_LAN_2XF32_PP(pX); + { + vboolN_2 bnan; + bnan = IVP_LTRN_2I(0); + for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) { + xb_vecN_2xf32 x; + IVP_LAN_2XF32_IP(x, al_X, pX); + x = IVP_SUBN_2XF32(x, xmax); + bnan |= IVP_UNN_2XF32(x, x); + { + xb_vecN_2xf32 gf, zout; + xb_vecN_2x32v xin_i, fr, exp, t; + xb_vecN_2x32v y, y1, y2, c1, c2, f2; + xb_vecN_2x64w w; + xin_i = IVP_TRUNCN_2XF32(x, 24); + /* Multiply by 1/ln2, extract the integer and fractional (Q32) + * components. */ + /* Q54 <- Q24*Q30 */ + w = IVP_MULN_2X32(xin_i, invln2_Q30); + exp = IVP_PACKVRNRN_2X64W(w, 54); + fr = IVP_SRLN_2X32(IVP_PACKVRNRN_2X64W(w, 22), 1); + /* polynomial for 2^x */ + f2 = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, fr), 31); + y1 = IVP_LSRN_2X32_I(pTbl, 0 * sizeof(int32_t)); + y2 = IVP_LSRN_2X32_I(pTbl, 1 * sizeof(int32_t)); + c1 = IVP_LSRN_2X32_I(pTbl, 2 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31); + y1 = IVP_ADDSN_2X32(c1, t); + c2 = IVP_LSRN_2X32_I(pTbl, 3 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31); + y2 = IVP_ADDSN_2X32(c2, t); + c1 = IVP_LSRN_2X32_I(pTbl, 4 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31); + y1 = IVP_ADDSN_2X32(c1, t); + c2 = IVP_LSRN_2X32_I(pTbl, 5 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31); + y2 = IVP_ADDSN_2X32(c2, t); + c1 = IVP_LSRN_2X32_I(pTbl, 6 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31); + y1 = IVP_ADDSN_2X32(c1, t); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, y2), 31); + y = IVP_ADDSN_2X32(y1, t); + /* scale result to original exponent ignoring very low items */ + gf = IVP_FLOATN_2X32(y, 30); + exp = IVP_SLLIN_2X32(IVP_MAXN_2X32(IVP_ADDN_2X32(127, exp), 0), 23); + zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp)); + x = zout; + } + ysum = IVP_ADDN_2XF32(ysum, x); + IVP_SAN_2XF32_IP(x, al_Y, pY); + } + if (N & (IVP_SIMD_WIDTH / 2 - 1)) { + xb_vecN_2xf32 x; + IVP_LAVN_2XF32_XP( + x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + x = IVP_SUBN_2XF32(x, xmax); + bnan |= IVP_UNN_2XF32(x, x); + { + xb_vecN_2xf32 gf, zout; + xb_vecN_2x32v xin_i, fr, exp, t; + xb_vecN_2x32v y, y1, y2, c1, c2, f2; + xb_vecN_2x64w w; + xin_i = IVP_TRUNCN_2XF32(x, 24); + /* Multiply by 1/ln2, extract the integer and fractional (Q32) + * components. */ + /* Q54 <- Q24*Q30 */ + w = IVP_MULN_2X32(xin_i, invln2_Q30); + exp = IVP_PACKVRNRN_2X64W(w, 54); + fr = IVP_SRLN_2X32(IVP_PACKVRNRN_2X64W(w, 22), 1); + /* polynomial for 2^x */ + f2 = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, fr), 31); + y1 = IVP_LSRN_2X32_I(pTbl, 0 * sizeof(int32_t)); + y2 = IVP_LSRN_2X32_I(pTbl, 1 * sizeof(int32_t)); + c1 = IVP_LSRN_2X32_I(pTbl, 2 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31); + y1 = IVP_ADDSN_2X32(c1, t); + c2 = IVP_LSRN_2X32_I(pTbl, 3 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31); + y2 = IVP_ADDSN_2X32(c2, t); + c1 = IVP_LSRN_2X32_I(pTbl, 4 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31); + y1 = IVP_ADDSN_2X32(c1, t); + c2 = IVP_LSRN_2X32_I(pTbl, 5 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31); + y2 = IVP_ADDSN_2X32(c2, t); + c1 = IVP_LSRN_2X32_I(pTbl, 6 * sizeof(int32_t)); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31); + y1 = IVP_ADDSN_2X32(c1, t); + t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, y2), 31); + y = IVP_ADDSN_2X32(y1, t); + /* scale result to original exponent ignoring very low items */ + gf = IVP_FLOATN_2X32(y, 30); + exp = IVP_SLLIN_2X32(IVP_MAXN_2X32(IVP_ADDN_2X32(127, exp), 0), 23); + zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp)); + x = zout; + } + IVP_ADDN_2XF32T( + ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); + IVP_SAVN_2XF32_XP( + x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + } + IVP_SAPOSN_2XF32_FP(al_Y, pY); + ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan); + } + norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum)); + __Pragma("no_reorder"); + pX = (const xb_vecN_2xf32*)y; + pY = (xb_vecN_2xf32*)y; + + al_R = IVP_LAN_2XF32_PP(pX); + + for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) { + xb_vecN_2xf32 x; + IVP_LAN_2XF32_IP(x, al_R, pX); + x = IVP_MULN_2XF32(x, norm); + IVP_SAN_2XF32_IP(x, al_Y, pY); + } + if (N & (IVP_SIMD_WIDTH / 2 - 1)) { + xb_vecN_2xf32 x; + IVP_LAVN_2XF32_XP( + x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + x = IVP_MULN_2XF32(x, norm); + IVP_SAVN_2XF32_XP( + x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + } + IVP_SAPOSN_2XF32_FP(al_Y, pY); + +} /* vsoftmaxf() */ +#endif diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c new file mode 100644 index 00000000000..f1c6f3d44ae --- /dev/null +++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c @@ -0,0 +1,85 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ + +/* + tables for expf(x) approximation +*/ +/* Portable data types. */ +#include "expf_tbl.h" +#include "dtypes.h" + +/* + polynomial coefficients for 2^x in range 0...1 + + derived by MATLAB code: + order=6; + x=(0:pow2(1,-16):1); + y=2.^x; + p=polyfit(x,y,6); + p(order+1)=1; + p(order)=p(order)-(sum(p)-2); +*/ +const int32_t ALIGN_2SIMD expftbl_Q30[8] = { + 234841, + 1329551, + 10400465, + 59570027, + 257946177, + 744260763, + 1073741824, + 0 /* Padding to allow for vector loads */ +}; + +const union ufloat32uint32 ALIGN_2SIMD + expfminmax[2] = /* minimum and maximum arguments of expf() input */ + { + {0xc2ce8ed0}, /*-1.0327893066e+002f */ + {0x42b17218} /* 8.8722839355e+001f */ +}; + +const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */ + +const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = { + {0x3fb8aa3b}, /* 1.4426950216 */ + {0x32a57060} /* 1.9259629891e-008 */ +}; + +/* +order=6; +x=(0:pow2(1,-16):1); +y=2.^x; +p=polyfit(x,y,order); +p(order+1)=1; +p(order)=p(order)-(sum(p)-2); +num2hex(single(p)); +*/ +const union ufloat32uint32 ALIGN_2SIMD expftblf[] = { + {0x39655635}, + {0x3aa24c7a}, + {0x3c1eb2d1}, + {0x3d633ddb}, + {0x3e75ff24}, + {0x3f317212}, + {0x3f800000}}; diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c new file mode 100644 index 00000000000..8464ee9f549 --- /dev/null +++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c @@ -0,0 +1,38 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ + +/* + infinities for single precision routines +*/ + +#include "inff_tbl.h" +#include "dtypes.h" + +const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */ +const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */ +const union ufloat32uint32 realmaxf = { + 0x7f7fffff}; /* maximum floating point number */ +const union ufloat32uint32 realminf = { + 0x00800000}; /* minimum floating point number */ diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c new file mode 100644 index 00000000000..f165234fce4 --- /dev/null +++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c @@ -0,0 +1,38 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + NaN values for single precision routines +*/ + +/* Portable data types. */ +/* NaN values for single precision routines. */ +#include "nanf_tbl.h" +#include "dtypes.h" + +const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN */ +const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN */ +const union ufloat32uint32 minus_sNaNf = { + 0xff800001}; /* Negative Signalling NaN */ +const union ufloat32uint32 minus_qNaNf = {0xffc00000}; /* Negative Quiet NaN */ diff --git a/backends/cadence/vision/third-party/targets.bzl b/backends/cadence/vision/third-party/targets.bzl new file mode 100644 index 00000000000..26a097010d5 --- /dev/null +++ b/backends/cadence/vision/third-party/targets.bzl @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbsource//arvr/tools/build_defs:oxx.bzl", "oxx_binary", "oxx_static_library") + + +def define_common_targets(): + runtime.cxx_library( + name = "vision-nnlib", + srcs = select({ + "DEFAULT": ["dummy.c"], # Use dummy file for non-Xtensa builds + "ovr_config//cpu:xtensa": glob(["library/**/*.c"]), + }), + exported_headers = glob([ + "include/*.h", + "include_private/*.h" + ]), + header_namespace = "", + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + platforms = CXX, + compatible_with = select({ + "DEFAULT": [], + "ovr_config//cpu:xtensa": ["ovr_config//cpu:xtensa"], + }), + compiler_flags = select({ + "DEFAULT": ["-UCOMPILER_XTENSA"], # Ensure COMPILER_XTENSA is not defined for non-Xtensa builds + "ovr_config//cpu:xtensa": [ + "-DCOMPILER_XTENSA", + "-Ixplat/executorch/backends/cadence/vision/third-party/include", + "-Ixplat/executorch/backends/cadence/vision/third-party/include_private", + ], + }), + define_static_target = True, + ) diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt index 1567b8b5e1c..a728584e49c 100644 --- a/backends/cortex_m/CMakeLists.txt +++ b/backends/cortex_m/CMakeLists.txt @@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() -# Source root directory for executorch. +# Source root directory for executorch if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) endif() @@ -21,71 +21,76 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake) include(FetchContent) -# CMSIS-NN version to download +# CMSIS-NN configuration with dynamic path detection set(CMSIS_NN_VERSION - "v4.1.0" + "v7.0.0" CACHE STRING "CMSIS-NN version to download" ) - -# Declare CMSIS-NN as a FetchContent project -FetchContent_Declare( - cmsis_nn - GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git - GIT_TAG ${CMSIS_NN_VERSION} +set(CMSIS_NN_LOCAL_PATH + "" + CACHE PATH "Path to existing local CMSIS-NN installation" ) -# Download and make CMSIS-NN available -FetchContent_MakeAvailable(cmsis_nn) - -# Print paths for debugging -message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}") -message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}") +# Try to find existing / local CMSIS-NN installation. This is useful for +# debugging and testing with local changes. This is not common, as the CMSIS-NN +# library is downloaded via FetchContent in the default/regular case. +if(CMSIS_NN_LOCAL_PATH AND EXISTS "${CMSIS_NN_LOCAL_PATH}") + message(STATUS "Using CMSIS-NN from specified path: ${CMSIS_NN_LOCAL_PATH}") + add_subdirectory(${CMSIS_NN_LOCAL_PATH} _deps/cmsis_nn-build) +else() + # Use FetchContent with automatic fallback + message(STATUS "Using CMSIS-NN via FetchContent") + + FetchContent_Declare( + cmsis_nn + GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git + GIT_TAG ${CMSIS_NN_VERSION} + GIT_SHALLOW TRUE + ) + + FetchContent_MakeAvailable(cmsis_nn) +endif() # Cortex-M ops kernel sources set(_cortex_m_kernels__srcs ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp ) -# Generate C++ bindings to register kernels into Executorch (for runtime) +# Generate C++ bindings to register kernels into Executorch set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml) gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}") - generate_bindings_for_kernels( LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}" ) -message("Generated files ${gen_command_sources}") -# Build a library for cortex_m_kernels +# Build library for cortex_m_kernels add_library(cortex_m_kernels ${_cortex_m_kernels__srcs}) -target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options}) -# Include directories for cortex_m_kernels -target_include_directories( +# Use PRIVATE for implementation dependencies to avoid INTERFACE pollution +target_link_libraries( cortex_m_kernels - PRIVATE ${EXECUTORCH_ROOT}/.. - ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 - ${cmsis_nn_SOURCE_DIR}/Include + PRIVATE cmsis-nn + PRIVATE executorch ) -# Link directly to the CMSIS-NN static library file -target_link_libraries( - cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch +# Include directories for cortex_m_kernels +target_include_directories( + cortex_m_kernels PRIVATE ${EXECUTORCH_ROOT}/.. + ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 ) -# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual -# CMSIS-NN target name (usually 'cmsis-nn') -add_dependencies(cortex_m_kernels cmsis-nn) - # cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime gen_operators_lib( LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch ) install( - TARGETS cortex_m_kernels cortex_m_ops_lib + TARGETS cortex_m_kernels cortex_m_ops_lib cmsis-nn EXPORT ExecuTorchTargets - DESTINATION lib - PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/ + DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/backends/cortex_m/ops/ ) diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h new file mode 100644 index 00000000000..4b9fdaebdf7 --- /dev/null +++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include "cortex_m_ops_common.h" +extern "C" { +#include "arm_nnfunctions.h" +} + +namespace cortex_m { +namespace native { + +// During AOT phase, quantized_linear_fusion_pass allocates total buffer +// and passes in as 'Tensor'. (Total buffer = 8-byte header + x bytes) +// ┌─────────────────┬─────────────────────────────────────┐ +// │ KernelSum Header│ CMSIS Workspace │ +// │ (8 bytes) │ (x bytes) │ +// └─────────────────┴─────────────────────────────────────┘ +// │ │ +// │ └─> Passed to CMSIS API +// │ +// └─> State for kernel sum + +// C++ Runtime: +// ┌─────────────────┬─────────────────────────────────────┐ +// │ KernelSum Header│ CMSIS Workspace │ +// │ (8 bytes) │ (x bytes) │ +// └─────────────────┴─────────────────────────────────────┘ +// ^ ^ +// │ │ +// scratch_ptr cmsis_workspace_ptr +// │ │ +// ▼ ▼ +// arm_vector_sum_s8() writes kernel sums (with bias if avail): +// [sum₀+bias₀][sum₁+bias₁][sum₂+bias₂]...[sum_{n-1}+bias_{n-1}] +// (n * 4-byte int32_t values = x bytes) +// +// - n = out_features (number of output features) +// - x = n * 4 bytes (total CMSIS buffer size) +// - Total buffer = 8 + x bytes + +class CMSISScratchBufferContext final { + public: + CMSISScratchBufferContext( + Tensor& scratch_buffer, + const Tensor& weights, + const Tensor& weight_zero_point, + const torch::executor::optional& bias) + : scratch_ptr_(scratch_buffer.mutable_data_ptr()), + total_size_(scratch_buffer.size(0)), + base_ptr_(reinterpret_cast(scratch_ptr_)), + in_features_(weights.size(1)), + out_features_(weights.size(0)), + is_per_channel_(weight_zero_point.numel() > 1), + weight_data_offset_(calculate_offset(weights.const_data_ptr())), + weight_zp_data_offset_( + calculate_offset(weight_zero_point.const_data_ptr())), + bias_data_offset_( + bias.has_value() + ? calculate_offset(bias.value().const_data_ptr()) + : 0), + header_(reinterpret_cast(scratch_ptr_)), + cmsis_workspace_ptr_(scratch_ptr_ + KERNEL_SUM_HEADER_SIZE) { + cmsis_nn_dims filter_dims = {in_features_, 1, 1, out_features_}; + validate_size(filter_dims); + } + + cmsis_nn_context get_cmsis_ctx() const { + cmsis_nn_context ctx; + ET_CHECK_MSG( + reinterpret_cast(cmsis_workspace_ptr_) % 4 == 0, + "CMSIS workspace not 4-byte aligned"); + ctx.buf = cmsis_workspace_ptr_; + ctx.size = get_cmsis_workspace_size(); + return ctx; + } + + bool is_kernel_sum_updated() const { + return header_->updated; + } + + void compute_kernel_sums_if_needed() { + if (!header_->updated) { + arm_vector_sum_s8( + reinterpret_cast(cmsis_workspace_ptr_), + in_features_, + out_features_, + get_weight_data(), + get_weight_zp_data()[0], + 0, + get_bias_data()); + header_->updated = true; + ET_LOG( + Info, + "Computed kernel sums. [required_bytes : %d]", + header_->required_size); + } + } + + const int8_t* get_weight_data() const { + return reinterpret_cast(base_ptr_ + weight_data_offset_); + } + + const int32_t* get_weight_zp_data() const { + return reinterpret_cast(base_ptr_ + weight_zp_data_offset_); + } + + const int32_t* get_bias_data() const { + return bias_data_offset_ == 0 + ? nullptr + : reinterpret_cast(base_ptr_ + bias_data_offset_); + } + + bool is_per_channel_quant() const { + return is_per_channel_; + } + int32_t get_in_features() const { + return in_features_; + } + int32_t get_out_features() const { + return out_features_; + } + + private: + static constexpr size_t KERNEL_SUM_HEADER_SIZE = 8; + + // Header for kernel sum computation state only + struct KernelSumHeader { + bool updated = false; + int32_t required_size = 0; + }; + static_assert( + sizeof(KernelSumHeader) == KERNEL_SUM_HEADER_SIZE, + "KernelSumHeader must be exactly 8 bytes"); + + int8_t* scratch_ptr_; + size_t total_size_; + uint8_t* base_ptr_; + + // Context members + const int32_t in_features_; + const int32_t out_features_; + const bool is_per_channel_; + const uint32_t weight_data_offset_; + const uint32_t weight_zp_data_offset_; + const uint32_t bias_data_offset_; + + KernelSumHeader* header_; + int8_t* cmsis_workspace_ptr_; + + uint32_t calculate_offset(const void* ptr) const { + if (ptr == nullptr) + return 0; + + const uint8_t* ptr_bytes = reinterpret_cast(ptr); + ET_CHECK_MSG(ptr_bytes >= base_ptr_, "Pointer is before base address"); + + const std::ptrdiff_t offset = ptr_bytes - base_ptr_; + ET_CHECK_MSG( + offset >= 0 && offset <= UINT32_MAX, "Offset out of valid range"); + return static_cast(offset); + } + + size_t get_cmsis_workspace_size() const { + return total_size_ - KERNEL_SUM_HEADER_SIZE; + } + + void validate_size(const cmsis_nn_dims& filter_dims) const { + header_->required_size = + arm_fully_connected_s8_get_buffer_size(&filter_dims); + + ET_CHECK_MSG( + get_cmsis_workspace_size() >= + static_cast(header_->required_size), + "Scratch buffer size %zu insufficient for required size %d", + get_cmsis_workspace_size(), + header_->required_size); + } +}; + +} // namespace native +} // namespace cortex_m diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h index 5ef2d9d4bf9..eaa7027e46c 100644 --- a/backends/cortex_m/ops/cortex_m_ops_common.h +++ b/backends/cortex_m/ops/cortex_m_ops_common.h @@ -22,6 +22,10 @@ using ScalarType = executorch::aten::ScalarType; using Scalar = torch::executor::Scalar; using Error = executorch::runtime::Error; +// From arm_nn_math_types.h +#define ARM_NN_Q31_MAX ((int32_t)(0x7FFFFFFFL)) +#define ARM_NN_Q31_MIN ((int32_t)(0x80000000L)) + // Basic tensor type / layout validation and dimension order checking inline void validate_cmsis_nn_tensor_requirements( const Tensor& input1, @@ -32,16 +36,19 @@ inline void validate_cmsis_nn_tensor_requirements( // Basic dtype validation ET_CHECK_MSG( input1.scalar_type() == expected_dtype, - "Input1 dtype must be %hhd", - expected_dtype); + "Input1 dtype must be %hhd, got %hhd", + expected_dtype, + input1.scalar_type()); ET_CHECK_MSG( input2.scalar_type() == expected_dtype, - "Input2 dtype must be %hhd", - expected_dtype); + "Input2 dtype must be %hhd, got %hhd", + expected_dtype, + input2.scalar_type()); ET_CHECK_MSG( output.scalar_type() == expected_dtype, - "Output dtype must be %hhd", - expected_dtype); + "Output dtype must be %hhd, got %hhd", + expected_dtype, + output.scalar_type()); // Dim order consistency ET_CHECK_MSG( @@ -114,6 +121,33 @@ inline void validate_quantization_params( "Single quant Output"); } +// Refer to CMSIS-NN 'arm_nn_requantize' implementation for details: +// https://github.com/ARM-software/CMSIS-NN/blob/main/Include/arm_nnsupportfunctions.h#L1625 +// multiplier: Range {ARM_NN_Q31_MIN + 1, Q32_MAX} +// shift : Range {-31, 30} +inline bool validate_per_channel_quant_params( + const int32_t* multipliers, + const int32_t* shifts, + int num_channels) { + for (int i = 0; i < num_channels; ++i) { + // Multiplier: {ARM_NN_Q31_MIN + 1, ARM_NN_Q31_MAX} + if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) { + ET_LOG( + Error, + "weight_multiplier[%d] out of CMSIS-NN range: %d", + i, + multipliers[i]); + return false; + } + // Shift: {-31, 30} for arm_nn_requantize + if (shifts[i] < -31 || shifts[i] > 30) { + ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]); + return false; + } + } + return true; +} + inline Error resize_to_broadcast_target_size( const Tensor& input1, const Tensor& input2, diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp new file mode 100644 index 00000000000..d1ccb6d0d45 --- /dev/null +++ b/backends/cortex_m/ops/op_quantized_linear.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "cmsis_scratch_buffer_context.h" +#include "cortex_m_ops_common.h" + +extern "C" { +#include "arm_nnfunctions.h" +} + +namespace cortex_m { +namespace native { +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; + +Tensor& quantized_linear_out( + KernelRuntimeContext& context, + const Tensor& input, + const Scalar& input_zero_point, + const Scalar& input_multiplier, + const Scalar& input_shift, + const Tensor& weights, + const Tensor& weight_zero_point, + const Tensor& weight_multiplier, + const Tensor& weight_shift, + const torch::executor::optional& bias, + const Tensor& bias_multiplier, + const Tensor& bias_shift, + const Tensor& scratch_buffer, + const Scalar& output_zero_point, + const Scalar& in_features, + const Scalar& out_features, + Tensor& out) { + ET_LOG(Info, "quantized_linear_out: called"); + validate_cmsis_nn_tensor_requirements(input, weights, out); + + ET_CHECK_MSG( + scratch_buffer.scalar_type() == ScalarType::Char, + "Scratch buffer must be int8"); + + const int32_t batch_size = input.size(0); + const int32_t in_feat = static_cast(in_features.to()); + const int32_t out_feat = static_cast(out_features.to()); + const int32_t input_zp = static_cast(input_zero_point.to()); + const int32_t output_zp = + static_cast(output_zero_point.to()); + const bool is_per_channel = (weight_zero_point.numel() > 1); + + const int8_t* input_data = input.const_data_ptr(); + const int8_t* weight_data = weights.const_data_ptr(); + const int32_t* bias_data = + bias.has_value() ? bias.value().const_data_ptr() : nullptr; + int8_t* output_data = out.mutable_data_ptr(); + const int32_t* weight_zp_data = weight_zero_point.const_data_ptr(); + const int32_t* weight_mult_data = weight_multiplier.const_data_ptr(); + const int32_t* weight_shift_data = weight_shift.const_data_ptr(); + + if (!validate_per_channel_quant_params( + weight_mult_data, weight_shift_data, out_feat)) { + context.fail(Error::InvalidArgument); + return out; + } + + // Initialize scratch buffer context (validates early) + CMSISScratchBufferContext scratch_ctx( + const_cast(scratch_buffer), weights, weight_zero_point, bias); + + scratch_ctx.compute_kernel_sums_if_needed(); + cmsis_nn_context ctx = scratch_ctx.get_cmsis_ctx(); + + // Setup CMSIS-NN parameters + cmsis_nn_fc_params fc_params; + fc_params.input_offset = -input_zp; + fc_params.output_offset = output_zp; + fc_params.activation.min = std::numeric_limits::min(); + fc_params.activation.max = std::numeric_limits::max(); + + cmsis_nn_dims input_dims = {1, 1, 1, in_feat}; + cmsis_nn_dims filter_dims = {in_feat, 1, 1, out_feat}; + cmsis_nn_dims bias_dims = {1, 1, 1, out_feat}; + cmsis_nn_dims output_dims = {1, 1, 1, out_feat}; + + arm_cmsis_nn_status status; + for (int32_t b = 0; b < batch_size; b++) { + const int8_t* batch_input = input_data + b * in_feat; + int8_t* batch_output = output_data + b * out_feat; + + ET_CHECK_MSG( + batch_input != nullptr && weight_data != nullptr, + "Null input pointers"); + ET_CHECK_MSG(in_feat > 0 && out_feat > 0, "Invalid dimensions"); + + if (is_per_channel) { + cmsis_nn_per_channel_quant_params per_channel_quant_params; + per_channel_quant_params.multiplier = + const_cast(weight_mult_data); + per_channel_quant_params.shift = const_cast(weight_shift_data); + + status = arm_fully_connected_per_channel_s8( + &ctx, + &fc_params, + &per_channel_quant_params, + &input_dims, + batch_input, + &filter_dims, + weight_data, + &bias_dims, + bias_data, + &output_dims, + batch_output); + } else { + fc_params.filter_offset = -weight_zp_data[0]; + cmsis_nn_per_tensor_quant_params per_tensor_quant_params; + per_tensor_quant_params.multiplier = weight_mult_data[0]; + per_tensor_quant_params.shift = weight_shift_data[0]; + + status = arm_fully_connected_s8( + &ctx, + &fc_params, + &per_tensor_quant_params, + &input_dims, + batch_input, + &filter_dims, + weight_data, + &bias_dims, + bias_data, + &output_dims, + batch_output); + } + + if (status != ARM_CMSIS_NN_SUCCESS) { + ET_LOG( + Error, + "quantized_linear_out: CMSIS-NN failed with status [%d]", + status); + context.fail(Error::Internal); + return out; + } + } + return out; +} + +// Functional variant (stub, not used at runtime) +Tensor quantized_linear( + KernelRuntimeContext& context, + const Tensor& input, + const Scalar& input_zero_point, + const Scalar& input_multiplier, + const Scalar& input_shift, + const Tensor& weights, + const Tensor& weight_zero_point, + const Tensor& weight_multiplier, + const Tensor& weight_shift, + const torch::executor::optional& bias, + const Tensor& bias_multiplier, + const Tensor& bias_shift, + const Tensor& scratch_buffer, + const Scalar& output_zero_point, + const Scalar& in_features, + const Scalar& out_features) { + ET_LOG(Info, "quantized_linear: called"); + assert(false); + return const_cast(input); +} + +} // namespace native +} // namespace cortex_m diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index 926dcd85e4b..d642531e950 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -223,3 +223,216 @@ def quantized_add_out_impl( out.copy_(result_quantized) return out + + +# =================================================================== +# QUANTIZED LINEAR OPERATION DEFINITION +# =================================================================== + + +def _check_per_tensor_or_per_channel(param: torch.Tensor, out_channels: int, name: str): + assert param.numel() in [ + 1, + out_channels, + ], f"{name} must be per-tensor (1) or per-channel ({out_channels}), got {param.numel()}" + + +lib.define( + "quantized_linear.out(" + "Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, " + "Tensor weights, " + "Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, " + "Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, " + "Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features, " + "*, Tensor(a!) out) -> Tensor(a!)" +) + +# Define functional variant (non-out version) +lib.define( + "quantized_linear(" + "Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, " + "Tensor weights, " + "Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, " + "Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, " + "Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features" + ") -> Tensor" +) + + +# Fake meta function for shape inference (out variant) +@register_fake("cortex_m::quantized_linear.out") +def quantized_linear_out_meta( + input: torch.Tensor, + input_zero_point: int, + input_multiplier: int, + input_shift: int, + weights: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_multiplier: torch.Tensor, + weight_shift: torch.Tensor, + bias: torch.Tensor, + bias_multiplier: torch.Tensor, + bias_shift: torch.Tensor, + scratch_buffer: torch.Tensor, + output_zero_point: int, + in_features: int, + out_features: int, + out: torch.Tensor, +) -> torch.Tensor: + # Validate dimensions + batch_size = input.shape[0] + out_channels = weights.shape[0] + + # Validate weight quantization parameters dimensions + _check_per_tensor_or_per_channel( + weight_zero_point, out_channels, "weight_zero_point" + ) + _check_per_tensor_or_per_channel( + weight_multiplier, out_channels, "weight_multiplier" + ) + _check_per_tensor_or_per_channel(weight_shift, out_channels, "weight_shift") + + # Validate output shape + expected_shape = (batch_size, out_channels) + assert ( + out.shape == expected_shape + ), f"Output shape {out.shape} must be {expected_shape}" + + return out + + +# Fake meta function for shape inference (functional variant) +@register_fake("cortex_m::quantized_linear") +def quantized_linear_meta( + input: torch.Tensor, + input_zero_point: int, + input_multiplier: int, + input_shift: int, + weights: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_multiplier: torch.Tensor, + weight_shift: torch.Tensor, + bias: torch.Tensor, + bias_multiplier: torch.Tensor, + bias_shift: torch.Tensor, + scratch_buffer: torch.Tensor, + output_zero_point: int, + in_features: int, + out_features: int, +) -> torch.Tensor: + # Validate dimensions (same as out variant) + batch_size = input.shape[0] + out_channels = weights.shape[0] + + # Validate weight quantization parameters dimensions + _check_per_tensor_or_per_channel( + weight_zero_point, out_channels, "weight_zero_point" + ) + _check_per_tensor_or_per_channel( + weight_multiplier, out_channels, "weight_multiplier" + ) + _check_per_tensor_or_per_channel(weight_shift, out_channels, "weight_shift") + + # Calculate output shape for functional variant + output_shape = (batch_size, out_channels) + return torch.empty(output_shape, dtype=input.dtype, device=input.device) + + +@impl(lib, "quantized_linear.out", "CompositeExplicitAutograd") +def quantized_linear_out_impl( + input: torch.Tensor, + input_zero_point: int, + input_multiplier: int, + input_shift: int, + weights: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_multiplier: torch.Tensor, + weight_shift: torch.Tensor, + bias: torch.Tensor, + bias_multiplier: torch.Tensor, + bias_shift: torch.Tensor, + scratch_buffer: torch.Tensor, + output_zero_point: int, + in_features: int, + out_features: int, + *, + out: torch.Tensor, +) -> torch.Tensor: + """ + Fallback implementation for meta/testing + Note: This won't be called at runtime, only during compilation + """ + + # Per-channel dequantization + input_scale = input_multiplier * (2.0 ** (-input_shift)) + input_fp = (input.float() - input_zero_point) * input_scale + if weight_zero_point.numel() == 1: + # Per-tensor + weight_scale = weight_multiplier.item() * (2.0 ** (-weight_shift.item())) + weights_fp = (weights.float() - weight_zero_point.item()) * weight_scale + else: + # Per-channel + weight_scales = weight_multiplier.float() * (2.0 ** (-weight_shift.float())) + weights_fp = ( + weights.float() - weight_zero_point.float().unsqueeze(1) + ) * weight_scales.unsqueeze(1) + bias_fp = None + if bias is not None: + bias_scales = bias_multiplier.float() * (2.0 ** (-bias_shift.float())) + bias_fp = bias.float() * bias_scales + + result_fp = torch.nn.functional.linear(input_fp, weights_fp, bias_fp) + else: + result_fp = torch.nn.functional.linear(input_fp, weights_fp) + result_quantized = torch.clamp( + torch.round(result_fp + output_zero_point), -128, 127 + ).to(torch.int8) + out.copy_(result_quantized) + return out + + +# Functional variant implementation +@impl(lib, "quantized_linear", "CompositeExplicitAutograd") +def quantized_linear_impl( + input: torch.Tensor, + input_zero_point: int, + input_multiplier: int, + input_shift: int, + weights: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_multiplier: torch.Tensor, + weight_shift: torch.Tensor, + bias: torch.Tensor, + bias_multiplier: torch.Tensor, + bias_shift: torch.Tensor, + scratch_buffer: torch.Tensor, + output_zero_point: int, + in_features: int, + out_features: int, +) -> torch.Tensor: + """ + Functional variant - creates output tensor and calls out variant + """ + # Create output tensor + batch_size = input.shape[0] + output = torch.empty( + (batch_size, out_features), dtype=torch.int8, device=input.device + ) + return quantized_linear_out_impl( + input, + input_zero_point, + input_multiplier, + input_shift, + weights, + weight_zero_point, + weight_multiplier, + weight_shift, + bias, + bias_multiplier, + bias_shift, + scratch_buffer, + output_zero_point, + in_features, + out_features, + out=output, + ) diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index f2615a1f525..b41c0c68fa5 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -27,3 +27,15 @@ kernels: - arg_meta: null kernel_name: cortex_m::quantized_add_out + +- func: cortex_m::quantized_linear(Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, Tensor weights, Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features) -> Tensor + variants: function + kernels: + - arg_meta: null + kernel_name: cortex_m::quantized_linear + +- func: cortex_m::quantized_linear.out(Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, Tensor weights, Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: cortex_m::quantized_linear_out diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py index 3f6e05fc4de..7155f997bf4 100644 --- a/backends/cortex_m/passes/passes_utils.py +++ b/backends/cortex_m/passes/passes_utils.py @@ -8,6 +8,10 @@ import torch +from executorch.exir.dialects._ops import ops as exir_ops + +from torch.fx import Node + def dequantize_per_tensor_cmsis( qtensor: torch.Tensor, zero_point: int, multiplier: int, shift: int @@ -92,3 +96,58 @@ def quantize_multiplier_aot(scale: float) -> tuple[int, int]: def cleanup_erased_nodes(graph_module: torch.fx.GraphModule): # Placeholder for any additional cleanup if needed pass + + +def transfer_metadata( + new_node: Node, source_node: Node, pass_name: str = "QuantizedPass" +) -> None: + """Transfer metadata with proper provenance tracking.""" + if hasattr(source_node, "meta") and source_node.meta: + new_node.meta = source_node.meta.copy() + if "from_node" in new_node.meta: + from_node_list = new_node.meta.get("from_node", []).copy() + from_node_list.append( + {"source": source_node.name, "pass": pass_name, "op": "fuse"} + ) + new_node.meta["from_node"] = from_node_list + for field in ["tensor_meta", "stack_trace"]: + if field in source_node.meta: + new_node.meta[field] = source_node.meta[field] + + +def is_dequant_node(node: Node) -> bool: + """Check if node is a dequantize operation.""" + dequant_targets = { + exir_ops.edge.cortex_m.dequantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, + } + return node.op == "call_function" and node.target in dequant_targets + + +def is_quant_node(node: Node) -> bool: + """Check if node is a quantize operation.""" + quant_targets = { + exir_ops.edge.cortex_m.quantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + } + return node.op == "call_function" and node.target in quant_targets + + +def cleanup_nodes(nodes_to_erase, graph): + """Clean up marked nodes from graph.""" + failed_nodes = [] + + for node in reversed(nodes_to_erase): + if node in graph.nodes and len(node.users) == 0: + try: + graph.erase_node(node) + except Exception as e: + print(f"Warning: Failed to erase node {node}: {e}") + failed_nodes.append(node) + continue + + if failed_nodes: + print(f"Warning: {len(failed_nodes)} nodes could not be erased") + + return failed_nodes diff --git a/backends/cortex_m/passes/quantized_linear_fusion_pass.py b/backends/cortex_m/passes/quantized_linear_fusion_pass.py new file mode 100644 index 00000000000..11a49beb2f4 --- /dev/null +++ b/backends/cortex_m/passes/quantized_linear_fusion_pass.py @@ -0,0 +1,646 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Optional + +import executorch.backends.cortex_m.ops.operators # noqa +import torch +import torch.fx + +from executorch.backends.cortex_m.passes.passes_utils import ( + cleanup_nodes, + is_dequant_node, + quantize_multiplier_aot, + transfer_metadata, +) + +from executorch.backends.transforms.utils import create_mutable_buffer, get_param_tensor + +from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass +from executorch.exir import ExportedProgram +from executorch.exir.dialects._ops import ops as exir_ops +from torch.fx import Node +from torch.fx.passes.infra.pass_manager import PassResult + +logger = logging.getLogger("quantized_linear_fusion_pass") +logger.setLevel(logging.INFO) + + +class QuantizedLinearFusionPass(XNNPACKPass): + """ + Cortex-M backend pass that fuses quantized linear-like patterns. + Fuses: dequantize -> [linear/addmm/fc_ops] -> quantize + Into: cortex_m.quantized_linear.default with direct parameters. + """ + + SUPPORTED_OPS_MAPPING = { + exir_ops.edge.aten.addmm.default: exir_ops.edge.cortex_m.quantized_linear.default, + exir_ops.edge.aten.mm.default: exir_ops.edge.cortex_m.quantized_linear.default, + } + + requires_exported_program = True + + def __init__(self, exported_program: ExportedProgram): + super().__init__(exported_program) + self.nodes_to_erase = [] + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + logger.info("Starting QuantizedLinearFusionPass") + assert id(self._exported_program.graph_module.graph) == id( + graph_module.graph + ), "QuantizedLinearFusionPass requires same graph instance" + + try: + fusion_count = self._fuse_quantized_linear_patterns(graph_module) + if fusion_count > 0: + graph_module.graph.eliminate_dead_code() + graph_module.graph.lint() + graph_module.recompile() + logger.info(f"Linear fusion completed: {fusion_count} patterns fused") + return PassResult(graph_module, fusion_count > 0) + except Exception as e: + logger.error(f"Error in QuantizedLinearFusionPass: {e}") + raise e + + def _extract_linear_pattern(self, quantize_node: Node): + if not quantize_node.args: + return None + fc_node = quantize_node.args[0] + if not ( + fc_node.op == "call_function" + and fc_node.target in self.SUPPORTED_OPS_MAPPING + ): + return None + + op_name = str(fc_node.target).split(".")[-1] + + if "addmm" in str(fc_node.target): + input_dq_node = fc_node.args[1] + else: + input_dq_node = fc_node.args[0] + if not is_dequant_node(input_dq_node): + logger.info("input_dq_node is not a dequant node") + return None + weight_dq_node, bias_dq_node = self._extract_weight_bias_from_fc_op(fc_node) + if not weight_dq_node: + logger.info("No weight, bias dequantize node found") + return None + return ( + quantize_node, + fc_node, + input_dq_node, + weight_dq_node, + bias_dq_node, + op_name, + ) + + def _extract_weight_bias_from_fc_op(self, fc_node: Node): + """Generic extraction for FC-like operations.""" + + if "addmm" in str(fc_node.target): + if len(fc_node.args) >= 3: + bias_arg = fc_node.args[0] + weight_arg = fc_node.args[2] + weight_dq_node = self._trace_to_dequantize(weight_arg) + logger.info( + f"weight_arg: {weight_arg}, traced weight_dq_node: {weight_dq_node}" + ) + + if weight_dq_node is None: + logger.info("No weight dequantize node found ") + + # For bias, try to trace to dequantize but allow None (no-bias case) + bias_dq_node = self._trace_to_dequantize(bias_arg) + if bias_dq_node is None: + logger.info("No bias dequantize node found - likely no-bias linear") + return weight_dq_node, bias_dq_node + elif any(op in str(fc_node.target) for op in ["linear", "mm"]): + if len(fc_node.args) >= 2: + weight_arg = fc_node.args[1] + bias_arg = fc_node.args[2] if len(fc_node.args) > 2 else None + weight_dq_node = self._trace_to_dequantize(weight_arg) + bias_dq_node = self._trace_to_dequantize(bias_arg) if bias_arg else None + return weight_dq_node, bias_dq_node + return None, None + + def _extract_input_quantization_parameters( + self, input_dq_node: Node + ) -> Optional[dict]: + """Extract input quantization parameters from dequantize node.""" + try: + # Find the quantize operation that produces the int8 tensor + input_quantize_node = None + if hasattr(input_dq_node, "args") and input_dq_node.args: + quantize_candidate = input_dq_node.args[0] + if getattr( + quantize_candidate, "op", None + ) == "call_function" and "quantize" in str( + getattr(quantize_candidate, "target", "") + ): + input_quantize_node = quantize_candidate + + if not input_quantize_node: + logger.error("Could not find quantize node for input!") + return None + + # Extract input quantization parameters + input_scale = self._extract_param_value(input_dq_node.args[1]) + input_zero_point = int(self._extract_param_value(input_dq_node.args[2])) + input_multiplier, input_shift = quantize_multiplier_aot(input_scale) + + return { + "input_scale": input_scale, + "input_zero_point": input_zero_point, + "input_multiplier": input_multiplier, + "input_shift": input_shift, + "input_tensor": input_quantize_node, + } + except Exception as e: + logger.error(f"Failed to extract input quantization parameters: {e}") + return None + + def _extract_output_quantization_parameters( + self, quantize_node: Node + ) -> Optional[dict]: + """Extract output quantization parameters from quantize node.""" + try: + output_scale = self._extract_param_value(quantize_node.args[1]) + output_zero_point = int(self._extract_param_value(quantize_node.args[2])) + + return { + "output_scale": output_scale, + "output_zero_point": output_zero_point, + } + except Exception as e: + logger.error(f"Failed to extract output quantization parameters: {e}") + return None + + def _create_constant_parameter_buffer( + self, graph, quantize_node: Node, data: torch.Tensor, name: str + ): + """Create a parameter buffer""" + buffer_name = f"{name}_{id(quantize_node)}" + + setattr(graph.owning_module, buffer_name, data) + + # Create a get_attr node + with graph.inserting_before(quantize_node): + buffer_node = graph.create_node( + op="get_attr", target=buffer_name, name=buffer_name + ) + + # Set metadata + buffer_node.meta["val"] = data + + return buffer_node + + def _extract_weight_parameters(self, weight_dq_node: Node) -> Optional[dict]: + try: + weight_tensor = weight_dq_node.args[0] + weight_scale = weight_dq_node.args[1] + weight_zero_point = ( + weight_dq_node.args[2] if len(weight_dq_node.args) > 2 else None + ) + + weight_scale_data = self._extract_param_value(weight_scale) + weight_zp_data = ( + self._extract_param_value(weight_zero_point) + if weight_zero_point + else None + ) + + # Get actual tensor data to determine output features + weight_tensor_data = get_param_tensor(self._exported_program, weight_tensor) + out_features = weight_tensor_data.shape[0] + + # Handle both per-tensor and per-channel + if ( + isinstance(weight_scale_data, torch.Tensor) + and weight_scale_data.numel() > 1 + ): + # Per-channel: ensure we have the right number of elements + assert ( + weight_scale_data.numel() == out_features + ), f"Scale size {weight_scale_data.numel()} != out_features {out_features}" + + multipliers = [] + shifts = [] + for scale in weight_scale_data: + mult, shift = quantize_multiplier_aot(scale.item()) + multipliers.append(mult) + shifts.append(shift) + + weight_multiplier = torch.tensor(multipliers, dtype=torch.int32) + weight_shift = torch.tensor(shifts, dtype=torch.int32) + weight_zp_tensor = ( + weight_zp_data.int() + if weight_zp_data is not None + else torch.zeros(out_features, dtype=torch.int32) + ) + else: + # Per-tensor: create tensors with correct size for output features + scale_val = ( + weight_scale_data.item() + if isinstance(weight_scale_data, torch.Tensor) + else weight_scale_data + ) + mult, shift = quantize_multiplier_aot(scale_val) + + # Create tensors sized for out_features (not single element) + weight_multiplier = torch.full((out_features,), mult, dtype=torch.int32) + weight_shift = torch.full((out_features,), shift, dtype=torch.int32) + weight_zp_tensor = torch.full( + (out_features,), + weight_zp_data if weight_zp_data else 0, + dtype=torch.int32, + ) + + # Validate multipliers + for i, mult in enumerate(weight_multiplier): + if mult < (1 << 30) or mult > ((1 << 31) - 1): + logger.error( + f"Invalid multiplier[{i}]: {mult}, scale was: {weight_scale_data}" + ) + return None + + return { + "weight_tensor": weight_tensor, + "weight_zero_point_data": weight_zp_tensor, + "weight_multiplier_data": weight_multiplier, + "weight_shift_data": weight_shift, + } + except Exception as e: + logger.error(f"Failed to extract weight parameters: {e}") + return None + + def _extract_bias_parameters(self, bias_dq_node: Optional[Node]) -> Optional[dict]: + """ + Extract bias parameters for quantized linear fusion. + Handles both dequantized bias nodes and constant bias tensors. + Returns a dict with bias_tensor, bias_multiplier, and bias_shift. + """ + if not bias_dq_node: + # No bias present + return None + try: + # Case 1: Bias is a dequantize node + if hasattr(bias_dq_node, "op") and is_dequant_node(bias_dq_node): + bias_tensor = bias_dq_node.args[0] + bias_scale = bias_dq_node.args[1] + + bias_scale_data = self._extract_param_value(bias_scale) + + if ( + isinstance(bias_scale_data, torch.Tensor) + and bias_scale_data.numel() > 1 + ): + # Per-channel bias + bias_multipliers = [] + bias_shifts = [] + for scale_val in bias_scale_data.tolist(): + mult, shift = quantize_multiplier_aot(scale_val) + bias_multipliers.append(mult) + bias_shifts.append(shift) + return { + "bias_tensor": bias_tensor, + "bias_multiplier": bias_multipliers, + "bias_shift": bias_shifts, + } + else: + # Per-tensor bias + bias_scale_val = ( + bias_scale_data.item() + if isinstance(bias_scale_data, torch.Tensor) + else bias_scale_data + ) + bias_multiplier, bias_shift = quantize_multiplier_aot( + bias_scale_val + ) + return { + "bias_tensor": bias_tensor, + "bias_multiplier": bias_multiplier, + "bias_shift": bias_shift, + } + else: + # Case 2: Bias is a constant tensor (not dequantized) + # This can happen if bias is not quantized in the model + bias_tensor = bias_dq_node + # Use default multiplier/shift for unquantized bias + bias_multiplier = 1 + bias_shift = 0 + return { + "bias_tensor": bias_tensor, + "bias_multiplier": bias_multiplier, + "bias_shift": bias_shift, + } + except Exception as e: + logger.error(f"Failed to extract bias parameters: {e}") + return None + + def _prepare_bias_tensors( + self, bias_params: Optional[dict], out_features: int + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Prepare bias multiplier and shift tensors for kernel call. + Returns (bias_multiplier_tensor, bias_shift_tensor) both sized [out_features]. + """ + if bias_params: + bias_multiplier = bias_params["bias_multiplier"] + bias_shift = bias_params["bias_shift"] + + # Convert to tensors of the right size + if isinstance(bias_multiplier, int): + bias_multiplier_tensor = torch.full( + [out_features], bias_multiplier, dtype=torch.int32 + ) + elif isinstance(bias_multiplier, list): + assert ( + len(bias_multiplier) == out_features + ), f"Bias multiplier size {len(bias_multiplier)} != out_features {out_features}" + bias_multiplier_tensor = torch.tensor( + bias_multiplier, dtype=torch.int32 + ) + elif isinstance(bias_multiplier, torch.Tensor): + assert ( + bias_multiplier.numel() == out_features + ), f"Bias multiplier size {bias_multiplier.numel()} != out_features {out_features}" + bias_multiplier_tensor = bias_multiplier + else: + raise TypeError( + f"Unsupported bias_multiplier type: {type(bias_multiplier)}" + ) + + if isinstance(bias_shift, int): + bias_shift_tensor = torch.full( + [out_features], bias_shift, dtype=torch.int32 + ) + elif isinstance(bias_shift, list): + assert ( + len(bias_shift) == out_features + ), f"Bias shift size {len(bias_shift)} != out_features {out_features}" + bias_shift_tensor = torch.tensor(bias_shift, dtype=torch.int32) + elif isinstance(bias_shift, torch.Tensor): + assert ( + bias_shift.numel() == out_features + ), f"Bias shift size {bias_shift.numel()} != out_features {out_features}" + bias_shift_tensor = bias_shift + else: + raise TypeError(f"Unsupported bias_shift type: {type(bias_shift)}") + + return bias_multiplier_tensor, bias_shift_tensor + else: + # No bias: return zero tensors of correct shape + return ( + torch.zeros([out_features], dtype=torch.int32), + torch.zeros([out_features], dtype=torch.int32), + ) + + def _extract_param_value(self, node_or_value): + """ + Extract a scalar value from a Node or a direct float/int. + """ + if isinstance(node_or_value, (float, int)): + return node_or_value + # If it's a tensor, get its scalar value if possible + if isinstance(node_or_value, torch.Tensor): + return node_or_value.item() if node_or_value.numel() == 1 else node_or_value + # If it's a Node, use get_param_tensor + if hasattr(node_or_value, "op"): + tensor = get_param_tensor(self._exported_program, node_or_value) + return tensor.item() if tensor.numel() == 1 else tensor + raise TypeError(f"Unsupported parameter type: {type(node_or_value)}") + + def _calculate_cmsis_scratch_size(self, weight_tensor) -> int: + """Calculate CMSIS-NN scratch buffer size for quantized linear operations. + + Source: CMSIS-NN arm_fully_connected_s8_get_buffer_size() returns filter_dims->w * sizeof(int32_t). + This buffer stores pre-computed kernel sums (weight row sums) - one int32_t per output feature. + Same buffer size applies to both per-tensor and per-channel quantization paths since both use + identical kernel sum optimization in the underlying matrix multiplication. + """ + try: + print(f"weight_tensor type: {type(weight_tensor)}, value: {weight_tensor}") + weight_shape = get_param_tensor(self._exported_program, weight_tensor).shape + out_features = weight_shape[0] # filter_dims->w in CMSIS terms + + # CMSIS-NN implementation expects the following size + cmsis_buffer_size = out_features * 4 # sizeof(int32_t) + return cmsis_buffer_size + except Exception as e: + logger.error(f"Failed to calculate CMSIS scratch size: {e}") + return 2048 # Fallback + + def _create_scratch_buffer(self, graph, quantize_node: Node, weight_tensor): + cmsis_scratch = self._calculate_cmsis_scratch_size(weight_tensor) + + kernel_sum_header = 8 # sizeof(KernelSumHeader) + total_size = kernel_sum_header + cmsis_scratch + + logger.info( + f"Kernel sum header: {kernel_sum_header}, CMSIS buffer: {cmsis_scratch}, total: {total_size}" + ) + + return create_mutable_buffer( + self._exported_program, + name=f"b_cmsis_linear_scratch_{id(quantize_node)}", + data=torch.zeros((total_size,), dtype=torch.int8), + ) + + def _create_fused_node( + self, + graph, + quantize_node: Node, + quant_params: dict, + weight_params: dict, + bias_params: Optional[dict], + quantized_target, + ) -> Node: + """Generic fused node creation for any FC-like operation.""" + # Extract all parameters + input_tensor = quant_params["input_tensor"] + input_zp = quant_params["input_zero_point"] + input_multiplier = quant_params["input_multiplier"] + input_shift = quant_params["input_shift"] + weight_tensor = weight_params["weight_tensor"] + + weight_zp_node = self._create_constant_parameter_buffer( + graph, quantize_node, weight_params["weight_zero_point_data"], "weight_zp" + ) + weight_mult_node = self._create_constant_parameter_buffer( + graph, quantize_node, weight_params["weight_multiplier_data"], "weight_mult" + ) + weight_shift_node = self._create_constant_parameter_buffer( + graph, quantize_node, weight_params["weight_shift_data"], "weight_shift" + ) + # Get dimensions + weight_shape = get_param_tensor(self._exported_program, weight_tensor).shape + assert ( + len(weight_shape) == 2 + ), f"Weight tensor must be 2D, got shape {weight_shape}" + in_features = weight_shape[1] + out_features = weight_shape[0] + + # Handle bias + bias_tensor = bias_params["bias_tensor"] if bias_params else None + bias_multiplier, bias_shift = self._prepare_bias_tensors( + bias_params, out_features + ) + output_zp = quant_params["output_zero_point"] + + scratch_buffer = self._create_scratch_buffer( + graph, quantize_node, weight_tensor + ) + + with graph.inserting_after(quantize_node): + fused = graph.create_node( + "call_function", + target=quantized_target, + args=( + input_tensor, + input_zp, + input_multiplier, + input_shift, + weight_tensor, + weight_zp_node, + weight_mult_node, + weight_shift_node, + bias_tensor, + bias_multiplier, + bias_shift, + scratch_buffer, + output_zp, + in_features, + out_features, + ), + kwargs={}, + ) + + transfer_metadata(fused, quantize_node, "QuantizedLinearFusionPass") + return fused + + def _mark_for_cleanup(self, nodes): + for node in nodes: + if node is not None: + self.nodes_to_erase.append(node) + + def _cleanup_nodes(self, graph): + cleanup_nodes(self.nodes_to_erase, graph) + self.nodes_to_erase.clear() + + def _extract_linear_pattern_with_validation(self, quantize_node: Node): + pattern_info = self._extract_linear_pattern(quantize_node) + if not pattern_info: + return None + # Optionally add more validation here if needed + return pattern_info + + def _trace_to_dequantize(self, node: Optional[Node], max_depth=3) -> Optional[Node]: + """Trace through transformations to find dequantize node.""" + current_node = node + depth = 0 + while current_node and depth < max_depth: + if is_dequant_node(current_node): + return current_node + if current_node.op == "call_function" and current_node.target in { + exir_ops.edge.aten.permute_copy.default, + exir_ops.edge.aten.view_copy.default, + }: + if current_node.args: + current_node = current_node.args[0] + depth += 1 + continue + break + return None + + def _fuse_quantized_linear_patterns( + self, graph_module: torch.fx.GraphModule + ) -> int: + fusion_count = 0 + graph = graph_module.graph + for node in list(graph.nodes): + if not ( + node.op == "call_function" and "quantize_per_tensor" in str(node.target) + ): + continue + pattern_info = self._extract_linear_pattern_with_validation(node) + if not pattern_info: + continue + + ( + quantize_node, + fc_node, + input_dq_node, + weight_dq_node, + bias_dq_node, + op_name, + ) = pattern_info + + # Get quantized target for this FC operation + quantized_target = self.SUPPORTED_OPS_MAPPING.get(fc_node.target) + if not quantized_target: + logger.warning(f"No quantized target found for {fc_node.target}") + continue + + logger.info(f"✅ Found complete cortex_m Q/DQ + {op_name} pattern!") + + try: + input_params = self._extract_input_quantization_parameters( + input_dq_node + ) + if not input_params: + logger.error( + "Quantization parameter extraction failed for node: %s", node + ) + return None + output_params = self._extract_output_quantization_parameters( + quantize_node + ) + if not output_params: + logger.error( + "Output quantization parameter extraction failed for node: %s", + node, + ) + return None + quant_params = {**input_params, **output_params} + logger.info(f"Quantization parameters: {quant_params}") + + weight_params = self._extract_weight_parameters(weight_dq_node) + if not weight_params: + continue + bias_params = self._extract_bias_parameters(bias_dq_node) + if bias_dq_node and not bias_params: + continue + fused_node = self._create_fused_node( + graph, + quantize_node, + quant_params, + weight_params, + bias_params, + quantized_target, + ) + logger.info(f"Created fused {op_name} node: {fused_node}") + + quantize_node.replace_all_uses_with(fused_node) + self._mark_for_cleanup( + [ + quantize_node, + fc_node, + input_dq_node, + weight_dq_node, + bias_dq_node, + ] + ) + fusion_count += 1 + logger.info(f"✅ Successfully fused {op_name} operation {fusion_count}") + except Exception as e: + logger.error( + f"Failed to fuse {op_name} pattern for {fc_node.name}: {e}" + ) + continue + self._cleanup_nodes(graph) + return fusion_count diff --git a/backends/cortex_m/passes/quantized_op_fusion_pass.py b/backends/cortex_m/passes/quantized_op_fusion_pass.py index ca6d8b97795..eebf6866d83 100644 --- a/backends/cortex_m/passes/quantized_op_fusion_pass.py +++ b/backends/cortex_m/passes/quantized_op_fusion_pass.py @@ -36,7 +36,7 @@ class QuantizedOpFusionPass(ExportPass): # Generic operation mapping SUPPORTED_OPS_MAPPING = { exir_ops.edge.aten.add.Tensor: exir_ops.edge.cortex_m.quantized_add.default, - # Future ops to be added here: + # Future binary ops to be added here: } def __init__(self): diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh new file mode 100755 index 00000000000..cc28ac5484a --- /dev/null +++ b/backends/cortex_m/test/build_test_runner.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# TODO: More separation from the regular arm executor runner and testing. + +set -eu + +# Always rebuild executorch in case the cortex-m kernels has been updated. +script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")") +et_root_dir=$(realpath "${script_dir}/../../..") +build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh" +${build_executorch} + +# Build executor runner with all portable ops selected and semi hosting +build_dir="${et_root_dir}/arm_test" +build_executor_runner="${et_root_dir}/backends/arm/scripts/build_executor_runner.sh" +build_root_test_dir="${et_root_dir}/arm_test/arm_semihosting_executor_runner_corstone-300" + +${build_executor_runner} --pte=semihosting --target=ethos-u55-128 --output="${build_root_test_dir}" diff --git a/backends/cortex_m/test/ops/__init__.py b/backends/cortex_m/test/ops/__init__.py new file mode 100644 index 00000000000..c8d1c683da3 --- /dev/null +++ b/backends/cortex_m/test/ops/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/cortex_m/test/ops/test_add.py b/backends/cortex_m/test/ops/test_add.py new file mode 100644 index 00000000000..b7b0ffcbfbc --- /dev/null +++ b/backends/cortex_m/test/ops/test_add.py @@ -0,0 +1,179 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import ( + CortexMTester, + McuTestCase, + ramp_tensor, +) +from executorch.backends.test.suite.operators.test_add import Model, ModelAlpha + + +class CortexMSelfAdd(torch.nn.Module): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + def forward(self, x): + return x + x + + +class CortexMScalarAdd(Model): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +class CortexMTensorAdd(Model): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +class CortexMAlphaAdd(ModelAlpha): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +test_cases = { + "self_scalar": McuTestCase( + CortexMSelfAdd(), + (10.0,), + ), + "self_rank_1": McuTestCase( + CortexMSelfAdd(), + (torch.linspace(-5, 5, 10),), + ), + "self_rank_2_pos": McuTestCase( + CortexMSelfAdd(), + (ramp_tensor(0, 1000, (10, 1)),), + ), + "self_rank_3_neg": McuTestCase( + CortexMSelfAdd(), + (ramp_tensor(-100, 0, (2, 2, 2)),), + ), + "self_rank_4_small": McuTestCase( + CortexMSelfAdd(), + (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),), + ), + "self_rank_5": McuTestCase( + CortexMSelfAdd(), + (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),), + ), + "scalar_scalar": McuTestCase( + CortexMScalarAdd(), + (-0.5, 1.0), + ), + "tensor_scalar": McuTestCase( + CortexMScalarAdd(), + (torch.ones(2, 2), 1.0), + ), + "scalar_tensor": McuTestCase( + CortexMScalarAdd(), + (1000.0, torch.ones(2, 2)), + ), + "broadcast_1": McuTestCase( + CortexMTensorAdd(), + (torch.ones(1), torch.ones(2, 2, 2, 2)), + ), + "broadcast_2": McuTestCase( + CortexMTensorAdd(), + (torch.ones((2, 1, 1, 1)), torch.ones(1)), + ), + "broadcast_3": McuTestCase( + CortexMTensorAdd(), + ( + ramp_tensor(-2, 2, (2, 1, 2, 1)), + ramp_tensor(-5, 5, (1, 2, 1, 2)), + ), + ), + "alpha": McuTestCase( + CortexMAlphaAdd(0.5), + ( + ramp_tensor(-10, 10, (4, 5)), + ramp_tensor(-20, 20, (4, 5)), + ), + ), +} + + +dialect_xfails = { + "self_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError), + "self_rank_1": ("Output 0 does not match reference output", AssertionError), + "self_rank_2_pos": ("Output 0 does not match reference output", AssertionError), + "self_rank_3_neg": ("Output 0 does not match reference output", AssertionError), + "self_rank_4_small": ("Output 0 does not match reference output", AssertionError), + "self_rank_5": ("Output 0 does not match reference output", AssertionError), + "scalar_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError), + "broadcast_3": ("Output 0 does not match reference output", AssertionError), + "alpha": ("Expecting kwargs for aten op IR to be empty", AssertionError), +} + + +@parametrize("test_case", test_cases, xfails=dialect_xfails) +def test_dialect_add(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_dialect( + test_case.model.ops_before_transforms, test_case.model.ops_after_transforms + ) + + +implementation_xfails = { + "self_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError), + "self_rank_1": ("Output 0 does not match reference output", AssertionError), + "self_rank_2_pos": ("Output 0 does not match reference output", AssertionError), + "self_rank_3_neg": ("Output 0 does not match reference output", AssertionError), + "self_rank_4_small": ("Output 0 does not match reference output", AssertionError), + "self_rank_5": ("Output 0 does not match reference output", AssertionError), + "scalar_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError), + "tensor_scalar": ("Output 0 does not match reference output", AssertionError), + "scalar_tensor": ("Output 0 does not match reference output", AssertionError), + "broadcast_1": ("Output 0 does not match reference output", AssertionError), + "broadcast_2": ("Output 0 does not match reference output", AssertionError), + "broadcast_3": ("Output 0 does not match reference output", AssertionError), + "alpha": ("Expecting kwargs for aten op IR to be empty", AssertionError), +} + + +@parametrize("test_case", test_cases, xfails=implementation_xfails) +def test_implementation_add(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_implementation() diff --git a/backends/cortex_m/test/ops/test_linear.py b/backends/cortex_m/test/ops/test_linear.py new file mode 100644 index 00000000000..a1275352fcf --- /dev/null +++ b/backends/cortex_m/test/ops/test_linear.py @@ -0,0 +1,211 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import ( + CortexMTester, + McuTestCase, + ramp_tensor, +) + + +class CortexMMm(torch.nn.Module): + def forward(self, x, y): + return torch.mm(x, y) + + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_mm_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +class CortexMBmm(torch.nn.Module): + def forward(self, x, y): + return torch.bmm(x, y) + + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_bmm_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +class CortexMAddmm(torch.nn.Module): + def forward(self, x, y, z, alpha=None, beta=None): + return torch.addmm(beta, x, alpha, y, z) + + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_addmm_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +class CortexMAt(CortexMMm): + def forward(self, x, y): + return x @ y + + +class CortexMMatmul(CortexMMm): + def forward(self, x, y): + return torch.matmul(x, y) + + +class CortexMLinear(CortexMMatmul): + def __init__(self, *args, **kwargs): + super().__init__() + self.linear = torch.nn.Linear(*args, bias=False) + + def forward(self, x): + return self.linear(x) + + +class CortexMLinearBias(CortexMAddmm): + def __init__(self, *args, **kwargs): + super().__init__() + self.linear = torch.nn.Linear(*args, bias=True) + + def forward(self, x): + return self.linear(x) + + +test_cases = { + "mm": McuTestCase( + model=CortexMMm(), + example_inputs=( + ramp_tensor(0, 10, (1, 16)), + ramp_tensor(0, 10, (16, 16)), + ), + ), + "bmm": McuTestCase( + model=CortexMBmm(), + example_inputs=( + ramp_tensor(0, 10, (1, 16, 16)), + ramp_tensor(0, 10, (1, 16, 16)), + ), + ), + "addmm": McuTestCase( + model=CortexMAddmm(), + example_inputs=( + ramp_tensor(0, 10, (1, 16)), + ramp_tensor(0, 10, (16, 16)), + ramp_tensor(0, 10, (16, 16)), + 2, + 4, + ), + ), + "addmm_scalars": McuTestCase( + model=CortexMAddmm(), + example_inputs=( + ramp_tensor(0, 10, (1, 16)), + ramp_tensor(0, 10, (16, 16)), + ramp_tensor(0, 10, (16, 16)), + ), + ), + "@-operator": McuTestCase( + model=CortexMAt(), + example_inputs=( + ramp_tensor(0, 10, (1, 16)), + ramp_tensor(0, 10, (16, 16)), + ), + ), + "matmul": McuTestCase( + model=CortexMMatmul(), + example_inputs=( + ramp_tensor(0, 10, (1, 16)), + ramp_tensor(0, 10, (16, 16)), + ), + ), + "linear_rank1": McuTestCase( + model=CortexMLinear(2, 3), + example_inputs=(ramp_tensor(-1, 1, (2,)),), + ), + "linear_rank2_pos": McuTestCase( + model=CortexMLinear(8, 3), + example_inputs=(ramp_tensor(0, 10, (2, 8)),), + ), + "linear_rank3_neg": McuTestCase( + model=CortexMLinear(5, 3), + example_inputs=(ramp_tensor(-40, 0, (4, 2, 5)),), + ), + "linear_rank4": McuTestCase( + model=CortexMLinear(16, 32), + example_inputs=(ramp_tensor(-100, 100, (2, 1, 2, 16)),), + ), + "linear_rank5": McuTestCase( + model=CortexMLinear(4, 3), + example_inputs=(ramp_tensor(-2, 2, (5, 2, 1, 2, 4)),), + ), + "linear_bias": McuTestCase( + model=CortexMLinearBias(61, 37), + example_inputs=(ramp_tensor(0, 10, (8, 61)),), + ), +} + +dialect_xfails = { + "mm": ("torch.mm ops are currently not quantized", RuntimeError), + "bmm": ("torch.bmm ops are currently not quantized", RuntimeError), + "addmm": ("torch.addmm ops are currently not quantized", RuntimeError), + "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError), + "matmul": ("torch.matmul ops are currently not quantized", RuntimeError), + "@-operator": ("@ ops are currently not quantized", RuntimeError), + "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_rank2_pos": ("name 'int32' is not defined", NameError), + "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_bias": ("name 'int32' is not defined", NameError), +} + + +@parametrize("test_case", test_cases, dialect_xfails) +def test_dialect_linear(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_dialect( + test_case.model.ops_before_transforms, test_case.model.ops_after_transforms + ) + + +implementation_xfails = { + "mm": ("torch.mm ops are currently not quantized", RuntimeError), + "bmm": ("torch.bmm ops are currently not quantized", RuntimeError), + "addmm": ("torch.addmm ops are currently not quantized", RuntimeError), + "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError), + "matmul": ("torch.matmul ops are currently not quantized", RuntimeError), + "@-operator": ("@ ops are currently not quantized", RuntimeError), + "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_rank2_pos": ("Output 0 does not match reference output.", AssertionError), + "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError), + "linear_bias": ("Output 0 does not match reference output.", AssertionError), +} + + +@parametrize("test_case", test_cases, implementation_xfails) +def test_implementation_linear(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_implementation() diff --git a/backends/cortex_m/test/ops/test_mul.py b/backends/cortex_m/test/ops/test_mul.py new file mode 100644 index 00000000000..a2f13760bf0 --- /dev/null +++ b/backends/cortex_m/test/ops/test_mul.py @@ -0,0 +1,131 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import pytest +import torch +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import ( + CortexMTester, + McuTestCase, + ramp_tensor, +) +from executorch.backends.test.suite.operators.test_mul import Model + + +class CortexMSelfMul(torch.nn.Module): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + def forward(self, x): + return x * x + + +class CortexMScalarMul(Model): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +class CortexMTensorMul(Model): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + +test_cases = { + "self_scalar": McuTestCase( + CortexMSelfMul(), + (10.0,), + ), + "self_rank_1": McuTestCase( + CortexMSelfMul(), + (ramp_tensor(-5, 5, (10,)),), + ), + "self_rank_2_pos": McuTestCase( + CortexMSelfMul(), + (ramp_tensor(0, 1000, (10, 1)),), + ), + "self_rank_3_neg": McuTestCase( + CortexMSelfMul(), + (ramp_tensor(-100, 0, (2, 2, 2)),), + ), + "self_rank_4_small": McuTestCase( + CortexMSelfMul(), + (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),), + ), + "self_rank_5": McuTestCase( + CortexMSelfMul(), + (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),), + ), + "scalar_scalar": McuTestCase( + CortexMScalarMul(), + (-0.5, 1.0), + ), + "tensor_scalar": McuTestCase( + CortexMScalarMul(), + (torch.ones(2, 2), 1.0), + ), + "scalar_tensor": McuTestCase( + CortexMScalarMul(), + (1000.0, torch.ones(2, 2)), + ), + "broadcast_1": McuTestCase( + CortexMTensorMul(), + (torch.ones(1), torch.ones(2, 2, 2, 2)), + ), + "broadcast_2": McuTestCase( + CortexMTensorMul(), + (torch.ones((2, 1, 1, 1)), torch.ones(1)), + ), + "broadcast_3": McuTestCase( + CortexMTensorMul(), + ( + ramp_tensor(-2, 2, (2, 1, 2, 1)), + ramp_tensor(-5, 5, (1, 2, 1, 2)), + ), + ), +} + + +@pytest.mark.skip(reason="Not implemented yet") +@parametrize("test_case", test_cases) +def test_dialect_mul(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_dialect( + test_case.model.ops_before_transforms, test_case.model.ops_after_transforms + ) + + +@pytest.mark.skip(reason="Not implemented yet") +@parametrize("test_case", test_cases) +def test_implementation_mul(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_implementation() diff --git a/backends/cortex_m/test/test_quantize_op_fusion_pass.py b/backends/cortex_m/test/test_quantize_op_fusion_pass.py index 1595b0cfbc3..20f2ecfe656 100644 --- a/backends/cortex_m/test/test_quantize_op_fusion_pass.py +++ b/backends/cortex_m/test/test_quantize_op_fusion_pass.py @@ -313,7 +313,7 @@ def forward(self, x, y): # Apply passes transformed_program = self._apply_passes(edge_program) - # Generate ExecutorTorch program + # Generate ExecuTorch program executorch_program = transformed_program.to_executorch() # Verify the program contains the expected fused operator diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py new file mode 100644 index 00000000000..c492d3c8443 --- /dev/null +++ b/backends/cortex_m/test/tester.py @@ -0,0 +1,114 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from dataclasses import dataclass +from typing import Any + +import torch +from executorch.backends.arm.test.common import get_u55_compile_spec +from executorch.backends.arm.test.tester.arm_tester import Serialize +from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import ( + QuantizedLinearFusionPass, +) +from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import ( + QuantizedOpFusionPass, +) + +from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import ( + ReplaceQuantNodesPass, +) +from executorch.backends.test.harness import Tester as TesterBase +from executorch.backends.test.harness.stages import ( + Export, + Quantize, + RunPasses, + StageType, + ToEdgeTransformAndLower, + ToExecutorch, +) +from executorch.backends.xnnpack._passes import XNNPACKPassManager + +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, + XNNPACKQuantizer, +) + + +class CortexMQuantize(Quantize): + def __init__(self): + quantizer = XNNPACKQuantizer() + config = get_symmetric_quantization_config() + super().__init__(quantizer, config) + + +class CortexMRunPasses(RunPasses): + def __init__(self): + super().__init__( + XNNPACKPassManager, + pass_list=[ + ReplaceQuantNodesPass, + QuantizedLinearFusionPass, + QuantizedOpFusionPass, + ], + ) + + +class CortexMSerialize(Serialize): + def __init__(self): + compile_spec = get_u55_compile_spec() + super().__init__(compile_spec, 1024) + + +cortex_m_stage_classes = { + StageType.EXPORT: Export, + StageType.QUANTIZE: CortexMQuantize, + StageType.RUN_PASSES: CortexMRunPasses, + StageType.SERIALIZE: Serialize, + StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower, + StageType.TO_EXECUTORCH: ToExecutorch, + StageType.SERIALIZE: CortexMSerialize, +} + + +class CortexMTester(TesterBase): + def __init__(self, module, example_inputs): + super().__init__(module, example_inputs, cortex_m_stage_classes) + + def test_dialect(self, ops_before_transforms, ops_after_transforms, qtol=0): + """ + Test the python dialect op implementation. + """ + self.quantize() + self.export() + self.to_edge_transform_and_lower() + self.check_count(ops_before_transforms) + self.run_passes() + self.check_count(ops_after_transforms) + self.run_method_and_compare_outputs(inputs=self.example_inputs, qtol=qtol) + + def test_implementation(self, qtol=0): + """ + Test the optimized op implementation in simulation + """ + self.quantize() + self.export() + self.to_edge_transform_and_lower() + self.run_passes() + self.to_executorch() + self.serialize() + self.run_method_and_compare_outputs(inputs=self.example_inputs, qtol=qtol) + + +@dataclass +class McuTestCase: + model: torch.nn.Module + example_inputs: tuple[Any] + + +def ramp_tensor(start: int, end: int, shape: tuple[int]) -> torch.Tensor: + return torch.linspace(start, end, steps=torch.prod(torch.tensor(shape))).reshape( + shape + ) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt new file mode 100644 index 00000000000..221291442ec --- /dev/null +++ b/backends/cuda/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Build AOTI CUDA backend for runtime. +# +# ### Editing this file ### +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# +cmake_minimum_required(VERSION 3.29) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +find_package(CUDAToolkit REQUIRED) + +# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +find_package_torch() + +# CUDA-specific AOTI functionality +set(_aoti_cuda_sources + runtime/cuda_backend.cpp runtime/shims/memory.cpp + runtime/shims/tensor_attribute.cpp runtime/guard.cpp + runtime/shims/cuda_guard.cpp +) +add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +target_include_directories( + aoti_cuda + PUBLIC ${CUDAToolkit_INCLUDE_DIRS} + $ + $ + # PyTorch AOTI headers from ExecutorTorch's torch detection + ${TORCH_INCLUDE_DIRS} +) +target_compile_options( + aoti_cuda PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) +# Ensure symbols are exported properly +target_link_options( + aoti_cuda PUBLIC $<$>:-Wl,--export-dynamic> +) + +# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries +target_link_libraries( + aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS} +) +# If you need other CUDA libraries, link them similarly: +# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) +executorch_target_link_options_shared_lib(aoti_cuda) + +if(BUILD_TESTING) + # Add runtime + add_executable(voxtral_runner tests/voxtral_runner.cpp) + target_link_libraries( + voxtral_runner PUBLIC aoti_cuda extension_module_static + extension_flat_tensor portable_ops_lib + ) +endif() + +install( + TARGETS aoti_cuda + EXPORT ExecuTorchTargets + DESTINATION lib +) diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS new file mode 100644 index 00000000000..fe57f7f1b63 --- /dev/null +++ b/backends/cuda/TARGETS @@ -0,0 +1,35 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.python_library( + name = "cuda_backend", + srcs = [ + "cuda_backend.py", + "replace_slice_copy_with_slice.py", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/exir/_serialize:lib", + "//executorch/exir/backend:backend_details", + "//executorch/exir/backend:compile_spec_schema", + ], +) + +runtime.python_library( + name = "cuda_partitioner", + srcs = [ + "cuda_partitioner.py", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/exir/backend:partitioner", + "//executorch/exir/backend:utils", + ], +) diff --git a/backends/cuda/__init__.py b/backends/cuda/__init__.py new file mode 100644 index 00000000000..2e41cd717f6 --- /dev/null +++ b/backends/cuda/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py new file mode 100644 index 00000000000..ef98de29f23 --- /dev/null +++ b/backends/cuda/cuda_backend.py @@ -0,0 +1,207 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import os +import typing +from enum import Enum + +from typing import Any, Dict, final, List, Optional, Set + +import torch +from executorch.backends.cuda.replace_slice_copy_with_slice import ( + ReplaceSliceCopyWithSlicePass, +) +from executorch.exir._serialize._named_data_store import NamedDataStore +from executorch.exir._warnings import experimental +from executorch.exir.backend.backend_details import ( + BackendDetails, + ExportedProgram, + PreprocessResult, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec +from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu +from torch.export.passes import move_to_device_pass +from torch.nn.attention import SDPBackend + +# exist fallback operators in et namespace; +supported_fallback_kernels: Dict[str, Any] = {} + +# required fallback kernels but not supported +missing_fallback_kernels: Set[str] = set() + + +class COMPILE_SPEC_KEYS(Enum): + METHOD_NAME = "method_name" + + +# context manager for non-fallback guarantee +# it will raise exception when generating fallback kernels during aoti compile +@contextlib.contextmanager +def collect_unsupported_fallback_kernels(): + original_generate_c_shim_extern_kernel_call = ( + CppWrapperCpu.generate_c_shim_extern_kernel_call + ) + original_generate_fallback_kernel_with_runtime_lookup_aot = ( + CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot + ) + + def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels( + self, + kernel: str, + args: list[str], + device: str, + *, + debug_args: Optional[list[str]] = None, + ): + if kernel not in supported_fallback_kernels: + missing_fallback_kernels.add(kernel) + + original_generate_c_shim_extern_kernel_call( + self, kernel, args, device, debug_args=debug_args + ) + + def generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels( + self, + op_overload, + raw_args, + output_args, + raw_outputs, + ): + # Extract kernel name for collection + kernel_name = getattr(op_overload, "_name", str(op_overload)) + if kernel_name not in supported_fallback_kernels: + missing_fallback_kernels.add(kernel_name) + + original_generate_fallback_kernel_with_runtime_lookup_aot( + self, op_overload, raw_args, output_args, raw_outputs + ) + + CppWrapperCpu.generate_c_shim_extern_kernel_call = ( + generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels + ) + CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = ( + generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels + ) + try: + yield + finally: + CppWrapperCpu.generate_c_shim_extern_kernel_call = ( + original_generate_c_shim_extern_kernel_call + ) + CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = ( + original_generate_fallback_kernel_with_runtime_lookup_aot + ) + + +@final +@experimental( + "This API and all of cuda backend related functionality are experimental." +) +class CudaBackend(BackendDetails): + """ + CudaBackend is a backend that compiles a model to run on CUDA devices. It uses the AOTInductor compiler to generate + optimized CUDA kernels for the model's operators with libtorch-free. The compiled model can be executed on CUDA devices + using the Executorch runtime. + """ + + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + # Move the edge_program from CPU to CUDA for aoti compile + cuda_edge_program = move_to_device_pass(edge_program, "cuda") + + # replace slice_copy with slice + ReplaceSliceCopyWithSlicePass()(cuda_edge_program.graph_module) + + edge_program_module = cuda_edge_program.module() + + # Grab all input placeholders from the graph + user_input_names = cuda_edge_program.graph_signature.user_inputs + user_input_placeholders = [] + for node in cuda_edge_program.graph.nodes: + if node.op == "placeholder" and node.name in user_input_names: + user_input_placeholders.append(node.meta["val"]) + + options: dict[str, typing.Any] = { + # Better model precision + "emulate_precision_casts": True, + # Embed CUDA kernel binaries directly into the compiled shared object + "aot_inductor.embed_kernel_binary": True, + # Do not link against the full PyTorch/libtorch library + "aot_inductor.link_libtorch": False, + # Package model constants and other generated files directly in the shared object (.so) file + "aot_inductor.package_constants_in_so": True, + # Enable maximum automatic tuning for optimal performance + "max_autotune": True, + # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch + "max_autotune_gemm_backends": "TRITON", + # Use TRITON backend for convolution operations tuning only to avoid using operators in libtorch + "max_autotune_conv_backends": "TRITON", + } + + with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel( + [ + SDPBackend.MATH # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`. + ] + ), torch.no_grad(): + # torch._logging.set_logs(post_grad_graphs=True) + so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type] + if len(missing_fallback_kernels) > 0: + formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels)) + raise RuntimeError( + f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n - {formatted_kernels}\n" + "Please add them to the AOTI backend." + ) + + # pyre-ignorep[6]: Incompatible parameter type + with open(so_path, "rb") as f: + so_data = f.read() + + named_data_store = NamedDataStore() + method_name = CudaBackend.method_name_from_compile_specs(compile_specs) + named_data_store.add_named_data( + method_name + "_so_blob", so_data, 1, "aoti_cuda_blob" + ) + + # Clean up the generated so file; it has been packaged into the NamdeDataStore + # pyre-ignorep[6]: Incompatible parameter type + os.remove(so_path) + + return PreprocessResult( + processed_bytes=b"", + debug_handle_map={}, + data_store_output=named_data_store.get_named_data_store_output(), + ) + + @staticmethod + def generate_method_name_compile_spec( + method_name: str, + ) -> CompileSpec: + """ + Returns the compile spec representing the model compute precision, for additional details + please refer to the documentation for ``coremltools.precision``. + """ + return CompileSpec( + COMPILE_SPEC_KEYS.METHOD_NAME.value, + method_name.encode("utf-8"), + ) + + @staticmethod + def method_name_from_compile_specs( + compile_specs: List[CompileSpec], + ) -> str: + """ + Returns the method name from the compile specs. + """ + for spec in compile_specs: + if spec.key == COMPILE_SPEC_KEYS.METHOD_NAME.value: + return spec.value.decode("utf-8") + raise RuntimeError( + f"Could not find method name in compile specs: {compile_specs}" + ) diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py new file mode 100644 index 00000000000..64df7b7dcb2 --- /dev/null +++ b/backends/cuda/cuda_partitioner.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Dict, final, List, Optional, Tuple + +import torch +from executorch.backends.cuda.cuda_backend import CudaBackend # usort: skip +from executorch.exir._warnings import experimental +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, +) +from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer +from torch.export.exported_program import ExportedProgram + + +@final +@experimental( + "This API and all of cuda backend related functionality are experimental." +) +class CudaPartitioner(Partitioner): + """ + CUDA partitioner for AOTInductor backend integration. + + This partitioner creates a single partition containing all operators from the input graph. + It skips core ATen decomposition, allowing the CUDA backend to handle decomposition using + AOTInductor's CUDA-specific decomposition table. + + Only operators that cannot be handled by the aoti-cuda library will be excluded from + the partition and fall back to ExecuTorch's default or custom handling. + """ + + def __init__(self, compile_spec: List[CompileSpec]) -> None: + self.delegation_spec = DelegationSpec(CudaBackend.__name__, compile_spec) + + def partition(self, exported_program: ExportedProgram) -> PartitionResult: + """ + Fully delegate the graph to AOTInductor by tagging all nodes as a single partition. + """ + + partition_tags: Dict[str, DelegationSpec] = {} + tag = "tag0" + + for node in exported_program.graph.nodes: + if node.op != "call_function": + continue + node.meta["delegation_tag"] = tag + + partition_tags[tag] = self.delegation_spec + + tag_constant_data(exported_program) + tag_mutated_buffer(exported_program) + + return PartitionResult( + tagged_exported_program=exported_program, partition_tags=partition_tags + ) + + def ops_to_not_decompose( + self, ep: ExportedProgram + ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: + """ + Return a list of operations that should not be decomposed and let the AOT compiler handle them. + Currently we skip ATen decompositon for all ops, and let the cuda backend handle them. + """ + do_not_decompose = set() + + for node in ep.graph.nodes: + if node.op == "call_function" and isinstance( + node.target, torch._ops.OpOverload + ): + do_not_decompose.add(node.target) + return list(do_not_decompose), None diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py new file mode 100644 index 00000000000..4f16759af35 --- /dev/null +++ b/backends/cuda/replace_slice_copy_with_slice.py @@ -0,0 +1,118 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from typing import Dict, Iterable, Tuple + +import torch +from executorch.exir.dialects._ops import ops +from executorch.exir.dialects.edge._ops import EdgeOpOverload +from executorch.exir.pass_base import ExportPass, PassResult +from torch import fx + + +_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = ( + torch.ops.aten.slice_copy.Tensor, + ops.edge.aten.slice_copy.Tensor, +) + +_SLICE_TARGETS: Dict[ + torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload +] = { + torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor, + ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor, +} + + +class ReplaceSliceCopyWithSlicePass(ExportPass): + """Replace non-mutated ``slice_copy`` results with ``slice`` views.""" + + def call(self, graph_module: fx.GraphModule) -> PassResult: + graph_changed = False + + for node in graph_module.graph.nodes: + if node.op != "call_function" or node.target not in _SLICE_COPY_TARGETS: + continue + + if self._has_blocking_user(node, node.users.keys()): + continue + + node.target = _SLICE_TARGETS[node.target] + graph_changed = True + + if graph_changed: + graph_module.graph.lint() + graph_module.recompile() + + return PassResult(graph_module, graph_changed) + + def _has_blocking_user(self, node: fx.Node, users: Iterable[fx.Node]) -> bool: + for user in users: + if self._is_mutating_user(node, user) or self._is_view_user(node, user): + return True + return False + + def _is_mutating_user(self, node: fx.Node, user: fx.Node) -> bool: + if user.op == "call_method": + # Treat in-place tensor methods conservatively as mutations only when the + # method name ends with ``_`` which is the PyTorch convention for mutation. + return isinstance(user.target, str) and user.target.endswith("_") + + if user.op != "call_function": + return False + + target = user.target + if not hasattr(target, "_schema"): + return False + + schema = target._schema # pyre-ignore[16] + # Positional arguments + for index, arg in enumerate(user.args): + if arg is node and self._argument_mutates(schema, index): + return True + + # Keyword arguments + for name, arg in user.kwargs.items(): + if arg is node and self._argument_mutates(schema, name): + return True + + return False + + def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool: + if user.op == "call_method": + # Treat tensor methods conservatively and assume they may be view-producing. + return True + + if user.op != "call_function": + return False + + target = user.target + if getattr(target, "is_view", False): + for arg in user.args: + if arg is node: + return True + for arg in user.kwargs.values(): + if arg is node: + return True + + return False + + def _argument_mutates( + self, schema: torch._C.FunctionSchema, key: int | str + ) -> bool: + arguments = schema.arguments + if isinstance(key, int): + if key >= len(arguments): + return False + argument = arguments[key] + else: + argument = next((arg for arg in arguments if arg.name == key), None) + if argument is None: + return False + + alias_info = argument.alias_info + return bool(alias_info and alias_info.is_write) diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS new file mode 100644 index 00000000000..54412269287 --- /dev/null +++ b/backends/cuda/runtime/TARGETS @@ -0,0 +1,58 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.cxx_library( + name = "runtime_shims", + srcs = [ + "guard.cpp", + "shims/cuda_guard.cpp", + "shims/memory.cpp", + "shims/tensor_attribute.cpp", + ], + headers = [ + "guard.h", + "shims/cuda_guard.h", + "shims/memory.h", + "shims/tensor_attribute.h", + "utils.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + # Constructor needed for backend registration. + compiler_flags = ["-Wno-global-constructors"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/backends/aoti:common_shims", + "//executorch/extension/tensor:tensor", + "//executorch/runtime/core:core", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/platform:platform", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], +) + +runtime.cxx_library( + name = "cuda_backend", + srcs = [ + "cuda_backend.cpp", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + # Constructor needed for backend registration. + compiler_flags = ["-Wno-global-constructors"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + ":runtime_shims", + "//executorch/backends/aoti:aoti_common", + "//executorch/runtime/backend:interface", + "//executorch/runtime/core/exec_aten/util:tensor_util", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], +) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp new file mode 100644 index 00000000000..805c54ff55c --- /dev/null +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -0,0 +1,351 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// Include our shim layer headers +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +#define LOAD_SYMBOL(name, handle) \ + do { \ + name = reinterpret_cast(dlsym(handle, #name)); \ + ET_CHECK_OR_RETURN_ERROR( \ + name != nullptr, AccessFailed, "Failed to load " #name); \ + } while (0) + +using namespace std; +using namespace aoti; + +using executorch::aten::ScalarType; +using executorch::runtime::ArrayRef; +using executorch::runtime::Backend; +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::BackendInitContext; +using executorch::runtime::CompileSpec; +using executorch::runtime::DelegateHandle; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::NamedDataMap; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::etensor::Tensor; + +class ET_EXPERIMENTAL CudaBackend final + : public ::executorch::runtime::BackendInterface { + private: + Error register_shared_library_functions(void* so_handle) const { + LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle); + LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle); + LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle); + LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle); + LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle); + + return Error::Ok; + } + + public: + bool is_available() const override { + return 1; + } + + // Once per loaded binary blob + Result init( + BackendInitContext& context, + FreeableBuffer* processed, // This will be a empty buffer + ArrayRef compile_specs // This will be my empty list + ) const override { + std::string method_name; + for (const CompileSpec& spec : compile_specs) { + if (std::strcmp(spec.key, "method_name") == 0) { + method_name.assign( + static_cast(spec.value.buffer), + spec.value.nbytes); // no nullptr guarantee, so pass size + break; + } + } + + std::string so_blob_key = + method_name.empty() ? "so_blob" : method_name + "_so_blob"; + + const NamedDataMap* named_data_map = context.get_named_data_map(); + auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str()); + ET_CHECK_OR_RETURN_ERROR( + aoti_cuda_buffer.ok(), + Internal, + "Failed to get data for key %s: 0x%x", + so_blob_key.c_str(), + static_cast(aoti_cuda_buffer.error())); + + // Generate dynamic temporary file path + filesystem::path temp_dir = filesystem::temp_directory_path(); + filesystem::path so_path = + temp_dir / (so_blob_key + to_string(getpid()) + ".so"); + + // Create a temporary file + ofstream outfile(so_path.c_str(), ios::binary); + + // Write the ELF buffer to the temporary file + ET_LOG( + Info, + "Writing %zu bytes to %s", + aoti_cuda_buffer->size(), + so_path.c_str()); + + outfile.write( + static_cast(aoti_cuda_buffer->data()), + aoti_cuda_buffer->size()); + + ET_CHECK_OR_RETURN_ERROR( + outfile, AccessFailed, "Failed to write to file %s", so_path.c_str()); + + // Finish writing the file to disk + outfile.close(); + + // Load the ELF using dlopen + void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL); + ET_CHECK_OR_RETURN_ERROR( + so_handle != nullptr, + AccessFailed, + "Failed to load shared library: %s", + dlerror()); + + processed->Free(); + + // Register all shared library functions + ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle)); + + AOTInductorModelContainerHandle container_handle = nullptr; + + ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice( + &container_handle, 1, "cuda", nullptr)); + + ET_LOG(Info, "container_handle = %p", container_handle); + + AOTIDelegateHandle* handle = new AOTIDelegateHandle(); + handle->so_handle = so_handle; + handle->so_path = so_path.string(); + handle->container_handle = container_handle; + + // Create a CUDA stream for asynchronous execution + cudaStream_t cuda_stream; + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream)); + handle->cuda_stream = static_cast(cuda_stream); + + return (DelegateHandle*)handle; // Return the handle post-processing + } + + // Once per execution + Error execute( + BackendExecutionContext& context, + DelegateHandle* handle_, + Span args) const override { + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + // Need to re-register all the symbols from the so_handle hosted by this + // CudaBackend instance. The reason is that these symbols are + // static/singleton across the whole process. When we share multiple methods + // (meaning multiple so_handle) in the same process, we need to re-register + // the symbols from the so_handle that is being used in this execution. + ET_CHECK_OK_OR_RETURN_ERROR( + register_shared_library_functions(handle->so_handle)); + + size_t n_inputs; + AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs); + + size_t n_outputs; + AOTInductorModelContainerGetNumOutputs( + handle->container_handle, &n_outputs); + + ET_CHECK_OR_RETURN_ERROR( + n_inputs + n_outputs == args.size(), + InvalidArgument, + "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", + n_inputs, + n_outputs, + args.size()) + + // NOTE: ExecuTorch tensors are always on CPU/host memory + // We need to create GPU copies for CUDA kernel execution + std::vector gpu_inputs( + n_inputs); // GPU copies for kernel execution + std::vector gpu_outputs( + n_outputs); // GPU tensors for kernel output + + // Process input tensors: ExecuTorch provides CPU tensors, create GPU + // copies + for (int i = 0; i < n_inputs; i++) { + // Get tensor dimensions and properties from ExecuTorch CPU tensor + auto cpu_tensor = &(args[i]->toTensor()); + auto sizes = cpu_tensor->sizes(); + auto scalar_type = cpu_tensor->scalar_type(); + + // Create GPU tensor with same shape + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_input_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_input_handle); + + ET_CHECK_OR_RETURN_ERROR( + create_err == Error::Ok, + Internal, + "Failed to create GPU tensor for input %d", + i); + + gpu_inputs[i] = gpu_input_handle; + + // Copy data from CPU to GPU + ET_CHECK_OR_RETURN_ERROR( + aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, + Internal, + "Failed to copy input %d from CPU to GPU", + i); + } + // Process output tensors: create GPU counterparts for ExecuTorch CPU + // tensors + for (int i = 0; i < n_outputs; i++) { + // Get output tensor dimensions from ExecuTorch CPU tensor + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + auto sizes = cpu_output_tensor->sizes(); + auto scalar_type = cpu_output_tensor->scalar_type(); + + // Create GPU tensor with same shape for kernel output + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_output_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_output_handle); + + ET_CHECK_OR_RETURN_ERROR( + create_err == Error::Ok, + Internal, + "Failed to create GPU tensor for output %d", + i); + + gpu_outputs[i] = gpu_output_handle; + } + // Run AOTI container with GPU tensors + AOTIRuntimeError error = AOTInductorModelContainerRun( + handle->container_handle, + gpu_inputs.data(), // Use GPU input tensors + n_inputs, + gpu_outputs.data(), // Use GPU output tensors + n_outputs, + handle->cuda_stream, // Pass the actual CUDA stream + nullptr); // proxy_executor_handle can remain nullptr + + ET_CHECK_OR_RETURN_ERROR( + error == Error::Ok, + Internal, + "AOTInductorModelContainerRun failed with error code %d", + error); + + // Copy GPU output results back to CPU output tensors + for (int i = 0; i < n_outputs; i++) { + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + // For DYNAMIC_BOUND tensors we try to resize + ET_CHECK_OK_OR_RETURN_ERROR( + resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), + "Error resizing tensor at output index %d", + i); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), + "Failed to copy GPU output %d back to CPU", + i); + } + + return Error::Ok; + } + + void destroy(DelegateHandle* handle_) const override { + if (handle_ == nullptr) { + return; + } + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + // Destroy the CUDA stream if it exists + if (handle->cuda_stream != nullptr) { + cudaStream_t cuda_stream = static_cast(handle->cuda_stream); + cudaError_t stream_err = cudaStreamDestroy(cuda_stream); + ET_CHECK_OR_LOG_ERROR( + stream_err == cudaSuccess, + "Failed to destroy CUDA stream: %s", + cudaGetErrorString(stream_err)); + handle->cuda_stream = nullptr; + } + + // NOTE: AOTInductorModelContainerDelete does not work correctly with + // multiple .so files. Deleting one container frees shared resources, + // which causes segmentation faults when attempting to delete other + // containers. As a workaround, we skip explicit container deletion + // and defer cleanup to the OS. + // TODO(gasoonjia): Find a proper solution for safe container deletion. + // AOTInductorModelContainerDelete(handle->container_handle); + + // Now close the shared library + if (handle->so_handle != nullptr) { + dlclose(handle->so_handle); + } + + // Remove the temporary shared library file + if (!handle->so_path.empty()) { + std::error_code remove_error; + std::filesystem::remove(handle->so_path, remove_error); + ET_CHECK_OR_LOG_ERROR( + !remove_error, + "Failed to remove temporary shared library %s: %s", + handle->so_path.c_str(), + remove_error.message().c_str()); + } + + delete handle; + clear_all_tensors(); + } +}; + +} // namespace executorch::backends::cuda + +namespace executorch::backends { +namespace { +auto cls = cuda::CudaBackend(); +executorch::runtime::Backend backend{"CudaBackend", &cls}; +static executorch::runtime::Error success_with_compiler = + register_backend(backend); +} // namespace +} // namespace executorch::backends diff --git a/backends/cuda/runtime/guard.cpp b/backends/cuda/runtime/guard.cpp new file mode 100644 index 00000000000..674cc6387b3 --- /dev/null +++ b/backends/cuda/runtime/guard.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace executorch::backends::cuda { + +namespace { +// Thread-local stream storage (private to this file) +thread_local std::unordered_map current_streams_; +} // namespace + +Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) { + if (device_index == -1) { + // Get current device if not specified + int current_device; + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(¤t_device)); + device_index = current_device; + } + + current_streams_[device_index] = stream; + return Error::Ok; +} + +Result getCurrentCUDAStream(DeviceIndex device_index) { + if (device_index == -1) { + int current_device; + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(¤t_device)); + device_index = current_device; + } + + auto it = current_streams_.find(device_index); + if (it != current_streams_.end()) { + return it->second; + } + + cudaStream_t stream; + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&stream)); + setCurrentCUDAStream(stream, device_index); + return stream; +} + +CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept + : original_device_index_(other.original_device_index_), + current_device_index_(other.current_device_index_) { + // Mark the moved-from object as "already restored" so its destructor doesn't + // try to restore the device + other.original_device_index_ = other.current_device_index_; +} + +CUDAGuard::~CUDAGuard() { + if (original_device_index_ != current_device_index_) { + cudaError_t err = cudaSetDevice(original_device_index_); + if (err != cudaSuccess) { + ET_LOG( + Error, + "~CUDAGuard: Failed to restore device to %d: %s", + original_device_index_, + cudaGetErrorString(err)); + } + } +} + +Error CUDAGuard::set_index(DeviceIndex device_index) { + int orig_index = -1; + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&orig_index)); + + original_device_index_ = orig_index; + current_device_index_ = device_index; + + if (current_device_index_ != original_device_index_) { + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaSetDevice(current_device_index_)); + } + + return Error::Ok; +} + +Result CUDAGuard::create(DeviceIndex device_index) { + CUDAGuard guard; // Fixed: Removed () to create a variable, not a function + ET_CHECK_OK_OR_RETURN_ERROR(guard.set_index(device_index)); + return guard; +} + +CUDAStreamGuard::CUDAStreamGuard(CUDAStreamGuard&& other) noexcept + : device_guard_(std::move(other.device_guard_)), + original_stream_(other.original_stream_), + current_stream_(other.current_stream_), + device_index_(other.device_index_) { + // Mark the moved-from object as "already restored" so its destructor doesn't + // try to restore the stream + other.original_stream_ = other.current_stream_; +} + +CUDAStreamGuard::~CUDAStreamGuard() { + // Restore the original stream unless this object was moved-from. + // After a move, original_stream_ == current_stream_, which indicates + // the moved-from object should not restore. + // Note: nullptr is a valid stream value (represents the default stream), + // so we must restore even if original_stream_ is nullptr. + if (original_stream_ != current_stream_) { + Error err = setCurrentCUDAStream(original_stream_, device_index_); + if (err != Error::Ok) { + ET_LOG( + Error, + "~CUDAStreamGuard: Failed to restore stream for device %d", + device_index_); + } + } +} + +Error CUDAStreamGuard::set_stream( + cudaStream_t stream, + DeviceIndex device_index) { + auto result = getCurrentCUDAStream(device_index); + if (!result.ok()) { + ET_LOG(Error, "Failed to get current stream for device %d", device_index); + return result.error(); + } + + original_stream_ = result.get(); + current_stream_ = stream; + device_index_ = device_index; + + ET_CHECK_OK_OR_RETURN_ERROR(setCurrentCUDAStream(stream, device_index)); + + return Error::Ok; +} + +Result CUDAStreamGuard::create( + cudaStream_t stream, + DeviceIndex device_index) { + auto guard_result = CUDAGuard::create(device_index); + ET_CHECK_OK_OR_RETURN_ERROR(guard_result.error()); + + CUDAStreamGuard stream_guard(std::move(guard_result.get())); + ET_CHECK_OK_OR_RETURN_ERROR(stream_guard.set_stream(stream, device_index)); + + return stream_guard; +} + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h new file mode 100644 index 00000000000..3f187000f90 --- /dev/null +++ b/backends/cuda/runtime/guard.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +using executorch::runtime::Error; +using executorch::runtime::Result; + +// Type alias for device index +using DeviceIndex = int32_t; + +/** + * Set the current CUDA stream for the specified device. + * + * @param stream The CUDA stream to set as current + * @param device_index The device index (-1 to use current device) + * @return Error code indicating success or failure + */ +Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index = -1); + +/** + * Get the current CUDA stream for the specified device. + * If no stream has been set, creates a new stream and sets it as current. + * + * @param device_index The device index (-1 to use current device) + * @return Result containing the current stream on success, or an error code on + * failure + */ +Result getCurrentCUDAStream(DeviceIndex device_index = -1); + +/** + * RAII guard that sets the current CUDA device and restores it on destruction. + * This ensures that the device is properly restored even if an exception + * occurs. + * + */ +class CUDAGuard { + private: + /** + * Private constructor - use create() factory method instead. + */ + explicit CUDAGuard() + : original_device_index_(-1), current_device_index_(-1) {} + + public: + /** + * Factory method to create a CUDAGuard. + * + * @param device_index The device index to set as current + * @return Result containing the guard on success, or an error code on failure + */ + static Result create(DeviceIndex device_index); + + // Copy is not allowed + CUDAGuard(const CUDAGuard&) = delete; + CUDAGuard& operator=(const CUDAGuard&) = delete; + + // Move constructor and assignment + CUDAGuard(CUDAGuard&& other) noexcept; + CUDAGuard& operator=(CUDAGuard&& other) = delete; + + /** + * Destructor that restores the original device if necessary. + */ + ~CUDAGuard(); + + /** + * Sets the CUDA device to the given device index. + * + * @param device_index The device index to set as current + * @return Error code indicating success or failure + */ + Error set_index(DeviceIndex device_index); + + /** + * Get the original device index before the guard was created. + * + * @return The original device index + */ + DeviceIndex original_device() const { + return original_device_index_; + } + + /** + * Get the current device index. + * + * @return The current device index + */ + DeviceIndex current_device() const { + return current_device_index_; + } + + private: + /// The original device before this guard was created + DeviceIndex original_device_index_; + /// The current device managed by this guard + DeviceIndex current_device_index_; +}; + +/** + * RAII guard that sets the current CUDA device and stream, restoring both on + * destruction. This is useful for temporarily switching to a different device + * and stream. + * + */ +class CUDAStreamGuard { + private: + // Private constructor that takes a CUDAGuard + explicit CUDAStreamGuard(CUDAGuard&& guard) + : device_guard_(std::move(guard)), + original_stream_(nullptr), + current_stream_(nullptr), + device_index_(-1) {} + + public: + /** + * Factory method to create a CUDAStreamGuard. + * + * @param stream The CUDA stream to set as current + * @param device_index The device index for the stream + * @return Result containing the guard on success, or an error code on failure + */ + static Result create( + cudaStream_t stream, + DeviceIndex device_index); + + // Copy is not allowed + CUDAStreamGuard(const CUDAStreamGuard&) = delete; + CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete; + + // Move constructor and assignment + CUDAStreamGuard(CUDAStreamGuard&& other) noexcept; + CUDAStreamGuard& operator=(CUDAStreamGuard&& other) noexcept = delete; + + /** + * Destructor that restores the original stream and device. + */ + ~CUDAStreamGuard(); + + /** + * Sets the CUDA stream to the given stream on the specified device. + * + * @param stream The CUDA stream to set as current + * @param device_index The device index for the stream + * @return Error code indicating success or failure + */ + Error set_stream(cudaStream_t stream, DeviceIndex device_index); + + /** + * Get the current guarded stream. + * + * @return The current stream + */ + cudaStream_t stream() const { + return current_stream_; + } + + /** + * Get the device index being guarded. + * + * @return The device index + */ + DeviceIndex device_index() const { + return device_index_; + } + + private: + /// The device guard that handles device switching + CUDAGuard device_guard_; + /// The original stream that was current before this guard + cudaStream_t original_stream_ = nullptr; + /// The current stream being guarded + cudaStream_t current_stream_ = nullptr; + /// The device index for this stream guard + DeviceIndex device_index_; +}; + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/cuda_guard.cpp b/backends/cuda/runtime/shims/cuda_guard.cpp new file mode 100644 index 00000000000..bb07acc7ffa --- /dev/null +++ b/backends/cuda/runtime/shims/cuda_guard.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch::backends::cuda { + +extern "C" { + +AOTITorchError aoti_torch_create_cuda_guard( + int32_t device_index, + CUDAGuardHandle* ret_guard) { + ET_CHECK_OR_RETURN_ERROR( + ret_guard != nullptr, + InvalidArgument, + "aoti_torch_create_cuda_guard failed: ret_guard is null"); + + auto result = CUDAGuard::create(device_index); + if (!result.ok()) { + return result.error(); + } + *ret_guard = new CUDAGuard(std::move(result.get())); + return Error::Ok; +} + +AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard) { + ET_CHECK_OR_RETURN_ERROR( + guard != nullptr, + InvalidArgument, + "aoti_torch_delete_cuda_guard failed: guard is null"); + + delete guard; + return Error::Ok; +} + +AOTITorchError aoti_torch_cuda_guard_set_index( + CUDAGuardHandle guard, + int32_t device_index) { + ET_CHECK_OR_RETURN_ERROR( + guard != nullptr, + InvalidArgument, + "aoti_torch_cuda_guard_set_index failed: guard is null"); + + ET_CHECK_OK_OR_RETURN_ERROR(guard->set_index(device_index)); + return Error::Ok; +} + +AOTITorchError aoti_torch_create_cuda_stream_guard( + void* stream, + int32_t device_index, + CUDAStreamGuardHandle* ret_guard) { + ET_CHECK_OR_RETURN_ERROR( + ret_guard != nullptr, + InvalidArgument, + "aoti_torch_create_cuda_stream_guard failed: ret_guard is null"); + + ET_CHECK_OR_RETURN_ERROR( + stream != nullptr, + InvalidArgument, + "aoti_torch_create_cuda_stream_guard failed: stream is null"); + + auto result = + CUDAStreamGuard::create(static_cast(stream), device_index); + if (!result.ok()) { + return result.error(); + } + *ret_guard = new CUDAStreamGuard(std::move(result.get())); + return Error::Ok; +} + +AOTITorchError aoti_torch_delete_cuda_stream_guard( + CUDAStreamGuardHandle guard) { + ET_CHECK_OR_RETURN_ERROR( + guard != nullptr, + InvalidArgument, + "aoti_torch_delete_cuda_stream_guard failed: guard is null"); + + delete guard; + return Error::Ok; +} + +AOTITorchError aoti_torch_get_current_cuda_stream( + int32_t device_index, + void** ret_stream) { + ET_CHECK_OR_RETURN_ERROR( + ret_stream != nullptr, + InvalidArgument, + "aoti_torch_get_current_cuda_stream failed: ret_stream is null"); + + auto result = getCurrentCUDAStream(device_index); + if (!result.ok()) { + return result.error(); + } + *ret_stream = static_cast(result.get()); + return Error::Ok; +} + +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h new file mode 100644 index 00000000000..f930f3df643 --- /dev/null +++ b/backends/cuda/runtime/shims/cuda_guard.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +using executorch::backends::aoti::AOTITorchError; + +extern "C" { + +// Handle types for CUDA guards +using CUDAGuardHandle = CUDAGuard*; +using CUDAStreamGuardHandle = CUDAStreamGuard*; + +/** + * Creates a CUDA device guard that sets the current device and restores it + * upon destruction. + * + * @param device_index The device index to set as current + * @param ret_guard Output parameter for the created guard handle (must not be + * null) + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_create_cuda_guard( + int32_t device_index, + CUDAGuardHandle* ret_guard); + +/** + * Deletes a CUDA device guard and frees its associated resources. + * + * @param guard Handle to the guard to be deleted + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); + +/** + * Sets the CUDA device to a new index for an existing guard. + * + * @param guard Handle to the guard + * @param device_index The device index to set as current + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_cuda_guard_set_index( + CUDAGuardHandle guard, + int32_t device_index); + +/** + * Creates a CUDA stream guard that sets the current device and stream, + * restoring both upon destruction. + * + * @param stream The CUDA stream to set as current + * @param device_index The device index for the stream + * @param ret_guard Output parameter for the created guard handle (must not be + * null) + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_create_cuda_stream_guard( + void* stream, + int32_t device_index, + CUDAStreamGuardHandle* ret_guard); + +/** + * Deletes a CUDA stream guard and frees its associated resources. + * + * @param guard Handle to the stream guard to be deleted + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); + +/** + * Gets the current CUDA stream for a specified device. + * + * @param device_index The device index (-1 to use current device) + * @param ret_stream Output parameter for the current stream (must not be null) + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_get_current_cuda_stream( + int32_t device_index, + void** ret_stream); + +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp new file mode 100644 index 00000000000..6fe315ba8ee --- /dev/null +++ b/backends/cuda/runtime/shims/memory.cpp @@ -0,0 +1,663 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include // For posix_memalign +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +using executorch::aten::SizesType; +using executorch::aten::StridesType; +using executorch::backends::aoti::aoti_torch_get_device_index; +using executorch::backends::aoti::aoti_torch_get_dtype; +using executorch::backends::aoti::aoti_torch_get_sizes; +using executorch::backends::aoti::aoti_torch_get_strides; +using executorch::backends::aoti::dtype_to_element_size; +using executorch::backends::aoti::dtype_to_scalar_type; +using executorch::backends::aoti::validate_storage_offset; + +// Global storage for tensors and their metadata +std::unordered_set> tensors; + +// Reference counting for memory addresses +// Maps memory address to number of tensors using it +// Special value: NOT_OWN (-1) means tensor never owns the memory +constexpr int32_t NOT_OWN = -1; +std::unordered_map memory_to_n_tensor; + +namespace { + +// Calculate linear offset from strides and indices +int64_t calculate_linear_offset( + const int64_t* indices, + const int64_t* strides, + int64_t ndim) { + int64_t offset = 0; + for (int64_t i = 0; i < ndim; ++i) { + offset += indices[i] * strides[i]; + } + return offset; +} + +// Convert linear index to multi-dimensional indices based on sizes +void linear_to_indices( + int64_t linear_idx, + const int64_t* sizes, + int64_t ndim, + int64_t* indices) { + for (int64_t i = ndim - 1; i >= 0; --i) { + indices[i] = linear_idx % sizes[i]; + linear_idx /= sizes[i]; + } +} + +// Generic pointwise copy function that handles arbitrary strides +template +AOTITorchError pointwise_copy_generic( + T* dst_data, + const T* src_data, + const int64_t* dst_sizes, + const int64_t* dst_strides, + const int64_t* src_sizes, + const int64_t* src_strides, + int64_t dst_ndim, + int64_t src_ndim, + int64_t total_elements) { + std::vector dst_indices(dst_ndim); + std::vector src_indices(src_ndim); + + for (int64_t linear_idx = 0; linear_idx < total_elements; ++linear_idx) { + // Convert linear index to multi-dimensional indices for both tensors + linear_to_indices(linear_idx, dst_sizes, dst_ndim, dst_indices.data()); + linear_to_indices(linear_idx, src_sizes, src_ndim, src_indices.data()); + + // Calculate offsets for both source and destination + int64_t src_offset = + calculate_linear_offset(src_indices.data(), src_strides, src_ndim); + int64_t dst_offset = + calculate_linear_offset(dst_indices.data(), dst_strides, dst_ndim); + + // Copy element + dst_data[dst_offset] = src_data[src_offset]; + } + + return Error::Ok; +} + +} // anonymous namespace + +extern "C" { + +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size) { + // TODO(gasoonjia): verify given data is on the target device + (void)device_type; + (void)opaque_metadata; + (void)layout; + (void)opaque_metadata_size; + + // Validate input parameters first + ET_CHECK_OR_RETURN_ERROR( + data != nullptr, + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2 failed: data pointer is null"); + + ET_CHECK_OR_RETURN_ERROR( + !(sizes_ptr == nullptr && ndim > 0), + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2 failed: sizes_ptr is null"); + + ET_CHECK_OR_RETURN_ERROR( + ret_new_tensor != nullptr, + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2 failed: ret_new_tensor is null"); + + // Check that device_index is always 0 + ET_CHECK_OR_RETURN_ERROR( + device_index == 0, + InvalidArgument, + "device_index must be 0, got: %d", + device_index); + + // Validate dtype using SupportedDTypes from utils.h + ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); + + // Storage offset must be 0 since from_blob cannot handle different offsets + ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset)); + + // Convert sizes to the format expected by ExecutorTorch using SizesType + std::vector sizes = + convert_sizes_to_vector(ndim, sizes_ptr); + + // Convert strides using the common helper function with StridesType + std::vector strides = + convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); + + // Create ExecutorTorch tensor that wraps the existing memory + // Note: We're NOT copying the data, just wrapping it + auto tensor = executorch::extension::from_blob( + data, // existing memory (don't copy!) + sizes, // tensor dimensions + strides, // tensor strides (allows different strides) + dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType + ); + + ET_CHECK_OR_RETURN_ERROR( + tensor != nullptr, InvalidArgument, "Failed to create tensor from blob"); + + // Store the tensor so it doesn't get destroyed + tensors.insert(tensor); + + *ret_new_tensor = tensor.get(); + + // Check if this memory address is already being tracked + auto memory_it = memory_to_n_tensor.find(data); + ET_CHECK_OR_RETURN_ERROR( + memory_it == memory_to_n_tensor.end(), + InvalidArgument, + "Memory address %p is already being tracked by another tensor", + data); + + // Mark this memory as NOT_OWN since tensor created from blob never owns + // memory + memory_to_n_tensor[data] = NOT_OWN; + + return Error::Ok; +} + +AOTITorchError aoti_torch_empty_strided( + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor) { + // Check that device_index is always 0 + ET_CHECK_OR_RETURN_ERROR( + device_index == 0, + InvalidArgument, + "device_index must be 0, got: %d", + device_index); + + // This requires us to reserve CUDA memory and put it into a ETensor + void* ptr; + int64_t numel = 1; + for (int64_t i = 0; i < ndim; i++) { + numel *= sizes_ptr[i]; + } + + ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); + + size_t element_size = dtype_to_element_size(dtype); + ET_CHECK_OR_RETURN_ERROR( + element_size != 0, + InvalidArgument, + "Invalid element size for dtype: %d", + dtype); + int64_t nbytes = numel * element_size; + + if (device_type == static_cast(SupportedDevices::CUDA)) { + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaMallocAsync(&ptr, static_cast(nbytes), cudaStreamDefault)); + } else if (device_type == static_cast(SupportedDevices::CPU)) { + // Ensure 16-byte alignment for CPU memory to match CUDA requirements + int result = posix_memalign(&ptr, 16, nbytes); + ET_CHECK_OR_RETURN_ERROR( + result == 0, + MemoryAllocationFailed, + "Failed to allocate aligned CPU memory"); + ET_CHECK_OR_RETURN_ERROR( + ptr != nullptr, + MemoryAllocationFailed, + "Failed to call posix_memalign"); + } else { + ET_CHECK_OR_RETURN_ERROR( + false, + NotImplemented, + "Need to implement empty_strided for non-CUDA non-CPU device type %d", + device_type); + } + + // ETensor sizes + auto sizes = convert_sizes_to_vector(ndim, sizes_ptr); + + // ETensor strides + auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); + + // ETensor creation with dynamic shape support for edge cases + auto tensor = executorch::extension::from_blob( + ptr, sizes, strides, dtype_to_scalar_type(dtype)); + + // Store the tensor so it doesn't get destroyed + tensors.insert(tensor); + *ret_new_tensor = tensor.get(); + + // This tensor owns the memory it allocated, set reference count to 1 + memory_to_n_tensor[ptr] = 1; + + return Error::Ok; +} + +void clear_all_tensors() { + // Use aoti_torch_delete_tensor_object to properly delete each tensor + // Note: We need to collect tensor pointers first since deletion modifies the + // set + std::vector tensor_ptrs; + tensor_ptrs.reserve(tensors.size()); + for (const auto& tensor_shared : tensors) { + tensor_ptrs.push_back(tensor_shared.get()); + } + + // Now delete each tensor - this will modify the global tensors set + for (Tensor* tensor_ptr : tensor_ptrs) { + aoti_torch_delete_tensor_object(tensor_ptr); + } + + // tensors set should now be empty, but ensure it's cleared + tensors.clear(); + + ET_LOG(Info, "Cleared all tensors"); +} + +AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) { + // Handle null tensor pointer + ET_CHECK_OR_RETURN_ERROR( + tensor != nullptr, InvalidArgument, "Cannot delete null tensor"); + + // Check if tensor exists in our tracking + bool found_in_tensors = false; + for (auto it = tensors.begin(); it != tensors.end(); ++it) { + if (it->get() == tensor) { + found_in_tensors = true; + break; + } + } + + // If tensor not found in our tracking, it's invalid + ET_CHECK_OR_RETURN_ERROR( + found_in_tensors, InvalidArgument, "Didn't find tensor %p", tensor); + + // Find and delete the tensor + for (auto it = tensors.begin(); it != tensors.end(); ++it) { + if (it->get() == tensor) { + // Get the tensor before erasing + auto tensor_ptr = *it; + void* data_ptr = tensor_ptr->mutable_data_ptr(); + + // Find the reference count for this memory address + auto memory_it = memory_to_n_tensor.find(data_ptr); + if (memory_it != memory_to_n_tensor.end()) { + int32_t ref_count = memory_it->second; + + if (ref_count == NOT_OWN) { + // Tensor never owned the memory, skip freeing + // Just remove tensor from tracking + tensors.erase(it); + return Error::Ok; + } else if (ref_count == 1) { + // Only current tensor using this memory, free it + // Determine if it's GPU memory + cudaPointerAttributes attributes{}; + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaPointerGetAttributes(&attributes, data_ptr)); + + if (attributes.type == cudaMemoryTypeDevice) { + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaFreeAsync(data_ptr, cudaStreamDefault)); + } else { + ET_CHECK_OR_RETURN_ERROR( + attributes.type != cudaMemoryTypeManaged, + Internal, + "Expected host memory but got managed!") + // This is CPU memory - free immediately + free(data_ptr); + data_ptr = nullptr; + } + + // Remove from memory tracking + memory_to_n_tensor.erase(memory_it); + } else if (ref_count > 1) { + // Other tensors still using this memory, just decrement count + memory_to_n_tensor[data_ptr] = ref_count - 1; + } + } else { + ET_CHECK_OR_RETURN_ERROR( + false, + Internal, + "Internal error: memory not found during deletion"); + } + + // Remove tensor from set (this will call the destructor if it's the last + // reference) + tensors.erase(it); + return Error::Ok; + } + } + + // This should never be reached since we found it above + ET_CHECK_OR_RETURN_ERROR( + false, Internal, "Internal error: tensor not found after validation"); +} + +AOTITorchError +aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) { + (void)non_blocking; + + // Check for null pointers first + ET_CHECK_OR_RETURN_ERROR( + self != nullptr, + InvalidArgument, + "aoti_torch_copy_ failed: self tensor is null"); + + ET_CHECK_OR_RETURN_ERROR( + src != nullptr, + InvalidArgument, + "aoti_torch_copy_ failed: src tensor is null"); + + // Get dtype information and validate compatibility + int32_t self_dtype, src_dtype; + aoti_torch_get_dtype(self, &self_dtype); + aoti_torch_get_dtype(src, &src_dtype); + + ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(self_dtype)); + + ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(src_dtype)); + + // Check dtype compatibility - both tensors must have the same dtype + ET_CHECK_OR_RETURN_ERROR( + self_dtype == src_dtype, + InvalidArgument, + "dtype mismatch. self.dtype=%d, src.dtype=%d. aoti_torch_copy_ requires same dtypes", + self_dtype, + src_dtype); + + // Check total number of elements compatibility (PyTorch copy_ behavior) + int64_t self_numel = self->numel(); + int64_t src_numel = src->numel(); + + ET_CHECK_OR_RETURN_ERROR( + self_numel == src_numel, + InvalidArgument, + "numel mismatch. self.numel()=%ld, src.numel()=%ld", + self_numel, + src_numel); + + // Get tensor metadata + int64_t* self_strides; + int64_t* src_strides; + aoti_torch_get_strides(self, &self_strides); + aoti_torch_get_strides(src, &src_strides); + + int64_t* self_sizes; + int64_t* src_sizes; + aoti_torch_get_sizes(self, &self_sizes); + aoti_torch_get_sizes(src, &src_sizes); + + // Determine device locations + cudaPointerAttributes srcAttributes{}; + cudaPointerAttributes dstAttributes{}; + + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaPointerGetAttributes(&srcAttributes, src->data_ptr())); + + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaPointerGetAttributes(&dstAttributes, self->data_ptr())); + + bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; + bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; + + // Check if tensors have the same schema (sizes, strides, dtype) for fast path + bool same_schema = true; + for (int i = 0; i < self->dim(); i++) { + if (self_strides[i] != src_strides[i]) { + same_schema = false; + break; + } + } + + size_t total_bytes = src->nbytes(); + int64_t total_elements = self->numel(); + + if (same_schema) { + // Fast path: Direct memory copy since layouts match exactly + if (srcIsDevice && dstIsDevice) { + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + total_bytes, + cudaMemcpyDeviceToDevice)); + } else if (srcIsDevice && !dstIsDevice) { + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + total_bytes, + cudaMemcpyDeviceToHost)); + } else if (!srcIsDevice && dstIsDevice) { + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + total_bytes, + cudaMemcpyHostToDevice)); + } else { + std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes); + } + } else { + // Fallback path: Pointwise copy with stride-aware indexing + // This handles arbitrary tensor layouts and strides + + size_t element_size = dtype_to_element_size(self_dtype); + ET_CHECK_OR_RETURN_ERROR( + element_size != 0, + InvalidArgument, + "Invalid element size for dtype: %d", + self_dtype); + + // Allocate temporary host memory for GPU tensors + float* src_host_data = nullptr; + float* dst_host_data = nullptr; + bool need_free_src = false; + bool need_free_dst = false; + + if (srcIsDevice) { + src_host_data = + static_cast(malloc(total_elements * sizeof(float))); + ET_CHECK_OR_RETURN_ERROR( + src_host_data != nullptr, + MemoryAllocationFailed, + "Failed to allocate memory for src_host_data"); + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( + src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost)); + need_free_src = true; + } else { + src_host_data = static_cast(src->data_ptr()); + } + + if (dstIsDevice) { + dst_host_data = + static_cast(malloc(total_elements * sizeof(float))); + if (dst_host_data == nullptr) { + if (need_free_src) { + free(src_host_data); + } + ET_CHECK_OR_RETURN_ERROR( + false, + MemoryAllocationFailed, + "Failed to allocate memory for dst_host_data"); + } + need_free_dst = true; + } else { + dst_host_data = static_cast(self->mutable_data_ptr()); + } + + // Perform pointwise copy with stride calculation + AOTITorchError copy_err = pointwise_copy_generic( + dst_host_data, + src_host_data, + self_sizes, + self_strides, + src_sizes, + src_strides, + self->dim(), + src->dim(), + total_elements); + + if (copy_err != Error::Ok) { + // Clean up temporary buffers before returning + if (need_free_src) { + free(src_host_data); + } + if (need_free_dst) { + free(dst_host_data); + } + return copy_err; + } + + // Copy result back to device if needed + if (dstIsDevice) { + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy( + self->mutable_data_ptr(), + dst_host_data, + total_bytes, + cudaMemcpyHostToDevice)); + } + + // Clean up temporary buffers + if (need_free_src) { + free(src_host_data); + } + if (need_free_dst) { + free(dst_host_data); + } + } + + return Error::Ok; +} + +AOTITorchError aoti_torch__reinterpret_tensor( + Tensor* self, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + Tensor** ret_new_tensor) { + // Validate input parameters first + ET_CHECK_OR_RETURN_ERROR( + self != nullptr, + InvalidArgument, + "aoti_torch__reinterpret_tensor failed: self tensor is null"); + + ET_CHECK_OR_RETURN_ERROR( + !(sizes_ptr == nullptr && ndim > 0), + InvalidArgument, + "aoti_torch__reinterpret_tensor failed: sizes_ptr is null"); + + ET_CHECK_OR_RETURN_ERROR( + ret_new_tensor != nullptr, + InvalidArgument, + "aoti_torch__reinterpret_tensor failed: ret_new_tensor is null"); + + // Check if storage_offset is not 0 - return error if not + ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset)); + + // Get the device info from the source tensor to perform device_index + // validation + int32_t device_type = 0; + int32_t device_index = 0; + ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type)); + + ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index)); + + // Ensure device_index is always 0 + ET_CHECK_OR_RETURN_ERROR( + device_index == 0, + InvalidArgument, + "device_index must be 0, got: %d", + device_index); + + // Get the dtype from the source tensor + int32_t dtype = 0; + ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(self, &dtype)); + + // Validate dtype using SupportedDTypes + ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); + + // Get the original data pointer from the source tensor + void* data_ptr = self->mutable_data_ptr(); + ET_CHECK_OR_RETURN_ERROR( + data_ptr != nullptr, + InvalidArgument, + "Source tensor has null data pointer"); + + // Check if the given memory is in the map, if not return error + auto memory_it = memory_to_n_tensor.find(data_ptr); + ET_CHECK_OR_RETURN_ERROR( + memory_it != memory_to_n_tensor.end(), + InvalidArgument, + "Memory address %p is not being tracked by reference counting system", + data_ptr); + + // Convert sizes using utility function from utils.h + std::vector sizes = convert_sizes_to_vector(ndim, sizes_ptr); + + // Convert strides using utility function from utils.h + std::vector strides = + convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); + + // Create new tensor view that reinterprets the same memory with different + // shape/strides This creates a view, not a copy - the data pointer is shared + std::shared_ptr tensor = executorch::extension::from_blob( + data_ptr, // Reuse the same memory from source tensor + sizes, // New sizes with explicit SizesType + strides, // New strides with explicit StridesType + dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting + ); + + ET_CHECK_OR_RETURN_ERROR( + tensor != nullptr, + InvalidArgument, + "Failed to create reinterpreted tensor view"); + + // Store the tensor so it doesn't get destroyed + tensors.insert(tensor); + + *ret_new_tensor = tensor.get(); + + // Increment the reference count for this memory address only if it is owned + // by tensor + memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN + ? NOT_OWN + : memory_to_n_tensor[data_ptr] + 1; + + return Error::Ok; +} + +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h new file mode 100644 index 00000000000..7a8d4c3609b --- /dev/null +++ b/backends/cuda/runtime/shims/memory.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace executorch::backends::cuda { + +using executorch::backends::aoti::AOTITorchError; +using executorch::backends::aoti::Tensor; + +extern "C" { + +/** + * Creates a tensor object from an existing memory blob without copying the + * data. The tensor will wrap the provided memory and will not take ownership of + * it. When the tensor is deleted, the original memory will remain valid and + * must be freed by the caller. + * + * @param data Pointer to the memory blob to wrap (must not be null) + * @param ndim Number of dimensions in the tensor + * @param sizes_ptr Pointer to array of dimension sizes (using SizesType) + * @param strides_ptr Pointer to array of strides for each dimension (using + * StridesType, can be null for contiguous) + * @param storage_offset Storage offset (must be 0 for current implementation) + * @param dtype Data type identifier (supports FLOAT32 and BFLOAT16 from + * SupportedDTypes) + * @param device_type Device type (CPU=0, CUDA=1 from SupportedDevices) + * @param device_index Device index (must be 0 for current implementation) + * @param ret_new_tensor Output parameter for the created tensor (must not be + * null) + * @param layout Tensor layout identifier (0=strided) + * @param opaque_metadata Optional metadata pointer (can be null) + * @param opaque_metadata_size Size of opaque metadata in bytes + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size); + +/** + * Creates an uninitialized tensor with specified dimensions, strides, and + * dtyper on either CPU or CUDA device. + * + * @param ndim Number of dimensions in the tensor + * @param sizes_ptr Pointer to array of dimension sizes + * @param strides_ptr Pointer to array of strides for each dimension + * @param dtype Data type identifier (matches PyTorch scalar types) + * @param device_type Device type (0=CPU, 1=CUDA) + * @param device_index Device index (must be 0 for current implementation) + * @param ret_new_tensor Output parameter for the created tensor + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_empty_strided( + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor); + +/** + * Deletes a tensor object and frees its associated memory. + * + * @param tensor Pointer to the tensor object to be deleted + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); + +/** + * Creates a tensor view that reinterprets the same underlying memory with + * different shape and strides without copying data. + * + * Note that the new tensor will not have the ownership of the underlying + * memory. + * + * @param self Input tensor whose memory will be reinterpreted + * @param ndim Number of dimensions for the new tensor view + * @param sizes_ptr Array of sizes for each dimension + * @param strides_ptr Array of strides for each dimension (or nullptr for + * contiguous) + * @param storage_offset Storage offset (must be 0) + * @param ret_new_tensor Output pointer to store the new tensor view + * + * @return Error::Ok on success, appropriate error code on failure + */ +AOTITorchError aoti_torch__reinterpret_tensor( + Tensor* self, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + Tensor** ret_new_tensor); + +/** + * Copies data from source tensor to destination tensor. + * + * This function implements copy function for tensors living in CUDA AOTI + * backend. It supports copying between tensors with different shapes (as long + * as they have the same total number of elements) and different memory + * layouts/strides. + * + * Note that currently this function does not support copying between tensors + * with different dtypes. + * + * @param self Destination tensor (data will be overwritten) + * @param src Source tensor (data will be copied from this tensor) + * @param non_blocking Whether the copy should be non-blocking (currently + * ignored) + * + * @return Error::Ok on success, appropriate error code on failure: + * - Error::InvalidArgument: null pointers, dtype mismatch, numel + * mismatch + * - Error::MemoryAllocationFailed: failed to allocate temporary memory + * - Error::Internal: CUDA operation failures + */ +AOTITorchError +aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); + +// Function to clear all tensors from internal storage +void clear_all_tensors(); +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tensor_attribute.cpp b/backends/cuda/runtime/shims/tensor_attribute.cpp new file mode 100644 index 00000000000..1a14c79f9f2 --- /dev/null +++ b/backends/cuda/runtime/shims/tensor_attribute.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch::backends::cuda { + +extern "C" { + +// Device type functions for tensor attributes +AOTITorchError aoti_torch_get_device_type( + Tensor* tensor, + int32_t* ret_device_type) { + // All tensors in aoti-cuda delegate are on CUDA + *ret_device_type = aoti_torch_device_type_cuda(); + return Error::Ok; +} + +// Device type constants +int32_t aoti_torch_device_type_cuda() { + // Let's say cuda is 1 for ET as well + return 1; +} + +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h new file mode 100644 index 00000000000..15a4e397d24 --- /dev/null +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace executorch::backends::cuda { + +// Common using declarations for ExecutorTorch types +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Common AOTI type aliases +using AOTITorchError = Error; + +// Device type functions for tensor attributes +AOTITorchError aoti_torch_get_device_type( + Tensor* tensor, + int32_t* ret_device_type); + +// Device type constants +int32_t aoti_torch_device_type_cuda(); + +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tests/TARGETS b/backends/cuda/runtime/shims/tests/TARGETS new file mode 100644 index 00000000000..9ff3e83a8bd --- /dev/null +++ b/backends/cuda/runtime/shims/tests/TARGETS @@ -0,0 +1,6 @@ +load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl new file mode 100644 index 00000000000..70f27b86bec --- /dev/null +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -0,0 +1,35 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") +load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils") + +def cuda_shim_cpp_unittest(name): + cpp_unittest( + name = "test_" + name, + srcs = [ + "test_" + name + ".cpp", + ], + deps = [ + "//executorch/backends/aoti:common_shims", + "//executorch/backends/cuda/runtime:runtime_shims", + "//executorch/extension/tensor:tensor", + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + "//executorch/runtime/core/exec_aten:lib", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + ) + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + cuda_shim_cpp_unittest("aoti_torch_empty_strided") + cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object") + cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2") + cuda_shim_cpp_unittest("aoti_torch__reinterpret_tensor") + cuda_shim_cpp_unittest("aoti_torch_copy_") + cuda_shim_cpp_unittest("aoti_torch_cuda_guard") diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp new file mode 100644 index 00000000000..e18bf142b5c --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp @@ -0,0 +1,810 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::aoti; +using namespace executorch::backends::cuda; +using namespace executorch::runtime; +using executorch::runtime::etensor::Tensor; + +// Test fixture for aoti_torch__reinterpret_tensor tests +class AOTITorchReinterpretTensorTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize ExecuTorch Platform Abstraction Layer + et_pal_init(); + + // Check if CUDA is available + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + + // Clear any remaining tensors from previous tests + clear_all_tensors(); + } + + void TearDown() override { + // Clean up metadata + cleanup_tensor_metadata(); + + // Clear the global tensor storage using the provided function + clear_all_tensors(); + } + + // Helper to calculate number of elements from sizes + int64_t calculate_numel(const std::vector& sizes) { + int64_t numel = 1; + for (int64_t size : sizes) { + numel *= size; + } + return numel; + } + + // Helper to calculate contiguous strides from sizes + std::vector calculate_contiguous_strides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + + strides[sizes.size() - 1] = 1; + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; + } + + // Helper to create a source tensor using empty_strided (which allocates new + // memory) + Tensor* create_source_tensor( + const std::vector& sizes, + int32_t dtype = 6, // float32 + int32_t device_type = 1, // CUDA + int32_t device_index = 0) { + std::vector strides = calculate_contiguous_strides(sizes); + + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + dtype, + device_type, + device_index, + &tensor); + + if (error != Error::Ok) { + return nullptr; + } + + return tensor; + } + + private: + std::vector cuda_memory_buffers_; + std::vector cpu_memory_buffers_; +}; + +// Test basic functionality: reinterpret tensor with different shapes +TEST_F(AOTITorchReinterpretTensorTest, BasicReinterpretation) { + // Create a source tensor with shape [12] (1D with 12 elements) + std::vector source_sizes = {12}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + // Store the original data pointer + void* original_data_ptr = source_tensor->mutable_data_ptr(); + ASSERT_NE(original_data_ptr, nullptr); + + // Reinterpret as [3, 4] (2D with same number of elements) + std::vector new_sizes = {3, 4}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, // storage_offset + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Check that the reinterpreted tensor has the new shape + EXPECT_EQ(reinterpreted_tensor->dim(), 2); + EXPECT_EQ(reinterpreted_tensor->size(0), 3); + EXPECT_EQ(reinterpreted_tensor->size(1), 4); + + // CRITICAL: Check that the reinterpreted tensor uses the SAME memory + void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); + EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) + << "Reinterpreted tensor should use the same memory as the source tensor"; + + // Write data through the original tensor and verify it's visible through the + // reinterpreted tensor + std::vector test_data = { + 1.0f, + 2.0f, + 3.0f, + 4.0f, + 5.0f, + 6.0f, + 7.0f, + 8.0f, + 9.0f, + 10.0f, + 11.0f, + 12.0f}; + cudaError_t cuda_err = cudaMemcpy( + original_data_ptr, + test_data.data(), + test_data.size() * sizeof(float), + cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); + + // Read back through the reinterpreted tensor + std::vector readback_data(12); + cuda_err = cudaMemcpy( + readback_data.data(), + reinterpreted_data_ptr, + readback_data.size() * sizeof(float), + cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + + // Verify the data matches + for (size_t i = 0; i < test_data.size(); i++) { + EXPECT_EQ(readback_data[i], test_data[i]) + << "Data should be the same through both tensors at index " << i; + } +} + +// Test reinterpreting with different strides +TEST_F(AOTITorchReinterpretTensorTest, ReinterpretWithCustomStrides) { + // Create a source tensor with shape [2, 6] (contiguous) + std::vector source_sizes = {2, 6}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + void* original_data_ptr = source_tensor->mutable_data_ptr(); + ASSERT_NE(original_data_ptr, nullptr); + + // Reinterpret as [3, 4] with custom strides (still valid for the same memory) + std::vector new_sizes = {3, 4}; + std::vector new_strides = {4, 1}; // Row-major strides for [3, 4] + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, // storage_offset + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Check shape + EXPECT_EQ(reinterpreted_tensor->dim(), 2); + EXPECT_EQ(reinterpreted_tensor->size(0), 3); + EXPECT_EQ(reinterpreted_tensor->size(1), 4); + + // CRITICAL: Check that the reinterpreted tensor uses the SAME memory + void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); + EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) + << "Reinterpreted tensor should use the same memory as the source tensor"; + + // Verify strides were set correctly + int64_t* tensor_strides; + error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(tensor_strides[0], 4); + EXPECT_EQ(tensor_strides[1], 1); +} + +// Test error cases: null input tensor +TEST_F(AOTITorchReinterpretTensorTest, NullInputTensor) { + std::vector new_sizes = {2, 3}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + nullptr, // null input tensor + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, // storage_offset + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test error cases: null sizes pointer +TEST_F(AOTITorchReinterpretTensorTest, NullSizesPointer) { + std::vector source_sizes = {6}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + std::vector new_strides = {2, 1}; + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + 2, // ndim > 0 + nullptr, // null sizes pointer + new_strides.data(), + 0, // storage_offset + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test error cases: null return tensor pointer +TEST_F(AOTITorchReinterpretTensorTest, NullReturnTensorPointer) { + std::vector source_sizes = {6}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + std::vector new_sizes = {2, 3}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, // storage_offset + nullptr); // null return tensor pointer + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test error cases: non-zero storage offset (should fail) +TEST_F(AOTITorchReinterpretTensorTest, NonZeroStorageOffset) { + std::vector source_sizes = {6}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + std::vector new_sizes = {2, 3}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 1, // non-zero storage_offset (should fail) + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test reinterpreting CPU tensor +TEST_F(AOTITorchReinterpretTensorTest, ReinterpretCPUTensor) { + // Create a CPU tensor with shape [8] + std::vector source_sizes = {8}; + Tensor* source_tensor = create_source_tensor( + source_sizes, + 6, // float32 + 0, // CPU device + 0); + ASSERT_NE(source_tensor, nullptr); + + void* original_data_ptr = source_tensor->mutable_data_ptr(); + ASSERT_NE(original_data_ptr, nullptr); + + // Reinterpret as [2, 4] + std::vector new_sizes = {2, 4}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, // storage_offset + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Check that the reinterpreted tensor uses the SAME memory + void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); + EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) + << "Reinterpreted CPU tensor should use the same memory as the source tensor"; + + // Test direct memory access for CPU tensors + float* original_float_ptr = reinterpret_cast(original_data_ptr); + float* reinterpreted_float_ptr = + reinterpret_cast(reinterpreted_data_ptr); + + // Write through original and read through reinterpreted + original_float_ptr[0] = 42.0f; + EXPECT_EQ(reinterpreted_float_ptr[0], 42.0f) + << "Changes through original tensor should be visible through reinterpreted tensor"; +} + +// Test that deleting source tensor doesn't affect reinterpreted tensor (they +// share memory) +TEST_F(AOTITorchReinterpretTensorTest, DeletionBehavior) { + std::vector source_sizes = {6}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + void* shared_data_ptr = source_tensor->mutable_data_ptr(); + + // Reinterpret as [2, 3] + std::vector new_sizes = {2, 3}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Verify they share the same memory + EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), shared_data_ptr); + + // Delete the source tensor (which owns the memory) + error = aoti_torch_delete_tensor_object(source_tensor); + EXPECT_EQ(error, Error::Ok); + + // The reinterpreted tensor should still be valid but the memory might be + // freed Since the source tensor owned the memory, the reinterpreted tensor + // becomes invalid This is expected behavior - the user needs to manage the + // lifecycle properly + + // Clean up the reinterpreted tensor + error = aoti_torch_delete_tensor_object(reinterpreted_tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test scalar tensor reinterpretation +TEST_F(AOTITorchReinterpretTensorTest, ReinterpretScalarTensor) { + // Create a scalar tensor (0D) + std::vector source_sizes = {}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + void* original_data_ptr = source_tensor->mutable_data_ptr(); + + // Try to reinterpret scalar as [1] (1D with 1 element) + std::vector new_sizes = {1}; + std::vector new_strides = {1}; + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Check that the reinterpreted tensor uses the SAME memory + EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr); + + // Check new shape + EXPECT_EQ(reinterpreted_tensor->dim(), 1); + EXPECT_EQ(reinterpreted_tensor->size(0), 1); +} + +// Test reinterpreting tensor with zero-sized dimension +// TODO: This test is disabled because zero-sized tensors have complex stride +// validation requirements that need further investigation +TEST_F(AOTITorchReinterpretTensorTest, DISABLED_ReinterpretZeroSizedTensor) { + // Create a tensor with shape [0, 5] (zero elements) + std::vector source_sizes = {0, 5}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + void* original_data_ptr = source_tensor->mutable_data_ptr(); + + // Reinterpret as [5, 0] (still zero elements) + std::vector new_sizes = {5, 0}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Check that the reinterpreted tensor uses the SAME memory + EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr); + + // Check new shape + EXPECT_EQ(reinterpreted_tensor->dim(), 2); + EXPECT_EQ(reinterpreted_tensor->size(0), 5); + EXPECT_EQ(reinterpreted_tensor->size(1), 0); +} + +// Test with nullptr strides (should use contiguous strides) +TEST_F(AOTITorchReinterpretTensorTest, NullStridesPointer) { + std::vector source_sizes = {12}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + void* original_data_ptr = source_tensor->mutable_data_ptr(); + + // Reinterpret as [3, 4] with null strides (should calculate contiguous + // strides) + std::vector new_sizes = {3, 4}; + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + nullptr, // null strides - should calculate contiguous strides + 0, + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Check that the reinterpreted tensor uses the SAME memory + EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr); + + // Check that contiguous strides were calculated correctly + int64_t* tensor_strides; + error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(tensor_strides[0], 4); // stride for dimension 0 should be 4 + EXPECT_EQ(tensor_strides[1], 1); // stride for dimension 1 should be 1 +} + +// Test bf16 tensor reinterpretation +TEST_F(AOTITorchReinterpretTensorTest, ReinterpretBF16Tensor) { + // Create a bf16 source tensor with shape [6] + std::vector source_sizes = {6}; + Tensor* source_tensor = create_source_tensor( + source_sizes, + static_cast( + SupportedDTypes::BFLOAT16), // bf16 dtype from SupportedDTypes + static_cast( + SupportedDevices::CUDA), // CUDA device from SupportedDevices + 0); // device_index must be 0 + ASSERT_NE(source_tensor, nullptr); + + void* original_data_ptr = source_tensor->mutable_data_ptr(); + ASSERT_NE(original_data_ptr, nullptr); + + // Verify the tensor is actually bf16 + int32_t actual_dtype = 0; + AOTITorchError dtype_check_error = + aoti_torch_get_dtype(source_tensor, &actual_dtype); + EXPECT_EQ(dtype_check_error, Error::Ok); + EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::BFLOAT16)) + << "Source tensor should have bfloat16 dtype"; + + // Reinterpret as [2, 3] (same number of elements) + std::vector new_sizes = {2, 3}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, // storage_offset + &reinterpreted_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(reinterpreted_tensor, nullptr); + + // Check that the reinterpreted tensor has the new shape + EXPECT_EQ(reinterpreted_tensor->dim(), 2); + EXPECT_EQ(reinterpreted_tensor->size(0), 2); + EXPECT_EQ(reinterpreted_tensor->size(1), 3); + + // Verify the dtype is preserved as bf16 + int32_t reinterpreted_dtype = 0; + dtype_check_error = + aoti_torch_get_dtype(reinterpreted_tensor, &reinterpreted_dtype); + EXPECT_EQ(dtype_check_error, Error::Ok); + EXPECT_EQ( + reinterpreted_dtype, static_cast(SupportedDTypes::BFLOAT16)) + << "Reinterpreted tensor should preserve bfloat16 dtype"; + + // CRITICAL: Check that the reinterpreted tensor uses the SAME memory + void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr(); + EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr) + << "Reinterpreted tensor should use the same memory as the source tensor"; + + // Test memory sharing by writing data through the original tensor + // and verifying it's visible through the reinterpreted tensor + // Note: bf16 has 2 bytes per element + std::vector test_data_bf16 = { + 0x3F80, 0x4000, 0x4040, 0x4080, 0x40A0, 0x40C0}; // bf16 values + cudaError_t cuda_err = cudaMemcpy( + original_data_ptr, + test_data_bf16.data(), + test_data_bf16.size() * sizeof(uint16_t), + cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); + + // Read back through the reinterpreted tensor + std::vector readback_data_bf16(6); + cuda_err = cudaMemcpy( + readback_data_bf16.data(), + reinterpreted_data_ptr, + readback_data_bf16.size() * sizeof(uint16_t), + cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + + // Verify the data matches + for (size_t i = 0; i < test_data_bf16.size(); i++) { + EXPECT_EQ(readback_data_bf16[i], test_data_bf16[i]) + << "BF16 data should be the same through both tensors at index " << i; + } +} + +// Test reference counting behavior - memory not in map should fail +TEST_F(AOTITorchReinterpretTensorTest, MemoryNotInMapShouldFail) { + // Create a tensor directly without using our allocation functions + // This should NOT be in the reference counting map + void* external_memory; + ASSERT_EQ( + cudaMallocManaged(&external_memory, 12 * sizeof(float)), cudaSuccess); + + // Create a tensor by manually wrapping this memory without going through our + // APIs + std::vector sizes = {12}; + std::vector strides = calculate_contiguous_strides(sizes); + + // Create the tensor directly using ExecutorTorch extension + auto tensor_shared = executorch::extension::from_blob( + external_memory, + convert_sizes_to_vector(sizes.size(), sizes.data()), + convert_strides_to_vector(sizes.size(), sizes.data(), strides.data()), + executorch::runtime::etensor::ScalarType::Float); + + ASSERT_TRUE(tensor_shared); + Tensor* external_tensor = tensor_shared.get(); + + // Try to reinterpret this tensor - should fail because memory is not in map + std::vector new_sizes = {3, 4}; + std::vector new_strides = calculate_contiguous_strides(new_sizes); + + Tensor* reinterpreted_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + external_tensor, + new_sizes.size(), + new_sizes.data(), + new_strides.data(), + 0, // storage_offset + &reinterpreted_tensor); + + // Should fail because memory is not being tracked by reference counting + // system + EXPECT_EQ(error, Error::InvalidArgument); + + // Clean up the external memory + ASSERT_EQ(cudaFree(external_memory), cudaSuccess); +} + +// Test reference counting behavior - creating view increments reference count +TEST_F(AOTITorchReinterpretTensorTest, ViewCreationIncrementsReferenceCount) { + // Create a source tensor that owns memory (reference count = 1) + std::vector source_sizes = {12}; + Tensor* source_tensor = create_source_tensor(source_sizes); + ASSERT_NE(source_tensor, nullptr); + + void* shared_data_ptr = source_tensor->mutable_data_ptr(); + ASSERT_NE(shared_data_ptr, nullptr); + + // Create first view - should increment reference count to 2 + std::vector view1_sizes = {3, 4}; + std::vector view1_strides = + calculate_contiguous_strides(view1_sizes); + + Tensor* view1_tensor; + AOTITorchError error = aoti_torch__reinterpret_tensor( + source_tensor, + view1_sizes.size(), + view1_sizes.data(), + view1_strides.data(), + 0, + &view1_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(view1_tensor, nullptr); + EXPECT_EQ(view1_tensor->mutable_data_ptr(), shared_data_ptr); + + // Create second view - should increment reference count to 3 + std::vector view2_sizes = {2, 6}; + std::vector view2_strides = + calculate_contiguous_strides(view2_sizes); + + Tensor* view2_tensor; + error = aoti_torch__reinterpret_tensor( + source_tensor, + view2_sizes.size(), + view2_sizes.data(), + view2_strides.data(), + 0, + &view2_tensor); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(view2_tensor, nullptr); + EXPECT_EQ(view2_tensor->mutable_data_ptr(), shared_data_ptr); + + // Now delete the source tensor - memory should NOT be freed (reference count + // = 2) + error = aoti_torch_delete_tensor_object(source_tensor); + EXPECT_EQ(error, Error::Ok); + + // Both views should still be valid - test by accessing memory + float test_value = 42.0f; + cudaError_t cuda_err = cudaMemcpy( + shared_data_ptr, &test_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); + + float readback_value = 0.0f; + cuda_err = cudaMemcpy( + &readback_value, + view1_tensor->mutable_data_ptr(), + sizeof(float), + cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_EQ(readback_value, test_value); + + // Delete first view - memory should still NOT be freed (reference count = 1) + error = aoti_torch_delete_tensor_object(view1_tensor); + EXPECT_EQ(error, Error::Ok); + + // Second view should still be valid + readback_value = 0.0f; + cuda_err = cudaMemcpy( + &readback_value, + view2_tensor->mutable_data_ptr(), + sizeof(float), + cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_EQ(readback_value, test_value); + + // Delete second view - NOW memory should be freed (reference count = 0) + error = aoti_torch_delete_tensor_object(view2_tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test reference counting behavior with NOT_OWN memory (from blob) - should +// SUCCEED and keep NOT_OWN +TEST_F(AOTITorchReinterpretTensorTest, ViewOfNotOwnMemoryKeepsNotOwnStatus) { + // Allocate external memory + void* external_memory; + cudaError_t cuda_err = + cudaMallocManaged(&external_memory, 12 * sizeof(float)); + ASSERT_EQ(cuda_err, cudaSuccess); + + // Create tensor from blob (which marks memory as NOT_OWN) + std::vector blob_sizes = {12}; + std::vector blob_strides = calculate_contiguous_strides(blob_sizes); + + Tensor* blob_tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + external_memory, + blob_sizes.size(), + blob_sizes.data(), + blob_strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device_index + &blob_tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(blob_tensor, nullptr); + + // Create view of NOT_OWN memory - should SUCCEED and keep NOT_OWN status + std::vector view_sizes = {3, 4}; + std::vector view_strides = calculate_contiguous_strides(view_sizes); + + Tensor* view_tensor; + error = aoti_torch__reinterpret_tensor( + blob_tensor, + view_sizes.size(), + view_sizes.data(), + view_strides.data(), + 0, + &view_tensor); + + // Should succeed - NOT_OWN memory can be reinterpreted but stays NOT_OWN + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(view_tensor, nullptr); + EXPECT_EQ(view_tensor->mutable_data_ptr(), external_memory); + + // Verify both tensors share the same memory + EXPECT_EQ(blob_tensor->mutable_data_ptr(), view_tensor->mutable_data_ptr()); + + // Test memory sharing by writing data through one tensor and reading through + // the other + float test_value = 42.0f; + cuda_err = cudaMemcpy( + external_memory, &test_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); + + float readback_value = 0.0f; + cuda_err = cudaMemcpy( + &readback_value, + view_tensor->mutable_data_ptr(), + sizeof(float), + cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_EQ(readback_value, test_value); + + // Delete the blob tensor - external memory should NOT be freed (NOT_OWN + // behavior) + error = aoti_torch_delete_tensor_object(blob_tensor); + EXPECT_EQ(error, Error::Ok); + + // View tensor should still be valid - test by accessing memory + readback_value = 0.0f; + cuda_err = cudaMemcpy( + &readback_value, + view_tensor->mutable_data_ptr(), + sizeof(float), + cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_EQ(readback_value, test_value); + + // Delete view tensor - external memory should still NOT be freed (NOT_OWN + // behavior) + error = aoti_torch_delete_tensor_object(view_tensor); + EXPECT_EQ(error, Error::Ok); + + // External memory should still be accessible (proves neither tensor freed it) + readback_value = 0.0f; + cuda_err = cudaMemcpy( + &readback_value, external_memory, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess); + EXPECT_EQ(readback_value, test_value); + + // Clean up external memory manually (as expected for NOT_OWN memory) + ASSERT_EQ(cudaFree(external_memory), cudaSuccess); +} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp new file mode 100644 index 00000000000..9fca0f92cf8 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::cuda; +using namespace executorch::backends::aoti; +using namespace executorch::runtime; + +// Test fixture for aoti_torch_copy_ tests +class AOTITorchCopyTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize ExecuTorch Platform Abstraction Layer + et_pal_init(); + + // Check if CUDA is available + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + + // Clear any remaining tensors from previous tests + clear_all_tensors(); + } + + void TearDown() override { + // Clean up metadata + cleanup_tensor_metadata(); + + // Clear the global tensor storage using the provided function + clear_all_tensors(); + } + + // Helper to create test tensors with specific data + Tensor* create_test_tensor_with_data( + const std::vector& sizes, + const std::vector& data, + const std::vector& strides = {}, + int32_t dtype = static_cast(SupportedDTypes::FLOAT32), + int32_t device_type = static_cast(SupportedDevices::CUDA), + int32_t device_index = 0) { + Tensor* tensor; + + const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); + + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides_ptr, + dtype, + device_type, + device_index, + &tensor); + + if (error != Error::Ok || tensor == nullptr) { + return nullptr; + } + + // Fill tensor with data + size_t total_bytes = data.size() * sizeof(float); + if (device_type == static_cast(SupportedDevices::CUDA)) { + cudaError_t memcpy_err = cudaMemcpy( + tensor->mutable_data_ptr(), + data.data(), + total_bytes, + cudaMemcpyHostToDevice); + // Note: Error is checked but we don't fail the function + // This allows tests to proceed and handle errors as needed + (void)memcpy_err; // Suppress unused variable warning + } else { // CPU + std::memcpy(tensor->mutable_data_ptr(), data.data(), total_bytes); + } + + return tensor; + } + + // Helper to get data from tensor + std::vector get_tensor_data(Tensor* tensor) { + if (!tensor) { + return {}; + } + + size_t num_elements = tensor->numel(); + std::vector data(num_elements); + + // Determine if this is a CUDA tensor + cudaPointerAttributes attributes{}; + cudaError_t err = cudaPointerGetAttributes(&attributes, tensor->data_ptr()); + bool is_device = + (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice); + + if (is_device) { + cudaError_t memcpy_err = cudaMemcpy( + data.data(), + tensor->data_ptr(), + num_elements * sizeof(float), + cudaMemcpyDeviceToHost); + // Note: Error is checked but we don't fail the function + // This allows tests to proceed and handle errors as needed + (void)memcpy_err; // Suppress unused variable warning + } else { + std::memcpy( + data.data(), tensor->data_ptr(), num_elements * sizeof(float)); + } + + return data; + } + + // Helper to verify two tensors have same data + bool tensors_equal(Tensor* a, Tensor* b, float tolerance = 1e-6f) { + if (!a || !b) { + return false; + } + if (a->numel() != b->numel()) { + return false; + } + + auto data_a = get_tensor_data(a); + auto data_b = get_tensor_data(b); + + for (size_t i = 0; i < data_a.size(); ++i) { + if (std::abs(data_a[i] - data_b[i]) > tolerance) { + return false; + } + } + return true; + } +}; + +// Test basic copy functionality - same schema (fast path) +TEST_F(AOTITorchCopyTest, BasicCopySameSchema) { + // Create source tensor with test data + std::vector sizes = {2, 3}; + std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + + Tensor* src = create_test_tensor_with_data(sizes, src_data); + EXPECT_NE(src, nullptr); + + // Create destination tensor with same schema + Tensor* dst = + create_test_tensor_with_data(sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); + EXPECT_NE(dst, nullptr); + + // Perform copy + AOTITorchError error = aoti_torch_copy_(dst, src, 0); + EXPECT_EQ(error, Error::Ok); + + // Verify copy was successful + EXPECT_TRUE(tensors_equal(dst, src)); +} + +// Test copy with different strides (pointwise fallback) +TEST_F(AOTITorchCopyTest, CopyDifferentStrides) { + // Create source tensor (2x3) with contiguous layout + std::vector src_sizes = {2, 3}; + std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + + Tensor* src = create_test_tensor_with_data(src_sizes, src_data); + EXPECT_NE(src, nullptr); + + // Create destination tensor with transposed strides + std::vector dst_strides = {1, 2}; // Column-major layout + Tensor* dst = create_test_tensor_with_data( + src_sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, dst_strides); + EXPECT_NE(dst, nullptr); + + // Perform copy - this should use pointwise fallback + AOTITorchError error = aoti_torch_copy_(dst, src, 0); + EXPECT_EQ(error, Error::Ok); + + // Verify the copy worked correctly by checking specific elements + auto dst_data = get_tensor_data(dst); + auto src_data_check = get_tensor_data(src); + + // For transposed layout, the data should be rearranged + EXPECT_EQ(dst_data.size(), 6); + EXPECT_EQ(src_data_check.size(), 6); +} + +// Test copy between CPU and CUDA tensors +TEST_F(AOTITorchCopyTest, CopyCPUToCUDA) { + std::vector sizes = {2, 2}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + + // Create CPU tensor + Tensor* cpu_tensor = create_test_tensor_with_data( + sizes, + data, + {}, + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CPU)); // CPU + EXPECT_NE(cpu_tensor, nullptr); + + // Create CUDA tensor + Tensor* cuda_tensor = create_test_tensor_with_data( + sizes, + {0.0f, 0.0f, 0.0f, 0.0f}, + {}, + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA)); // CUDA + EXPECT_NE(cuda_tensor, nullptr); + + // Copy from CPU to CUDA + AOTITorchError error = aoti_torch_copy_(cuda_tensor, cpu_tensor, 0); + EXPECT_EQ(error, Error::Ok); + + // Verify copy + EXPECT_TRUE(tensors_equal(cuda_tensor, cpu_tensor)); +} + +// Test copy between CUDA and CPU tensors +TEST_F(AOTITorchCopyTest, CopyCUDAToCPU) { + std::vector sizes = {2, 2}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + + // Create CUDA tensor + Tensor* cuda_tensor = create_test_tensor_with_data( + sizes, + data, + {}, + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA)); // CUDA + EXPECT_NE(cuda_tensor, nullptr); + + // Create CPU tensor + Tensor* cpu_tensor = create_test_tensor_with_data( + sizes, + {0.0f, 0.0f, 0.0f, 0.0f}, + {}, + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CPU)); // CPU + EXPECT_NE(cpu_tensor, nullptr); + + // Copy from CUDA to CPU + AOTITorchError error = aoti_torch_copy_(cpu_tensor, cuda_tensor, 0); + EXPECT_EQ(error, Error::Ok); + + // Verify copy + EXPECT_TRUE(tensors_equal(cpu_tensor, cuda_tensor)); +} + +// Test copy with bf16 dtype support +TEST_F(AOTITorchCopyTest, CopyBf16Tensors) { + // Test that bf16 tensors can be created and copied + std::vector sizes = {2, 3}; + std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + + // Note: We create float32 data but the tensor will be created with bf16 dtype + // This simulates creating bf16 tensors + Tensor* src = create_test_tensor_with_data( + sizes, + src_data, + {}, // default strides + static_cast(SupportedDTypes::BFLOAT16), // bf16 dtype + static_cast(SupportedDevices::CUDA), // CUDA device + 0 // device_index = 0 + ); + EXPECT_NE(src, nullptr); + + // Create destination tensor with bf16 dtype + std::vector dst_init(6, 0.0f); + Tensor* dst = create_test_tensor_with_data( + sizes, + dst_init, + {}, // default strides + static_cast(SupportedDTypes::BFLOAT16), // bf16 dtype + static_cast(SupportedDevices::CUDA), // CUDA device + 0 // device_index = 0 + ); + EXPECT_NE(dst, nullptr); + + // Perform copy between bf16 tensors + AOTITorchError error = aoti_torch_copy_(dst, src, 0); + EXPECT_EQ(error, Error::Ok); + + // Verify that both tensors have the expected dtype + int32_t src_dtype, dst_dtype; + aoti_torch_get_dtype(src, &src_dtype); + aoti_torch_get_dtype(dst, &dst_dtype); + + EXPECT_EQ(src_dtype, static_cast(SupportedDTypes::BFLOAT16)); + EXPECT_EQ(dst_dtype, static_cast(SupportedDTypes::BFLOAT16)); + + // Verify copy was successful by checking numel matches + EXPECT_EQ(src->numel(), dst->numel()); + EXPECT_EQ(src->numel(), 6); +} + +// Test copy between different dtypes should fail +TEST_F(AOTITorchCopyTest, CopyDTypeMismatchError) { + std::vector sizes = {2, 2}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + + // Create float32 tensor + Tensor* float32_tensor = create_test_tensor_with_data( + sizes, + data, + {}, // default strides + static_cast(SupportedDTypes::FLOAT32), // float32 dtype + static_cast(SupportedDevices::CUDA), // CUDA device + 0 // device_index = 0 + ); + EXPECT_NE(float32_tensor, nullptr); + + // Create bf16 tensor + Tensor* bf16_tensor = create_test_tensor_with_data( + sizes, + {0.0f, 0.0f, 0.0f, 0.0f}, + {}, // default strides + static_cast(SupportedDTypes::BFLOAT16), // bf16 dtype + static_cast(SupportedDevices::CUDA), // CUDA device + 0 // device_index = 0 + ); + EXPECT_NE(bf16_tensor, nullptr); + + // Attempting to copy between different dtypes should fail + AOTITorchError error = aoti_torch_copy_(bf16_tensor, float32_tensor, 0); + EXPECT_EQ(error, Error::InvalidArgument); + + // Reverse direction should also fail + error = aoti_torch_copy_(float32_tensor, bf16_tensor, 0); + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test error conditions +TEST_F(AOTITorchCopyTest, ErrorHandling) { + std::vector sizes = {2, 3}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + + Tensor* valid_tensor = create_test_tensor_with_data(sizes, data); + EXPECT_NE(valid_tensor, nullptr); + + // Test null pointers + AOTITorchError error = aoti_torch_copy_(nullptr, valid_tensor, 0); + EXPECT_NE(error, Error::Ok); + + error = aoti_torch_copy_(valid_tensor, nullptr, 0); + EXPECT_NE(error, Error::Ok); + + // Test numel mismatch (different total number of elements) + std::vector different_numel_sizes = { + 2, 3, 4}; // 24 elements vs 6 elements + std::vector different_data(24, 1.0f); + Tensor* different_numel = + create_test_tensor_with_data(different_numel_sizes, different_data); + EXPECT_NE(different_numel, nullptr); + + error = aoti_torch_copy_(valid_tensor, different_numel, 0); + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test copy from 1D to 3D with same total elements +TEST_F(AOTITorchCopyTest, Copy1DTo3DSameNumel) { + // Source tensor: 8 elements in 1D + std::vector src_sizes = {8}; + std::vector src_data = { + 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + + Tensor* src = create_test_tensor_with_data(src_sizes, src_data); + EXPECT_NE(src, nullptr); + + // Destination tensor: 2x2x2 = 8 elements (different shape, same total) + std::vector dst_sizes = {2, 2, 2}; + std::vector dst_init(8, 0.0f); + Tensor* dst = create_test_tensor_with_data(dst_sizes, dst_init); + EXPECT_NE(dst, nullptr); + + // This should work - same total number of elements + AOTITorchError error = aoti_torch_copy_(dst, src, 0); + EXPECT_EQ(error, Error::Ok); + + // Verify the data was copied correctly + auto dst_data = get_tensor_data(dst); + EXPECT_EQ(dst_data.size(), 8); + + // Check some specific elements to verify correct copying + EXPECT_FLOAT_EQ(dst_data[0], 1.0f); + EXPECT_FLOAT_EQ(dst_data[7], 8.0f); +} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp new file mode 100644 index 00000000000..d9b785a5a78 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp @@ -0,0 +1,754 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::aoti; +using namespace executorch::backends::cuda; +using namespace executorch::runtime; +using executorch::runtime::etensor::Tensor; + +// Test fixture for aoti_torch_create_tensor_from_blob_v2 tests +class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test { + protected: + void SetUp() override { + // Initialize ExecuTorch Platform Abstraction Layer + et_pal_init(); + + // Check if CUDA is available + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + + // Clear any remaining tensors from previous tests + clear_all_tensors(); + } + + void TearDown() override { + // Clean up metadata + cleanup_tensor_metadata(); + + // Clear the global tensor storage using the provided function + clear_all_tensors(); + + // Clean up any allocated memory buffers + for (void* ptr : cuda_memory_buffers_) { + if (ptr) { + cudaError_t cuda_err = cudaFree(ptr); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Failed to free CUDA memory: " << cudaGetErrorString(cuda_err); + } + } + cuda_memory_buffers_.clear(); + + for (void* ptr : cpu_memory_buffers_) { + if (ptr) { + free(ptr); + } + } + cpu_memory_buffers_.clear(); + } + + // Helper to allocate CUDA memory and track it for cleanup + void* allocate_cuda_memory(size_t bytes) { + void* ptr; + cudaError_t err = cudaMallocManaged(&ptr, bytes); + if (err == cudaSuccess) { + cuda_memory_buffers_.push_back(ptr); + return ptr; + } + return nullptr; + } + + // Helper to allocate CPU memory and track it for cleanup + void* allocate_cpu_memory(size_t bytes) { + void* ptr; + int result = posix_memalign(&ptr, 16, bytes); // 16-byte aligned + if (result == 0 && ptr != nullptr) { + cpu_memory_buffers_.push_back(ptr); + return ptr; + } + return nullptr; + } + + // Helper to calculate number of elements from sizes + int64_t calculate_numel(const std::vector& sizes) { + int64_t numel = 1; + for (int64_t size : sizes) { + numel *= size; + } + return numel; + } + + // Helper to calculate contiguous strides from sizes + std::vector calculate_contiguous_strides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + + strides[sizes.size() - 1] = 1; + // Use int64_t and check for underflow to avoid unsigned integer wraparound + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; + } + + private: + std::vector cuda_memory_buffers_; + std::vector cpu_memory_buffers_; +}; + +// Test basic functionality with CUDA memory +TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCUDA) { + // Test 1D tensor + std::vector sizes_1d = {5}; + std::vector strides_1d = calculate_contiguous_strides(sizes_1d); + + // Allocate CUDA memory + size_t bytes = calculate_numel(sizes_1d) * sizeof(float); + void* cuda_data = allocate_cuda_memory(bytes); + ASSERT_NE(cuda_data, nullptr); + + Tensor* tensor_1d; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + cuda_data, + sizes_1d.size(), + sizes_1d.data(), + strides_1d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_1d, + 0, // layout (strided) + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_1d, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor_1d->dim(), 1); + EXPECT_EQ(tensor_1d->size(0), 5); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor_1d->mutable_data_ptr(); + EXPECT_EQ(tensor_data, cuda_data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor_1d); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) For CUDA memory, check that we can still access it (synchronously) + // after tensor deletion + float pattern_value = 42.0f; + cudaError_t cuda_err = cudaMemcpy( + cuda_data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = cudaMemcpy( + &readback_value, cuda_data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test basic functionality with CPU memory +TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCPU) { + // Test 2D tensor + std::vector sizes_2d = {3, 4}; + std::vector strides_2d = calculate_contiguous_strides(sizes_2d); + + // Allocate CPU memory + size_t bytes = calculate_numel(sizes_2d) * sizeof(float); + void* cpu_data = allocate_cpu_memory(bytes); + ASSERT_NE(cpu_data, nullptr); + + Tensor* tensor_2d; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + cpu_data, + sizes_2d.size(), + sizes_2d.data(), + strides_2d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CPU), + 0, // device index + &tensor_2d, + 0, // layout (strided) + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_2d, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor_2d->dim(), 2); + EXPECT_EQ(tensor_2d->size(0), 3); + EXPECT_EQ(tensor_2d->size(1), 4); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor_2d->mutable_data_ptr(); + EXPECT_EQ(tensor_data, cpu_data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor_2d); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) For CPU memory, directly write and read to verify accessibility + float* float_ptr = reinterpret_cast(cpu_data); + float pattern_value = 42.0f; + *float_ptr = pattern_value; + EXPECT_EQ(*float_ptr, pattern_value) + << "Original CPU memory should still be accessible after tensor deletion"; +} + +// Test with invalid dtype +TEST_F(AOTITorchCreateTensorFromBlobV2Test, InvalidDtype) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + 999, // invalid dtype + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test with non-zero storage offset (should fail since from_blob cannot handle +// offsets) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, NonZeroStorageOffset) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 1, // non-zero storage_offset (should fail since from_blob cannot handle + // offsets) + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test with custom strides (using stride parameter but still contiguous) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, CustomContiguousStrides) { + std::vector sizes = {2, 3}; + // Use the correct contiguous strides but pass them explicitly + std::vector contiguous_strides = {3, 1}; // Proper contiguous strides + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + contiguous_strides.data(), // Explicitly pass contiguous strides + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, data); + + // Verify strides were properly set (we can check via aoti_torch_get_strides) + int64_t* tensor_strides; + error = aoti_torch_get_strides(tensor, &tensor_strides); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(tensor_strides[0], 3); + EXPECT_EQ(tensor_strides[1], 1); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) + float pattern_value = 42.0f; + cudaError_t cuda_err = + cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = + cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test with null data pointer +TEST_F(AOTITorchCreateTensorFromBlobV2Test, NullDataPointer) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + nullptr, // null data pointer + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test scalar tensor (0D) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) { + std::vector sizes = {}; // 0D tensor + std::vector strides = {}; // Empty strides for scalar + + size_t bytes = sizeof(float); // Single element + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 0); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) + float pattern_value = 42.0f; + cudaError_t cuda_err = + cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = + cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test zero-sized tensor +TEST_F(AOTITorchCreateTensorFromBlobV2Test, ZeroSizedTensor) { + std::vector sizes = {0, 5}; // Zero elements + std::vector strides = calculate_contiguous_strides(sizes); + + // Even for zero-sized tensor, we need some memory allocated + size_t bytes = sizeof(float); // Minimum allocation + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 0); + EXPECT_EQ(tensor->size(1), 5); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) + float pattern_value = 42.0f; + cudaError_t cuda_err = + cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = + cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test multi-dimensional tensors +TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultiDimensionalTensors) { + // Test 3D tensor + std::vector sizes_3d = {2, 3, 4}; + std::vector strides_3d = calculate_contiguous_strides(sizes_3d); + + size_t bytes_3d = calculate_numel(sizes_3d) * sizeof(float); + void* data_3d = allocate_cuda_memory(bytes_3d); + ASSERT_NE(data_3d, nullptr); + + Tensor* tensor_3d; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data_3d, + sizes_3d.size(), + sizes_3d.data(), + strides_3d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_3d, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_3d, nullptr); + EXPECT_EQ(tensor_3d->dim(), 3); + EXPECT_EQ(tensor_3d->size(0), 2); + EXPECT_EQ(tensor_3d->size(1), 3); + EXPECT_EQ(tensor_3d->size(2), 4); + + // Test 4D tensor + std::vector sizes_4d = {2, 3, 4, 5}; + std::vector strides_4d = calculate_contiguous_strides(sizes_4d); + + size_t bytes_4d = calculate_numel(sizes_4d) * sizeof(float); + void* data_4d = allocate_cuda_memory(bytes_4d); + ASSERT_NE(data_4d, nullptr); + + Tensor* tensor_4d; + error = aoti_torch_create_tensor_from_blob_v2( + data_4d, + sizes_4d.size(), + sizes_4d.data(), + strides_4d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_4d, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_4d, nullptr); + EXPECT_EQ(tensor_4d->dim(), 4); + EXPECT_EQ(tensor_4d->size(0), 2); + EXPECT_EQ(tensor_4d->size(1), 3); + EXPECT_EQ(tensor_4d->size(2), 4); + EXPECT_EQ(tensor_4d->size(3), 5); +} + +// Test tensor data pointer consistency +TEST_F(AOTITorchCreateTensorFromBlobV2Test, DataPointerConsistency) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* original_data = allocate_cuda_memory(bytes); + ASSERT_NE(original_data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + original_data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check that the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, original_data); +} + +// Test creating multiple tensors from different blobs +TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultipleTensorsFromBlobs) { + const int num_tensors = 5; + std::vector tensors; + std::vector data_ptrs; + + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {i + 1, i + 2}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + data_ptrs.push_back(data); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + tensors.push_back(tensor); + + // Verify dimensions + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), i + 1); + EXPECT_EQ(tensor->size(1), i + 2); + + // Verify the tensor uses the correct data pointer + EXPECT_EQ(tensor->mutable_data_ptr(), data); + } + + // Verify all tensors have different data pointers + for (int i = 0; i < num_tensors; i++) { + EXPECT_EQ(tensors[i]->mutable_data_ptr(), data_ptrs[i]); + for (int j = i + 1; j < num_tensors; j++) { + EXPECT_NE(tensors[i]->mutable_data_ptr(), tensors[j]->mutable_data_ptr()); + } + } +} + +// Test deletion of tensor created from blob (should not free the original +// memory) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeletionDoesNotFreeOriginalMemory) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // The original memory should still be valid (we'll free it in teardown) + // We can't easily test if the memory is still valid without risking crashes, + // but the test should pass without issues if memory management is correct +} + +// Test with opaque metadata +TEST_F(AOTITorchCreateTensorFromBlobV2Test, WithOpaqueMetadata) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + // Create some opaque metadata + std::vector metadata = {0x01, 0x02, 0x03, 0x04}; + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + metadata.data(), // opaque_metadata + metadata.size()); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); +} + +// Test stress test with many small tensors from blobs +TEST_F(AOTITorchCreateTensorFromBlobV2Test, StressTestManySmallTensors) { + const int num_tensors = 50; // Reduced for reasonable test time + std::vector tensors; + + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {1, 1}; // Minimal size + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + if (data == nullptr) { + // Skip if we run out of memory + continue; + } + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + if (error == Error::Ok && tensor != nullptr) { + tensors.push_back(tensor); + + // Verify the tensor uses the correct data pointer + EXPECT_EQ(tensor->mutable_data_ptr(), data); + } + } + + // Delete all created tensors + for (Tensor* tensor : tensors) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + } +} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp new file mode 100644 index 00000000000..7527965cdb8 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::aoti; +using namespace executorch::backends::cuda; +using namespace executorch::runtime; + +// TODO(gasoonjia): Multiple device tests were not included due to test +// environment limitations. Will be added in the future. +class AOTITorchCUDAGuardTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); + + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess); + } + + void TearDown() override { + if (cudaGetDeviceCount(&original_device_) == cudaSuccess) { + ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess); + } + } + + int original_device_ = 0; +}; + +TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAGuard) { + CUDAGuardHandle guard = nullptr; + AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(guard, nullptr); + + int current_device = -1; + ASSERT_EQ(cudaGetDevice(¤t_device), cudaSuccess); + EXPECT_EQ(current_device, 0); + + error = aoti_torch_delete_cuda_guard(guard); + EXPECT_EQ(error, Error::Ok); +} + +TEST_F(AOTITorchCUDAGuardTest, CreateCUDAGuardNullReturnPointer) { + AOTITorchError error = aoti_torch_create_cuda_guard(0, nullptr); + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAGuardNullHandle) { + AOTITorchError error = aoti_torch_delete_cuda_guard(nullptr); + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexNullHandle) { + AOTITorchError error = aoti_torch_cuda_guard_set_index(nullptr, 0); + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexInvalidDevice) { + CUDAGuardHandle guard = nullptr; + AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard); + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(guard, nullptr); + + error = aoti_torch_cuda_guard_set_index(guard, 999); + EXPECT_NE(error, Error::Ok); + + error = aoti_torch_delete_cuda_guard(guard); + EXPECT_EQ(error, Error::Ok); +} + +TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAStreamGuard) { + cudaStream_t stream; + ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess); + + CUDAStreamGuardHandle guard = nullptr; + AOTITorchError error = aoti_torch_create_cuda_stream_guard(stream, 0, &guard); + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(guard, nullptr); + + error = aoti_torch_delete_cuda_stream_guard(guard); + EXPECT_EQ(error, Error::Ok); + + ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess); +} + +TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullReturnPointer) { + cudaStream_t stream; + ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess); + + AOTITorchError error = + aoti_torch_create_cuda_stream_guard(stream, 0, nullptr); + EXPECT_EQ(error, Error::InvalidArgument); + + ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess); +} + +TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullStream) { + CUDAStreamGuardHandle guard = nullptr; + AOTITorchError error = + aoti_torch_create_cuda_stream_guard(nullptr, 0, &guard); + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAStreamGuardNullHandle) { + AOTITorchError error = aoti_torch_delete_cuda_stream_guard(nullptr); + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStream) { + void* ret_stream = nullptr; + AOTITorchError error = aoti_torch_get_current_cuda_stream(0, &ret_stream); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(ret_stream, nullptr); +} + +TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStreamNullReturnPointer) { + AOTITorchError error = aoti_torch_get_current_cuda_stream(0, nullptr); + EXPECT_EQ(error, Error::InvalidArgument); +} + +TEST_F(AOTITorchCUDAGuardTest, StreamGuardWithSameDevice) { + ASSERT_EQ(cudaSetDevice(0), cudaSuccess); + + cudaStream_t stream1, stream2; + ASSERT_EQ(cudaStreamCreate(&stream1), cudaSuccess); + ASSERT_EQ(cudaStreamCreate(&stream2), cudaSuccess); + + CUDAStreamGuardHandle guard1 = nullptr; + AOTITorchError error = + aoti_torch_create_cuda_stream_guard(stream1, 0, &guard1); + EXPECT_EQ(error, Error::Ok); + + void* ret_stream = nullptr; + error = aoti_torch_get_current_cuda_stream(0, &ret_stream); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(static_cast(ret_stream), stream1); + + CUDAStreamGuardHandle guard2 = nullptr; + error = aoti_torch_create_cuda_stream_guard(stream2, 0, &guard2); + EXPECT_EQ(error, Error::Ok); + + ret_stream = nullptr; + error = aoti_torch_get_current_cuda_stream(0, &ret_stream); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(static_cast(ret_stream), stream2); + + error = aoti_torch_delete_cuda_stream_guard(guard2); + EXPECT_EQ(error, Error::Ok); + + ret_stream = nullptr; + error = aoti_torch_get_current_cuda_stream(0, &ret_stream); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(static_cast(ret_stream), stream1); + + error = aoti_torch_delete_cuda_stream_guard(guard1); + EXPECT_EQ(error, Error::Ok); + + ASSERT_EQ(cudaStreamDestroy(stream1), cudaSuccess); + ASSERT_EQ(cudaStreamDestroy(stream2), cudaSuccess); +} + +TEST_F(AOTITorchCUDAGuardTest, GetCurrentStreamAfterSetStream) { + cudaStream_t new_stream; + ASSERT_EQ(cudaStreamCreate(&new_stream), cudaSuccess); + + CUDAStreamGuardHandle guard = nullptr; + AOTITorchError error = + aoti_torch_create_cuda_stream_guard(new_stream, 0, &guard); + EXPECT_EQ(error, Error::Ok); + + void* ret_stream = nullptr; + error = aoti_torch_get_current_cuda_stream(0, &ret_stream); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(static_cast(ret_stream), new_stream); + + error = aoti_torch_delete_cuda_stream_guard(guard); + EXPECT_EQ(error, Error::Ok); + + ASSERT_EQ(cudaStreamDestroy(new_stream), cudaSuccess); +} diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp new file mode 100644 index 00000000000..10c8d8c1a31 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp @@ -0,0 +1,454 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::aoti; +using namespace executorch::backends::cuda; +using namespace executorch::runtime; +using executorch::runtime::etensor::Tensor; + +// Test fixture for aoti_torch_delete_tensor_object tests +class AOTITorchDeleteTensorObjectTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize ExecuTorch Platform Abstraction Layer + et_pal_init(); + + // Check if CUDA is available + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + + // Clear any remaining tensors from previous tests + clear_all_tensors(); + } + + void TearDown() override { + // Clean up metadata + cleanup_tensor_metadata(); + + // Clear the global tensor storage using the provided function + clear_all_tensors(); + } + + // Helper to create test tensors + Tensor* create_test_tensor( + const std::vector& sizes, + const std::vector& strides = {}, + int32_t dtype = 6, // float32 + int32_t device_type = 1, // CUDA + int32_t device_index = 0) { + Tensor* tensor; + + const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); + + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides_ptr, + dtype, + device_type, + device_index, + &tensor); + + return (error == Error::Ok) ? tensor : nullptr; + } +}; + +// Test basic deletion of CUDA tensor +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCudaTensorBasic) { + // Create a CUDA tensor + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties before deletion + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test basic deletion of CPU tensor +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCpuTensorBasic) { + // Create a CPU tensor + std::vector sizes = {3, 4}; + Tensor* tensor = create_test_tensor(sizes, {}, 6, 0, 0); // CPU device + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties before deletion + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 3); + EXPECT_EQ(tensor->size(1), 4); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of null tensor pointer +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteNullTensor) { + AOTITorchError error = aoti_torch_delete_tensor_object(nullptr); + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test deletion of tensor not in tracking system +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteUntrackedTensor) { + // Create a tensor and then clear the tracking system + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Clear the tracking system (simulating an untracked tensor) + clear_all_tensors(); + + // Try to delete the tensor - should fail + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test deletion of multiple tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMultipleTensors) { + // Create multiple tensors + std::vector tensors; + + for (int i = 1; i <= 5; i++) { + std::vector sizes = {i, i + 1}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + tensors.push_back(tensor); + } + + // Delete all tensors + for (Tensor* tensor : tensors) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + } +} + +// Test deletion of zero-sized tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteZeroSizedTensor) { + // Create a zero-sized tensor + std::vector sizes = {0, 5}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 0); + EXPECT_EQ(tensor->size(1), 5); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of scalar (0D) tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteScalarTensor) { + // Create a scalar tensor + std::vector sizes = {}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 0); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of large multi-dimensional tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteLargeTensor) { + // Create a large multi-dimensional tensor + std::vector sizes = {10, 20, 30}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 10); + EXPECT_EQ(tensor->size(1), 20); + EXPECT_EQ(tensor->size(2), 30); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of tensors with custom strides +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteTensorWithCustomStrides) { + // Create tensor with custom strides + std::vector sizes = {3, 4}; + std::vector strides = {4, 1}; // Row-major strides + Tensor* tensor = create_test_tensor(sizes, strides); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 3); + EXPECT_EQ(tensor->size(1), 4); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion after accessing tensor data +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteAfterDataAccess) { + // Create a tensor + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Access tensor data (this should not prevent deletion) + void* data_ptr = tensor->mutable_data_ptr(); + EXPECT_NE(data_ptr, nullptr); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test double deletion (should fail on second attempt) +TEST_F(AOTITorchDeleteTensorObjectTest, DoubleDeletion) { + // Create a tensor + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // First deletion should succeed + AOTITorchError error1 = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error1, Error::Ok); + + // Second deletion should fail (tensor no longer tracked) + AOTITorchError error2 = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error2, Error::InvalidArgument); +} + +// Test deletion of tensors on both CUDA and CPU devices +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMixedDeviceTensors) { + // Create CUDA tensor + std::vector sizes = {2, 3}; + Tensor* cuda_tensor = create_test_tensor(sizes, {}, 6, 1, 0); + ASSERT_NE(cuda_tensor, nullptr); + + // Create CPU tensor + Tensor* cpu_tensor = create_test_tensor(sizes, {}, 6, 0, 0); + ASSERT_NE(cpu_tensor, nullptr); + + // Delete both tensors + AOTITorchError cuda_error = aoti_torch_delete_tensor_object(cuda_tensor); + EXPECT_EQ(cuda_error, Error::Ok); + + AOTITorchError cpu_error = aoti_torch_delete_tensor_object(cpu_tensor); + EXPECT_EQ(cpu_error, Error::Ok); +} + +// Test memory consistency after deletion +TEST_F(AOTITorchDeleteTensorObjectTest, MemoryConsistencyAfterDeletion) { + // Create multiple tensors + std::vector tensors; + const int num_tensors = 10; + + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {i + 1, i + 2}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + tensors.push_back(tensor); + } + + // Delete every other tensor + for (int i = 0; i < num_tensors; i += 2) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]); + EXPECT_EQ(error, Error::Ok); + } + + // Delete remaining tensors + for (int i = 1; i < num_tensors; i += 2) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]); + EXPECT_EQ(error, Error::Ok); + } +} + +// Test stress deletion with many small tensors +TEST_F(AOTITorchDeleteTensorObjectTest, StressDeletionManySmallTensors) { + const int num_tensors = 100; + std::vector tensors; + + // Create many small tensors + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {1, 1}; // Minimal size + Tensor* tensor = create_test_tensor(sizes); + if (tensor != nullptr) { + tensors.push_back(tensor); + } + } + + // Delete all created tensors + for (Tensor* tensor : tensors) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + } +} + +// Test CUDA synchronization during deletion +TEST_F(AOTITorchDeleteTensorObjectTest, CudaSynchronizationDuringDeletion) { + // Create a larger CUDA tensor to ensure memory allocation + std::vector sizes = {100, 100}; + Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device + ASSERT_NE(tensor, nullptr); + + // Delete the tensor (should handle synchronization internally) + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Verify CUDA state is still good + cudaError_t cuda_error = cudaGetLastError(); + EXPECT_EQ(cuda_error, cudaSuccess); +} + +// Test specific deletion of bfloat16 tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteBFloat16Tensor) { + // Test 1D bfloat16 tensor deletion + std::vector sizes_1d = {10}; + Tensor* tensor_bf16_1d = create_test_tensor( + sizes_1d, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_1d, nullptr); + + // Verify it's bfloat16 before deletion + int32_t actual_dtype; + EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_1d, &actual_dtype), Error::Ok); + EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::BFLOAT16)) + << "Expected bfloat16 dtype (" + << static_cast(SupportedDTypes::BFLOAT16) << "), got " + << actual_dtype; + + // Verify element size (bfloat16 should be 2 bytes per element) + EXPECT_EQ(tensor_bf16_1d->element_size(), 2); + + // Delete the bfloat16 tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor_bf16_1d); + EXPECT_EQ(error, Error::Ok); + + // Test 2D bfloat16 tensor deletion with custom strides + std::vector sizes_2d = {4, 6}; + std::vector strides_2d = {6, 1}; // Row-major strides + Tensor* tensor_bf16_2d = create_test_tensor( + sizes_2d, + strides_2d, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_2d, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor_bf16_2d->dim(), 2); + EXPECT_EQ(tensor_bf16_2d->size(0), 4); + EXPECT_EQ(tensor_bf16_2d->size(1), 6); + EXPECT_EQ(tensor_bf16_2d->element_size(), 2); + + // Verify it's bfloat16 + int32_t dtype_2d; + EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_2d, &dtype_2d), Error::Ok); + EXPECT_EQ(dtype_2d, static_cast(SupportedDTypes::BFLOAT16)); + + // Delete the 2D bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_2d); + EXPECT_EQ(error, Error::Ok); + + // Test 3D bfloat16 tensor deletion + std::vector sizes_3d = {2, 3, 4}; + Tensor* tensor_bf16_3d = create_test_tensor( + sizes_3d, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_3d, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor_bf16_3d->dim(), 3); + EXPECT_EQ(tensor_bf16_3d->size(0), 2); + EXPECT_EQ(tensor_bf16_3d->size(1), 3); + EXPECT_EQ(tensor_bf16_3d->size(2), 4); + EXPECT_EQ(tensor_bf16_3d->element_size(), 2); + + // Verify memory size (2 * 3 * 4 * 2 bytes = 48 bytes) + size_t expected_memory = 2 * 3 * 4 * 2; + size_t actual_memory = + tensor_bf16_3d->numel() * tensor_bf16_3d->element_size(); + EXPECT_EQ(actual_memory, expected_memory); + + // Delete the 3D bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_3d); + EXPECT_EQ(error, Error::Ok); + + // Test bfloat16 scalar tensor (0D) deletion + std::vector scalar_sizes = {}; + Tensor* tensor_bf16_scalar = create_test_tensor( + scalar_sizes, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_scalar, nullptr); + + // Verify scalar tensor properties + EXPECT_EQ(tensor_bf16_scalar->dim(), 0); + EXPECT_EQ(tensor_bf16_scalar->numel(), 1); + EXPECT_EQ(tensor_bf16_scalar->element_size(), 2); + + // Delete the scalar bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_scalar); + EXPECT_EQ(error, Error::Ok); + + // Test zero-element bfloat16 tensor deletion + std::vector zero_sizes = {0, 5}; + Tensor* tensor_bf16_zero = create_test_tensor( + zero_sizes, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_zero, nullptr); + + // Verify zero-element tensor properties + EXPECT_EQ(tensor_bf16_zero->dim(), 2); + EXPECT_EQ(tensor_bf16_zero->size(0), 0); + EXPECT_EQ(tensor_bf16_zero->size(1), 5); + EXPECT_EQ(tensor_bf16_zero->numel(), 0); + EXPECT_EQ(tensor_bf16_zero->element_size(), 2); + + // Delete the zero-element bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_zero); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of mixed dtype tensors (float32 and bfloat16) diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp new file mode 100644 index 00000000000..da65129f18a --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp @@ -0,0 +1,588 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::cuda; +using namespace executorch::backends::aoti; +using namespace executorch::runtime; +using executorch::runtime::etensor::Tensor; + +// Test fixture for aoti_torch_empty_strided tests +class AOTITorchEmptyStridedTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize ExecuTorch Platform Abstraction Layer + et_pal_init(); + + // Check if CUDA is available + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + + // Clear any remaining tensors from previous tests + clear_all_tensors(); + } + + void TearDown() override { + // Clean up metadata + cleanup_tensor_metadata(); + + // Clear the global tensor storage using the provided function + clear_all_tensors(); + } + + // Helper to create test tensors + Tensor* create_tracked_tensor( + const std::vector& sizes, + const std::vector& strides = {}, + int32_t dtype = static_cast(SupportedDTypes::FLOAT32), + int32_t device_type = static_cast(SupportedDevices::CUDA), + int32_t device_index = 0) { + Tensor* tensor; + + const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); + + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides_ptr, + dtype, + device_type, + device_index, + &tensor); + + return (error == Error::Ok) ? tensor : nullptr; + } +}; + +// Test aoti_torch_empty_strided basic functionality +TEST_F(AOTITorchEmptyStridedTest, BasicFunctionality) { + // Test 1D tensor + std::vector sizes_1d = {5}; + Tensor* tensor_1d; + AOTITorchError error = aoti_torch_empty_strided( + sizes_1d.size(), + sizes_1d.data(), + nullptr, // Let function compute strides + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_1d); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_1d, nullptr); + + // CRITICAL: Verify the tensor is actually float32 + int32_t actual_dtype; + EXPECT_EQ(aoti_torch_get_dtype(tensor_1d, &actual_dtype), Error::Ok); + EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::FLOAT32)) + << "Expected float32 dtype (" + << static_cast(SupportedDTypes::FLOAT32) << "), got " + << actual_dtype; + + // Verify element size (float32 should be 4 bytes per element) + size_t element_size = tensor_1d->element_size(); + EXPECT_EQ(element_size, 4) + << "Expected float32 element size to be 4 bytes, got " << element_size; + + // Verify total number of elements and memory usage + int64_t expected_numel = 5; // 5 elements + EXPECT_EQ(tensor_1d->numel(), expected_numel) + << "Expected " << expected_numel << " elements, got " + << tensor_1d->numel(); + + // Verify total memory size (numel * element_size) + size_t expected_memory_size = expected_numel * 4; // 5 * 4 = 20 bytes + size_t actual_memory_size = tensor_1d->numel() * tensor_1d->element_size(); + EXPECT_EQ(actual_memory_size, expected_memory_size) + << "Expected " << expected_memory_size << " bytes, got " + << actual_memory_size; + + // Check tensor properties + EXPECT_EQ(tensor_1d->dim(), 1); + EXPECT_EQ(tensor_1d->size(0), 5); + + // Test 2D tensor with explicit strides + std::vector sizes_2d = {3, 4}; + std::vector strides_2d = {4, 1}; + Tensor* tensor_2d; + error = aoti_torch_empty_strided( + sizes_2d.size(), + sizes_2d.data(), + strides_2d.data(), + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_2d); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_2d, nullptr); + + // Verify 2D tensor is also float32 + int32_t dtype_2d; + EXPECT_EQ(aoti_torch_get_dtype(tensor_2d, &dtype_2d), Error::Ok); + EXPECT_EQ(dtype_2d, static_cast(SupportedDTypes::FLOAT32)) + << "Expected float32 dtype (" + << static_cast(SupportedDTypes::FLOAT32) << "), got " + << dtype_2d; + + // Verify element size for 2D tensor + EXPECT_EQ(tensor_2d->element_size(), 4); + + // Check tensor properties + EXPECT_EQ(tensor_2d->dim(), 2); + EXPECT_EQ(tensor_2d->size(0), 3); + EXPECT_EQ(tensor_2d->size(1), 4); + + // Verify memory size for 2D tensor + int64_t expected_numel_2d = 3 * 4; // 12 elements + size_t expected_memory_2d = expected_numel_2d * 4; // 12 * 4 = 48 bytes + EXPECT_EQ(tensor_2d->numel() * tensor_2d->element_size(), expected_memory_2d); +} + +// Test aoti_torch_empty_strided with CPU device +TEST_F(AOTITorchEmptyStridedTest, CPUDevice) { + std::vector sizes = {2, 3}; + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, // Let function compute strides + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CPU), + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); +} + +// Test aoti_torch_empty_strided with invalid dtype +TEST_F(AOTITorchEmptyStridedTest, InvalidDtype) { + std::vector sizes = {2, 3}; + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 999, // invalid dtype + 1, // CUDA device + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test aoti_torch_empty_strided with unsupported device +TEST_F(AOTITorchEmptyStridedTest, UnsupportedDevice) { + std::vector sizes = {2, 3}; + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 6, // float32 + 2, // unsupported device type + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::NotImplemented); +} + +// Test aoti_torch_empty_strided with zero-sized tensor +TEST_F(AOTITorchEmptyStridedTest, ZeroSized) { + std::vector sizes = {0, 5}; + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 6, // float32 + 1, // CUDA device + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 0); + EXPECT_EQ(tensor->size(1), 5); +} + +// Test aoti_torch_empty_strided scalar tensor (0D) +TEST_F(AOTITorchEmptyStridedTest, Scalar) { + std::vector sizes = {}; + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 6, // float32 + 1, // CUDA device + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 0); +} + +// Test aoti_torch_empty_strided with large tensor +TEST_F(AOTITorchEmptyStridedTest, LargeTensor) { + std::vector sizes = {100, 200, 50}; + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 6, // float32 + 1, // CUDA device + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 100); + EXPECT_EQ(tensor->size(1), 200); + EXPECT_EQ(tensor->size(2), 50); +} + +// Test error handling with memory allocation failures +TEST_F(AOTITorchEmptyStridedTest, MemoryAllocationStress) { + // Try to create a very large tensor that might cause allocation failure + // (This test may pass or fail depending on available memory) + std::vector huge_sizes = {10000, 10000, 100}; // ~38GB for float32 + Tensor* tensor; + + AOTITorchError error = aoti_torch_empty_strided( + huge_sizes.size(), + huge_sizes.data(), + nullptr, + 6, // float32 + 1, // CUDA device + 0, // device index + &tensor); + + // Either succeed or fail with memory allocation error + if (error == Error::Ok) { + EXPECT_NE(tensor, nullptr); + } else { + EXPECT_EQ(error, Error::MemoryAllocationFailed); + } +} + +// Test aoti_torch_empty_strided with bfloat16 dtype +TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) { + // Test creating bfloat16 tensor on CUDA + std::vector sizes = {2, 3, 4}; + Tensor* tensor_bf16; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, // Let function compute strides + static_cast(SupportedDTypes::BFLOAT16), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_bf16); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_bf16, nullptr); + + // CRITICAL: Verify the tensor is actually bfloat16 + int32_t actual_dtype; + EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16, &actual_dtype), Error::Ok); + EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::BFLOAT16)) + << "Expected bfloat16 dtype (" + << static_cast(SupportedDTypes::BFLOAT16) << "), got " + << actual_dtype; + + // Verify element size (bfloat16 should be 2 bytes per element) + size_t element_size = tensor_bf16->element_size(); + EXPECT_EQ(element_size, 2) + << "Expected bfloat16 element size to be 2 bytes, got " << element_size; + + // Verify total number of elements and memory usage + int64_t expected_numel = 2 * 3 * 4; // 24 elements + EXPECT_EQ(tensor_bf16->numel(), expected_numel) + << "Expected " << expected_numel << " elements, got " + << tensor_bf16->numel(); + + // Verify total memory size (numel * element_size) + size_t expected_memory_size = expected_numel * 2; // 24 * 2 = 48 bytes + size_t actual_memory_size = + tensor_bf16->numel() * tensor_bf16->element_size(); + EXPECT_EQ(actual_memory_size, expected_memory_size) + << "Expected " << expected_memory_size << " bytes, got " + << actual_memory_size; + + // Check tensor properties + EXPECT_EQ(tensor_bf16->dim(), 3); + EXPECT_EQ(tensor_bf16->size(0), 2); + EXPECT_EQ(tensor_bf16->size(1), 3); + EXPECT_EQ(tensor_bf16->size(2), 4); + + // Verify we can get tensor metadata + int64_t* sizes_ptr; + int64_t* strides_ptr; + EXPECT_EQ(aoti_torch_get_sizes(tensor_bf16, &sizes_ptr), Error::Ok); + EXPECT_EQ(aoti_torch_get_strides(tensor_bf16, &strides_ptr), Error::Ok); + + // Check sizes match + EXPECT_EQ(sizes_ptr[0], 2); + EXPECT_EQ(sizes_ptr[1], 3); + EXPECT_EQ(sizes_ptr[2], 4); + + // Check that strides are computed correctly (row-major order) + EXPECT_EQ(strides_ptr[0], 12); // 3 * 4 + EXPECT_EQ(strides_ptr[1], 4); // 4 + EXPECT_EQ(strides_ptr[2], 1); // 1 + + // Test bfloat16 tensor with custom strides + std::vector sizes_2d = {3, 2}; + std::vector strides_2d = {2, 1}; // Row-major strides + Tensor* tensor_bf16_custom; + error = aoti_torch_empty_strided( + sizes_2d.size(), + sizes_2d.data(), + strides_2d.data(), + static_cast(SupportedDTypes::BFLOAT16), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_bf16_custom); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_bf16_custom, nullptr); + + // Verify custom stride tensor is also bfloat16 + int32_t custom_dtype; + EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_custom, &custom_dtype), Error::Ok); + EXPECT_EQ(custom_dtype, static_cast(SupportedDTypes::BFLOAT16)) + << "Expected bfloat16 dtype (" + << static_cast(SupportedDTypes::BFLOAT16) << "), got " + << custom_dtype; + + // Verify element size for custom stride tensor + EXPECT_EQ(tensor_bf16_custom->element_size(), 2); + + // Check tensor properties + EXPECT_EQ(tensor_bf16_custom->dim(), 2); + EXPECT_EQ(tensor_bf16_custom->size(0), 3); + EXPECT_EQ(tensor_bf16_custom->size(1), 2); + + // Verify memory size for custom stride tensor + int64_t custom_expected_numel = 3 * 2; // 6 elements + size_t custom_expected_memory = custom_expected_numel * 2; // 6 * 2 = 12 bytes + EXPECT_EQ( + tensor_bf16_custom->numel() * tensor_bf16_custom->element_size(), + custom_expected_memory); + + // Check custom strides + int64_t* custom_strides_ptr; + EXPECT_EQ( + aoti_torch_get_strides(tensor_bf16_custom, &custom_strides_ptr), + Error::Ok); + EXPECT_EQ(custom_strides_ptr[0], 2); + EXPECT_EQ(custom_strides_ptr[1], 1); + + // Test bfloat16 scalar tensor (0D) + std::vector scalar_sizes = {}; + Tensor* tensor_bf16_scalar; + error = aoti_torch_empty_strided( + scalar_sizes.size(), + scalar_sizes.data(), + nullptr, + static_cast(SupportedDTypes::BFLOAT16), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_bf16_scalar); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_bf16_scalar, nullptr); + EXPECT_EQ(tensor_bf16_scalar->dim(), 0); + + // Verify scalar tensor is also bfloat16 + int32_t scalar_dtype; + EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_scalar, &scalar_dtype), Error::Ok); + EXPECT_EQ(scalar_dtype, static_cast(SupportedDTypes::BFLOAT16)) + << "Expected bfloat16 dtype (" + << static_cast(SupportedDTypes::BFLOAT16) << "), got " + << scalar_dtype; + + // Verify scalar tensor properties + EXPECT_EQ(tensor_bf16_scalar->element_size(), 2); + EXPECT_EQ(tensor_bf16_scalar->numel(), 1); // Scalar tensor has 1 element + EXPECT_EQ( + tensor_bf16_scalar->numel() * tensor_bf16_scalar->element_size(), + 2); // 1 * 2 = 2 bytes +} + +// Test custom strides functionality +TEST_F(AOTITorchEmptyStridedTest, CustomStrides) { + // Create tensor with valid custom strides (contiguous layout) + std::vector sizes = {2, 3}; + std::vector strides = {3, 1}; // Standard row-major strides + + Tensor* tensor = create_tracked_tensor(sizes, strides); + EXPECT_NE(tensor, nullptr); + + // Verify the tensor was created correctly + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + + // Check strides through AOTI interface + int64_t* strides_ptr; + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); + EXPECT_EQ(strides_ptr[0], 3); + EXPECT_EQ(strides_ptr[1], 1); + + // Test another valid stride pattern - transpose-like + std::vector sizes_2 = {3, 2}; + std::vector strides_2 = {1, 3}; // Column-major strides + + Tensor* tensor_2 = create_tracked_tensor(sizes_2, strides_2); + EXPECT_NE(tensor_2, nullptr); + + // Verify the tensor properties + EXPECT_EQ(tensor_2->dim(), 2); + EXPECT_EQ(tensor_2->size(0), 3); + EXPECT_EQ(tensor_2->size(1), 2); + + // Check strides + int64_t* strides_ptr_2; + EXPECT_EQ(aoti_torch_get_strides(tensor_2, &strides_ptr_2), Error::Ok); + EXPECT_EQ(strides_ptr_2[0], 1); + EXPECT_EQ(strides_ptr_2[1], 3); +} + +// Test edge case: zero-element tensor with non-zero dimensions +TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) { + std::vector sizes = {2, 0, 3}; // Total elements = 0 + Tensor* tensor = create_tracked_tensor(sizes); + EXPECT_NE(tensor, nullptr); + + // Verify the tensor properties + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 0); + EXPECT_EQ(tensor->size(2), 3); + + // Should be able to get metadata + int64_t* sizes_ptr; + int64_t* strides_ptr; + EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr), Error::Ok); + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); + + EXPECT_EQ(sizes_ptr[0], 2); + EXPECT_EQ(sizes_ptr[1], 0); + EXPECT_EQ(sizes_ptr[2], 3); +} + +// Test different data types (only float32 is currently supported) +TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { + std::vector sizes = {2, 3}; + + // Test float32 (dtype 6) - currently the only supported type + Tensor* tensor_float32; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 6, // float32 + 1, // CUDA device + 0, // device index + &tensor_float32); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_float32, nullptr); + + // Test unsupported data types should return error + Tensor* tensor_int32; + error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 3, // int32 - unsupported + 1, // CUDA device + 0, // device index + &tensor_int32); + + EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype + + // Test another unsupported data type + Tensor* tensor_float64; + error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + nullptr, + 7, // float64 - unsupported + 1, // CUDA device + 0, // device index + &tensor_float64); + + EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype +} + +// Test multi-dimensional tensors with various shapes +TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) { + // Test 3D tensor + std::vector sizes_3d = {2, 3, 4}; + Tensor* tensor_3d = create_tracked_tensor(sizes_3d); + EXPECT_NE(tensor_3d, nullptr); + EXPECT_EQ(tensor_3d->dim(), 3); + EXPECT_EQ(tensor_3d->size(0), 2); + EXPECT_EQ(tensor_3d->size(1), 3); + EXPECT_EQ(tensor_3d->size(2), 4); + + // Test 4D tensor + std::vector sizes_4d = {2, 3, 4, 5}; + Tensor* tensor_4d = create_tracked_tensor(sizes_4d); + EXPECT_NE(tensor_4d, nullptr); + EXPECT_EQ(tensor_4d->dim(), 4); + EXPECT_EQ(tensor_4d->size(0), 2); + EXPECT_EQ(tensor_4d->size(1), 3); + EXPECT_EQ(tensor_4d->size(2), 4); + EXPECT_EQ(tensor_4d->size(3), 5); + + // Test 5D tensor + std::vector sizes_5d = {1, 2, 3, 4, 5}; + Tensor* tensor_5d = create_tracked_tensor(sizes_5d); + EXPECT_NE(tensor_5d, nullptr); + EXPECT_EQ(tensor_5d->dim(), 5); + EXPECT_EQ(tensor_5d->size(0), 1); + EXPECT_EQ(tensor_5d->size(1), 2); + EXPECT_EQ(tensor_5d->size(2), 3); + EXPECT_EQ(tensor_5d->size(3), 4); + EXPECT_EQ(tensor_5d->size(4), 5); +} diff --git a/backends/cuda/runtime/tests/TARGETS b/backends/cuda/runtime/tests/TARGETS new file mode 100644 index 00000000000..9ff3e83a8bd --- /dev/null +++ b/backends/cuda/runtime/tests/TARGETS @@ -0,0 +1,6 @@ +load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/backends/cuda/runtime/tests/targets.bzl b/backends/cuda/runtime/tests/targets.bzl new file mode 100644 index 00000000000..37e8d876526 --- /dev/null +++ b/backends/cuda/runtime/tests/targets.bzl @@ -0,0 +1,27 @@ +load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") + +def cuda_runtime_cpp_unittest(name): + cpp_unittest( + name = "test_" + name, + srcs = [ + "test_" + name + ".cpp", + ], + deps = [ + "//executorch/backends/cuda/runtime:runtime_shims", + "//executorch/runtime/core:core", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/platform:platform", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + ) + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + cuda_runtime_cpp_unittest("cuda_guard") + cuda_runtime_cpp_unittest("cuda_stream_guard") diff --git a/backends/cuda/runtime/tests/test_cuda_guard.cpp b/backends/cuda/runtime/tests/test_cuda_guard.cpp new file mode 100644 index 00000000000..a364ae98484 --- /dev/null +++ b/backends/cuda/runtime/tests/test_cuda_guard.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +using namespace executorch::backends::cuda; +using namespace executorch::runtime; + +// TODO(gasoonjia): Multiple device tests were not included due to test +// environment limitations. These tests should be added in the future when +// multi-GPU test environments are available, + +class CUDAGuardTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); + + int device_count = 0; + cudaError_t error = cudaGetDeviceCount(&device_count); + if (error != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available or no CUDA devices found"; + } + device_count_ = device_count; + + ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess); + } + + void TearDown() override { + if (device_count_ > 0) { + ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess); + } + } + + int device_count_ = 0; + int original_device_ = 0; +}; + +TEST_F(CUDAGuardTest, BasicDeviceSwitching) { + int current_device; + ASSERT_EQ(cudaGetDevice(¤t_device), cudaSuccess); + + { + auto guard_result = CUDAGuard::create(0); + ASSERT_TRUE(guard_result.ok()); + CUDAGuard guard = std::move(guard_result.get()); + + int device_after_guard; + ASSERT_EQ(cudaGetDevice(&device_after_guard), cudaSuccess); + EXPECT_EQ(device_after_guard, 0); + EXPECT_EQ(guard.current_device(), 0); + EXPECT_EQ(guard.original_device(), current_device); + } + + int device_after_destruction; + ASSERT_EQ(cudaGetDevice(&device_after_destruction), cudaSuccess); + EXPECT_EQ(device_after_destruction, current_device); +} + +TEST_F(CUDAGuardTest, SameDeviceNoSwitching) { + ASSERT_EQ(cudaSetDevice(0), cudaSuccess); + + { + auto guard_result = CUDAGuard::create(0); + ASSERT_TRUE(guard_result.ok()); + CUDAGuard guard = std::move(guard_result.get()); + + int current_device; + ASSERT_EQ(cudaGetDevice(¤t_device), cudaSuccess); + EXPECT_EQ(current_device, 0); + EXPECT_EQ(guard.current_device(), 0); + EXPECT_EQ(guard.original_device(), 0); + } + + int final_device; + ASSERT_EQ(cudaGetDevice(&final_device), cudaSuccess); + EXPECT_EQ(final_device, 0); +} + +TEST_F(CUDAGuardTest, InvalidDeviceIndex) { + auto guard_result = CUDAGuard::create(999); + EXPECT_FALSE(guard_result.ok()); +} + +TEST_F(CUDAGuardTest, NegativeDeviceIndex) { + auto guard_result = CUDAGuard::create(-2); + EXPECT_FALSE(guard_result.ok()); +} + +TEST_F(CUDAGuardTest, CopyConstructorDeleted) { + static_assert( + !std::is_copy_constructible_v, + "CUDAGuard should not be copy constructible"); +} + +TEST_F(CUDAGuardTest, CopyAssignmentDeleted) { + static_assert( + !std::is_copy_assignable_v, + "CUDAGuard should not be copy assignable"); +} + +TEST_F(CUDAGuardTest, MoveAssignmentDeleted) { + static_assert( + !std::is_move_assignable_v, + "CUDAGuard should not be move assignable"); +} diff --git a/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp new file mode 100644 index 00000000000..68a050a69be --- /dev/null +++ b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp @@ -0,0 +1,264 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +using namespace executorch::backends::cuda; +using namespace executorch::runtime; + +// TODO(gasoonjia): Multiple device tests were not included due to test +// environment limitations. These tests should be added in the future when +// multi-GPU test environments are available, + +class CUDAStreamGuardTest : public ::testing::Test { + protected: + void SetUp() override { + et_pal_init(); + + int device_count = 0; + cudaError_t error = cudaGetDeviceCount(&device_count); + if (error != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available or no CUDA devices found"; + } + device_count_ = device_count; + + ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess); + + ASSERT_EQ(cudaStreamCreate(&test_stream1_), cudaSuccess); + ASSERT_EQ(cudaStreamCreate(&test_stream2_), cudaSuccess); + } + + void TearDown() override { + if (test_stream1_) { + ASSERT_EQ(cudaStreamDestroy(test_stream1_), cudaSuccess); + } + if (test_stream2_) { + ASSERT_EQ(cudaStreamDestroy(test_stream2_), cudaSuccess); + } + + if (device_count_ > 0) { + ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess); + } + } + + int device_count_ = 0; + int original_device_ = 0; + cudaStream_t test_stream1_ = nullptr; + cudaStream_t test_stream2_ = nullptr; +}; + +TEST_F(CUDAStreamGuardTest, BasicStreamSwitching) { + auto guard_result = CUDAStreamGuard::create(test_stream1_, 0); + ASSERT_TRUE(guard_result.ok()); + CUDAStreamGuard guard = std::move(guard_result.get()); + + EXPECT_EQ(guard.stream(), test_stream1_); + EXPECT_EQ(guard.device_index(), 0); + + auto current_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(current_stream_result.ok()); + EXPECT_EQ(current_stream_result.get(), test_stream1_); + + int current_device; + ASSERT_EQ(cudaGetDevice(¤t_device), cudaSuccess); + EXPECT_EQ(current_device, 0); +} + +TEST_F(CUDAStreamGuardTest, StreamSwitchingOnSameDevice) { + Error err = setCurrentCUDAStream(test_stream1_, 0); + ASSERT_EQ(err, Error::Ok); + + auto current_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(current_stream_result.ok()); + EXPECT_EQ(current_stream_result.get(), test_stream1_); + + { + auto guard_result = CUDAStreamGuard::create(test_stream2_, 0); + ASSERT_TRUE(guard_result.ok()); + CUDAStreamGuard guard = std::move(guard_result.get()); + + auto new_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(new_stream_result.ok()); + EXPECT_EQ(new_stream_result.get(), test_stream2_); + EXPECT_EQ(guard.stream(), test_stream2_); + } + + auto restored_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(restored_stream_result.ok()); + EXPECT_EQ(restored_stream_result.get(), test_stream1_); +} + +TEST_F(CUDAStreamGuardTest, NestedStreamGuards) { + cudaStream_t initial_stream; + ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess); + + Error err = setCurrentCUDAStream(initial_stream, 0); + ASSERT_EQ(err, Error::Ok); + + { + auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0); + ASSERT_TRUE(guard1_result.ok()); + CUDAStreamGuard guard1 = std::move(guard1_result.get()); + + auto stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(stream_result.ok()); + EXPECT_EQ(stream_result.get(), test_stream1_); + + { + auto guard2_result = CUDAStreamGuard::create(test_stream2_, 0); + ASSERT_TRUE(guard2_result.ok()); + CUDAStreamGuard guard2 = std::move(guard2_result.get()); + + auto stream_result2 = getCurrentCUDAStream(0); + ASSERT_TRUE(stream_result2.ok()); + EXPECT_EQ(stream_result2.get(), test_stream2_); + } + + auto stream_result3 = getCurrentCUDAStream(0); + ASSERT_TRUE(stream_result3.ok()); + EXPECT_EQ(stream_result3.get(), test_stream1_); + } + + auto final_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(final_stream_result.ok()); + EXPECT_EQ(final_stream_result.get(), initial_stream); + + ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess); +} + +TEST_F(CUDAStreamGuardTest, SameStreamNoChange) { + Error err = setCurrentCUDAStream(test_stream1_, 0); + ASSERT_EQ(err, Error::Ok); + + { + auto guard_result = CUDAStreamGuard::create(test_stream1_, 0); + ASSERT_TRUE(guard_result.ok()); + CUDAStreamGuard guard = std::move(guard_result.get()); + + auto stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(stream_result.ok()); + EXPECT_EQ(stream_result.get(), test_stream1_); + EXPECT_EQ(guard.stream(), test_stream1_); + } + + auto final_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(final_stream_result.ok()); + EXPECT_EQ(final_stream_result.get(), test_stream1_); +} + +TEST_F(CUDAStreamGuardTest, StreamAccessor) { + auto guard_result = CUDAStreamGuard::create(test_stream1_, 0); + ASSERT_TRUE(guard_result.ok()); + CUDAStreamGuard guard = std::move(guard_result.get()); + + EXPECT_EQ(guard.stream(), test_stream1_); + EXPECT_EQ(guard.device_index(), 0); +} + +TEST_F(CUDAStreamGuardTest, SetStreamMethod) { + auto guard_result = CUDAStreamGuard::create(test_stream1_, 0); + ASSERT_TRUE(guard_result.ok()); + CUDAStreamGuard guard = std::move(guard_result.get()); + + EXPECT_EQ(guard.stream(), test_stream1_); + + Error err = guard.set_stream(test_stream2_, 0); + EXPECT_EQ(err, Error::Ok); + + EXPECT_EQ(guard.stream(), test_stream2_); + + auto current_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(current_stream_result.ok()); + EXPECT_EQ(current_stream_result.get(), test_stream2_); +} + +TEST_F(CUDAStreamGuardTest, MoveConstructor) { + auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0); + ASSERT_TRUE(guard1_result.ok()); + CUDAStreamGuard guard1 = std::move(guard1_result.get()); + + EXPECT_EQ(guard1.stream(), test_stream1_); + EXPECT_EQ(guard1.device_index(), 0); + + CUDAStreamGuard guard2 = std::move(guard1); + + EXPECT_EQ(guard2.stream(), test_stream1_); + EXPECT_EQ(guard2.device_index(), 0); + + auto current_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(current_stream_result.ok()); + EXPECT_EQ(current_stream_result.get(), test_stream1_); +} + +TEST_F(CUDAStreamGuardTest, MoveConstructorRestoresOnlyOnce) { + cudaStream_t initial_stream; + ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess); + + Error err = setCurrentCUDAStream(initial_stream, 0); + ASSERT_EQ(err, Error::Ok); + + { + auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0); + ASSERT_TRUE(guard1_result.ok()); + CUDAStreamGuard guard1 = std::move(guard1_result.get()); + + { CUDAStreamGuard guard2 = std::move(guard1); } + + auto stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(stream_result.ok()); + EXPECT_EQ(stream_result.get(), initial_stream); + } + + auto final_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(final_stream_result.ok()); + EXPECT_EQ(final_stream_result.get(), initial_stream); + + ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess); +} + +TEST_F(CUDAStreamGuardTest, InvalidDeviceIndex) { + auto guard_result = CUDAStreamGuard::create(test_stream1_, 999); + EXPECT_FALSE(guard_result.ok()); +} + +TEST_F(CUDAStreamGuardTest, NegativeDeviceIndex) { + auto guard_result = CUDAStreamGuard::create(test_stream1_, -2); + EXPECT_FALSE(guard_result.ok()); +} + +TEST_F(CUDAStreamGuardTest, CopyConstructorDeleted) { + static_assert( + !std::is_copy_constructible_v, + "CUDAStreamGuard should not be copy constructible"); +} + +TEST_F(CUDAStreamGuardTest, CopyAssignmentDeleted) { + static_assert( + !std::is_copy_assignable_v, + "CUDAStreamGuard should not be copy assignable"); +} + +TEST_F(CUDAStreamGuardTest, MoveAssignmentDeleted) { + static_assert( + !std::is_move_assignable_v, + "CUDAStreamGuard should not be move assignable"); +} + +TEST_F(CUDAStreamGuardTest, NullStreamPointer) { + auto guard_result = CUDAStreamGuard::create(nullptr, 0); + ASSERT_TRUE(guard_result.ok()); + CUDAStreamGuard guard = std::move(guard_result.get()); + + EXPECT_EQ(guard.stream(), nullptr); + + auto current_stream_result = getCurrentCUDAStream(0); + ASSERT_TRUE(current_stream_result.ok()); +} diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h new file mode 100644 index 00000000000..2d805724090 --- /dev/null +++ b/backends/cuda/runtime/utils.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +// CUDA error checking macro +#define ET_CUDA_CHECK_OR_RETURN_ERROR(EXPR) \ + do { \ + const cudaError_t err = EXPR; \ + if (err == cudaSuccess) { \ + break; \ + } \ + ET_LOG( \ + Error, \ + "%s:%d CUDA error: %s", \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(err)); \ + return Error::Internal; \ + } while (0) + +// Kernel launch check macro +#define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \ + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError()) + +namespace executorch::backends::cuda { + +// Enum for supported data types in et-cuda backend +enum class SupportedDTypes : int32_t { + INT64 = 4, // PyTorch's int64 dtype code + FLOAT32 = 6, // PyTorch's float32 dtype code + BFLOAT16 = 15, // PyTorch's bfloat16 dtype code +}; + +// Enum for supported device types in et-cuda backend +enum class SupportedDevices : int32_t { + CPU = 0, // CPU device + CUDA = 1, // CUDA device +}; + +// Utility function to convert sizes pointer to vector +inline std::vector convert_sizes_to_vector( + int64_t ndim, + const int64_t* sizes_ptr) { + std::vector sizes(ndim); + for (int i = 0; i < ndim; i++) { + sizes[i] = static_cast(sizes_ptr[i]); + } + return sizes; +} + +// Utility function to convert strides pointer to vector or calculate from sizes +inline std::vector convert_strides_to_vector( + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr) { + std::vector strides(ndim); + + if (strides_ptr != nullptr) { + // Use provided strides. it is ok if provided strides here is not contiguous + // strides since it will be used internally in CUDA delegate. + for (int64_t i = 0; i < ndim; i++) { + strides[i] = static_cast(strides_ptr[i]); + } + } else { + // Calculate strides from sizes using ExecutorTorch's algorithm + if (ndim > 0) { + strides[ndim - 1] = static_cast( + 1); // Last dimension has stride 1 + for (int64_t i = ndim - 2; i >= 0; i--) { + if (sizes_ptr[i + 1] == 0) { + strides[i] = strides[i + 1]; // Copy stride when size is 0 + } else { + strides[i] = static_cast( + static_cast(strides[i + 1]) * sizes_ptr[i + 1]); + } + } + } + } + return strides; +} + +extern "C" { +using executorch::runtime::Error; +// Common AOTI type aliases +using AOTITorchError = Error; + +// Helper function to check if a dtype is supported in ET CUDA backend +inline bool is_dtype_supported_in_et_cuda(int32_t dtype) { + switch (dtype) { + case static_cast(SupportedDTypes::INT64): + case static_cast(SupportedDTypes::FLOAT32): + case static_cast(SupportedDTypes::BFLOAT16): + return true; + default: + return false; + } +} + +// Dtype validation utility function +inline AOTITorchError validate_dtype(int32_t dtype) { + ET_CHECK_OR_RETURN_ERROR( + is_dtype_supported_in_et_cuda(dtype), + InvalidArgument, + "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)", + dtype, + static_cast(SupportedDTypes::INT64), + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDTypes::BFLOAT16)); + + return Error::Ok; +} +} // extern "C" + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/tests/TARGETS b/backends/cuda/tests/TARGETS new file mode 100644 index 00000000000..12718c04388 --- /dev/null +++ b/backends/cuda/tests/TARGETS @@ -0,0 +1,41 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") +load("@fbcode_macros//build_defs:python_unittest_remote_gpu.bzl", "python_unittest_remote_gpu") + +oncall("executorch") + +python_unittest_remote_gpu( + name = "test_cuda_export", + srcs = [ + "test_cuda_export.py", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/cuda:cuda_backend", + "//executorch/backends/cuda:cuda_partitioner", + "//executorch/exir:lib", + "//executorch/exir/backend:backend_api", + "//executorch/exir/backend:compile_spec_schema", + ], + keep_gpu_sections = True, +) + +python_unittest( + name = "test_cuda_partitioner", + srcs = [ + "test_cuda_partitioner.py", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/cuda:cuda_partitioner", + "//executorch/backends/cuda:cuda_backend", + "//executorch/exir:lib", + "//executorch/exir/backend:compile_spec_schema", + ], +) diff --git a/backends/cuda/tests/__init__.py b/backends/cuda/tests/__init__.py new file mode 100644 index 00000000000..2e41cd717f6 --- /dev/null +++ b/backends/cuda/tests/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py new file mode 100644 index 00000000000..d794a4f042c --- /dev/null +++ b/backends/cuda/tests/test_cuda_export.py @@ -0,0 +1,253 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Tuple + +import torch +from executorch.backends.cuda.cuda_backend import CudaBackend +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner +from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower +from torch.export import export + + +class TestCudaExport(unittest.TestCase): + """Test CUDA export functionality for various operations using to_edge_transform_and_lower.""" + + def setUp(self): + """Set up test environment.""" + # Skip tests if CUDA is not available + if not torch.cuda.is_available(): + self.skipTest("CUDA is not available") + + def _export_to_cuda_with_lower( + self, module: torch.nn.Module, inputs: Tuple[torch.Tensor, ...] + ) -> None: + """Helper method to export a module to CUDA backend using to_edge_transform_and_lower.""" + # Export the model + exported_program = export(module, inputs, strict=True) + + # Create partitioner and compile specs + partitioner = CudaPartitioner( + [CudaBackend.generate_method_name_compile_spec("forward")] + ) + + # Use to_edge_transform_and_lower for complete pipeline + edge_program_manager = to_edge_transform_and_lower( + exported_program, + partitioner=[partitioner], + compile_config=EdgeCompileConfig( + _check_ir_validity=False, + ), + ) + + # Verify that the pipeline succeeded + self.assertIsNotNone(edge_program_manager) + self.assertTrue(hasattr(edge_program_manager, "exported_program")) + + # Verify that the final exported program contains delegated calls + exported_program = edge_program_manager.exported_program() + has_delegate_call = False + for node in exported_program.graph.nodes: + if node.op == "call_function" and "executorch_call_delegate" in str( + node.target + ): + has_delegate_call = True + break + + self.assertTrue( + has_delegate_call, "No delegate calls found in final exported program" + ) + + return edge_program_manager + + def test_simple_add(self): + """Test CUDA export for simple element-wise addition.""" + + class AddModule(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return x + y + + module = AddModule() + module.eval() + inputs = (torch.randn(3, 4), torch.randn(3, 4)) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Simple add operation export failed") + + def test_conv2d(self): + """Test CUDA export for 2D convolution.""" + + class Conv2dModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, padding=1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.conv(x) + + module = Conv2dModule() + module.eval() + inputs = (torch.randn(1, 3, 32, 32),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Conv2d operation export failed") + + def test_linear(self): + """Test CUDA export for linear layer.""" + + class LinearModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(128, 64) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.linear(x) + + module = LinearModule() + module.eval() + inputs = (torch.randn(8, 128),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Linear operation export failed") + + def test_resnet_block(self): + """Test CUDA export for a ResNet-style block.""" + + class ResNetBlock(torch.nn.Module): + def __init__(self, in_channels: int, out_channels: int, stride: int = 1): + super().__init__() + self.conv1 = torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + ) + # Use eval mode to avoid batch norm mutations during export + self.bn1 = torch.nn.BatchNorm2d(out_channels) + self.relu = torch.nn.ReLU(inplace=True) + self.conv2 = torch.nn.Conv2d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.bn2 = torch.nn.BatchNorm2d(out_channels) + + # Shortcut connection + self.shortcut = torch.nn.Sequential() + if stride != 1 or in_channels != out_channels: + self.shortcut = torch.nn.Sequential( + torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + ), + torch.nn.BatchNorm2d(out_channels), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + identity = self.shortcut(x) + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += identity + out = self.relu(out) + + return out + + module = ResNetBlock(64, 64) + # Set module to eval mode to avoid batch norm running statistics mutations + module.eval() + inputs = (torch.randn(1, 64, 32, 32),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "ResNet block export failed") + + def test_multi_operation_module(self): + """Test CUDA export for a module with multiple operations.""" + + class MultiOpModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 32, kernel_size=3, padding=1) + self.relu = torch.nn.ReLU() + self.pool = torch.nn.AdaptiveAvgPool2d((1, 1)) + self.linear = torch.nn.Linear(32, 10) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x) + x = self.relu(x) + x = self.pool(x) + x = x.view(x.size(0), -1) + x = self.linear(x) + return x + + module = MultiOpModule() + module.eval() + inputs = (torch.randn(2, 3, 16, 16),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone( + edge_program_manager, "Multi-operation module export failed" + ) + + def test_activation_functions(self): + """Test CUDA export for various activation functions.""" + + class ActivationModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Test multiple activation functions + x1 = torch.relu(x) + x2 = torch.sigmoid(x) + x3 = torch.tanh(x) + return x1 + x2 + x3 + + module = ActivationModule() + module.eval() + inputs = (torch.randn(4, 8),) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone(edge_program_manager, "Activation functions export failed") + + def test_mathematical_operations(self): + """Test CUDA export for mathematical operations.""" + + class MathOpsModule(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + # Test various mathematical operations + add_result = x + y + mul_result = x * y + sub_result = x - y + div_result = x / (y + 1e-8) # Add epsilon to avoid division by zero + return add_result + mul_result + sub_result + div_result + + module = MathOpsModule() + module.eval() + inputs = (torch.randn(4, 4), torch.randn(4, 4)) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone( + edge_program_manager, "Mathematical operations export failed" + ) diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py new file mode 100644 index 00000000000..cb4a2def1f8 --- /dev/null +++ b/backends/cuda/tests/test_cuda_partitioner.py @@ -0,0 +1,141 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Tuple + +import torch +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner +from executorch.exir.backend.partitioner import PartitionResult +from torch.export import export + + +class TestCudaPartitioner(unittest.TestCase): + """ + Test CUDA partitioner functionality. + + After CUDA partitioning, there should be exactly one partitioned graph that contains + all operators from the input graph. This means all operators should be tagged with + the same delegation tag, indicating they will all be executed by the CUDA backend. + """ + + def _get_partition_result( + self, module: torch.nn.Module, inputs: Tuple[torch.Tensor, ...] + ) -> PartitionResult: + """Helper method to get partition result for a given module.""" + # Export the model + exported_program = export(module, inputs, strict=True) + + # Create partitioner and compile specs + partitioner = CudaPartitioner([]) + + # Get partition result + partition_result = partitioner.partition(exported_program) + + # Verify partition result structure + self.assertIsNotNone(partition_result) + self.assertTrue(hasattr(partition_result, "tagged_exported_program")) + self.assertTrue(hasattr(partition_result, "partition_tags")) + + return partition_result + + def _check_fully_partitioned(self, partition_result: PartitionResult) -> bool: + """Check if the graph is fully partitioned (all operators have the same tag).""" + tagged_nodes = [] + untagged_ops = [] + + for node in partition_result.tagged_exported_program.graph.nodes: + if node.op == "call_function": + if hasattr(node, "meta") and "delegation_tag" in node.meta: + tagged_nodes.append(node) + else: + untagged_ops.append(node) + + # Check if we have any tagged nodes + if not tagged_nodes: + return False + + # Check if all tagged nodes have the same tag + first_tag = tagged_nodes[0].meta["delegation_tag"] + all_same_tag = all( + node.meta.get("delegation_tag") == first_tag for node in tagged_nodes + ) + + # Should have no untagged operations for full partitioning + fully_partitioned = len(untagged_ops) == 0 and all_same_tag + + return fully_partitioned + + def test_simple_add_partition(self): + """ + Test that CUDA partitioner creates exactly one partition containing all operators. + Simple element-wise addition should result in a single graph with all ops tagged identically. + """ + + class AddModule(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return x + y + + module = AddModule() + inputs = (torch.randn(3, 4), torch.randn(3, 4)) + + partition_result = self._get_partition_result(module, inputs) + fully_partitioned = self._check_fully_partitioned(partition_result) + + self.assertTrue( + fully_partitioned, + "Graph should be fully partitioned with all operators having the same tag", + ) + + def test_conv2d_partition(self): + """ + Test that CUDA partitioner creates exactly one partition containing all operators. + Conv2D operation should result in a single graph with all ops tagged identically. + """ + + class Conv2dModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, padding=1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.conv(x) + + module = Conv2dModule() + inputs = (torch.randn(1, 3, 32, 32),) + + partition_result = self._get_partition_result(module, inputs) + fully_partitioned = self._check_fully_partitioned(partition_result) + + self.assertTrue( + fully_partitioned, + "Graph should be fully partitioned with all operators having the same tag", + ) + + def test_linear_partition(self): + """ + Test that CUDA partitioner creates exactly one partition containing all operators. + Linear layer operation should result in a single graph with all ops tagged identically. + """ + + class LinearModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(128, 64) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.linear(x) + + module = LinearModule() + inputs = (torch.randn(8, 128),) + + partition_result = self._get_partition_result(module, inputs) + fully_partitioned = self._check_fully_partitioned(partition_result) + + self.assertTrue( + fully_partitioned, + "Graph should be fully partitioned with all operators having the same tag", + ) diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp new file mode 100644 index 00000000000..feed458e1f5 --- /dev/null +++ b/backends/cuda/tests/voxtral_runner.cpp @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::extension::make_tensor_ptr; +using executorch::extension::TensorPtr; +using executorch::extension::module::Module; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::Result; +using Clock = std::chrono::steady_clock; +using DurationMs = std::chrono::duration; + +std::vector to_sizes( + std::initializer_list dims) { + return std::vector(dims.begin(), dims.end()); +} + +std::string format_shape(const Tensor& tensor) { + std::ostringstream oss; + oss << "["; + const auto& sizes = tensor.sizes(); + for (size_t i = 0; i < sizes.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << sizes[i]; + } + oss << "]"; + return oss.str(); +} + +void print_tensor_summary(const std::string& label, const Tensor& tensor) { + std::cout << " " << label + << ": dtype=" << executorch::runtime::toString(tensor.scalar_type()) + << ", shape=" << format_shape(tensor) + << ", numel=" << tensor.numel() << std::endl; +} + +TensorPtr create_audio_input() { + const auto sizes = to_sizes({3, 128, 3000}); + const size_t numel = 3ull * 128ull * 3000ull; + std::vector data(numel, 0.5f); + return make_tensor_ptr( + sizes, std::move(data), {}, {}, ScalarType::BFloat16); +} + +TensorPtr create_token_ids_input() { + const auto sizes = to_sizes({1, 1138}); + std::vector data(static_cast(1) * 1138, 0); + return make_tensor_ptr(sizes, std::move(data)); +} + +TensorPtr create_positions_input() { + const auto sizes = to_sizes({1138}); + std::vector data(static_cast(1138), 0); + return make_tensor_ptr(sizes, std::move(data)); +} + +TensorPtr create_fallback_text_embedding() { + const auto sizes = to_sizes({1, 1138, 3072}); + const size_t numel = 1ull * 1138ull * 3072ull; + std::vector data(numel, 0.0f); + return make_tensor_ptr( + sizes, std::move(data), {}, {}, ScalarType::BFloat16); +} + +struct MethodTiming { + double load_ms{0.0}; + double run_ms{0.0}; +}; + +} // namespace + +int main(int argc, char** argv) { + if (argc != 3) { + std::cerr << "Usage: " << argv[0] + << " " + << std::endl; + return 1; + } + + const std::string program_path = argv[1]; + const std::string data_map_path = argv[2]; + + try { + Module module(program_path, data_map_path); + + const auto program_load_start = Clock::now(); + const Error program_load_error = module.load(); + const auto program_load_end = Clock::now(); + if (program_load_error != Error::Ok) { + std::cerr << "Failed to load ExecuTorch program: error code " + << static_cast(program_load_error) << std::endl; + return 1; + } + const DurationMs program_load_latency = + program_load_end - program_load_start; + + MethodTiming audio_timing; + MethodTiming token_timing; + MethodTiming text_timing; + + auto measure_method_load = + [&](const std::string& name) -> std::pair { + const auto start = Clock::now(); + const Error err = module.load_method(name); + const auto end = Clock::now(); + return {err, DurationMs(end - start).count()}; + }; + + // audio_encoder + { + const auto [err, load_ms] = measure_method_load("audio_encoder"); + if (err != Error::Ok) { + std::cerr << "Failed to load method audio_encoder: error code " + << static_cast(err) << std::endl; + return 1; + } + audio_timing.load_ms = load_ms; + + const TensorPtr audio_input = create_audio_input(); + std::vector inputs; + std::vector owned_inputs; + owned_inputs.emplace_back(audio_input); + inputs.emplace_back(*audio_input); + + const auto run_start = Clock::now(); + Result> output_result = + module.execute("audio_encoder", inputs); + const auto run_end = Clock::now(); + audio_timing.run_ms = DurationMs(run_end - run_start).count(); + + if (output_result.error() != Error::Ok) { + std::cerr << "audio_encoder execution failed: error code " + << static_cast(output_result.error()) << std::endl; + return 1; + } + + const auto& outputs = output_result.get(); + if (!outputs.empty() && outputs[0].isTensor()) { + print_tensor_summary("audio_encoder output", outputs[0].toTensor()); + } + } + + EValue token_output; + bool token_executed = false; + + // token_embedding + { + const auto [err, load_ms] = measure_method_load("token_embedding"); + if (err != Error::Ok) { + std::cerr << "Failed to load method token_embedding: error code " + << static_cast(err) << std::endl; + return 1; + } + token_timing.load_ms = load_ms; + + const TensorPtr token_ids = create_token_ids_input(); + std::vector inputs; + std::vector owned_inputs; + owned_inputs.emplace_back(token_ids); + inputs.emplace_back(*token_ids); + + const auto run_start = Clock::now(); + auto token_output_result = module.execute("token_embedding", inputs); + const auto run_end = Clock::now(); + token_timing.run_ms = DurationMs(run_end - run_start).count(); + + if (token_output_result.error() != Error::Ok) { + std::cerr << "token_embedding execution failed: error code " + << static_cast(token_output_result.error()) << std::endl; + return 1; + } + + token_executed = true; + const auto& outputs = token_output_result.get(); + if (!outputs.empty() && outputs[0].isTensor()) { + print_tensor_summary("token_embedding output", outputs[0].toTensor()); + token_output = outputs[0]; + } + } + + // text_decoder + { + const auto [err, load_ms] = measure_method_load("text_decoder"); + if (err != Error::Ok) { + std::cerr << "Failed to load method text_decoder: error code " + << static_cast(err) << std::endl; + return 1; + } + text_timing.load_ms = load_ms; + + std::vector inputs; + std::vector owned_inputs; + if (token_executed) { + if (token_output.isTensor()) { + inputs.emplace_back(token_output); + } + } + + if (inputs.empty()) { + auto fallback_embedding = create_fallback_text_embedding(); + owned_inputs.emplace_back(fallback_embedding); + inputs.emplace_back(*fallback_embedding); + } + + auto positions = create_positions_input(); + owned_inputs.emplace_back(positions); + inputs.emplace_back(*positions); + + const auto run_start = Clock::now(); + Result> output_result = + module.execute("text_decoder", inputs); + const auto run_end = Clock::now(); + text_timing.run_ms = DurationMs(run_end - run_start).count(); + + if (output_result.error() != Error::Ok) { + std::cerr << "text_decoder execution failed: error code " + << static_cast(output_result.error()) << std::endl; + return 1; + } + + const auto& outputs = output_result.get(); + if (!outputs.empty() && outputs[0].isTensor()) { + print_tensor_summary("text_decoder output", outputs[0].toTensor()); + } + } + + std::cout << std::fixed << std::setprecision(3); + std::cout << "Program load latency (ms): " << program_load_latency.count() + << std::endl; + + std::cout << "Method load latency (ms):" << std::endl; + std::cout << " audio_encoder: " << audio_timing.load_ms << std::endl; + std::cout << " token_embedding: " << token_timing.load_ms << std::endl; + std::cout << " text_decoder: " << text_timing.load_ms << std::endl; + + std::cout << "Run latency (ms):" << std::endl; + std::cout << " audio_encoder: " << audio_timing.run_ms << std::endl; + std::cout << " token_embedding: " << token_timing.run_ms << std::endl; + std::cout << " text_decoder: " << text_timing.run_ms << std::endl; + + return 0; + } catch (const std::exception& ex) { + std::cerr << "Unhandled exception: " << ex.what() << std::endl; + return 1; + } +} diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index ed9b37e1998..10c28be0053 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -46,5 +46,5 @@ executorch_target_link_options_shared_lib(neuron_backend) install( TARGETS neuron_backend EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} ) diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md index e8a535b3fde..6ff751f8408 100644 --- a/backends/mediatek/README.md +++ b/backends/mediatek/README.md @@ -28,7 +28,7 @@ To get started with MediaTek's ExecuTorch libraries, download the [NeuroPilot Ex - **`mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`**: This library preprocesses the model into a MediaTek representation. -- **`mtk_neuron-8.2.19-py3-none-linux_x86_64.whl`**: This library converts the model to binaries. +- **`mtk_neuron-8.2.23-py3-none-linux_x86_64`**: This library converts the model to binaries. Additionally, make sure to copy `NeuronAdapter.h` to the following directory: `backends/mediatek/runtime/include/api/`. @@ -45,7 +45,7 @@ Follow the steps below to setup your build environment: ``` - Install the two .whl downloaded from NeuroPilot Portal ```bash - pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl + pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ``` diff --git a/backends/mediatek/scripts/mtk_build.sh b/backends/mediatek/scripts/mtk_build.sh index 599f754d7bc..d42e5f7e10a 100755 --- a/backends/mediatek/scripts/mtk_build.sh +++ b/backends/mediatek/scripts/mtk_build.sh @@ -30,6 +30,7 @@ cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_NEURON=ON \ -B"${build_dir}" diff --git a/backends/nxp/CMakeLists.txt b/backends/nxp/CMakeLists.txt index 43fcaa24d19..bfc4c046be6 100644 --- a/backends/nxp/CMakeLists.txt +++ b/backends/nxp/CMakeLists.txt @@ -17,5 +17,5 @@ target_include_directories( install( TARGETS executorch_delegate_neutron EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} ) diff --git a/backends/nxp/README.md b/backends/nxp/README.md index 10eb1290a8b..de41cdd282e 100644 --- a/backends/nxp/README.md +++ b/backends/nxp/README.md @@ -15,7 +15,8 @@ networks, as well as the ability to adapt and scale to new model architectures, to AI workloads. ML application development with the eIQ Neutron NPU is fully supported by the [eIQ machine learning software development environment](https://www.nxp.com/design/design-center/software/eiq-ml-development-environment/eiq-toolkit-for-end-to-end-model-development-and-deployment:EIQ-TOOLKIT). The eIQ AI SW Stack provides a streamlined development experience for developers and end-users of NXP products. -eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices. +eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers +to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices. ## Supported NXP platforms @@ -35,37 +36,28 @@ improvements. NXP and the ExecuTorch community is actively developing this codeb ## Neutron Backend implementation and SW architecture Neutron Backend uses the eIQ Neutron Converter as ML compiler to compile the delegated subgraph to Neutron microcode. -The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class therefore the Neutron Backend uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler. - -The Neutron Backend in its early prototype phase, is based on existing NXP products, such as -onnx2tflite, known from the NXP's eIQ Toolkit. -The **onnx2tflite** is a converter from the ONNX format to LiteRT (formerly known as TFLite). -It consists of 3 stages: -* ONNX Model Parsing -* Tensor Format Inference, to identify tensors using channel-first layer -* ONNX to LiteRT Conversion -* Optimization Passes, which operate on top of the LiteRT format -* LiteRT Serialization - -Due to the similarities between ONNX to LiteRT and Edge to LiteRT conversion, the Neutron Backend's -currently leverages the Tensor format Inference and LiteRT Optimizer. -This shall be considered as temporary solution, intended to be replaced with: -* Dim Order (https://github.com/pytorch/executorch/issues/4873) -* Corresponding ExecuTorch/ATen passes - -before reaching higher maturity status by the end of 2025. +The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class therefore the Neutron Backend +uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler. ## Layout -The current code base is as follows: * `backend/ir/` - TFLite/LiteRT based IR to represent the Edge Subgraph, taken from onnx2tflite code base and extended to support Edge Dialect to LiteRT conversion. * `backend/ir/converter` - Neutron Backends conversion from Edge (ATen) Dialect to LiteRT, TFLite. The subfolder `node_conveters` is structured as single module for each Edge operator. - * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema + * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema. * `backend/ir/tflite_generator` and `backend/ir/tflite_optimizer` handle the serialization of the in-memory built subgraph for delegation into LiteRT/TFLite flatbuffers representation. Code taken from the onnx2tflite tool. -* `quantizer` - Neutron Backends quantizer implementation. +* `edge_passes` - Various passes operating on Edge dialect level. +* `quantizer` - Neutron Backend quantizer implementation. +* `runtime` - Neutron Backend runtime implementation. For running compiled on device. +* `tests/` - Unit tests for Neutron backend. + * `tests/converter/node_converter` - Operator level unit tests. + +* `examples/nxp/` - Example models and scripts for running them. + +## Examples +Please see this [README.md](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md). ## Help & Improvements If you have problems or questions or have suggestions for ways to make diff --git a/backends/nxp/aten_passes/fuse_linear_and_add_pass.py b/backends/nxp/aten_passes/fuse_linear_and_add_pass.py new file mode 100644 index 00000000000..20a32c1bcac --- /dev/null +++ b/backends/nxp/aten_passes/fuse_linear_and_add_pass.py @@ -0,0 +1,204 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import torch + +from executorch.backends.nxp.backend.edge_helper import ( + try_get_tensor_constant_from_node, +) +from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix +from torch.export.unflatten import _assign_attr, _AttrKind +from torch.fx import GraphModule, Node +from torch.fx.passes.infra.pass_base import PassBase, PassResult + + +class FuseLinearAndAddPass(PassBase): + """Replace a sequence of `linear` and `add` nodes in the following pattern by a single `linear` node when possible. + │ + ┌──────▼──────┐ + │ aten.linear │ + └──────┬──────┘ │ + │ replace with ┌──────▼──────┐ + ┌─────▼────┐ ───────────► │ aten.linear │ + │ aten.add │ └──────┬──────┘ + └─────┬────┘ + ▼ + """ + + def _fuse_with_existing_bias( + self, + linear_node: Node, + other_add_input: Node, + graph_module: GraphModule, + alpha: float, + ) -> bool: + """Fuse the `linear` and `add` nodes provided the `linear` already has a bias. + The fusion can only be done if both the "biases" have static data, which can be added together to get a + single bias. + + :return: True, if the nodes were successfully merged. False, otherwise. + """ + + linear_bias = linear_node.args[2] + if other_add_input.meta["val"].shape != linear_bias.meta["val"].shape: + # The biases cannot be added together due to their different shapes. + # Shape broadcasting is not applicable, as the only allowed `linear` bias shape is 1D ([output_features]). + return False + + bias_data = [ + try_get_tensor_constant_from_node(graph_module, linear_bias), + try_get_tensor_constant_from_node(graph_module, other_add_input), + ] + if any(data is None for data in bias_data): + return ( + False # Fusion is not possible because at least 1 bias is not static. + ) + + # Add the bias data together, to obtain the combined bias. Take the `alpha` attribute into account. + combined_bias = bias_data[0] + bias_data[1] * alpha + + # Create a new node containing the combined bias data. + combined_bias_name = get_new_attr_name_with_prefix( + linear_bias.name + "combined" + )(graph_module) + _assign_attr( + torch.nn.Parameter(combined_bias), + graph_module, + combined_bias_name, + _AttrKind.PARAMETER, + ) + with graph_module.graph.inserting_before(linear_node): + new_bias_node = graph_module.graph.get_attr(combined_bias_name) + + # Use the combined bias as the new bias for the `Linear`. + linear_node.args = ( + linear_node.args[:2] + (new_bias_node,) + linear_node.args[3:] + ) + return True + + def _fuse_without_existing_bias( + self, + linear_node: Node, + other_add_input: Node, + graph_module: GraphModule, + alpha: float, + ) -> bool: + """Fuse the `linear` and `add` provided the `linear` does not already have a bias. + + :return: True, if the nodes were successfully merged. False, otherwise. + """ + + # The weights have shape (out_features, in_features). + output_features = linear_node.args[1].meta["val"].shape[0] + new_bias_shape = other_add_input.meta["val"].shape + if list(new_bias_shape) != [output_features]: + return False # The `Add` is adding a tensor with shape that is not supported for the `Linear` bias. + + bias_data = try_get_tensor_constant_from_node(graph_module, other_add_input) + + if bias_data is None: + return False # Neutron doesn't support a dynamic bias, so fusion would be counterproductive. + + # It is possible that the `linear` comes before the `other_add_input` in the graph, so it cannot use it as an + # input directly. If the nodes are ordered as [linear, ..., other_add_input, ... add] (which is valid), using + # `other_add_input` directly as an input to `Linear` would not follow topological order. + # Rearranging the nodes is not trivial, as the graph could be complex (ultimately, the + # `other_add_input` could even originate from the `Linear` node...). + # Since the `other_add_input` has static data, we can create a new node with the data just before the `Linear` + # to ensure topological order. + # Regardless of the node ordering, the `add.Tensor` attribute `alpha` multiplies the second `add` input. If + # `alpha != 1`, we would have to insert a `mul` operator if we wanted to keep the original parameter node. + # Therefore, it is better to create a new static parameter node for the multiplied data in this case as well. + nodes = list(graph_module.graph.nodes) + if nodes.index(linear_node) < nodes.index(other_add_input) or alpha != 1.0: + # Problematic order, or required multiplication. + + # Handle the `aten.add.Tensor` attribute `alpha`. + bias_data *= alpha + + # Create a unique name. + new_bias_name = get_new_attr_name_with_prefix(linear_node.name + "_bias")( + graph_module + ) + _assign_attr(bias_data, graph_module, new_bias_name, _AttrKind.PARAMETER) + with graph_module.graph.inserting_before(linear_node): + new_bias_node = graph_module.graph.get_attr(new_bias_name) + + # Use the added tensor as the new `Linear` bias. + linear_node.args = ( + linear_node.args[:2] + (new_bias_node,) + linear_node.args[2:] + ) + return True + + else: + # Use the `other_add_input` directly as the new bias. + linear_node.args = ( + linear_node.args[:2] + (other_add_input,) + linear_node.args[2:] + ) + return True + + def call(self, graph_module: GraphModule) -> Optional[PassResult]: + def _is_applicable_linear_node(node_: Node): + is_linear = ( + node_.op == "call_function" + and node_.target == torch.ops.aten.linear.default + ) + has_single_user = len(node.users) == 1 + + return is_linear and has_single_user + + def _is_add(node_: Node): + return ( + node_.op == "call_function" + and node_.target == torch.ops.aten.add.Tensor + ) + + made_changes = False + for node in graph_module.graph.nodes: + if not _is_applicable_linear_node( + linear_node := node + ): # Also ensures a single user. + continue + + if not _is_add(add_node := list(linear_node.users.keys())[0]): + continue # Not the `Linear` -> `Add` case. + + if len(add_node.args) != 2: + continue # Unexpected case. + + # The `aten.add.Tensor` carries out the expression `out = input[0] + alpha × input[1]`. + # https://docs.pytorch.org/docs/stable/generated/torch.add.html + alpha = add_node.kwargs.get("alpha", 1.0) + if add_node.args[0] == linear_node: + other_add_input = add_node.args[1] + + else: + # The fusion is not implemented. The `other_add_input` would have to be divided by `alpha` before the + # fusion, and a `mul` operator would have to be added after the `linear` to multiply its output by + # `alpha`. + continue + + if len(linear_node.args) > 2: + if not self._fuse_with_existing_bias( + linear_node, other_add_input, graph_module, alpha + ): + continue # The nodes could not be fused. + + else: + # The `Linear` doesn't have a bias yet. + if not self._fuse_without_existing_bias( + linear_node, other_add_input, graph_module, alpha + ): + continue # The nodes could not be fused. + + # Use the output of the `Linear` instead of the `Add`, and remove the now unused `Add` node. + add_node.replace_all_uses_with(linear_node) + graph_module.graph.erase_node(add_node) + + made_changes = True + + return PassResult(graph_module, made_changes) diff --git a/backends/nxp/aten_passes/neutron_aten_pass_manager.py b/backends/nxp/aten_passes/neutron_aten_pass_manager.py index f6e3c374b19..407ebf5da61 100644 --- a/backends/nxp/aten_passes/neutron_aten_pass_manager.py +++ b/backends/nxp/aten_passes/neutron_aten_pass_manager.py @@ -13,6 +13,9 @@ from executorch.backends.nxp.aten_passes.fuse_batch_norm_with_linear_pass import ( FuseBatchNormWithLinearPass, ) +from executorch.backends.nxp.aten_passes.fuse_linear_and_add_pass import ( + FuseLinearAndAddPass, +) from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import ( RemoveNodesWithKnownOutputs, ) @@ -38,6 +41,7 @@ def __init__(self, passes: list[PassType] = None): SplitGroupConvolution(), SplitGRUBasedOnNumLayers(), RemoveNodesWithKnownOutputs(), + FuseLinearAndAddPass(), ] super().__init__(passes) diff --git a/backends/nxp/backend/edge_helper.py b/backends/nxp/backend/edge_helper.py index 061295ead79..60b367c0f39 100644 --- a/backends/nxp/backend/edge_helper.py +++ b/backends/nxp/backend/edge_helper.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py index ddbbf5b2e3a..fcfb9787715 100644 --- a/backends/nxp/backend/edge_program_converter.py +++ b/backends/nxp/backend/edge_program_converter.py @@ -18,6 +18,7 @@ from torch.fx import Node from torch.nn.parameter import Parameter from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import * # noqa F403 +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.backends.nxp.backend.node_format_inference import ( NodeFormat, NodeFormatInference, @@ -33,6 +34,7 @@ exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter, # noqa F405 exir_ops.edge.aten.cat.default: CatConverter, # noqa F405 exir_ops.edge.aten.clone.default: CloneConverter, # noqa F405 + exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter, # noqa F405 exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter, # noqa F405 exir_ops.edge.aten.convolution.default: ConvolutionConverter, # noqa F405 exir_ops.edge.aten.hardtanh.default: HardTanhConverter, # noqa F405 @@ -42,6 +44,7 @@ exir_ops.edge.aten.permute_copy.default: PermuteCopyConverter, # noqa F405 exir_ops.edge.aten.relu.default: ReLUConverter, # noqa F405 exir_ops.edge.aten._softmax.default: SoftmaxConverter, # noqa F405 + exir_ops.edge.aten.sub.Tensor: SubTensorConverter, # noqa F405 exir_ops.edge.aten.tanh.default: TanhConverter, # noqa F405 exir_ops.edge.aten.view_copy.default: ViewCopyConverter, # noqa F405 exir_ops.edge.aten.sigmoid.default: SigmoidConverter, # noqa F405 @@ -54,12 +57,14 @@ class EdgeProgramToIRConverter: """ _default_conversion_config = ConversionConfig() + _default_target_spec = NeutronTargetSpec("imxrt700", "SDK_25_09") _default_delegation_options = CustomDelegationOptions() def convert_program( self, edge_program: ExportedProgram, - conversion_config=_default_conversion_config, + conversion_config: ConversionConfig = _default_conversion_config, + neutron_target_spec: NeutronTargetSpec = _default_target_spec, custom_delegation_options: CustomDelegationOptions = _default_delegation_options, ) -> (bytes, dict): """ @@ -67,6 +72,7 @@ def convert_program( :param edge_program: Converter ExportedProgram. :param conversion_config: ConversionConfig instance. + :param neutron_target_spec: Object for querying the target platform to retrieve its properties. :param custom_delegation_options: Custom user options which affect node delegation. :return: TFLite flatbuffers as bytes. """ @@ -76,6 +82,7 @@ def convert_program( cc = self.build_conversion_context( parameters_mapping, node_formats, + neutron_target_spec, conversion_config, custom_delegation_options, ) @@ -134,6 +141,7 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex qdq_related_functions = [ exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, ] @@ -172,11 +180,12 @@ def map_inputs_to_parameters(edge_program: ExportedProgram) -> dict[str, Paramet def build_conversion_context( parameters_mapping: dict, node_formats: dict[Node, NodeFormat], + neutron_target_spec: NeutronTargetSpec, conversion_config: ConversionConfig = _default_conversion_config, custom_delegation_options: CustomDelegationOptions = _default_delegation_options, ) -> ConversionContext: tflite_builder = AtenModelBuilderDirector( - 3, "TFLite from EdgeProgram", conversion_config + 3, "TFLite from EdgeProgram", neutron_target_spec, conversion_config ) # Add "sentinel" buffer (defined in schema.fbs) @@ -203,7 +212,8 @@ def _convert_qdq_cluster_q_dq_nodes( :param conversion_context: ConversionContext instance. """ qdq_q_ops_converters = { - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQDequantizeConverter, # noqa F405 + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQPerTensorDequantizeConverter, # noqa F405 + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: QDQPerChannelDequantizeConverter, # noqa F405 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: QDQQuantizeConverter, # noqa F405 } diff --git a/backends/nxp/backend/ir/conversion_config.py b/backends/nxp/backend/ir/conversion_config.py index 4ac88eb467c..622735e881f 100644 --- a/backends/nxp/backend/ir/conversion_config.py +++ b/backends/nxp/backend/ir/conversion_config.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -14,7 +14,6 @@ def __init__(self, args: dict | None = None): :param args: Optional dictionary with conversion arguments. Unknown arguments are ignored. """ self.keep_io_format: bool = False - self.skip_shape_inference: bool = False self.allow_inputs_stripping: bool = True self.qdq_aware_conversion: bool = True self.symbolic_dimensions_mapping: dict[str, int] | None = None @@ -46,15 +45,6 @@ def __repr__(self): return "ConversionConfig[" + ", ".join(attrs) + "]" -class SkipShapeInferenceConfig(ConversionConfig): - - def __init__(self): - """ - Conversion config shortcut with disabled shape inference. - """ - super().__init__({"skip_shape_inference": True}) - - class QDQAwareConfig(ConversionConfig): def __init__(self): diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py index 4f036854138..643a6231d15 100755 --- a/backends/nxp/backend/ir/converter/builder/model_builder.py +++ b/backends/nxp/backend/ir/converter/builder/model_builder.py @@ -1,6 +1,6 @@ # # Copyright 2023 Martin Pavella -# Copyright 2023-2024 NXP +# Copyright 2023-2025 NXP # # License: MIT # See the LICENSE_MIT for more details. @@ -48,6 +48,7 @@ FlexTranspose, ) from executorch.backends.nxp.backend.ir.tflite_optimizer import optimizer +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec class ModelBuilder: @@ -74,17 +75,21 @@ class ModelBuilder: _zeros_tensor_map: Dict # Mapping 'string' shapes to 'tflT.Tensor' objects - _default_conversion_config = ConversionConfig() + neutron_target_spec: NeutronTargetSpec conversion_config: ConversionConfig + _default_conversion_config = ConversionConfig() + def __init__( self, model_version: int, model_description: str, + neutron_target_spec: NeutronTargetSpec, conversion_config: ConversionConfig = _default_conversion_config, ) -> None: self._tfl_model = tflite_model.Model(model_version, model_description) + self.neutron_target_spec = neutron_target_spec self.conversion_config = conversion_config self.op_code_type_index_map = {} @@ -471,31 +476,7 @@ def finish(self) -> tflite_model.Model: return self._tfl_model - def _assign_tensor_and_buffer_indices( # noqa C901 - self, allow_inputs_stripping: bool - ): - """Correctly initialize all references via indices in all tensors and buffers.""" - - # Assign each buffer its index - for i, buffer in enumerate(self.get_buffers().vector): - buffer.tmp_index = i - - # Assign each tensor its index and its buffer index - for i, tensor in enumerate(self.get_tensors().vector): - if tensor.tmp_null_tensor: - # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that - # this tensor should not be used. - # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98 - tensor.tmp_index = -1 - else: - tensor.tmp_index = i - - tensor.buffer = tensor.tmp_buffer.tmp_index - - # TODO Remove inputs and outputs that are not in the tensors collection - - # Assign 'Outputs' and 'Inputs' their tensor indices - outputs = self.get_sub_graph().outputs + def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool): for tensor in outputs.tmp_outputs: try: outputs.append(tensor.tmp_index) @@ -505,7 +486,6 @@ def _assign_tensor_and_buffer_indices( # noqa C901 f"The tensor '{tensor.name}' is among the model outputs, but does NOT appear in the graph!", ) - inputs = self.get_sub_graph().inputs for tensor in inputs.tmp_inputs: try: inputs.append(tensor.tmp_index) @@ -520,14 +500,46 @@ def _assign_tensor_and_buffer_indices( # noqa C901 f"The tensor '{tensor.name}' is among the model inputs, but does NOT appear in the graph!", ) - # Assign each operator its inputs and outputs indices - for operator in self.get_sub_graph().operators.vector: + def _assign_operators_io_tensor_indices(self, operators): + for operator in operators.vector: for inputTensor in operator.tmp_inputs: operator.inputs.append(inputTensor.tmp_index) for outputTensor in operator.tmp_outputs: operator.outputs.append(outputTensor.tmp_index) + def _assign_tensor_and_buffer_indices(self, allow_inputs_stripping: bool): + """Correctly initialize all references via indices in all tensors and buffers.""" + + # Assign each buffer its index + for i, buffer in enumerate(self.get_buffers().vector): + buffer.tmp_index = i + + # Assign each tensor its index and its buffer index + for i, tensor in enumerate(self.get_tensors().vector): + if tensor.tmp_null_tensor: + # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that + # this tensor should not be used. + # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98 + tensor.tmp_index = -1 + else: + tensor.tmp_index = i + + tensor.buffer = tensor.tmp_buffer.tmp_index + + # TODO Remove inputs and outputs that are not in the tensors collection + + subgraph = self.get_sub_graph() + + # Assign 'Outputs' and 'Inputs' their tensor indices + self._assign_io_tensor_indices( + inputs=subgraph.inputs, + outputs=subgraph.outputs, + allow_inputs_stripping=allow_inputs_stripping, + ) + # Assign each operator its inputs and outputs indices + self._assign_operators_io_tensor_indices(operators=subgraph.operators) + def _build_operator_code( self, op_type: BuiltinOperator, version, custom_code: str = None ): @@ -795,29 +807,8 @@ def _remove_tensor_with_name(self, name): def append_new_tensor(self, t_tensor: tflite_model.Tensor, overwrite: bool = False): """Append the TFLite tensor 't_tensor' to the 'SubGraph.tensors' and register it.""" - - if t_tensor.name in self._tensor_name_map.keys(): - """Tensor has already been added. Sometimes however, ONNX models - will have tensors in their 'inputs' or 'outputs', which don't - belong there and are in fact static. I this case we need to - overwrite the existing tensors.""" - - if overwrite: - self._remove_tensor_with_name(t_tensor.name) - - # If the tenor previously appeared in ONNX 'inputs' or 'outputs', - # the old version MUST be removed from there. - self._remove_input_with_name(t_tensor.name) - self._remove_output_with_name(t_tensor.name) - - self.get_tensors().append(t_tensor) - self._tensor_name_map[t_tensor.name] = t_tensor - else: - logger.w(f"Tensor '{t_tensor.name}' is already in the tensors!") - - else: - self._tensor_name_map[t_tensor.name] = t_tensor - self.get_tensors().append(t_tensor) + self._tensor_name_map[t_tensor.name] = t_tensor + self.get_tensors().append(t_tensor) def append_new_buffer(self, buffer: tflite_model.Buffer): """Append the 'buffer' to the 'model.buffers'.""" @@ -1515,7 +1506,7 @@ def prepare_dynamic_tensor_for_correct_broadcasting_with_channels_first_tensors( # Prepend a partial identity, to keep leading dimensions unchanged. revert_perm = list(range(rank_diff)) + list(revert_perm) - # Now add a permutation to convert the extended ONNX shape to a TFLite shape + # Now add a permutation to convert the extended ExecuTorch shape to a TFLite shape to_tflite_perm = ( translator.create_channels_first_to_channels_last_permutation( output_rank @@ -1579,20 +1570,20 @@ def prepare_static_tensor_for_correct_broadcasting_with_channels_first_tensors( original_shape = translator.dims_to_channels_first( shape - ) # Same shape as in the ONNX model + ) # Same shape as in the ExecuTorch model # Prepend 1s to the shape - extended_onnx_shape = [1] * rank_diff + original_shape + extended_executorch_shape = [1] * rank_diff + original_shape # Convert the full shape to TFLite format - tflite_shape = translator.dims_to_channels_last(extended_onnx_shape) + tflite_shape = translator.dims_to_channels_last(extended_executorch_shape) tensor.shape = tflite_model.Shape(tflite_shape) # Statically transpose the data data = translator.convert_data_to_channels_first( data - ) # To the same shape as in the ONNX model - data = data.reshape(extended_onnx_shape) # Extend with leading 1s + ) # To the same shape as in the ExecuTorch model + data = data.reshape(extended_executorch_shape) # Extend with leading 1s tensor.tmp_buffer.data = translator.convert_data_to_channels_last( data ) # Convert to TFLite format @@ -1600,16 +1591,16 @@ def prepare_static_tensor_for_correct_broadcasting_with_channels_first_tensors( assert tflite_shape == list(tensor.tmp_buffer.data.shape) else: - # The tensor is the same as in the ONNX model. + # The tensor is the same as in the ExecuTorch model. - extended_onnx_shape = [1] * rank_diff + shape + extended_executorch_shape = [1] * rank_diff + shape # Convert the full shape to TFLite format - tflite_shape = translator.dims_to_channels_last(extended_onnx_shape) + tflite_shape = translator.dims_to_channels_last(extended_executorch_shape) tensor.shape = tflite_model.Shape(tflite_shape) # Statically transpose the data - data = data.reshape(extended_onnx_shape) # Extend with leading 1s + data = data.reshape(extended_executorch_shape) # Extend with leading 1s tensor.tmp_buffer.data = translator.convert_data_to_channels_last( data ) # Convert to TFLite format diff --git a/backends/nxp/backend/ir/converter/conversion/common.py b/backends/nxp/backend/ir/converter/conversion/common.py index 8230e39a7fa..318fe66dfbd 100755 --- a/backends/nxp/backend/ir/converter/conversion/common.py +++ b/backends/nxp/backend/ir/converter/conversion/common.py @@ -1,6 +1,6 @@ # # Copyright 2023 Martin Pavella -# Copyright 2023-2024 NXP +# Copyright 2023-2025 NXP # # License: MIT # See the LICENSE_MIT for more details. @@ -12,7 +12,7 @@ 'conversion/builtin/' directory. """ -from typing import Any, List, MutableSequence, Optional +from typing import List, MutableSequence, Optional import executorch.backends.nxp.backend.ir.logger as logger from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model @@ -22,28 +22,8 @@ max_pool_2d_options, transpose_conv_options, ) -from torch.fx import Node - - -def exactly_one_is_none(obj1: Optional, obj2: Optional) -> bool: - """Determine if exactly 1 of the arguments is None, or not.""" - return (obj1 is None and obj2 is not None) or (obj1 is not None and obj2 is None) - - -def contains_duplicates(list_to_check: List[Any]) -> bool: - """Determine if given list has duplicate elements or not.""" - return len(list_to_check) != len(set(list_to_check)) - - -def clamp(val: int, start: int, end: int) -> int: - """Clamp an int value between start and end (inclusive) and return it.""" - if val < start: - return start - - elif val > end: - return end - return val +from torch.fx import Node def try_get_input(t_op: tflite_model.Operator, idx: int) -> tflite_model.Tensor | None: @@ -62,11 +42,6 @@ def try_get_input(t_op: tflite_model.Operator, idx: int) -> tflite_model.Tensor tensor = t_op.tmp_inputs[idx] - if tensor.name == "": - # ONNX allows the name "" for optional tensors. It indicates that the tensor should be ignored, and a default - # value should be used. Just like if the tensor was omitted altogether. - return None - return tensor @@ -101,7 +76,7 @@ def assign_2d_strides(options: StridedOptions, strides: Optional[List[int]]): If 'strides' is None, assign 1s. :param options: TFLite AveragePool2D, Conv2D, MaxPool2D or TransposeConv options object. - :param strides: An optional list of ONNX strides attribute. + :param strides: An optional list of ExecuTorch strides attribute. """ if strides is None: @@ -115,8 +90,8 @@ def assign_2d_strides(options: StridedOptions, strides: Optional[List[int]]): else: logger.e( - logger.Code.INVALID_ONNX_OPERATOR_ATTRIBUTE, - f"ONNX operator has invalid 'strides' attribute! ('{strides}')", + logger.Code.INVALID_OPERATOR_ATTRIBUTE, + f"ExecuTorch operator has invalid 'strides' attribute! ('{strides}')", ) @@ -188,32 +163,6 @@ def node_uses_shape_broadcasting(node: Node) -> bool: ) -def uses_multiple_input_types(t_op: tflite_model.Operator) -> bool: - """Determine if the input tensors of given TFLite operator use different data types or not. - - :param t_op: TFLite operator with 'tmp_inputs' initialized. - :return: True, if any two input tensors have a different data type. - False, if all input tensors use the same data type. - """ - - if t_op.tmp_inputs is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "common.uses_multiple_input_types(): 'tmp_inputs' are None!", - ) - - if len(t_op.tmp_inputs) == 0: - logger.e( - logger.Code.INTERNAL_ERROR, - "common.uses_multiple_input_types(): Operator has no inputs!", - ) - - first_input_type = t_op.tmp_inputs[0].type - return any( - input_tensor.type != first_input_type for input_tensor in t_op.tmp_inputs[1:] - ) - - class OpsList: """ Holder of TFLite operator (middle_op) that can be prefixed (pre_ops) of suffixed (post_ops) diff --git a/backends/nxp/backend/ir/converter/conversion/translator.py b/backends/nxp/backend/ir/converter/conversion/translator.py index 4f327c6ac80..1fe195843c0 100755 --- a/backends/nxp/backend/ir/converter/conversion/translator.py +++ b/backends/nxp/backend/ir/converter/conversion/translator.py @@ -1,6 +1,5 @@ -# # Copyright 2023 Martin Pavella -# Copyright 2023-2024 NXP +# Copyright 2023-2025 NXP # # License: MIT # See the LICENSE_MIT for more details. @@ -9,10 +8,10 @@ translator Module contains functions for context-free conversion of various -things from ONNX to TFLite. +things from ExecuTorch to NeutronIR. """ -from typing import Any, Collection, List, Optional, Sequence, Tuple +from typing import Any, Collection, List, Optional, Sequence import executorch.backends.nxp.backend.ir.lib.tflite.Padding as tflPadding import executorch.backends.nxp.backend.ir.logger as logger @@ -21,16 +20,12 @@ import numpy as np import torch from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType -from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat -from executorch.backends.nxp.backend.ir.tflite_generator.meta.types import ( - TensorFlowDataType, -) def permute_static_tensor(tensor: tflite_model.Tensor, perm: list[int]): - """Take a static TFLite tensor and permute its shape and data according to the permutation in 'perm'. + """Take a static NeutronIR tensor and permute its shape and data according to the permutation in 'perm'. - :param tensor: Static TFLite tensor to permute. + :param tensor: Static NeutronIR tensor to permute. :param perm: Permutation to apply to the tensor. """ @@ -53,7 +48,7 @@ def permute_static_tensor(tensor: tflite_model.Tensor, perm: list[int]): def get_tflite_tensor_shape_with_explicit_padding( tflite_shape: List[int], explicit_padding: List[List[int]] ) -> List[int]: - """Get the resulting shape of a tensor with shape 'tflite_shape' (in TFLite format), after 'explicit_padding' is + """Get the resulting shape of a tensor with shape 'tflite_shape' (in NeutronIR format), after 'explicit_padding' is applied to it. """ @@ -62,7 +57,7 @@ def get_tflite_tensor_shape_with_explicit_padding( ): logger.e( logger.Code.INTERNAL_ERROR, - f"Cannot apply padding '{explicit_padding}' to TFLite shape '{tflite_shape}'!", + f"Cannot apply padding '{explicit_padding}' to NeutronIR shape '{tflite_shape}'!", ) total_padding = [ @@ -90,24 +85,9 @@ def get_tflite_tensor_shape_with_explicit_padding( return padded_shape -def convert_tensor_format_to_tflite(tensor_format: TensorFormat) -> TensorFormat: - """Convert the format of a tensor from ONNX to TFLite. - :return: The tensor_format converted to TFLite. - """ - if tensor_format is TensorFormat.CHANNELS_FIRST: - return TensorFormat.CHANNELS_LAST - - elif tensor_format not in (TensorFormat.FORMATLESS, TensorFormat.NONE): - logger.d( - f"translator.convert_tensor_format(): Got unexpected format '{tensor_format}'." - ) - - return tensor_format - - def dims_to_channels_first(channels_last_dimensions: List[int]) -> List[int]: - """Convert a list of ints which represent dimensions in the channels last (TFLite) format to the channels first - (ONNX) format. + """Convert a list of ints which represent dimensions in the channels last (NeutronIR) format to the channels first + (ExecuTorch) format. """ assert len(channels_last_dimensions) > 0, "Dimensions list is empty!" @@ -122,8 +102,8 @@ def dims_to_channels_first(channels_last_dimensions: List[int]) -> List[int]: def dims_to_channels_last(channels_first_dimensions: List[int]) -> List[int]: - """Convert a list of ints which represent dimensions in the channels first (ONNX) format to the channels last - (TFLite) format. + """Convert a list of ints which represent dimensions in the channels first (ExecuTorch) format to the channels last + (NeutronIR) format. """ assert len(channels_first_dimensions) > 0, "Dimensions list is empty!" @@ -171,7 +151,7 @@ def _same_upper_equals_same_lower( o_strides: Optional[List[int]] = None, o_dilations: Optional[List[int]] = None, ) -> bool: - """Determine if in a given particular setting, the values of the ONNX `auto_pads` attribute SAME_UPPER and + """Determine if in a given particular setting, the values of the ExecuTorch `auto_pads` attribute SAME_UPPER and SAME_LOWER represent the exact same padding. """ @@ -193,7 +173,7 @@ def _tflite_padding_compute_output_size( """ Calculates the output shape of the tensor with particular setting as tflite would. Implementation corresponds to tensorflow/lite/kernels/padding.h:ComputeOutSize() - :param padding: TFLite Padding value - 'Same' or 'Valid' + :param padding: NeutronIR Padding value - 'Same' or 'Valid' :param tflite_spatial_input_shape: input tensor shape :param tflite_kernel_shape: convolution kernel shape :param strides: strides (default is 1) @@ -229,7 +209,7 @@ def tflite_compute_padding_with_offset( dilations: Optional[List[int]] = None, ) -> (List[int], List[int]): """ - Calculate padding and offset for each dimension for particular convolution setting as TFLite. + Calculate padding and offset for each dimension for particular convolution setting as NeutronIR. Implementation corresponds to tensorflow/lite/kernels/padding.h:ComputePaddingWithOffset() :param tflite_input_shape: tensorflow lite input shape :param tflite_kernel_shape: tensorflow lite kernel shape @@ -272,14 +252,14 @@ def _is_same_padding( o_strides: Optional[List[int]] = None, o_dilations: Optional[List[int]] = None, ) -> bool: - """Determine if given ONNX 'pads' padding can be represented exactly with the TFLite 'SAME' padding type. - - :param o_pads: ONNX 'pads' attribute. - :param tflite_input_shape: The shape of the main input of the operator in TFLite format. - :param tflite_output_shape: The shape of the main output of the operator in TFLite format. - :param o_kernel_shape: ONNX 'kernel_shape' attribute. - :param o_strides: ONNX 'strides' attribute. Can be omitted. - :param o_dilations: ONNX 'dilations' attribute. Can be omitted. + """Determine if given ExecuTorch 'pads' padding can be represented exactly with the NeutronIR 'SAME' padding type. + + :param o_pads: ExecuTorch 'pads' attribute. + :param tflite_input_shape: The shape of the main input of the operator in NeutronIR format. + :param tflite_output_shape: The shape of the main output of the operator in NeutronIR format. + :param o_kernel_shape: ExecuTorch 'kernel_shape' attribute. + :param o_strides: ExecuTorch 'strides' attribute. Can be omitted. + :param o_dilations: ExecuTorch 'dilations' attribute. Can be omitted. """ if len(tflite_input_shape) == 0 or len(tflite_output_shape) == 0: @@ -289,7 +269,7 @@ def _is_same_padding( f"'{tflite_input_shape}' and output shape '{tflite_output_shape}'.", ) - # Calculate if the output shape corresponds to Same padding setting in TFLite + # Calculate if the output shape corresponds to Same padding setting in NeutronIR tflite_spatial_input_shape = tflite_input_shape[1:-1] tmp_spatial_output_shape = _tflite_padding_compute_output_size( tflPadding.Padding.SAME, @@ -302,10 +282,10 @@ def _is_same_padding( return False # For every dimension, the padding is added to the start and end of the dimension. - # TFLite padding 'SAME' tries to split it evenly, but in case of odd padding, 'SAME' adds the excess 1 at the end. - # TFLite represents this in the offset. The offset is added to the end of particular dimension, + # NeutronIR padding 'SAME' tries to split it evenly, but in case of odd padding, 'SAME' adds the excess 1 at the end. + # NeutronIR represents this in the offset. The offset is added to the end of particular dimension, # i.e. bottom for H dim, right for W dim and so on. - # ONNX represents this in 'pads' as [x1_begin, x2_begin,... , x1_end, x2_end,...]. + # ExecuTorch represents this in 'pads' as [x1_begin, x2_begin,... , x1_end, x2_end,...]. padding, offset = tflite_compute_padding_with_offset( tflite_input_shape, o_kernel_shape, tflite_output_shape, o_strides, o_dilations ) @@ -319,30 +299,6 @@ def _is_same_padding( return True -def permutations_are_inverse( - permutation1: Sequence[int], permutation2: Sequence[int] -) -> bool: - """Determine if given Transpose permutations are inverse of each other. - i.e. when applied back to back, there will be no effect. - - Example: - 0 3 1 2 - 0 2 3 1 - """ - - if len(permutation1) != len(permutation2): - logger.e( - logger.Code.INTERNAL_ERROR, - "translator.permutations_are_inverse(): permutations have different size!", - ) - - for i, perm2 in enumerate(permutation2): - if i != permutation1[perm2]: - return False - - return True - - def combine_permutations( permutation1: Sequence[int], permutation2: Sequence[int] ) -> List[int]: @@ -375,31 +331,35 @@ def shape_from_numpy(numpy_array): return tflite_model.Shape(dims) -def onnx_explicit_padding_to_tflite(onnx_pads: list[int]) -> list[list[int]]: - """Convert the attribute or input 'pads' of the ONNX 'Pad' operator to the 'paddings' input of the TFLite 'Pad' +def executorch_explicit_padding_to_tflite( + executorch_pads: list[int], +) -> list[list[int]]: + """Convert the attribute or input 'pads' of the ExecuTorch 'Pad' operator to the 'paddings' input of the NeutronIR 'Pad' class of operators. This function does NOT take tensor formats into consideration. """ - start_padding = onnx_pads[ - : len(onnx_pads) // 2 + start_padding = executorch_pads[ + : len(executorch_pads) // 2 ] # Padding at the start of each dimension - end_padding = onnx_pads[ - len(onnx_pads) // 2 : + end_padding = executorch_pads[ + len(executorch_pads) // 2 : ] # Padding at the end of each dimension return list(zip(start_padding, end_padding)) -def onnx_pads_to_tflite_explicit_padding(onnx_pads: List[int]) -> List[List[int]]: - """Convert an ONNX attribute 'pads' of operators such as Conv, MaxPool or AveragePool, to a list of ints which is - compatible with the TFLite 'Pad' operator. +def executorch_pads_to_tflite_explicit_padding( + executorch_pads: List[int], +) -> List[List[int]]: + """Convert an ExecuTorch attribute 'pads' of operators such as Conv, MaxPool or AveragePool, to a list of ints which is + compatible with the NeutronIR 'Pad' operator. """ - tflite_padding = onnx_explicit_padding_to_tflite(onnx_pads) + tflite_padding = executorch_explicit_padding_to_tflite(executorch_pads) - # TFLite also allows padding to the 'batch' and 'channels'. ONNX does not + # NeutronIR also allows padding to the 'batch' and 'channels'. ExecuTorch does not tflite_padding.insert(0, [0, 0]) tflite_padding.append([0, 0]) @@ -413,15 +373,15 @@ def _get_explicit_tflite_padding_for_same_lower( o_strides: Optional[List[int]] = None, o_dilations: Optional[List[int]] = None, ) -> List[List[int]]: - """Get the TFLite explicit padding required to represent ONNX 'SAME_LOWER' auto_pad for a particular setting. + """Get the NeutronIR explicit padding required to represent ExecuTorch 'SAME_LOWER' auto_pad for a particular setting. - :param tflite_input_shape: TFLite (NHWC) shape of the input tensor of the operator. - :param tflite_output_shape: TFLite (NHWC) shape of the output tensor of the operator. - :param o_kernel_shape: ONNX 'kernel_shape' attribute. - :param o_strides: Optional ONNX 'o_strides' attribute. - :param o_dilations: Optional ONNX 'o_dilations' attribute. + :param tflite_input_shape: NeutronIR (NHWC) shape of the input tensor of the operator. + :param tflite_output_shape: NeutronIR (NHWC) shape of the output tensor of the operator. + :param o_kernel_shape: ExecuTorch 'kernel_shape' attribute. + :param o_strides: Optional ExecuTorch 'o_strides' attribute. + :param o_dilations: Optional ExecuTorch 'o_dilations' attribute. - :return: A TFLite style explicit padding, compatible with the TFLite 'Pad' operator. + :return: A NeutronIR style explicit padding, compatible with the NeutronIR 'Pad' operator. """ padding, offset = tflite_compute_padding_with_offset( @@ -433,102 +393,15 @@ def _get_explicit_tflite_padding_for_same_lower( ] # In case of odd padding, the excess is added at the start end_padding = padding - onnx_explicit_padding = start_padding + end_padding - - # Return explicit ONNX padding converted to TFLite padding - return onnx_pads_to_tflite_explicit_padding(onnx_explicit_padding) - - -def convert_padding( - o_auto_pad: str, - o_pads: List[int], - tflite_input_shape: List[int], - tflite_output_shape: List[int], - o_kernel_shape: List[int], - o_strides: Optional[List[int]], - o_dilations: Optional[List[int]] = None, -) -> Tuple[tflPadding.Padding, Optional[List[List[int]]]]: - """Convert ONNX operator attributes 'pads' and 'auto_pad' to TFLite. - - :param o_auto_pad: ONNX operator attribute 'auto_pad' - :param o_pads: ONNX operator attribute 'pads' - :param tflite_input_shape: The shape of the main input tensor in the TFLite format. - :param tflite_output_shape: The shape of the main output tensor in the TFLite format. - :param o_kernel_shape: ONNX operator attribute 'kernel_shape' - :param o_strides: ONNX operator attribute 'strides' - :param o_dilations: ONNX operator attribute 'dilations' - - :return: A tuple. - The first element is the converted TFLite padding. - The second is None, if conversion is finished. Or it is a list of ints representing the explicit - padding in TFLite format (compatible with the 'Pad' operator), which needs to be provided by a - 'Pad' operator. Caller must add this operator using model_builder! - """ - - if o_auto_pad == "SAME_UPPER": - return tflPadding.Padding.SAME, None - - elif o_auto_pad == "SAME_LOWER": - if _same_upper_equals_same_lower( - tflite_input_shape, - tflite_output_shape, - o_kernel_shape, - o_strides, - o_dilations, - ): - return tflPadding.Padding.SAME, None - - else: - logger.d( - "'SAME_LOWER' auto_pad cannot be exactly represented in TFLite as padding 'SAME' or 'VALID'. " - "Inserting an extra 'Pad' operator." - ) - tflite_explicit_padding = _get_explicit_tflite_padding_for_same_lower( - tflite_input_shape, - tflite_output_shape, - o_kernel_shape, - o_strides, - o_dilations, - ) - return tflPadding.Padding.VALID, tflite_explicit_padding - - elif o_auto_pad == "VALID": - return tflPadding.Padding.VALID, None - - # auto_pad is NOTSET -> use explicit padding - elif o_pads is None or all(val == 0 for val in o_pads): - # No padding in any direction - return tflPadding.Padding.VALID, None - - elif _is_same_padding( - o_pads, - tflite_input_shape, - tflite_output_shape, - o_kernel_shape, - o_strides, - o_dilations, - ): - # Explicit padding can be represented with TFLite 'SAME' padding. - return tflPadding.Padding.SAME, None - - else: - # 'pads' cannot be converted directly. Return 'VALID' and the required explicit padding and caller must - # implement conversion by adding a 'Pad' operator. - - logger.d( - "Explicit ONNX 'pads' cannot be represented directly as 'SAME' or 'VALID'. " - "Inserting an extra 'Pad' operator." - ) - - # ONNX 'pads' uses different format than TFLite 'Pad' operator. Convert the explicit padding. - tflite_explicit_padding = onnx_pads_to_tflite_explicit_padding(o_pads) + executorch_explicit_padding = start_padding + end_padding - return tflPadding.Padding.VALID, tflite_explicit_padding + # Return explicit ExecuTorch padding converted to NeutronIR padding + return executorch_pads_to_tflite_explicit_padding(executorch_explicit_padding) def convert_data_to_channels_first(array: np.ndarray) -> np.ndarray: - """Convert a numpy array representing the data of a tensor from the channels last format (TFLite), to channels - first format (ONNX). + """Convert a numpy array representing the data of a tensor from the channels last format (NeutronIR), to channels + first format (ExecuTorch). :param array: Numpy array holding the tensor's data. :return: The transformed data. @@ -543,8 +416,8 @@ def convert_data_to_channels_first(array: np.ndarray) -> np.ndarray: def convert_data_to_channels_last(array: np.ndarray) -> np.ndarray: - """Convert a numpy array representing the data of a tensor from the channels first format (ONNX), to channels last - format (TFLite). + """Convert a numpy array representing the data of a tensor from the channels first format (ExecuTorch), to channels last + format (NeutronIR). :param array: Numpy array holding the tensor's data. :return: The transformed data. @@ -558,17 +431,6 @@ def convert_data_to_channels_last(array: np.ndarray) -> np.ndarray: return np.moveaxis(array, 1, -1) # Move the second axis (C), to the end -def channels_first_shape_to_channels_last( - channels_first_shape: tflite_model.Shape, -) -> tflite_model.Shape: - """Create a channels last version of a channels first 'tflite_model.Shape' object.""" - - dims = channels_first_shape.vector.copy() - dims = dims_to_channels_last(dims) - - return tflite_model.Shape(dims) - - def channels_last_shape_to_channels_first( nhwc_shape: tflite_model.Shape, ) -> tflite_model.Shape: @@ -580,23 +442,13 @@ def channels_last_shape_to_channels_first( return tflite_model.Shape(dims) -def convert_onnx_dimensions_to_tflite_shape(o_dims: List[int]) -> tflite_model.Shape: - """Convert list of ints representing the shape of an ONNX channels first Tensor to a TFLite 'Shape' object.""" - - dims = list(o_dims) # Copy just in case - - dims = dims_to_channels_last(dims) - - return tflite_model.Shape(dims) - - def create_channels_last_to_channels_first_permutation( rank: int, return_list: bool = False ) -> np.ndarray | list[int]: """Return a numpy array with data that describes the permutation, which would change a tensor from the channels - last (TFLite) format to the channels first (ONNX) format. + last (NeutronIR) format to the channels first (ExecuTorch) format. - This permutation is compatible with the TFLite `Transpose` operator. + This permutation is compatible with the NeutronIR `Transpose` operator. :param rank: The rank of the required permutation. :param return_list: If True, the function returns a list of ints. If False, a numpy array is returned. @@ -615,9 +467,9 @@ def create_channels_first_to_channels_last_permutation( rank: int, return_list: bool = False ) -> np.ndarray | list[int]: """Return a numpy array with data that describes the permutation, which would change a tensor from the channels - first (ONNX) format to the channels last (TFLite) format. + first (ExecuTorch) format to the channels last (NeutronIR) format. - This permutation is compatible with the TFLite `Transpose` operator. + This permutation is compatible with the NeutronIR `Transpose` operator. :param rank: The rank of the required permutation. :param return_list: If True, the function returns a list of ints. If False, a numpy array is returned. @@ -632,35 +484,8 @@ def create_channels_first_to_channels_last_permutation( return np.asarray(perm, np.int32) -def create_axis_to_last_perm(axis, num_dims): - """Create a numpy array representing the transpose permutations needed, to - make the 'axis' dimension, the last dimension. - """ - - dims = list(range(num_dims)) - - if axis == num_dims - 1: - return dims - elif axis >= num_dims or axis < 0: - logger.e( - logger.Code.INTERNAL_ERROR, - f"translator.create_axis_to_last_perm({axis},{num_dims}). Inputs don't make sense!", - ) - - # Remember axis dimension - axis_dim = dims[axis] - - # Move dimensions after 'axis' to the left - dims[axis:-1] = dims[axis + 1 : -1] - - # Add axis dimension to the end - dims.append(axis_dim) - - return np.asarray(dims, np.int32) - - def apply_permutation_to(target: List[Any], permutation: Collection[int]) -> List: - """Permute a list according to a permutation. Uses the same permutation format as the TFLite Transpose operator. + """Permute a list according to a permutation. Uses the same permutation format as the NeutronIR Transpose operator. :param target: A list of any types, to permute. Must be same size as the permutation. :param permutation: The permutation to apply to the target. @@ -678,7 +503,7 @@ def apply_permutation_to(target: List[Any], permutation: Collection[int]) -> Lis def create_inverse_permutation(permutation: List[int]) -> List[int]: """Create and return a permutation, that is the inverse of the given 'permutation' parameter. - Uses the same permutation format as the TFLite Transpose operator. + Uses the same permutation format as the NeutronIR Transpose operator. :param permutation: The permutation to create the inverse of. :return: Inverse permutation. @@ -694,38 +519,8 @@ def create_inverse_permutation(permutation: List[int]) -> List[int]: return [permutation.index(perm) for perm in range(len(permutation))] -def get_max_value_for_type(dtype: np.dtype) -> any: - """Return the maximum possible value for given numpy type.""" - if dtype.kind in ("i", "u"): - return np.iinfo(dtype).max - - elif dtype.kind == "f": - return np.finfo(dtype).max - - else: - logger.e( - logger.Code.INTERNAL_ERROR, - f"translator.get_max_value_for_type(): unexpected type {dtype.name}.", - ) - - -def get_min_value_for_type(dtype: np.dtype) -> any: - """Return the minimum possible value for given numpy type.""" - if dtype.kind in ("i", "u"): - return np.iinfo(dtype).min - - elif dtype.kind == "f": - return np.finfo(dtype).min - - else: - logger.e( - logger.Code.INTERNAL_ERROR, - f"translator.get_min_value_for_type(): unexpected type {dtype.name}.", - ) - - def convert_data_type(torch_type: torch.TensorType) -> TensorType: - """Convert Torch DataType to TFLite TensorType""" + """Convert Torch DataType to NeutronIR TensorType""" if torch_type == torch.float32: return TensorType.FLOAT32 @@ -753,7 +548,7 @@ def convert_data_type(torch_type: torch.TensorType) -> TensorType: def torch_type_to_numpy_type(torch_type: torch.TensorType) -> np.ScalarType: - """Convert Torch DataType to TFLite TensorType""" + """Convert Torch DataType to NeutronIR TensorType""" if torch_type == torch.float32: return np.dtype(np.float32) @@ -778,10 +573,10 @@ def torch_type_to_numpy_type(torch_type: torch.TensorType) -> np.ScalarType: def numpy_type_to_tf_lite(numpy_type: np.dtype) -> TensorType: # noqa C901 - """Convert the numpy data type to a corresponding TFLite 'TensorType'. + """Convert the numpy data type to a corresponding NeutronIR 'TensorType'. :param numpy_type: Numpy dtype to convert. - :return: Corresponding TFLite TensorType. + :return: Corresponding NeutronIR TensorType. """ numpy_type = numpy_type.type @@ -835,12 +630,12 @@ def numpy_type_to_tf_lite(numpy_type: np.dtype) -> TensorType: # noqa C901 else: logger.e( logger.Code.CONVERSION_IMPOSSIBLE, - f"Cannot convert numpy data type '{numpy_type}' to TFLite.", + f"Cannot convert numpy data type '{numpy_type}' to NeutronIR.", ) def tf_lite_type_to_numpy(tfl_type: TensorType) -> np.ScalarType: # noqa C901 - """Convert TFLite TensorType to numpy dtype""" + """Convert NeutronIR TensorType to numpy dtype""" if tfl_type == TensorType.FLOAT32: return np.dtype(np.float32) @@ -890,72 +685,5 @@ def tf_lite_type_to_numpy(tfl_type: TensorType) -> np.ScalarType: # noqa C901 else: logger.e( logger.Code.CONVERSION_IMPOSSIBLE, - f"Cannot convert TFLite type '{tfl_type}' to numpy dtype.", + f"Cannot convert NeutronIR type '{tfl_type}' to numpy dtype.", ) - - -def tflite_type_to_tensor_flow_data_type(tfl_type: TensorType) -> TensorFlowDataType: - """Convert TFLite TensorType to the internal type of TensorFlow.""" - match tfl_type: - case TensorType.FLOAT16: - # There seems to be no counterpart in the TF DataType. - logger.e( - logger.Code.INTERNAL_ERROR, - "tflite_type_to_tensor_flow_data_type(): float16.", - ) - case TensorType.FLOAT32: - return TensorFlowDataType.DT_FLOAT.value - case TensorType.FLOAT64: - return TensorFlowDataType.DT_DOUBLE.value - - case TensorType.INT4: - return TensorFlowDataType.DT_INT4.value - case TensorType.INT8: - return TensorFlowDataType.DT_INT8.value - case TensorType.INT16: - return TensorFlowDataType.DT_INT16.value - case TensorType.INT32: - return TensorFlowDataType.DT_INT32.value - case TensorType.INT64: - return TensorFlowDataType.DT_INT64.value - - case TensorType.UINT8: - return TensorFlowDataType.DT_UINT8.value - case TensorType.UINT16: - return TensorFlowDataType.DT_UINT16.value - case TensorType.UINT32: - return TensorFlowDataType.DT_UINT32.value - case TensorType.UINT64: - return TensorFlowDataType.DT_UINT64.value - - case TensorType.COMPLEX64: - return TensorFlowDataType.DT_COMPLEX64.value - case TensorType.COMPLEX128: - return TensorFlowDataType.DT_COMPLEX128.value - - case TensorType.STRING: - return TensorFlowDataType.DT_STRING.value - - case TensorType.BOOL: - return TensorFlowDataType.DT_BOOL.value - - case TensorType.RESOURCE: - return TensorFlowDataType.DT_RESOURCE.value - case TensorType.VARIANT: - return TensorFlowDataType.DT_VARIANT.value - - case _: - # All TFLite types are covered. Must be an invalid type. - logger.e( - logger.Code.INTERNAL_ERROR, - f"tflite_type_to_tensor_flow_data_type(): invalid TFLite type `{tfl_type}`.", - ) - - -def infer_kernel_shape(weight_tensor: tflite_model.Tensor) -> list[int]: - """Returns the kernel shape inferred from the weight tensor. - - Weight tensors shape expected in TFlite Format, where the 0th index is output channels count, last is input channels - count. - """ - return weight_tensor.shape.vector[1:-1] diff --git a/backends/nxp/backend/ir/converter/node_converter.py b/backends/nxp/backend/ir/converter/node_converter.py index ed624aaa411..36266486aac 100755 --- a/backends/nxp/backend/ir/converter/node_converter.py +++ b/backends/nxp/backend/ir/converter/node_converter.py @@ -4,7 +4,6 @@ # LICENSE file in the root directory of this source tree. from abc import ABC, abstractmethod -from enum import Enum import torch @@ -16,8 +15,10 @@ AtenModelBuilderDirector, ) from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.exir.dialects._ops import ops as exir_ops from torch.fx import Node +from torch.fx.passes.infra.partitioner import Partition from torch.nn import Parameter @@ -37,15 +38,8 @@ def _is_dequant_node(node: torch.fx.Node) -> bool: ] -class Target(Enum): - IGNORE = "ignore" # No target platform. Any target specific restrictions will be ignored. - - RT700 = "imxrt700" - IMX95 = "imx95" - - @classmethod - def values(cls) -> list[str]: - return [elt.value for elt in cls] +def is_not_qdq_node(node: torch.fx.Node) -> bool: + return not (_is_quant_node(node) or _is_dequant_node(node)) class NodeConverter(ABC): @@ -89,7 +83,7 @@ def _is_supported_in_IR( @staticmethod def _is_supported_on_target( node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: @@ -98,33 +92,50 @@ def _is_supported_on_target( can be used by operators with no target specific requirements. :param node: The node (edge operator) to check. - :param target: Value of the `Target` enum representing the target platform to check for. + :param neutron_target_spec: Object for querying the target platform to retrieve its properties. :param parameters_mapping: Dictionary mapping tensor names to their static data (if they have it). :param custom_delegation_options: Custom options which affect delegation. """ - return target == Target.RT700 + return True @classmethod def is_supported( cls, node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: """Check if the given `node` is supported in the IR and on the given `target` platform. :param node: torch.Node to check. - :param target: Value of the `Target` enum representing the target platform to check for. + :param neutron_target_spec: Object for querying the target platform to retrieve its properties. :param parameters_mapping: Dict mapping tensor names to their data. :param custom_delegation_options: Custom user options which affect node delegation. """ return cls._is_supported_in_IR( node, parameters_mapping, custom_delegation_options ) and cls._is_supported_on_target( - node, target, parameters_mapping, custom_delegation_options + node, neutron_target_spec, parameters_mapping, custom_delegation_options ) + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + ): + """Check if the given `node` supports the assigned partitioning, which is stored the `partition_list`. Child + classes can overwrite this method in case they have delegation restrictions based on the context defined by + the partitioning result. + + :param node: torch.Node to check. + :param partition_list: List of proposed partitions. + :param custom_delegation_options: Custom user options which affect node delegation. + """ + return True + @staticmethod def _has_shared_q_params_if_quantized(node: Node) -> bool: """Check if node has shared quantization parameters if it's quantized.""" diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py index d1674e16a9f..3cf70f46b8d 100755 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py @@ -41,7 +41,8 @@ PermuteCopyConverter, ) from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_dequantize_converter import ( - QDQDequantizeConverter, + QDQPerChannelDequantizeConverter, + QDQPerTensorDequantizeConverter, ) from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_quantize_converter import ( QDQQuantizeConverter, @@ -55,6 +56,9 @@ from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.softmax_converter import ( SoftmaxConverter, ) +from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.sub_tensor_converter import ( + SubTensorConverter, +) from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.tanh_converter import ( TanhConverter, ) @@ -70,7 +74,8 @@ "PermuteCopyConverter", "SoftmaxConverter", "ViewCopyConverter", - "QDQDequantizeConverter", + "QDQPerTensorDequantizeConverter", + "QDQPerChannelDequantizeConverter", "QDQQuantizeConverter", "ConstantPadNDConverter", "ReLUConverter", @@ -78,6 +83,7 @@ "MaxPool2dConverter", "AvgPool2dConverter", "AddTensorConverter", + "SubTensorConverter", "CloneConverter", "AbsConverter", "AdaptiveAvgPool2dConverter", diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py index c74baa61f67..cd5aa2ead81 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py @@ -9,11 +9,11 @@ from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, NodeConverter, - Target, ) from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import ( add_options, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node from torch.nn import Parameter @@ -22,20 +22,15 @@ class AddTensorConverter(NodeConverter): @staticmethod def _is_supported_on_target( node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - match target: - case Target.RT700: - if node_uses_shape_broadcasting(node): - # Shape broadcasting may require the addition of `Transpose` ops during conversion. - return False - - return True + if node_uses_shape_broadcasting(node): + # Shape broadcasting may require the addition of `Transpose` ops during conversion. + return False - case _: - return False + return True @staticmethod def _is_supported_in_IR( diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py index 4f7f00fe5ba..22ca258cd4f 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py @@ -13,11 +13,11 @@ _is_dequant_node, _is_quant_node, NodeConverter, - Target, ) from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.concatenation_options import ( Concatenation, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node from torch.nn import Parameter @@ -72,51 +72,52 @@ def _all_io_shares_quantization_parameters(node: Node) -> bool: @staticmethod def _is_supported_on_target( node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: if custom_delegation_options.force_delegate_cat: return True - match target: - case Target.RT700: - dim = CatConverter._get_normalized_dim(node) - - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491 - if dim == 0: - return False - - # Neutron requires the channels to be a multiple of `8`. The channels could either be the second or the - # last dimension, depending on the formats of the node. The format, however, cannot be determined - # during conversion, as it depends on what other nodes are delegated. - input_channels = [ - # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it - # will still be the channels in the IR. - _get_shape(input_)[1] - for input_ in node.all_input_nodes - ] + [ - # If the inputs/outputs are channels first, the last dimension will be the channels. - _get_shape(input_)[-1] - for input_ in node.all_input_nodes - ] - if any((input_channel % 8) != 0 for input_channel in input_channels): - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492 - return False - - output_channels = [_get_shape(node)[1], _get_shape(node)[-1]] - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493 - if any((out_c % 8) != 0 for out_c in output_channels): - return False - - if len(node.all_input_nodes) < 2: # Not supported on Neutron - # TODO Try to skip the operator if this case is realistic. - return False - - return True - - case _: - return False + dim = CatConverter._get_normalized_dim(node) + + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491 + if dim == 0: + return False + + # Neutron requires the channels to be a multiple of numMacs. The channels could either be the second or the + # last dimension, depending on the formats of the node. The format, however, cannot be determined + # during conversion, as it depends on what other nodes are delegated. + input_channels = [ + # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it + # will still be the channels in the IR. + _get_shape(input_)[1] + for input_ in node.all_input_nodes + ] + [ + # If the inputs/outputs are channels first, the last dimension will be the channels. + _get_shape(input_)[-1] + for input_ in node.all_input_nodes + ] + if any( + (input_channel % neutron_target_spec.get_num_macs()) != 0 + for input_channel in input_channels + ): + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492 + return False + + output_channels = [_get_shape(node)[1], _get_shape(node)[-1]] + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493 + if any( + (out_c % neutron_target_spec.get_num_macs()) != 0 + for out_c in output_channels + ): + return False + + if len(node.all_input_nodes) < 2: # Not supported on Neutron + # TODO Try to skip the operator if this case is realistic. + return False + + return True @staticmethod def _is_supported_in_IR( diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py index 1d370ab8c48..17b2cee9874 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py @@ -20,6 +20,11 @@ def _has_supported_memory_format(node: Node) -> bool: class CloneConverter(NodeConverter): + """ + This converter is responsible for converting both edge operators: + - aten.clone.default + - dim_order_ops._clone_dim_order.default + """ @staticmethod def _is_supported_in_IR( diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py index f58df1a88d9..499541aa58c 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py @@ -17,7 +17,6 @@ from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, NodeConverter, - Target, ) from executorch.backends.nxp.backend.ir.converter.quantization_utils import ( quantize_int8, @@ -27,6 +26,7 @@ pad_options, pad_v2_options, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node from torch.nn import Parameter @@ -35,22 +35,16 @@ class ConstantPadNDConverter(NodeConverter): @staticmethod def _is_supported_on_target( node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - match target: - case Target.RT700: - # TODO: Consider different tensor formats (dim-order) - paddings = node.args[1] - if len(paddings) > 4 and paddings[4:6] != [0, 0]: - # Attempt to Pad channels dimension, which is not supported on Neutron. - return False - - return True - - case _: - return False + paddings = node.args[1] + if len(paddings) > 4 and paddings[4:6] != [0, 0]: + # Attempt to Pad channels dimension, which is not supported on Neutron. + return False + + return True @staticmethod def _is_supported_in_IR( diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py index 0f3a4b9bb5a..f32b5a65cac 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py @@ -25,7 +25,6 @@ from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, NodeConverter, - Target, ) from executorch.backends.nxp.backend.ir.converter.node_converters.shared import ( conv_utils, @@ -45,6 +44,7 @@ depthwise_conv_2d_options, reshape_options, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node from torch.nn import Parameter @@ -53,45 +53,38 @@ class ConvolutionConverter(NodeConverter): @staticmethod def _is_supported_on_target( node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - match target: - case Target.RT700: - activations = node.args[0] - weights = node.args[1] - groups = node.args[8] - - if activations.meta["val"].shape[0] != 1: - # Only batch size 1 is supported on neutron. - return False - - if groups == 1: # Regular convolution. - pass - elif conv_utils.group_conv_convertible_as_depthwise( - node, groups - ): # Depthwise convolution. - # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted - # weights. In case the weights are dynamic, a Transpose operator would have to be added, which - # is not supported on Neutron. - if not node_is_effectively_static_tensor( - weights, parameters_mapping - ): - return False - elif conv_utils.group_conv_convertible_into_multiple_convolutions( - node, groups - ): # Separable conv. This should never be reached, as the node should have been decomposed into - # multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass. - logging.warning("Group convolution was not decomposed.") - return False - else: # Unexpected case (should never happen). - return False - - return True - - case _: + activations = node.args[0] + weights = node.args[1] + groups = node.args[8] + + if activations.meta["val"].shape[0] != 1: + # Only batch size 1 is supported on neutron. + return False + + if groups == 1: # Regular convolution. + pass + elif conv_utils.group_conv_convertible_as_depthwise( + node, groups + ): # Depthwise convolution. + # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted + # weights. In case the weights are dynamic, a Transpose operator would have to be added, which + # is not supported on Neutron. + if not node_is_effectively_static_tensor(weights, parameters_mapping): return False + elif conv_utils.group_conv_convertible_into_multiple_convolutions( + node, groups + ): # Separable conv. This should never be reached, as the node should have been decomposed into + # multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass. + logging.warning("Group convolution was not decomposed.") + return False + else: # Unexpected case (should never happen). + return False + + return True @staticmethod def _is_supported_in_IR( @@ -238,7 +231,7 @@ def _convert_1d_conv( def _convert_unpadded_2D( self, t_op: tflite_model.Operator, conv_params: ConvParameters ) -> conv_utils.ConvConversionResult: - """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converter by the + """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converted by the caller. """ common.assign_2d_strides(t_op.builtin_options, conv_params.stride) @@ -321,6 +314,10 @@ def _convert_2d_conv( t_op.tmp_inputs[1] = self.builder.create_transposed_tensor( weight_tensor, perm ) + + if t_op.tmp_inputs[1].quantization is not None: + # Model is quantized + t_op.tmp_inputs[1].quantization.quantized_dimension = 3 else: raise NotImplementedError("Dynamic Depthwise Conv weights.") diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py index f03c403876f..c1dd7b600be 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py @@ -12,7 +12,6 @@ from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, NodeConverter, - Target, ) from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reduce_utils import ( convert_axes_from_attribute, @@ -20,6 +19,7 @@ from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import ( mean_options, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node from torch.nn import Parameter @@ -28,34 +28,20 @@ class MeanDimConverter(NodeConverter): @staticmethod def _is_supported_on_target( node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - match target: - case Target.RT700: - # TODO: Consider different tensor formats (dim-order) - dim = node.args[1] - keepdim = node.args[2] if len(node.args) >= 3 else False - rank = len(node.args[0].meta["val"].shape) - dim = [MeanDimConverter._to_neg_dim(d, rank) for d in dim] - - # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron. - if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim: - return False - - return True - - case _: - return False + dim = node.args[1] + keepdim = node.args[2] if len(node.args) >= 3 else False + rank = len(node.args[0].meta["val"].shape) + dim = [d - rank if d > 0 else d for d in dim] - @staticmethod - def _to_pos_dim(d, rank): - return d + rank if d < 0 else d + # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron. + if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim: + return False - @staticmethod - def _to_neg_dim(d, rank): - return d - rank if d > 0 else d + return True @staticmethod def _is_supported_in_IR( @@ -75,6 +61,10 @@ def _is_supported_in_IR( return True + @staticmethod + def _to_pos_dim(d: int, rank: int): + return d + rank if d < 0 else d + @staticmethod def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]: # convert negative index to positive diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py index c6ea7f90042..1d7c6b44627 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py @@ -2,6 +2,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from abc import ABC, abstractmethod import numpy as np @@ -19,7 +20,15 @@ from torch.nn import Parameter -class QDQDequantizeConverter(NodeConverter): +class QDQDequantizeConverterBase(NodeConverter, ABC): + + @abstractmethod + def get_zero_point(self, node: Node) -> np.ndarray: + pass + + @abstractmethod + def get_scale(self, node: Node) -> np.ndarray: + pass @staticmethod def _is_supported_in_IR( @@ -27,7 +36,7 @@ def _is_supported_in_IR( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - zero_point_type = torch_type_to_numpy_type(node.args[5]) + zero_point_type = torch_type_to_numpy_type(node.args[-1]) if "cluster" not in node.meta or zero_point_type not in [np.int8, np.int32]: return False @@ -39,10 +48,8 @@ def convert(self, node: Node): from_tensor = self.builder.tensor_for_name(node.name) to_tensor = self.builder.tensor_for_name(node.args[0].name) - zero_point_type = torch_type_to_numpy_type(node.args[5]) - - scale = np.array(node.args[1], dtype=np.float32) - zero_point = np.array(node.args[2], dtype=zero_point_type) + scale = self.get_scale(node) + zero_point = self.get_zero_point(node) if self.context.parameters_mapping.get(node.args[0].name, None) is None: # Convert dequantize as identity op (Transpose that will be removed) because @@ -63,3 +70,22 @@ def convert(self, node: Node): # Change type so we pass check tensor similarity check when redirecting from_tensor.type = to_tensor.type self.builder.redirect_tensor(from_tensor, to_tensor) + + +class QDQPerTensorDequantizeConverter(QDQDequantizeConverterBase): + + def get_zero_point(self, node: Node) -> np.ndarray: + zero_point_type = torch_type_to_numpy_type(node.args[5]) + return np.array(node.args[2], dtype=zero_point_type) + + def get_scale(self, node: Node) -> np.ndarray: + return np.array(node.args[1], dtype=np.float32) + + +class QDQPerChannelDequantizeConverter(QDQDequantizeConverterBase): + + def get_zero_point(self, node: Node) -> np.ndarray: + return self.context.parameters_mapping[node.args[2].name].numpy() + + def get_scale(self, node: Node) -> np.ndarray: + return self.context.parameters_mapping[node.args[1].name].numpy() diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py index aa74c78ca24..5e4404d8476 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py @@ -7,13 +7,11 @@ CustomDelegationOptions, ) from executorch.backends.nxp.backend.edge_helper import input_rank -from executorch.backends.nxp.backend.ir.converter.node_converter import ( - NodeConverter, - Target, -) +from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import ( softmax_options, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node from torch.nn import Parameter @@ -22,18 +20,11 @@ class SoftmaxConverter(NodeConverter): @staticmethod def _is_supported_on_target( node: Node, - target: Target, + neutron_target_spec: NeutronTargetSpec, parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - match target: - case Target.RT700: - # The eIQ Neutron NPU runtime software has a known issue with the SoftMax operation. - # As long as the issue is present, return False for the i.MX RT700 target also. - return False - - case _: - return False + return False @staticmethod def _is_supported_in_IR( diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py new file mode 100644 index 00000000000..e9522c87114 --- /dev/null +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py @@ -0,0 +1,59 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.backends.nxp.backend.ir.converter.conversion.common import ( + node_uses_shape_broadcasting, +) +from executorch.backends.nxp.backend.ir.converter.node_converter import ( + CustomDelegationOptions, + NodeConverter, +) +from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import ( + sub_options, +) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec +from torch.fx import Node +from torch.nn import Parameter + + +class SubTensorConverter(NodeConverter): + @staticmethod + def _is_supported_on_target( + node: Node, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + custom_delegation_options: CustomDelegationOptions, + ) -> bool: + if node_uses_shape_broadcasting(node): + # Shape broadcasting may require the addition of `Transpose` ops during conversion. + return False + + return True + + @staticmethod + def _is_supported_in_IR( + node: Node, + parameters_mapping: dict[str, Parameter], + custom_delegation_options: CustomDelegationOptions, + ) -> bool: + if len(node.args) != 2: + return False + + # The `alpha` attribute can be represented by adding an extra `Mul` operator. + # However, this is not implemented as `alpha` is rarely used. + if hasattr(node.kwargs, "alpha"): + return False + + return True + + # sub.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1) + def convert(self, node: Node): + """Convert 'sub_tensor' operator to NeutronIR 'Sub'.""" + self.assert_convertible(node) + + t_op = self._create_tflite_op_with_io_tensors(node) + + t_op.builtin_options = sub_options.Sub() + self.builder.append_operators([t_op]) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py index 95a42d5d078..22eff3ebb5f 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py @@ -14,6 +14,7 @@ from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, + is_not_qdq_node, NodeConverter, ) from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reshape_transposition import ( @@ -23,6 +24,7 @@ reshape_options, ) from torch.fx import Node +from torch.fx.passes.infra.partitioner import Partition from torch.nn import Parameter @@ -45,6 +47,27 @@ def _is_supported_in_IR( return True + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + ): + view_copy_partitions = [ + partition for partition in partition_list if node in partition.nodes + ] + assert len(view_copy_partitions) == 1 + non_q_dq_partition_nodes = list( + filter(is_not_qdq_node, view_copy_partitions[0].nodes) + ) + + if len(non_q_dq_partition_nodes) == 1: + # The `view_copy` cannot be the only node in a partition. + return False + + return True + @staticmethod def _safe_compute_flat_size(shape: list[int | str]) -> int: """Compute the flat size of a tensor with given shape. Strings and negative dimensions are treated as '1'. diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py b/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py index 50b9aef6d18..52b895d60cd 100755 --- a/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py +++ b/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py @@ -1,19 +1,12 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.backends.nxp.backend.ir import logger from executorch.backends.nxp.backend.ir.converter.builder import model_builder from executorch.backends.nxp.backend.ir.converter.conversion import translator -from executorch.backends.nxp.backend.ir.converter.conversion.common import ( - OpsList, - try_get_input, -) +from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList from executorch.backends.nxp.backend.ir.converter.tensor_utils import tensor_has_data -from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import ( - ActivationFunctionType, -) from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model @@ -25,12 +18,12 @@ def ensure_correct_tensor_formatting( or RNN operator. The LSTM/RNN may be using channels last tensors, because of the surrounding operators. LSTM/RNN requires its own - format, however I think the input tensors should be marked as 'FORMATLESS', because the main inputs of TFLite - and ONNX version of the operators have the same shape. + format, however I think the input tensors should be marked as 'FORMATLESS', because the main inputs of the + NeutronIR and the ExecuTorch version of the operators have the same shape. I believe that the cleanest and most robust way to solve this, is to mark LSTM/RNN as an operator which can change the formats of its tensors, and solve any format related issues in this module. - :param t_op: TFLite operator with inputs and outputs corresponding to the ONNX LSTM/RNN operator. + :param t_op: NeutronIR operator with inputs and outputs corresponding to the ExecuTorch LSTM/RNN operator. :param builder: ModelBuilder object. :param ops: OpsList object, with operators to add to the model. May already contain some operators. """ @@ -69,44 +62,3 @@ def ensure_correct_tensor_formatting( ops.post_ops.append(transpose) t_op.tmp_outputs[idx].tensor_format = TensorFormat.FORMATLESS - - -def get_activation_function_for_name( - name: str, op_type: str = "LSTM" -) -> ActivationFunctionType: - get_activation_function_for_name.map = { - "Tanh": ActivationFunctionType.TANH, - "Relu": ActivationFunctionType.RELU, - } - - if act_fun := get_activation_function_for_name.map.get(name, None): - return act_fun - - # Couldn't find a corresponding activation function - logger.e( - logger.Code.CONVERSION_IMPOSSIBLE, - f"Conversion of ONNX {op_type} with activation function '{name}' is not possible.", - ) - - -def check_sequence_lens( - t_op: tflite_model.Operator, seq_length: int, op_type: str = "LSTM" -): - """Check if the 'sequence_lens' operand of ONNX LSTM/RNN has an effect. If it does, exit with error. - - :param t_op: TFLite operator with inputs and outputs corresponding to the ONNX operator. - :param seq_length: The first dimension of the main LSTM input. - :param op_type: Operator type of 't_op'. Used only for printing a specific error message. - """ - if sequence_lens := try_get_input(t_op, 4): - # 'sequence_lens' allows each sequence to have a different length. As far as I can tell, TFLite doesn't support - # this. - if (not tensor_has_data(sequence_lens)) or any( - elt != seq_length for elt in sequence_lens.tmp_buffer.data - ): - # The 'sequence_lens' is either dynamic, or static with at least one value different from 'seq_length'. - # Conversion most likely impossible. - logger.e( - logger.Code.CONVERSION_IMPOSSIBLE, - f"Conversion of ONNX {op_type} with 'sequence_lens' input is not possible.", - ) diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py b/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py index 1dca3acea74..da92e359f1e 100755 --- a/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py +++ b/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import numpy as np + from executorch.backends.nxp.backend.ir.converter.builder.model_builder import ( ModelBuilder, ) @@ -16,7 +17,7 @@ def convert_axes_from_attribute( t_op: tflite_model.Operator, builder: ModelBuilder, axes: list[int] | None ): - """Create an `axes` tensor and assign it as an input to the `t_op`, which is expected to represent an ONNX + """Create an `axes` tensor and assign it as an input to the `t_op`, which is expected to represent an ExecuTorch reduction operator. """ x = t_op.tmp_inputs[0] @@ -52,15 +53,15 @@ def ensure_reduce_transposition(builder, ops: OpsList): output_format = output_tensor.tensor_format if input_format.is_channels_last() and output_format.is_channels_last(): - to_onnx_perm = translator.create_channels_last_to_channels_first_permutation( - input_rank + to_executorch_perm = ( + translator.create_channels_last_to_channels_first_permutation(input_rank) ) to_tflite_perm = translator.create_channels_first_to_channels_last_permutation( output_rank, return_list=True ) transpose_before = builder.create_transpose_operator_before( - t_op, 0, to_onnx_perm + t_op, 0, to_executorch_perm ) transpose_before.tmp_outputs[0].tensor_format = TensorFormat.CHANNELS_FIRST ops.add_pre(transpose_before) @@ -72,7 +73,7 @@ def ensure_reduce_transposition(builder, ops: OpsList): ops.post_ops.insert(0, transpose_after) elif input_format.is_channels_last() and not output_format.is_channels_last(): - # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ONNX. + # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ExecuTorch. permutation = list( translator.create_channels_last_to_channels_first_permutation(input_rank) @@ -83,9 +84,9 @@ def ensure_reduce_transposition(builder, ops: OpsList): ops.add_pre(transpose) elif not input_format.is_channels_last() and output_format.is_channels_last(): - # The ReduceX introduces format to the tensor - # The ONNX ReduceX outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator - # must be added, to change the tensor to 'channels last'. + # The reduction operator introduces format to the tensor. + # The ExecuTorch reduction operator outputs a 'channels first' tensor. This has to stay the same, and then a + # Transpose operator must be added, to change the tensor to 'channels last'. permutation = list( translator.create_channels_first_to_channels_last_permutation(output_rank) diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py b/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py index 0e55c27684b..55056614684 100755 --- a/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py +++ b/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py @@ -1,4 +1,4 @@ -# Copyright 2023 NXP +# Copyright 2023-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -158,7 +158,7 @@ def ensure_reshape_transposition(builder, ops: OpsList) -> list[int]: new_shape = output_tensor.shape.vector if input_format.is_channels_last() and not output_format.is_channels_last(): - # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ONNX. + # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ExecuTorch. permutation = list( translator.create_channels_last_to_channels_first_permutation(input_rank) @@ -170,7 +170,7 @@ def ensure_reshape_transposition(builder, ops: OpsList) -> list[int]: elif not input_format.is_channels_last() and output_format.is_channels_last(): # The Reshape introduces format to the tensor (2D -> 4D for example) - # The ONNX Reshape outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator + # The `view_copy` outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator # must be added, to change the tensor to 'channels last'. permutation = list( diff --git a/backends/nxp/backend/ir/converter/quantization_utils.py b/backends/nxp/backend/ir/converter/quantization_utils.py index d9e7674d953..11de4eec13c 100755 --- a/backends/nxp/backend/ir/converter/quantization_utils.py +++ b/backends/nxp/backend/ir/converter/quantization_utils.py @@ -1,111 +1,19 @@ -# Copyright 2023 NXP +# Copyright 2023-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import copy -from typing import Iterable, List, Optional - -import executorch.backends.nxp.backend.ir.converter.builder.model_builder as model_builder +from typing import List import numpy as np + from executorch.backends.nxp.backend.ir import logger as logger -from executorch.backends.nxp.backend.ir.converter.conversion.translator import ( - tf_lite_type_to_numpy, -) -from executorch.backends.nxp.backend.ir.lib.tflite import TensorType as tflTensorType -from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType from executorch.backends.nxp.backend.ir.tflite_generator import ( tflite_model as tflite_model, ) -def quantization_is_equal( - x_scale: np.ndarray, - x_zp: np.ndarray, - x_type: TensorType, - y_scale: np.ndarray, - y_zp: np.ndarray, - y_type: TensorType, -) -> bool: - """Determine if provided quantization parameters of tensors 'x' and 'y' are the same. - - :param x_scale: Scale of the 'x' tensor. - :param x_zp: Zero point of the 'x' tensor. - :param x_type: TFLite data type of the 'x' tensor. - :param y_scale: Scale of the 'y' tensor. - :param y_zp: Zero point of the 'y' tensor. - :param y_type: TFLite data type of the 'y' tensor. - :return: True, if the quantization parameters are equal. - """ - if x_type != y_type: - return False - - if not (x_scale.size == x_zp.size == y_scale.size == y_zp.size): - return False - - x_scale, x_zp = quantization_params_to_lists(x_scale, x_zp) - y_scale, y_zp = quantization_params_to_lists(y_scale, y_zp) - - return all( - x_s == y_s and x_z == y_z - for x_s, y_s, x_z, y_z in zip(x_scale, y_scale, x_zp, y_zp) - ) - - -def quantization_params_to_lists( - scale: np.ndarray, zero_point: np.ndarray -) -> (List[float], List[int]): - if (scale is None) or (zero_point is None): - logger.e( - logger.Code.INTERNAL_ERROR, - "Missing zero_point and/or scale quantization params when converting to list!", - ) - - if (scale.size == 1) and (zero_point.size == 1): - # Per tensor quantization - scale = [scale.item()] - zero_point = [zero_point.item()] - elif (scale.size != 1) and (zero_point.size != 1): - # Per channel quantization - scale = scale.tolist() - zero_point = zero_point.tolist() - else: - logger.e( - logger.Code.CONVERSION_IMPOSSIBLE, - "TFLite doesn't support combination of per-channel and per-tensor quantization params.", - ) - - return scale, zero_point - - -def is_quantization_valid(scale, zero_point): - return scale.size == zero_point.size - - -def is_per_tensor_quantized(scale, zero_point): - return (scale.size == 1) and (zero_point.size == 1) - - -def is_per_channel_quantized(scale, zero_point): - return is_quantization_valid(scale, zero_point) and not is_per_tensor_quantized( - scale, zero_point - ) - - -def get_symmetric_zero_point_for_type(tensor_type: TensorType): - match tensor_type: - case TensorType.INT8: - return 0 - case TensorType.UINT8: - return 128 - case _: - logger.e( - logger.Code.INTERNAL_ERROR, - f"Attempt to get zero point definition for type: {tensor_type}", - ) - - def _validate_or_set_quant_params( tensor: tflite_model.Tensor, quant: tflite_model.Quantization ) -> bool: @@ -130,7 +38,7 @@ def propagate_quantization( """ Propagates quantization parameters from from_tensor to to_tensor. If to_tensor already has the params set checks the consistency. - :raises: logger.Error - INVALID_ONNX_MODEL + :raises: logger.Error - INVALID_INPUT_MODEL """ if ( @@ -147,7 +55,7 @@ def propagate_quantization( # noinspection PyTypeChecker if not _validate_or_set_quant_params(to_tensor, from_tensor.quantization): logger.e( - logger.Code.INVALID_ONNX_MODEL, + logger.Code.INVALID_INPUT_MODEL, f'Mismatched quantization parameters between tensors "{from_tensor.name}" and "{to_tensor.name}"', ) @@ -161,16 +69,16 @@ def set_quantization_parameters_to_tensor( """Create a TFLite QuantizationParameters object, initialize it from given parameters and add it to the 'tflite_tensor'. :param tflite_tensor: The TFLite tensor in the model, to add the quantization to. - :param scale: The data of the tensor, which is an input of a quantized ONNX operator and represents the + :param scale: The data of the tensor, which is an input of a quantized ExecuTorch operator and represents the quantization scale. - :param zero_point: The data of the tensor, which is an input of a quantized ONNX operator and represents the + :param zero_point: The data of the tensor, which is an input of a quantized ExecuTorch operator and represents the quantization zero point. :param quantized_dimension: The quantized dimension attribute of TFLite QuantizationParameters. """ if (scale is None) or (zero_point is None): logger.e( logger.Code.NOT_IMPLEMENTED, - "Conversion of ONNX quantized operators is only supported when " + "Conversion of ExecuTorch quantized operators is only supported when " "the quantization parameters are static!", ) @@ -184,8 +92,8 @@ def set_quantization_parameters_to_tensor( if scale.size != zero_point.size: logger.e( - logger.Code.INVALID_ONNX_MODEL, - f"The per channel quantization parameters of ONNX tensor " + logger.Code.INVALID_INPUT_MODEL, + f"The per channel quantization parameters of ExecuTorch tensor " f"'{tflite_tensor.name}' are of different sizes! ('{scale.size}'" f" != '{zero_point.size}')", ) @@ -193,8 +101,8 @@ def set_quantization_parameters_to_tensor( quantized_dimension_size = tflite_tensor.shape.get(quantized_dimension) if scale.size != quantized_dimension_size: logger.e( - logger.Code.INVALID_ONNX_MODEL, - f"The ONNX per channel quantization parameter vectors do not " + logger.Code.INVALID_INPUT_MODEL, + f"The ExecuTorch per channel quantization parameter vectors do not " f"match the size of the quantized dimension! ('{scale.size}' != " f"'{quantized_dimension_size}')", ) @@ -205,8 +113,8 @@ def set_quantization_parameters_to_tensor( else: # Combination of per tensor and per channel quantization parameters logger.e( - logger.Code.INVALID_ONNX_MODEL, - f"ONNX tensor '{tflite_tensor.name}' uses a combination of per " + logger.Code.INVALID_INPUT_MODEL, + f"ExecuTorch node '{tflite_tensor.name}' uses a combination of per " f"tensor and per channel quantization parameters. Conversion to " f"TFLite is not possible!", ) @@ -218,33 +126,12 @@ def set_quantization_parameters_to_tensor( ) if not _validate_or_set_quant_params(tflite_tensor, quant): logger.e( - logger.Code.INVALID_ONNX_MODEL, + logger.Code.INVALID_INPUT_MODEL, f'Mismatched quantization parameters between tensors: "{tflite_tensor.name}" already ' f"has the quantization params set", ) -def calculate_uint_to_int_re_quantization_zero_point( - data_type_byte_size: int, old_zero_point: Iterable[int] -) -> np.ndarray: - """ - Calculate the new zero points, after a quantized tensor with an unsigned int data type is re-quantized to - a signed type. - :param data_type_byte_size: Size of the data type that is used, in Bytes. For example 1 for INT8. - :param old_zero_point: The zero point quantisation parameter, of the original data, before re-quantization. - :return: The new zero point quantisation parameter, after re-quantization. - """ - data_type_bit_size = 8 * data_type_byte_size - zero_point_shift = 2 ** (data_type_bit_size - 1) - return np.asarray(np.subtract(np.array(old_zero_point, np.int32), zero_point_shift)) - - -def _re_quantize_uint8_to_int8(tensor_data: np.ndarray) -> np.ndarray: - """Re-quantize static uint8 data to int8.""" - int16_data = np.asarray(tensor_data, np.int16) - return np.array(int16_data - 128, np.int8) - - def quantize_int8( data: np.ndarray, scale: List[float], zero_point: List[int] ) -> np.ndarray: @@ -252,20 +139,6 @@ def quantize_int8( return np.clip(new_data, -128, 127).astype(np.int8) -def quantize_uint8( - data: np.ndarray, scale: List[float], zero_point: List[int] -) -> np.ndarray: - new_data = np.add(np.round(np.divide(data, scale)), zero_point) - return np.clip(new_data, 0, 255).astype(np.uint8) - - -def quantize_int32( - data: np.ndarray, scale: List[float], zero_point: List[int] -) -> np.ndarray: - new_data = np.add(np.round(np.divide(data, scale)), zero_point) - return np.clip(new_data, -2_147_483_648, 2_147_483_648).astype(np.int32) - - def dequantize( data: np.ndarray, scale: List[float], zero_point: List[int] ) -> np.ndarray: @@ -274,211 +147,3 @@ def dequantize( scale, dtype=np.float32, ) - - -def re_quantize_static_tensor( - builder: "model_builder.ModelBuilder", - tflite_tensor: tflite_model.Tensor, - to_type: tflTensorType.TensorType, - new_scale: Optional[List[float]] = None, - new_zero_point: Optional[List[int]] = None, -) -> tflite_model.Tensor: - """Create a new TFLite Tensor with new quantization parameters, type and data. - - :param builder: A ModelBuilder instance. - :param tflite_tensor: TFLite tensor to re-quantize. - :param to_type: The TFLite TensorType, that the tensor will be re-quantized to. - :param new_scale: New scale quantization parameter. Used only when re-quantizing to the same type. - :param new_zero_point: New zero point quantization parameter. Used only when re-quantizing to the same type. - :return: A new re-quantized tensor. - """ - if tflite_tensor.quantization is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "translator.re_quantize_static_tensor(): Got tensor without quantization!", - ) - - if tflite_tensor.tmp_buffer.data is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "translator.re_quantize_static_tensor(): Got tensor without static data!", - ) - - new_dtype = tf_lite_type_to_numpy(to_type) - re_quantized_tensor = builder.duplicate_tensor(tflite_tensor) - tensor_data = re_quantized_tensor.tmp_buffer.data - - if tensor_data.dtype == np.uint8 and new_dtype == np.int8: # INT8 -> UINT8 - re_quantized_tensor.tmp_buffer.data = _re_quantize_uint8_to_int8(tensor_data) - re_quantized_tensor.type = tflTensorType.TensorType.INT8 - calculated_zero_point = calculate_uint_to_int_re_quantization_zero_point( - 1, re_quantized_tensor.quantization.zero_point.vector - ) - re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - list(calculated_zero_point) - ) - - elif tensor_data.dtype == np.int32 and new_dtype == np.int8: # INT32 -> INT8 - if new_zero_point is None or new_scale is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "Missing new zero_point or new scale when re-quantizing tensor.", - ) - - old_zp = re_quantized_tensor.quantization.zero_point.vector - old_scale = re_quantized_tensor.quantization.scale.vector - float_data = dequantize(tensor_data, old_scale, old_zp) - int8_data = quantize_int8(float_data, new_scale, new_zero_point) - - re_quantized_tensor.tmp_buffer.data = int8_data - re_quantized_tensor.type = tflTensorType.TensorType.INT8 - re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - list(new_zero_point) - ) - re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale)) - - elif tensor_data.dtype == np.int32 and new_dtype == np.uint8: # INT32 -> UINT8 - if new_zero_point is None or new_scale is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "Missing new zero_point or new scale when re-quantizing tensor.", - ) - - old_zp = re_quantized_tensor.quantization.zero_point.vector - old_scale = re_quantized_tensor.quantization.scale.vector - float_data = dequantize(tensor_data, old_scale, old_zp) - uint8_data = quantize_uint8(float_data, new_scale, new_zero_point) - - re_quantized_tensor.tmp_buffer.data = uint8_data - re_quantized_tensor.type = tflTensorType.TensorType.UINT8 - re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - list(new_zero_point) - ) - re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale)) - - elif tensor_data.dtype == np.int8 and new_dtype == np.int8: # INT8 -> INT8 - # Re-quantizing int8 tensor data with different quantization parameters - if new_zero_point is None or new_scale is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "Missing new zero_point or new scale when re-quantizing tensor.", - ) - - zero_point_data = re_quantized_tensor.quantization.zero_point.vector - scale_data = re_quantized_tensor.quantization.scale.vector - new_tensor_data = dequantize(tensor_data, scale_data, zero_point_data) - - re_quantized_tensor.tmp_buffer.data = quantize_int8( - new_tensor_data, new_scale, new_zero_point - ) - re_quantized_tensor.quantization.scale = tflite_model.Scale(new_scale) - re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - new_zero_point - ) - - elif tensor_data.dtype == np.int32 and new_dtype == np.int32: # INT32 -> INT32 - if new_zero_point is None or new_scale is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "Missing new zero_point or new scale when re-quantizing tensor.", - ) - - old_zp = re_quantized_tensor.quantization.zero_point.vector - old_scale = re_quantized_tensor.quantization.scale.vector - float_data = dequantize(tensor_data, old_scale, old_zp) - int32_data = quantize_int32(float_data, new_scale, new_zero_point) - - re_quantized_tensor.tmp_buffer.data = int32_data - re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - list(new_zero_point) - ) - re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale)) - - else: - logger.e( - logger.Code.NOT_IMPLEMENTED, - f"Re-quantization of static tensors from type '{tensor_data.dtype}' " - f"to type '{to_type}' is not yet implemented!", - ) - - return re_quantized_tensor - - -def quantize_static_float_tensor( - builder: "model_builder.ModelBuilder", - tflite_tensor: tflite_model.Tensor, - to_type: tflTensorType.TensorType, - scale: List[float], - zero_point: List[int], - quantized_dimension: int = 0, -) -> tflite_model.Tensor: - """Quantize tensor 'tflite_tensor' with passed quantization params. - - :param builder: A ModelBuilder instance. - :param tflite_tensor: TFLite tensor to quantize. - :param to_type: The TFLite TensorType, that the tensor will be quantized to. - :param scale: Scale quantization parameter. - :param zero_point: Zero point quantization parameter. - :param quantized_dimension: Quantized dimension. - """ - if tflite_tensor.quantization is not None: - logger.e(logger.Code.INTERNAL_ERROR, "Got tensor with quantization!") - - if tflite_tensor.tmp_buffer.data is None: - logger.e(logger.Code.INTERNAL_ERROR, "Got tensor without static data!") - - quantized_tensor = builder.duplicate_tensor(tflite_tensor) - tensor_data = quantized_tensor.tmp_buffer.data - - if zero_point is None or scale is None: - logger.e( - logger.Code.INTERNAL_ERROR, - "Missing new zero_point or new scale when quantizing tensor.", - ) - - new_dtype = tf_lite_type_to_numpy(to_type) - - if tensor_data.dtype == np.float32 and new_dtype == np.int8: - int8_data = quantize_int8(tensor_data, scale, zero_point) - - quantized_tensor.tmp_buffer.data = int8_data - quantized_tensor.type = tflTensorType.TensorType.INT8 - quantized_tensor.quantization = tflite_model.Quantization() - quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - list(zero_point) - ) - quantized_tensor.quantization.scale = tflite_model.Scale(list(scale)) - quantized_tensor.quantization.quantized_dimension = quantized_dimension - - elif tensor_data.dtype == np.float32 and new_dtype == np.uint8: - uint8_data = quantize_uint8(tensor_data, scale, zero_point) - - quantized_tensor.tmp_buffer.data = uint8_data - quantized_tensor.type = tflTensorType.TensorType.UINT8 - quantized_tensor.quantization = tflite_model.Quantization() - quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - list(zero_point) - ) - quantized_tensor.quantization.scale = tflite_model.Scale(list(scale)) - quantized_tensor.quantization.quantized_dimension = quantized_dimension - - elif tensor_data.dtype == np.float32 and new_dtype == np.int32: - int32_data = quantize_int32(tensor_data, scale, zero_point) - - quantized_tensor.tmp_buffer.data = int32_data - quantized_tensor.type = tflTensorType.TensorType.INT32 - quantized_tensor.quantization = tflite_model.Quantization() - quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint( - list(zero_point) - ) - quantized_tensor.quantization.scale = tflite_model.Scale(list(scale)) - quantized_tensor.quantization.quantized_dimension = quantized_dimension - - else: - logger.e( - logger.Code.NOT_IMPLEMENTED, - f"Quantization of static tensors from type '{tensor_data.dtype}' " - f"to type '{to_type}' is not yet implemented!", - ) - - return quantized_tensor diff --git a/backends/nxp/backend/ir/logger.py b/backends/nxp/backend/ir/logger.py index ce8da2a31df..8019fb4d780 100644 --- a/backends/nxp/backend/ir/logger.py +++ b/backends/nxp/backend/ir/logger.py @@ -1,6 +1,6 @@ # # Copyright 2023 Martin Pavella -# Copyright 2023 NXP +# Copyright 2023-2025 NXP # # License: MIT # See the LICENSE_MIT for more details. @@ -85,18 +85,18 @@ class Code(Enum): PREPROCESSING_ERROR = 4 UNSUPPORTED_OPERATOR = 21 - UNSUPPORTED_ONNX_TYPE = 22 + # Code 22 was removed. UNSUPPORTED_OPERATOR_ATTRIBUTES = 23 NOT_IMPLEMENTED = 24 INVALID_TYPE = 31 INVALID_TENSOR_SHAPE = 32 - INVALID_ONNX_OPERATOR = 33 - INVALID_ONNX_OPERATOR_ATTRIBUTE = 34 - INVALID_ONNX_MODEL = 35 + # Code 33 was removed. + INVALID_OPERATOR_ATTRIBUTE = 34 + INVALID_INPUT_MODEL = 35 CONVERSION_IMPOSSIBLE = 41 - SHAPE_INFERENCE_ERROR = 42 + # Code 42 was removed. IO_PRESERVATION_ERROR = 43 INVALID_INPUT = 51 @@ -142,8 +142,6 @@ class BasicLoggingContext(LoggingContext): """ GLOBAL = LoggingContext("global") - SHAPE_INFERENCE = LoggingContext("shape_inference") - ONNX_PARSER = LoggingContext("onnx_parser") OPERATOR_CONVERSION = LoggingContext("operator_conversion") TFLITE_GENERATOR = LoggingContext("tflite_generator") QDQ_QUANTIZER = LoggingContext("qdq_quantizer") @@ -151,7 +149,7 @@ class BasicLoggingContext(LoggingContext): class NodeLoggingContext(LoggingContext): """ - ONNX node specific context. Logs reported within this context are related to node with index 'node_id'. + ExecuTorch node specific context. Logs reported within this context are related to node with index 'node_id'. """ def __init__(self, node_id): @@ -213,7 +211,7 @@ def _get_node_error(self, node_id: int, dict_item: str) -> Code | str | None: Return first error log item that belong to node with id 'node_id'. If no error is present None is returned instead. - :param node_id: ONNX node id. + :param node_id: ExecuTorch node id. :param dict_item: Dictionary item to return from `log` :return: Error code or None if there's no error related to node. """ @@ -230,7 +228,7 @@ def get_node_error_code(self, node_id: int) -> Code | None: Return first error code that belong to node with id 'node_id'. If no error is present None is returned instead. - :param node_id: ONNX node id. + :param node_id: ExecuTorch node id. :return: Error code or None if there's no error related to node. """ @@ -241,7 +239,7 @@ def get_node_error_message(self, node_id: int) -> str | None: Return first error message that belong to node with id 'node_id'. If no error is present None is returned instead. - :param node_id: ONNX node id + :param node_id: ExecuTorch node id :return: Error message or None if there is no error related to node. """ @@ -256,7 +254,7 @@ class loggingContext: Context manager used to nest logging contexts. Usage: with loggingContext(BasicLoggingContext.GLOBAL): - with loggingContext(BasicLoggingContext.ONNX_PARSER): + with loggingContext(BasicLoggingContext.OPERATOR_CONVERSION): logger.i("My log") # this log is automatically assigned to both parent contexts """ diff --git a/backends/nxp/backend/ir/tensor_formatting.py b/backends/nxp/backend/ir/tensor_formatting.py index aab22c3c368..db24576e81f 100644 --- a/backends/nxp/backend/ir/tensor_formatting.py +++ b/backends/nxp/backend/ir/tensor_formatting.py @@ -1,6 +1,5 @@ -# # Copyright 2023 Martin Pavella -# Copyright 2023-2024 NXP +# Copyright 2023-2025 NXP # # License: MIT # See the LICENSE_MIT for more details. @@ -26,7 +25,7 @@ class TensorFormat(Enum): TRANSPOSE_CONV_2D_WEIGHT_FORMAT = 13 # No special format (matrices, vectors, shapes etc.). All tensors with the FORMATLESS format MUST have EXACTLY - # the same shape and data in the TFLite model and in the ONNX model. + # the same shape and data in the NeutronIR model and in the ExecuTorch model. FORMATLESS = 20 NONE = 30 # Format has not been identified diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py index a9384861178..76a50a2e177 100755 --- a/backends/nxp/backend/ir/tflite_generator/tflite_model.py +++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py @@ -1,6 +1,5 @@ -# # Copyright 2023 Martin Pavella -# Copyright 2023-2024 NXP +# Copyright 2023-2025 NXP # # License: MIT # See the LICENSE_MIT for more details. @@ -272,8 +271,7 @@ def is_per_tensor(self) -> bool: return False def gen_tflite(self, builder: fb.Builder): - # Sometimes 1D per-tensor quantized tensors can have quantized_dimension != 0 - # (residue from badly defined ONNX models). This would cause TFLite inference to crash. + # Sometimes 1D per-tensor quantized tensors can have quantized_dimension != 0. if not self.is_per_channel(): self.quantized_dimension = 0 @@ -513,7 +511,7 @@ class Operator(meta.TFLiteObject): tmp_outputs: List[Tensor] tmp_version: int # OperatorConverter uses this to assign the corresponding operator code with correct version. - # If `True`, this is an extra operator added during conversion. It was not present in the original ONNX model. + # If `True`, this is an extra operator added during conversion. It was not present in the original input model. tmp_added_extra: bool def __init__( diff --git a/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py b/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py index 253dc9c69a1..e861eff0d18 100755 --- a/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py +++ b/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -100,23 +100,3 @@ def __call__( operator_is_type(preceding_op, self.single_preceding_op_type, builder) for preceding_op in preceding_ops ) - - -@dataclass -class WasNotInTheOriginalONNXModel(OpRule): - """Assures that this operator wasn't created by converting an ONNX operator from the original model, but instead - was added extra in order to convert a different operator. - - This rule is currently only satisfied for operators added by ModelBuilder methods `create_..._before()` and - `create_..._after()`. - """ - - def __call__( - self, - op: tflite_model.Operator, - tensor_map: NameToTensorMap, - input_to_ops_map: InputTensorToOpsMap, - output_to_op_map: OutputTensorToOpMap, - builder: "model_builder.ModelBuilder", - ) -> bool: - return op.tmp_added_extra diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py deleted file mode 100755 index dddabfe87f1..00000000000 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py +++ /dev/null @@ -1,256 +0,0 @@ -# Copyright 2024 NXP -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import ( - BuiltinOperator, -) -from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType -from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model -from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.hard_swish_options import ( - HardSwish, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import ( - BaseOptimization, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import ( - OneOf, - Op, - PatternMatcher, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import ( - RuleOr, - TensorHasNConsumers, - TensorHasStaticValue, - TensorHasType, - TensorsAreQuantized, - TensorsHaveOneConsumer, - TensorsHaveType, -) - - -class CombineHardSigmoidAndMulIntoHardSwish(BaseOptimization): - - def __call__(self) -> bool: - made_changes = self._combine_float_variant() - made_changes |= self._combine_quantized_variant() - - return made_changes - - def _combine_float_variant(self) -> bool: - """Fuse some operators in the following pattern. The ops `Mul`, `Add` `Minimum` and `Relu` compute the - `HardSigmoid` operation, as there is no `HardSigmoid` operator in TFLite. - - ┌─────┴─────┐ `x` - ┌──▼──┐ │ - 1/6 ──► Mul │ │ - └──┬──┘ │ - ┌──▼──┐ │ - 1/2 ──► Add │ │ │ - └──┬──┘ │ ┌─────▼─────┐ - ┌────▼────┐ │ ─────► │ HardSwish │ - 1 ──► Minimum │ │ └─────┬─────┘ - └────┬────┘ │ - ┌──▼───┐ │ - │ Relu │ │ - └──┬───┘ │ - └───┐ ┌───┘ - ┌▼───▼┐ - │ Mul │ - └──┬──┘ - """ - - matcher = PatternMatcher( - self._builder, - [ - Op(["Mul"], ["x", "alpha"], ["mul_o"]), - OneOf( - [ - Op(["Add"], ["mul_o", "beta"], ["add_o"]), - Op(["Add"], ["beta", "mul_o"], ["add_o"]), - ] - ), - OneOf( - [ - Op(["Minimum"], ["add_o", "one"], ["min_o"]), - Op(["Minimum"], ["one", "add_o"], ["min_o"]), - ] - ), - Op(["Relu"], ["min_o"], ["relu_o"]), - OneOf( - [ - Op(["Mul"], ["x", "relu_o"], ["y"]), - Op(["Mul"], ["relu_o", "x"], ["y"]), - ] - ), - ], - [ - TensorHasNConsumers("x", 2), - TensorsHaveOneConsumer(["mul_o", "add_o", "min_o", "relu_o"]), - TensorHasStaticValue("alpha", 1 / 6), - TensorHasStaticValue("beta", 0.5), - TensorHasStaticValue("one", 1), - # `HardSwishConverter` and `HardSigmoidConverter` both only support float32. - TensorHasType("x", TensorType.FLOAT32), - ], - ) - - # The mapped operator (value) will be inserted into the model later, at the position of the `key` operator. - to_add: dict[tflite_model.Operator, tflite_model.Operator] = {} - to_remove = [] - for pattern_ops, tensor_map, _, _ in matcher.match_patterns(): - x, y = tensor_map["x"], tensor_map["y"] - hard_swish = tflite_model.Operator( - builtin_options=HardSwish(), - opcode_index=self._builder.op_code_index_for_op_type( - BuiltinOperator.HARD_SWISH - ), - ) - hard_swish.tmp_inputs = [x] - hard_swish.tmp_outputs = [y] - - to_add[pattern_ops[0]] = hard_swish - - to_remove.extend(pattern_ops) - - ops = self._builder.get_operators() - for k, v in to_add.items(): - idx = ops.index(k) - ops.insert(idx, v) - - for op in to_remove: - ops.remove(op) - - return len(to_remove) != 0 - - def _combine_quantized_variant(self) -> bool: - """Fuse some operators in the following pattern. The ops `Mul`, `Add` `Minimum` and `Relu` compute the - `HardSigmoid` operation, as there is no `HardSigmoid` operator in TFLite. - - The following pattern arises from using the `onnx2quant` on a model with `HardSwish`. The quantizer always - runs a pre-processing step which splits the ONNX `HardSwish` into `HardSigmoid` and `Mul`. It seems like it - cannot be turned off. Therefore, we cannot add QDQ quantization of `HardSwish`. But since `HardSigmoid` - gets converted to multiple TFLite operators, we also cannot really add QDQ quantization for that operator. - This means that `HardSwish` will never get fully quantized by the `onnx2quant`, and the following pattern - will be created. - We can, however, convert the entire pattern into a quantized `HardSwish` using this optimization. - - │ (u)int8 `x` - ┌─────▼──────┐ - │ Dequantize │ - └─────┬──────┘ - ┌─────┴─────┐ float32 - ┌──▼──┐ │ - 1/6 ──► Mul │ │ - └──┬──┘ │ - ┌──▼──┐ │ - 1/2 ──► Add │ │ - └──┬──┘ │ - ┌────▼────┐ │ - 1 ──► Minimum │ │ │ (u)int8 `x` - └────┬────┘ │ ┌─────▼─────┐ - ┌──▼───┐ │ ─────► │ HardSwish │ - │ Relu │ │ └─────┬─────┘ - └──┬───┘ │ │ (u)int8 `y` - ┌────▼─────┐ │ - │ Quantize │ │ - └────┬─────┘ │ - ┌─────▼──────┐ │ - │ Dequantize │ │ - └─────┬──────┘ │ - └───┐ ┌───┘ - ┌▼───▼┐ - │ Mul │ - └──┬──┘ - │ float32 - ┌────▼─────┐ - │ Quantize │ - └────┬─────┘ - │ (u)int8 `y` - """ - matcher = PatternMatcher( - self._builder, - [ - Op(["Dequantize"], ["x"], ["deq1_o"]), - OneOf( - [ - Op(["Mul"], ["deq1_o", "alpha"], ["mul1_o"]), - Op(["Mul"], ["alpha", "deq1_o"], ["mul1_o"]), - ] - ), - OneOf( - [ - Op(["Add"], ["mul1_o", "beta"], ["add_o"]), - Op(["Add"], ["beta", "mul1_o"], ["add_o"]), - ] - ), - OneOf( - [ - Op(["Minimum"], ["add_o", "one"], ["min_o"]), - Op(["Minimum"], ["one", "add_o"], ["min_o"]), - ] - ), - Op(["Relu"], ["min_o"], ["relu_o"]), - Op(["Quantize"], ["relu_o"], ["quant1_o"]), - Op(["Dequantize"], ["quant1_o"], ["deq2_o"]), - OneOf( - [ - Op(["Mul"], ["deq1_o", "deq2_o"], ["mul2_o"]), - Op(["Mul"], ["deq2_o", "deq1_o"], ["mul2_o"]), - ] - ), - Op(["Quantize"], ["mul2_o"], ["y"]), - ], - [ - TensorHasNConsumers("deq1_o", 2), - TensorsHaveOneConsumer( - [ - "mul1_o", - "add_o", - "min_o", - "relu_o", - "quant1_o", - "deq2_o", - "mul2_o", - ] - ), - TensorHasStaticValue("alpha", 1 / 6), - TensorHasStaticValue("beta", 0.5), - TensorHasStaticValue("one", 1), - TensorHasType("deq1_o", TensorType.FLOAT32), - TensorsAreQuantized(["x", "y"]), - RuleOr( - TensorsHaveType(["x", "y"], TensorType.INT8), - TensorsHaveType(["x", "y"], TensorType.UINT8), - ), - ], - ) - - # The mapped operator (value) will be inserted into the model later, at the position of the `key` operator. - to_add: dict[tflite_model.Operator, tflite_model.Operator] = {} - to_remove = [] - for pattern_ops, tensor_map, _, _ in matcher.match_patterns(): - x, y = tensor_map["x"], tensor_map["y"] - hard_swish = tflite_model.Operator( - builtin_options=HardSwish(), - opcode_index=self._builder.op_code_index_for_op_type( - BuiltinOperator.HARD_SWISH - ), - ) - hard_swish.tmp_inputs = [x] - hard_swish.tmp_outputs = [y] - - to_add[pattern_ops[0]] = hard_swish - - to_remove.extend(pattern_ops) - - ops = self._builder.get_operators() - for k, v in to_add.items(): - idx = ops.index(k) - ops.insert(idx, v) - - for op in to_remove: - ops.remove(op) - - return len(to_remove) != 0 diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py deleted file mode 100755 index b6fd5849551..00000000000 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2024 NXP -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType -from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import ( - NoFusedActivationFunction, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import ( - BaseOptimization, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import ( - OneOf, - Op, - PatternMatcher, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import ( - RuleAnd, - RuleIf, - RuleOr, - TensorDimensionsMatch, - TensorHasDimensionOfSize, - TensorHasOneConsumer, - TensorHasRank, - TensorHasType, - TensorIsQuantized, -) - - -class FuseFullyConnectedAndAddOperators(BaseOptimization): - - def __call__(self) -> bool: - """ - FullyConnected -> Add sequence can handle more complicated shapes than just FullyConnected with bias - (due to shape broadcasting). - The bias can have shape [N] or [1, N], where N is the first dimension of the FC weights tensor. - It could also have shape [1, ..., 1, N], but then the TFLite FullyConnected removes the leading ones, - even if 'keep_num_dims' is True. In ONNX, the output tensor has the leading ones, - In this case, a Reshape would have to be added, so we do not perform the fusion. - - # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/fully_connected.cc#L398 - """ - matcher = PatternMatcher( - self._builder, - [ - # Require exactly 2 inputs. - Op( - ["FullyConnected"], ["x", "w"], ["y"], [NoFusedActivationFunction()] - ), - OneOf([Op(["Add"], ["y", "b"]), Op(["Add"], ["b", "y"])]), - ], - [ - TensorHasOneConsumer("y"), - TensorHasRank("w", 2), - RuleOr( - TensorHasRank("b", 1), - RuleAnd(TensorHasRank("b", 2), TensorHasDimensionOfSize("b", 0, 1)), - ), - TensorDimensionsMatch("w", 0, "b", -1), - RuleIf(TensorIsQuantized("x"), TensorHasType("b", TensorType.INT32)), - ], - ) - - to_remove = [] - for (fc, add), tensor_map, _, _ in matcher.match_patterns(): - b = tensor_map["b"] - fc.tmp_inputs.append(b) - - # Remove the 'Add' operator. - fc.tmp_outputs[0] = add.tmp_outputs[0] - fc.builtin_options.fused_activation_function = ( - add.builtin_options.fused_activation_function - ) - to_remove.append(add) - - for op in to_remove: - self._builder.get_operators().remove(op) - - return len(to_remove) != 0 diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py index 42eefc1ab56..ef76fad90de 100755 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py +++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -50,7 +50,7 @@ def __call__(self) -> bool: How it works: - The original model doesn't have the `Transpose`. It just has `Reshape` into `MatMul` (or `Gemm`...). - The `Transpose` is added, because the `Reshape` has a channels last input, which was originally - channels first (in the ONNX model), and so the 2D output of the `Reshape` would have the same data. + channels first (in the ExecuTorch model), and so the 2D output of the `Reshape` would have the same data. but at different locations. The `Transpose` makes the input channels first, which ensures correct output of the `Reshape`. - In the scenario in the graph above, it is possible to omit the `Transpose`, which causes the `Reshape` @@ -85,12 +85,12 @@ def __call__(self) -> bool: for (transpose, reshape, fc), tensor_map, _, _ in matcher.match_patterns(): # Make sure the `Transpose` is applying the expected permutation. y = tensor_map["y"] - to_onnx_perm = ( + to_executorch_perm = ( translator.create_channels_last_to_channels_first_permutation( y.shape.len() ) ) - if not np.allclose(to_onnx_perm, tensor_map["perm"].tmp_buffer.data): + if not np.allclose(to_executorch_perm, tensor_map["perm"].tmp_buffer.data): continue # The `Transpose` has an unexpected permutation. w = tensor_map["w"] diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py index dc9ad9999b4..0be46efcaa8 100755 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py +++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py index d4a097ca76d..69b75b72cdd 100755 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py +++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py @@ -11,15 +11,9 @@ from executorch.backends.nxp.backend.ir import logger from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.combine_hard_sigmoid_and_mul_to_hard_swish import ( - CombineHardSigmoidAndMulIntoHardSwish, -) from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_activation_functions import ( FuseActivationFunctions, ) -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_fully_connected_and_add_operators import ( - FuseFullyConnectedAndAddOperators, -) from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import ( MoveActivationBeforeConcatenation, ) @@ -34,7 +28,6 @@ class Optimization(Enum): FUSE_ACTIVATION_FUNCTIONS = 1 - FUSE_FULLY_CONNECTED_AND_ADD = 2 FUSE_TRANSPOSE_OPERATORS = 5 REMOVE_IDENTITY_TRANSPOSE_OPERATORS = 6 @@ -42,7 +35,6 @@ class Optimization(Enum): PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE = 12 MOVE_ACTIVATION_BEFORE_CONCAT = 15 - COMBINE_HARD_SIGMOID_AND_MUL_INTO_HARD_SWISH = 16 class Optimizer: @@ -75,9 +67,6 @@ def __init__( Optimization.FUSE_ACTIVATION_FUNCTIONS: FuseActivationFunctions( builder, conversion_config ), - Optimization.FUSE_FULLY_CONNECTED_AND_ADD: FuseFullyConnectedAndAddOperators( - builder, conversion_config - ), Optimization.FUSE_TRANSPOSE_OPERATORS: FuseTransposeOperators( builder, conversion_config ), @@ -90,9 +79,6 @@ def __init__( Optimization.MOVE_ACTIVATION_BEFORE_CONCAT: MoveActivationBeforeConcatenation( builder, conversion_config ), - Optimization.COMBINE_HARD_SIGMOID_AND_MUL_INTO_HARD_SWISH: CombineHardSigmoidAndMulIntoHardSwish( - builder, conversion_config - ), } def optimize( diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py index 2bc4380f89b..a6884a9ee24 100644 --- a/backends/nxp/backend/neutron_converter_manager.py +++ b/backends/nxp/backend/neutron_converter_manager.py @@ -7,8 +7,6 @@ import multiprocessing import pkgutil -from executorch.backends.nxp.backend.ir.converter.node_converter import Target - def convert_unsafe(neutron_converter, tflite_model, cctx, queue): """ @@ -27,16 +25,7 @@ class NeutronConverterManager: contains NeutronGraph nodes. """ - _supported_target_names = [Target.RT700.value] - - def convert( - self, tflite_model: bytes, target: str, neutron_converter_flavor: str - ) -> bytes: - # Neutron converter crashes if we provide invalid target -> verify. - if target not in self._supported_target_names: - raise RuntimeError( - f"Target '{target}' is not supported by NeutronConverterManager." - ) + def __init__(self, neutron_converter_flavor: str = "SDK_25_09"): neutron_converter_modules = [ module.name @@ -57,13 +46,34 @@ def convert( f"not found. Install 'neutron_converter_[flavor]' Python package." ) - neutron_converter = importlib.import_module( + self.neutron_converter = importlib.import_module( f"{requested_module_name}.neutron_converter" ) + self.neutron_library_utils = importlib.import_module( + f"{requested_module_name}.neutron_library_utils" + ) + + def get_converter(self): + return self.neutron_converter + + def get_library_utils(self): + return self.neutron_library_utils + + def verify_target(self, target: str): + if not self.neutron_library_utils.isNeutronTarget(target): + valid_targets = [ + target.name for target in self.neutron_library_utils.getNeutronTargets() + ] + raise ValueError( + f"Target `{target}` is not a valid target. Must be one of `{valid_targets}`." + ) + + def convert(self, tflite_model: bytes, target: str) -> bytes: + # Neutron converter crashes if we provide invalid target -> verify. + self.verify_target(target) - cctx = neutron_converter.CompilationContext() - cctx.targetOpts = neutron_converter.getNeutronTarget(target) - # New switch since Neutron Converter SDK_25.06 + cctx = self.neutron_converter.CompilationContext() + cctx.targetOpts = self.neutron_converter.getNeutronTarget(target) cctx.compilationOpts.minNumOpsPerGraph = 1 logger = multiprocessing.log_to_stderr() @@ -71,7 +81,8 @@ def convert( queue = multiprocessing.Manager().Queue() process = multiprocessing.Process( - target=convert_unsafe, args=(neutron_converter, tflite_model, cctx, queue) + target=convert_unsafe, + args=(self.neutron_converter, tflite_model, cctx, queue), ) process.start() process.join() # waits until the subprocess is complete diff --git a/backends/nxp/backend/neutron_target_spec.py b/backends/nxp/backend/neutron_target_spec.py new file mode 100644 index 00000000000..44399982e29 --- /dev/null +++ b/backends/nxp/backend/neutron_target_spec.py @@ -0,0 +1,64 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Target Spec for the NXP Neutron NPU + +from enum import Enum + +from executorch.backends.nxp.backend.neutron_converter_manager import ( + NeutronConverterManager, +) + + +class NeutronHWVersion(Enum): + N1 = 1 + N3 = 2 + + +class NeutronTargetSpec: + """ + The functionality for probing the properties of Neutron Target. + """ + + def __init__(self, target: str, neutron_converter_flavor: str): + + converter_manager = NeutronConverterManager(neutron_converter_flavor) + converter_manager.verify_target(target) + neutron_converter = converter_manager.get_converter() + self.neutron_target = neutron_converter.getNeutronTarget(target) + + if self.is_subsystem(): + raise ValueError( + f"Target `{target}` is not a neutron-C target. Only MCU targets are supported at the moment." + ) + + if self.get_hw_version() != NeutronHWVersion.N3: + raise ValueError( + f"Target `{target}` contains unsupported HW version. Only N3/N3+ targets are supported at the moment." + ) + + # Target name. + def get_name(self) -> str: + return self.neutron_target.name + + # Whether the target has subsystem (Neutron-S) or not (Neutron-C). + def is_subsystem(self) -> bool: + return self.neutron_target.subsystem + + # Number of compute units. + def get_num_units(self) -> int: + return self.neutron_target.numUnits + + # Number of compute pipelines. + def get_num_pipes(self) -> int: + return self.neutron_target.numPipes + + # Number of compute MACs. + def get_num_macs(self) -> int: + return self.neutron_target.numMacs + + # Neutron compute block hardware version. + def get_hw_version(self) -> NeutronHWVersion: + return NeutronHWVersion(self.neutron_target.hwVersion) diff --git a/backends/nxp/edge_passes/neutron_edge_pass_manager.py b/backends/nxp/edge_passes/neutron_edge_pass_manager.py index ec46070ac31..5ce23138720 100644 --- a/backends/nxp/edge_passes/neutron_edge_pass_manager.py +++ b/backends/nxp/edge_passes/neutron_edge_pass_manager.py @@ -10,6 +10,10 @@ MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass, ) from executorch.backends.nxp.edge_passes.neutron_edge_pass import NeutronEdgePass + +from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import ( + RemoveIOQuantOpsPass, +) from executorch.exir import EdgeProgramManager from executorch.exir.program._program import ( _get_updated_graph_signature, @@ -24,7 +28,9 @@ class NeutronEdgePassManager(PassManager): - def __init__(self, passes: list[NeutronEdgePass] = None): + def __init__( + self, passes: list[NeutronEdgePass] = None, remove_io_quant_ops: bool = False + ): passes: list[NeutronEdgePass] = passes or [ MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass(), MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass(), @@ -35,6 +41,8 @@ def __init__(self, passes: list[NeutronEdgePass] = None): steps=10, # Empirical value. At most 10 cycles of passes will be run. ) + self.remove_io_quant_ops = remove_io_quant_ops + def _transform_graph_module(self, module: nn.Module) -> PassResult: """Apply the passes to a single graph module.""" pass_result: PassResult = super().__call__(module) @@ -78,12 +86,17 @@ def __call__(self, epm: EdgeProgramManager) -> EdgeProgramManager: new_programs[name] = new_program - if len(new_programs) == 0: - # No passes were run, return the old EdgeProgramManager. - return epm + result = epm - else: - # Return a new EdgeProgramManager with the updated programs. - return EdgeProgramManager( + if len(new_programs) > 0: + # Use a new EdgeProgramManager with the updated programs if any update was performed. + result = EdgeProgramManager( new_programs, copy.deepcopy(epm._config_methods), epm.compile_config ) + + if self.remove_io_quant_ops: + result = result.transform( + [RemoveIOQuantOpsPass(edge_program_manager=result)] + ) + + return result diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py index 5bcdee0f8b6..965ad41309b 100644 --- a/backends/nxp/neutron_partitioner.py +++ b/backends/nxp/neutron_partitioner.py @@ -8,7 +8,7 @@ import logging import operator from dataclasses import dataclass -from typing import Dict, final, List, Mapping +from typing import final, Mapping import torch @@ -18,12 +18,13 @@ from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) -from executorch.backends.nxp.backend.ir.converter.node_converter import Target from torch.export.exported_program import ExportedProgram -from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner +from torch.fx import Graph +from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition from torch.fx.passes.operator_support import OperatorSupportBase from torch.nn import Parameter from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import * # noqa F403 +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.backends.nxp.nxp_backend import NeutronBackend from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.backend.partitioner import ( @@ -34,6 +35,9 @@ from executorch.exir.backend.utils import tag_constant_data from executorch.exir.dialects._ops import ops as exir_ops +NXP_DO_NOT_DELEGATE = "NXP_DO_NOT_DELEGATE" +NXP_DELEGATION_TAG = "delegation_tag" + class QDQClusterRecognizer: """ @@ -60,7 +64,7 @@ class QDQCluster: """ compute_node: torch.fx.Node - ops: List[torch.fx.Node] + ops: list[torch.fx.Node] QUANTIZE_OPERATORS = [ exir_ops.edge.quantized_decomposed.quantize_per_channel.default, @@ -93,7 +97,7 @@ def is_dequant_node(node: torch.fx.Node) -> bool: def is_auxiliary_node(node: torch.fx.Node) -> bool: return node.target in QDQClusterRecognizer.AUXILIARY_OPS - def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]: + def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> list[torch.fx.Node]: """ Return the list of nodes representing the input part of the QDQ cluster of the node `node`. Those are various dequantization nodes (see DEQUANTIZE_OPERATORS) optionally followed by auxiliary @@ -121,7 +125,7 @@ def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node] logging.debug(f"Dequant Cluster for {node} is: {qdq_cluster}") return qdq_cluster - def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node]: + def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> list[torch.fx.Node]: """ Returns the list of nodes representing the output part of the QDQ cluster of the `node`. Those are various quantize nodes (see QUANTIZE_OPERATORS) preceded by auxiliary nodes. @@ -151,7 +155,7 @@ def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node logging.debug(f"Quant Cluster for {node} is {qdq_cluster}") return qdq_cluster - def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]: + def get_qdq_cluster(self, node: torch.fx.Node) -> list[torch.fx.Node]: """ Returns the QDQ cluster of the operator, if quantized. If operator is not quantized, returns empty list. """ @@ -163,7 +167,7 @@ def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]: else: return [] - def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None: + def tag_nodes(self, nodes: list[torch.fx.Node], cluster_name: str) -> None: """ Tags a node and its related dequant and quant nodes with a specified cluster name """ @@ -171,7 +175,7 @@ def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None: logging.info(f"Tagging node {node} as {cluster_name}") node.meta["cluster"] = cluster_name - def tag_qdq_clusters(self, nodes: List[torch.fx.Node]): + def tag_qdq_clusters(self, nodes: list[torch.fx.Node]): """ Identifies QDQ clusters and tag them based on compute operation inside. """ @@ -197,6 +201,7 @@ def tag_qdq_clusters(self, nodes: List[torch.fx.Node]): exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter, # noqa F405 exir_ops.edge.aten.cat.default: CatConverter, # noqa F405 exir_ops.edge.aten.clone.default: CloneConverter, # noqa F405 + exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter, # noqa F405 exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter, # noqa F405 exir_ops.edge.aten.convolution.default: ConvolutionConverter, # noqa F405 exir_ops.edge.aten.hardtanh.default: HardTanhConverter, # noqa F405 @@ -206,6 +211,7 @@ def tag_qdq_clusters(self, nodes: List[torch.fx.Node]): exir_ops.edge.aten.mm.default: MMConverter, # noqa F405 exir_ops.edge.aten.relu.default: ReLUConverter, # noqa F405 exir_ops.edge.aten._softmax.default: SoftmaxConverter, # noqa F405 + exir_ops.edge.aten.sub.Tensor: SubTensorConverter, # noqa F405 exir_ops.edge.aten.tanh.default: TanhConverter, # noqa F405 exir_ops.edge.aten.view_copy.default: ViewCopyConverter, # noqa F405 exir_ops.edge.aten.sigmoid.default: SigmoidConverter, # noqa F405 @@ -216,14 +222,14 @@ class NeutronSupportedOperators(OperatorSupportBase): def __init__( self, - qdq_clusters: Dict[str, QDQClusterRecognizer.QDQCluster], - target: Target, - operators_not_to_delegate: List[str], + qdq_clusters: dict[str, QDQClusterRecognizer.QDQCluster], + neutron_target_spec: NeutronTargetSpec, + operators_not_to_delegate: list[str], parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ): self.qdq_clusters = qdq_clusters - self.target = target + self.neutron_target_spec = neutron_target_spec self.operators_not_to_delegate = operators_not_to_delegate self.parameters_mapping = parameters_mapping self.custom_delegation_options = custom_delegation_options @@ -246,6 +252,11 @@ def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool: """ Operator checking function for compute nodes. """ + + if hasattr(node, "meta") and node.meta.get(NXP_DO_NOT_DELEGATE, False): + # The delegation of this node has been prohibited. + return False + if not self.is_node_delegatable(node): return False @@ -260,7 +271,7 @@ def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool: # TODO: `view_copy` node should be delegated only if it's not the only operator in the cluster. node_converter.is_supported( node, - self.target, + self.neutron_target_spec, self.parameters_mapping, self.custom_delegation_options, ) @@ -296,35 +307,58 @@ def is_node_supported( class NeutronPartitioner(Partitioner): def __init__( self, - compile_spec: List[CompileSpec], + compile_spec: list[CompileSpec], custom_delegation_options: CustomDelegationOptions | None = None, ) -> None: self.delegation_spec = DelegationSpec(NeutronBackend.__name__, compile_spec) self.custom_delegation_options = ( custom_delegation_options or CustomDelegationOptions() ) + target = self.delegation_spec[1][2].value.decode() + converter_flavor = self.delegation_spec[1][3].value.decode() + self.neutron_target_spec = NeutronTargetSpec(target, converter_flavor) + + def validate_partitioning_result( + self, + graph: Graph, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + ) -> bool: + all_delegated_nodes = { + node for partition in partition_list for node in partition.nodes + } + partitioning_valid = True + for node in graph.nodes: + if ( + node in all_delegated_nodes + and hasattr(node, "target") + and node.target in supported_ops + ): + if not supported_ops[node.target].supports_partitioning_result( + node, partition_list, custom_delegation_options + ): + # This node is not supported within its partition. Exclude it from delegation in the future. + partitioning_valid = False + node.meta[NXP_DO_NOT_DELEGATE] = True + + return partitioning_valid def partition(self, exported_program: ExportedProgram) -> PartitionResult: # Run the CapabilityBasedPartitioner to return the largest possible # subgraphs containing the nodes with the tags logging.info("NeutronPartitioner::partition") partition_tags = {} + partition_list = [] graph_module = exported_program.graph_module nodes = list(graph_module.graph.nodes) qdq_cluster_recognizer = QDQClusterRecognizer() qdq_cluster_recognizer.tag_qdq_clusters(nodes) + graph_module.recompile() - target = None - operators_not_to_delegate = "" - for spec in self.delegation_spec.compile_specs: - if spec.key == "target": - target = Target(spec.value.decode()) - if spec.key == "operators_not_to_delegate": - operators_not_to_delegate = spec.value.decode().split(",") - assert target is not None + operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",") logging.info(f"Operators not to delegate: {operators_not_to_delegate}") parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters( @@ -334,7 +368,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: exported_program.graph_module, NeutronSupportedOperators( qdq_cluster_recognizer.cluster_map, - target, + self.neutron_target_spec, operators_not_to_delegate, parameters_mapping, self.custom_delegation_options, @@ -342,11 +376,24 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: allows_single_node_partition=True, ) - partition_list = capability_partitioner.propose_partitions() + iteration_limit = len(exported_program.graph.nodes) + for _ in range(iteration_limit): + # Run the partitioning. + partition_list = capability_partitioner.propose_partitions() + + # Check if the nodes support the partitioning result. Mark the problematic nodes with `NXP_DO_NOT_DELEGATE`. + partitioning_valid = self.validate_partitioning_result( + exported_program.graph, partition_list, self.custom_delegation_options + ) + if partitioning_valid: + # The result of the partitioning is fine + break + + # Mark the partitions in the node `meta` attribute. for partition in partition_list: for node in partition.nodes: delegation_tag = f"tag{partition.id}" - node.meta["delegation_tag"] = delegation_tag + node.meta[NXP_DELEGATION_TAG] = delegation_tag partition_tags[delegation_tag] = self.delegation_spec tag_constant_data(exported_program) diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py index c801eefec81..44e9a19d9f2 100644 --- a/backends/nxp/nxp_backend.py +++ b/backends/nxp/nxp_backend.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -18,11 +18,11 @@ from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) -from executorch.backends.nxp.backend.ir.converter.node_converter import Target from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat from executorch.backends.nxp.backend.neutron_converter_manager import ( NeutronConverterManager, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.backends.nxp.neutron_node_extraction import ( extract_artifacts_from_neutron_node, NeutronNodeArtifacts, @@ -36,9 +36,9 @@ class NeutronCompileSpecBuilder: + config: NeutronTargetSpec def __init__(self): - self.config: Target = None self.compile_spec: List[CompileSpec] = [] self.compiler_flags = [] self.output_format = None @@ -64,18 +64,13 @@ def neutron_compile_spec( Args: config: Neutron accelerator configuration, e.g. "imxrt700" neutron_converter_flavor: Flavor of the neutron-converter module to use. Neutron-converter module named " - "'neutron_converter_SDK_25_06' has flavor 'SDK_25_06'. + "'neutron_converter_SDK_25_09' has flavor 'SDK_25_09'. extra_flags: Extra flags for the Neutron compiler operators_not_to_delegate: List of operators that should not be delegated """ - try: - self.config = Target(config) - except ValueError: - raise ValueError( - f"Config `{config}` is not a valid target. Must be one of `{Target.values()}`." - ) self.neutron_converter_flavor = neutron_converter_flavor + self.config = NeutronTargetSpec(config, neutron_converter_flavor) assert ( self.output_format is None @@ -101,7 +96,7 @@ def build(self): self.compile_spec += [ CompileSpec("output_format", "tflite".encode()), CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()), - CompileSpec("target", self.config.value.encode()), + CompileSpec("target", self.config.get_name().encode()), CompileSpec( "neutron_converter_flavor", self.neutron_converter_flavor.encode() ), @@ -187,10 +182,11 @@ def preprocess( # noqa C901 # Convert the edge program to TFLite. tflite_model, io_formats = EdgeProgramToIRConverter().convert_program( edge_program, + neutron_target_spec=NeutronTargetSpec(target, neutron_converter_flavor), ) - neutron_model = NeutronConverterManager().convert( - tflite_model, target, neutron_converter_flavor + neutron_model = NeutronConverterManager(neutron_converter_flavor).convert( + tflite_model, target ) # Dump the tflite file if logging level is enabled diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py index d3f84144aa3..2681e221869 100644 --- a/backends/nxp/quantizer/neutron_quantizer.py +++ b/backends/nxp/quantizer/neutron_quantizer.py @@ -4,8 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List, Optional, Tuple, Union - import torch from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import ( @@ -27,6 +25,8 @@ LinearPattern, MaxPoolPattern, MeanDimPattern, + MmPattern, + NodeArgsIdx, PadPattern, PermutePattern, QuantizationPattern, @@ -36,6 +36,7 @@ SharedSpecPattern, SigmoidPattern, SoftMaxPattern, + SubTensorPattern, TanhInPlacePattern, TanhPattern, ViewPattern, @@ -106,13 +107,13 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: ) def annotate_inputs( - inputs: Union[ - List[Tuple[fx.Node, int]], - List[Tuple[fx.Node, int, DerivedQuantizationSpec],], - ], - spec: Optional[QuantizationSpec], + inputs: ( + list[tuple[fx.Node, NodeArgsIdx]] + | list[tuple[fx.Node, NodeArgsIdx, DerivedQuantizationSpec]] + ), + spec: QuantizationSpec | None, ) -> None: - for node, idx, *custom_spec in inputs: + for node, args_idx, *custom_spec in inputs: # pyre-ignore[16]: no attribute annotation = node.meta.get( Q_ANNOTATION_KEY, @@ -120,10 +121,10 @@ def annotate_inputs( ) arg = ( # pyre-ignore[16]: no attribute - node.args[idx] - if isinstance(idx, int) + node.args[args_idx.idx] + if args_idx.inner_idx is None # pyre-ignore[16]: no attribute - else node.args[idx[0]][idx[1]] + else node.args[args_idx.idx][args_idx.inner_idx] ) annotation.input_qspec_map[arg] = ( custom_spec[0] if custom_spec else spec @@ -131,32 +132,18 @@ def annotate_inputs( # pyre-ignore[16]: no attribute node.meta[Q_ANNOTATION_KEY] = annotation - def annotate_weights_or_biases( - weights_or_biases: List[Tuple[fx.Node, int]], - spec: Optional[QuantizationSpec], - ) -> None: - for node, idx, *custom_spec in weights_or_biases: - annotation = node.meta.get( - Q_ANNOTATION_KEY, - QuantizationAnnotation(_annotated=True), - ) - annotation.input_qspec_map[node.args[idx]] = ( - custom_spec[0] if custom_spec else spec - ) - node.meta[Q_ANNOTATION_KEY] = annotation - # pyre-ignore[6]: incompatible parameter type annotate_inputs(anchors.inputs, input_act_qspec) - annotate_weights_or_biases(anchors.weights, weight_qspec) + annotate_inputs(anchors.weights, weight_qspec) # pyre-ignore[6]: incompatible parameter type - annotate_weights_or_biases(anchors.biases, bias_qspec) + annotate_inputs(anchors.biases, bias_qspec) return model def validate(self, model: fx.GraphModule) -> None: pass @classmethod - def get_supported_operators(cls) -> List[OperatorConfig]: + def get_supported_operators(cls) -> list[OperatorConfig]: return [] @@ -195,12 +182,7 @@ def get_supported_operators(cls) -> List[OperatorConfig]: class NeutronQuantizer(ComposableQuantizer): def __init__(self): - static_qconfig = QuantizationConfig( - act_qspec, - act_qspec, - wgt_qspec, - None, - ) + static_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_qspec, None) static_fc_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_fc_qspec, None) super().__init__( [ @@ -219,6 +201,7 @@ def __init__(self): NeutronAtenQuantizer(LinearPattern(), static_fc_qconfig), NeutronAtenQuantizer(MaxPoolPattern(), static_qconfig), NeutronAtenQuantizer(MeanDimPattern(), static_qconfig), + NeutronAtenQuantizer(MmPattern(), static_qconfig), NeutronAtenQuantizer(PadPattern(), static_qconfig), NeutronAtenQuantizer(PermutePattern(), static_qconfig), NeutronAtenQuantizer(ReluPattern(), static_qconfig), @@ -226,6 +209,7 @@ def __init__(self): NeutronAtenQuantizer(ReshapePattern(), static_qconfig), NeutronAtenQuantizer(SigmoidPattern(), static_qconfig), NeutronAtenQuantizer(SoftMaxPattern(), static_qconfig), + NeutronAtenQuantizer(SubTensorPattern(), static_qconfig), NeutronAtenQuantizer(TanhPattern(), static_qconfig), NeutronAtenQuantizer(TanhInPlacePattern(), static_qconfig), NeutronAtenQuantizer(ViewPattern(), static_qconfig), diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py index 651f995d570..9588ce24c9e 100644 --- a/backends/nxp/quantizer/patterns.py +++ b/backends/nxp/quantizer/patterns.py @@ -7,26 +7,43 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import List, Optional, Tuple, Type, Union import torch from executorch.backends.nxp.quantizer.utils import get_bias_qparams from torch import fx from torch._ops import OpOverload +from torchao.quantization.pt2e import PerChannelMinMaxObserver from torchao.quantization.pt2e.quantizer import ( DerivedQuantizationSpec, FixedQParamsQuantizationSpec, + QuantizationSpec, SharedQuantizationSpec, ) from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY +@dataclass +class NodeArgsIdx: + """ + Specifies indexes to args paramater of Node in node input annotation. + + + Attributes: + idx (int): Index to Node's args paramater (list). Selects an input Node or a list of Nodes at the index. + inner_idx (int): If specified, index to a list pointed by 'idx' attribute. Selects an input Node at the index. + Default: None. + """ + + idx: int + inner_idx: int = None + + @dataclass class PartitionAnchors: """ - All fields except output are lists of (node, args_index) pair, where node is from - the given partition and node.args[args_index] is an input to the partition. Assumes + All fields except output are lists of (node, node_args_idx) or (node, node_args_idx, quantization_spec) tuples, + where node is from the given partition and node.args[node_args_idx] is an input to the partition. Assumes a single output. Quantizer uses inputs, weights and biases for quantization annotation. The others @@ -35,25 +52,23 @@ class PartitionAnchors: """ # Inputs can share quantization parameters - inputs: List[ - Union[ - Tuple[fx.Node, Union[int, Tuple[int, int]]], - Tuple[ - fx.Node, - Union[int, Tuple[int, int]], - SharedQuantizationSpec, - ], - ] + inputs: list[ + tuple[fx.Node, NodeArgsIdx] + | tuple[fx.Node, NodeArgsIdx, SharedQuantizationSpec], ] = field(default_factory=list) - weights: List[Tuple[fx.Node, int]] = field(default_factory=list) - biases: List[ - Union[Tuple[fx.Node, int], Tuple[fx.Node, int, DerivedQuantizationSpec]] + weights: list[ + tuple[fx.Node, NodeArgsIdx] | tuple[fx.Node, NodeArgsIdx, QuantizationSpec], + ] = field(default_factory=list) + biases: list[ + tuple[fx.Node, NodeArgsIdx] + | tuple[fx.Node, NodeArgsIdx, DerivedQuantizationSpec], + ] = field(default_factory=list) + others: list[tuple[fx.Node, NodeArgsIdx]] = field(default_factory=list) + literals: list[tuple[fx.Node, NodeArgsIdx]] = field(default_factory=list) + output: list[ + tuple[fx.Node] + | tuple[fx.Node, FixedQParamsQuantizationSpec | SharedQuantizationSpec], ] = field(default_factory=list) - others: List[Tuple[fx.Node, int]] = field(default_factory=list) - literals: List[Tuple[fx.Node, int]] = field(default_factory=list) - output: List[Union[Tuple[fx.Node], Tuple[fx.Node, SharedQuantizationSpec]]] = field( - default_factory=list - ) empty: bool = False @@ -67,8 +82,8 @@ def partition_types(self) -> list[OpOverload]: @abstractmethod def get_anchors( - self, gm: torch.fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> Optional[PartitionAnchors]: + self, gm: torch.fx.GraphModule, fused_partition: list[fx.GraphModule] + ) -> PartitionAnchors | None: pass @@ -80,11 +95,11 @@ class SharedSpecPattern(QuantizationPattern): quantization parameters (scale and zero-point). """ - def partition_types(self) -> List[Type[torch.nn.Module]]: + def partition_types(self) -> list[torch.nn.Module]: pass def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors | None: node = fused_partition[0].nodes[-1] assert len(fused_partition[0].input_nodes) == 1 @@ -97,7 +112,7 @@ def get_anchors( qspec = SharedQuantizationSpec(prev_node) return PartitionAnchors( - inputs=[(node, 0)], + inputs=[(node, NodeArgsIdx(0))], weights=[], biases=[], output=[ @@ -126,7 +141,7 @@ def get_anchors_for_fixed_quant_specs( ) return PartitionAnchors( - inputs=[(node, 0)], + inputs=[(node, NodeArgsIdx(0))], weights=[], biases=[], output=[ @@ -154,11 +169,11 @@ def partition_types(self): class AddmmPattern(QuantizationPattern): - def partition_types(self) -> List[OpOverload]: + def partition_types(self) -> list[OpOverload]: return [torch.ops.aten.addmm.default] def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors: # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... addmm_node = fused_partition[0].nodes[-1] @@ -176,9 +191,9 @@ def get_anchors( ) return PartitionAnchors( - inputs=[(addmm_node, 1)], - weights=[(addmm_node, 2)], - biases=[(addmm_node, 0, bias_qspec)], + inputs=[(addmm_node, NodeArgsIdx(1))], + weights=[(addmm_node, NodeArgsIdx(2))], + biases=[(addmm_node, NodeArgsIdx(0), bias_qspec)], output=[(addmm_node,)], ) @@ -190,16 +205,42 @@ class AddTensorPattern(QuantizationPattern): Basic quantization for all inputs and output. """ - def partition_types(self) -> List[Type[torch.nn.Module]]: + def partition_types(self) -> list[torch.nn.Module]: return [torch.ops.aten.add.Tensor] def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] + ) -> PartitionAnchors | None: + node = fused_partition[0].nodes[-1] + inputs = [(node, NodeArgsIdx(0))] + if len(fused_partition[0].input_nodes) == 2: + inputs = [(node, NodeArgsIdx(0)), (node, NodeArgsIdx(1))] + + return PartitionAnchors( + inputs=inputs, + weights=[], + biases=[], + output=[(node,)], + ) + + +class SubTensorPattern(QuantizationPattern): + """ + Quantization pattern for Sub Tensor quantization. Accepts 1 or 2 input nodes. + + Basic quantization for all inputs and output. + """ + + def partition_types(self) -> list[torch.nn.Module]: + return [torch.ops.aten.sub.Tensor] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors | None: node = fused_partition[0].nodes[-1] - inputs = [(node, 0)] + inputs = [(node, NodeArgsIdx(0))] if len(fused_partition[0].input_nodes) == 2: - inputs = [(node, 0), (node, 1)] + inputs = [(node, NodeArgsIdx(0)), (node, NodeArgsIdx(1))] return PartitionAnchors( inputs=inputs, @@ -242,13 +283,15 @@ def get_anchors( if quantized_input is not None: inputs = [] for idx, _ in enumerate(node.args[0]): - inputs.append((node, (0, idx), SharedQuantizationSpec(quantized_input))) + inputs.append( + (node, NodeArgsIdx(0, idx), SharedQuantizationSpec(quantized_input)) + ) outputs = [(node, SharedQuantizationSpec(quantized_input))] else: # No previous node was quantized => we are not able to share q-params. The conversion to IR will have to # re-quantize the inputs if necessary. - inputs = [(node, (0, idx)) for idx in range(len(node.args[0]))] + inputs = [(node, NodeArgsIdx(0, idx)) for idx in range(len(node.args[0]))] outputs = [(node,)] return PartitionAnchors( @@ -259,76 +302,60 @@ def get_anchors( ) -class Conv1dPattern(QuantizationPattern): - def partition_types(self) -> List[OpOverload]: - return [torch.ops.aten.conv1d.default] +class ConvPattern(QuantizationPattern): + @abstractmethod + def partition_types(self) -> list[OpOverload]: + pass def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors: - # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... - conv1d_node = fused_partition[0].nodes[-1] + conv_node = fused_partition[0].nodes[-1] - bias_qspec = DerivedQuantizationSpec( + bias_quantization_qspec = DerivedQuantizationSpec( derived_from=[ - (conv1d_node.args[0], conv1d_node), - (conv1d_node.args[1], conv1d_node), + (conv_node.args[0], conv_node), + (conv_node.args[1], conv_node), ], derive_qparams_fn=get_bias_qparams, dtype=torch.int32, - quant_min=-(2**31), + quant_min=-(2**31) + 1, quant_max=2**31 - 1, - qscheme=torch.per_tensor_affine, + qscheme=torch.per_channel_symmetric, + ch_axis=0, + ) + + weight_observer_or_fake_quant_ctr = PerChannelMinMaxObserver + weight_quantization_spec = QuantizationSpec( + dtype=torch.int8, + observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr, + quant_min=-127, + quant_max=127, + qscheme=torch.per_channel_symmetric, + ch_axis=0, ) # Keep bias empty if not supplied bias = [] - if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None: - bias = [(conv1d_node, 2, bias_qspec)] + if len(conv_node.args) > 2 and conv_node.args[2] is not None: + bias = [(conv_node, NodeArgsIdx(2), bias_quantization_qspec)] return PartitionAnchors( - inputs=[(conv1d_node, 0)], - weights=[(conv1d_node, 1)], - # pyre-fixme[6]: Incompatible parameter type + inputs=[(conv_node, NodeArgsIdx(0))], + weights=[(conv_node, NodeArgsIdx(1), weight_quantization_spec)], biases=bias, - output=[(conv1d_node,)], + output=[(conv_node,)], ) -class Conv2dPattern(QuantizationPattern): - def partition_types(self) -> List[OpOverload]: - return [torch.ops.aten.conv2d.default] - - def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] - ) -> PartitionAnchors: - # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... - conv2d_node = fused_partition[0].nodes[-1] - - bias_qspec = DerivedQuantizationSpec( - derived_from=[ - (conv2d_node.args[0], conv2d_node), - (conv2d_node.args[1], conv2d_node), - ], - derive_qparams_fn=get_bias_qparams, - dtype=torch.int32, - quant_min=-(2**31), - quant_max=2**31 - 1, - qscheme=torch.per_tensor_affine, - ) +class Conv1dPattern(ConvPattern): + def partition_types(self) -> list[OpOverload]: + return [torch.ops.aten.conv1d.default] - # Keep bias empty if not supplied - bias = [] - if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None: - bias = [(conv2d_node, 2, bias_qspec)] - return PartitionAnchors( - inputs=[(conv2d_node, 0)], - weights=[(conv2d_node, 1)], - # pyre-fixme[6]: Incompatible parameter type - biases=bias, - output=[(conv2d_node,)], - ) +class Conv2dPattern(ConvPattern): + def partition_types(self) -> list[OpOverload]: + return [torch.ops.aten.conv2d.default] class DropoutPattern(SharedSpecPattern): @@ -359,12 +386,12 @@ def partition_types(self): return [torch.ops.aten.hardtanh.default] def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors | None: node = fused_partition[0].nodes[-1] return PartitionAnchors( - inputs=[(node, 0)], + inputs=[(node, NodeArgsIdx(0))], weights=[], biases=[], output=[(node,)], @@ -384,12 +411,12 @@ def partition_types(self): return [torch.ops.aten.hardtanh_.default] def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors | None: node = fused_partition[0].nodes[-1] return PartitionAnchors( - inputs=[(node, 0)], + inputs=[(node, NodeArgsIdx(0))], weights=[], biases=[], output=[(node,)], @@ -400,13 +427,12 @@ def replacement_op(self): class LinearPattern(QuantizationPattern): - def partition_types(self) -> List[OpOverload]: + def partition_types(self) -> list[OpOverload]: return [torch.ops.aten.linear.default] def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors: - # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... linear_node = fused_partition[0].nodes[-1] bias_qspec = DerivedQuantizationSpec( @@ -424,12 +450,11 @@ def get_anchors( # Keep bias empty if not supplied bias = [] if len(linear_node.args) > 2: - bias = [(linear_node, 2, bias_qspec)] + bias = [(linear_node, NodeArgsIdx(2), bias_qspec)] return PartitionAnchors( - inputs=[(linear_node, 0)], - weights=[(linear_node, 1)], - # pyre-fixme[6]: Incompatible parameter type + inputs=[(linear_node, NodeArgsIdx(0))], + weights=[(linear_node, NodeArgsIdx(1))], biases=bias, output=[(linear_node,)], ) @@ -453,6 +478,23 @@ def partition_types(self): return [torch.ops.aten.mean.dim] +class MmPattern(QuantizationPattern): + def partition_types(self) -> list[OpOverload]: + return [torch.ops.aten.mm.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] + ) -> PartitionAnchors: + mm_node = fused_partition[0].nodes[-1] + + return PartitionAnchors( + inputs=[(mm_node, NodeArgsIdx(0))], + weights=[(mm_node, NodeArgsIdx(1))], + biases=[], + output=[(mm_node,)], + ) + + class PadPattern(SharedSpecPattern): """ Quantizer for Pad operator. @@ -515,7 +557,7 @@ class SoftMaxPattern(QuantizationPattern): The quantization of Softmax output is fixed to scale 1/256, zero point -128, dtype int8. """ - def partition_types(self) -> List[OpOverload]: + def partition_types(self) -> list[OpOverload]: return [torch.ops.aten.softmax.int] def get_anchors( @@ -526,33 +568,33 @@ def get_anchors( ) -class TanhPattern(QuantizationPattern): +class SigmoidPattern(QuantizationPattern): """ - Quantizer for Tanh operator. + Quantizer for Sigmoid operator. - The quantization of Tanh output is fixed to scale 1/128, zero point 0, dtype int8. + The quantization of Sigmoid output is fixed to scale 1/256, zero point -128, dtype int8. """ - def partition_types(self): - return [torch.ops.aten.tanh.default] + def partition_types(self) -> list[OpOverload]: + return [torch.ops.aten.sigmoid.default] def get_anchors( self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors: return get_anchors_for_fixed_quant_specs( - fused_partition, scale=1.0 / 128.0, zero_point=0 + fused_partition, scale=1.0 / 256.0, zero_point=-128 ) -class TanhInPlacePattern(QuantizationPattern): +class TanhPattern(QuantizationPattern): """ - Quantizer for inplace version of Tanh operator (torch.tanh_). + Quantizer for Tanh operator. The quantization of Tanh output is fixed to scale 1/128, zero point 0, dtype int8. """ def partition_types(self): - return [torch.ops.aten.tanh_.default] + return [torch.ops.aten.tanh.default] def get_anchors( self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] @@ -562,19 +604,19 @@ def get_anchors( ) -class SigmoidPattern(QuantizationPattern): +class TanhInPlacePattern(QuantizationPattern): """ - Quantizer for Sigmoid operator. + Quantizer for inplace version of Tanh operator (torch.tanh_). - The quantization of Sigmoid output is fixed to scale 1/256, zero point -128, dtype int8. + The quantization of Tanh output is fixed to scale 1/128, zero point 0, dtype int8. """ - def partition_types(self) -> List[OpOverload]: - return [torch.ops.aten.sigmoid.default] + def partition_types(self): + return [torch.ops.aten.tanh_.default] def get_anchors( - self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors: return get_anchors_for_fixed_quant_specs( - fused_partition, scale=1.0 / 256.0, zero_point=-128 + fused_partition, scale=1.0 / 128.0, zero_point=0 ) diff --git a/backends/nxp/quantizer/utils.py b/backends/nxp/quantizer/utils.py index ed94183c2db..12c722a8ab3 100644 --- a/backends/nxp/quantizer/utils.py +++ b/backends/nxp/quantizer/utils.py @@ -49,7 +49,7 @@ def get_bias_qparams( act_scale, _ = obs_or_fqs[0].calculate_qparams() weight_scale, _ = obs_or_fqs[1].calculate_qparams() bias_scale = act_scale * weight_scale - bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int32) + bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int64) return bias_scale, bias_zero_point diff --git a/backends/nxp/requirements-tests-eiq.txt b/backends/nxp/requirements-tests-eiq.txt index 896d2b8c07e..1fccf010e86 100644 --- a/backends/nxp/requirements-tests-eiq.txt +++ b/backends/nxp/requirements-tests-eiq.txt @@ -1,2 +1,2 @@ --index-url https://eiq.nxp.com/repository -neutron_converter_SDK_25_06 +neutron_converter_SDK_25_09 diff --git a/backends/nxp/runtime/NeutronDriver.h b/backends/nxp/runtime/NeutronDriver.h index 5ae4c3a3ff9..5c47bd74eab 100644 --- a/backends/nxp/runtime/NeutronDriver.h +++ b/backends/nxp/runtime/NeutronDriver.h @@ -18,22 +18,6 @@ extern "C" { #include "NeutronErrors.h" -/* Neutron Driver error category codes */ -typedef enum ERROR_CATEGORY_DRIVER { - ERROR_CATEGORY_DRIVER_GENERIC, /* Generic error category */ - ERROR_CATEGORY_DRIVER_UNSUPPORTED, /* Unsupported function */ - ERROR_CATEGORY_DRIVER_UCODE, /* Microcode bad magic or version incompatible. - */ - ERROR_CATEGORY_DRIVER_INVALID, /* Invalid arguments */ - ERROR_CATEGORY_DRIVER_BAD_HANDLE, /* Bad inference handle */ - ERROR_CATEGORY_DRIVER_NO_MEMORY, /* Not enough memory */ - ERROR_CATEGORY_DRIVER_INTERNAL_FAULT, /* Internal error */ - ERROR_CATEGORY_DRIVER_UNKNOWN_ARCH, /* Unknown architecture */ - ERROR_CATEGORY_DRIVER_TRACE_NOT_RUN, /* Tracing did not run, but trace buffer - was requested. */ - ERROR_CATEGORY_DRIVER_TIMEOUT /* Timeout error. */ -} ERROR_CATEGORY_DRIVER; - /// Trace configuration to enable kernel level tracing. #define TRACE_CONFIG_KERNEL_LEVEL (1U << 0) @@ -169,6 +153,12 @@ NeutronError neutronCustomExec( NeutronModelHandle hdl, const NeutronDataConfig* neutron_dcfg); +/// - Setup the input and output data ptr to use Neutron memory area. +/// - The input and ouput data ptr is stored in neutron_dcfg. +NeutronError neutronDataSetup( + NeutronModelHandle hdl, + NeutronDataConfig* neutron_dcfg); + /// - Prepare Neutron execution for a model with the given configuration. /// - This function only prepares the execution by transferring the parameters /// to the firmware. @@ -245,6 +235,29 @@ void* neutronMemAlloc(size_t alignment, size_t size); /// - This function is only available for Neutron-S in the Linux environment. void neutronMemFree(void* ptr); +/// - Allocates size bytes large buffer in DDR to be used for specialized +/// kernels (e.g. batch matmul) +/// Uses Linux CMA allocator +NeutronError allocateBuffer(uint64_t size, void** pBuffer, bool userspace); + +/// - Frees buffer allocated via allocateBuffer function +NeutronError releaseBuffer(void* buffer); + +/// - Clean/flush cache for DDR allocated buffer +/// TODO: rename function as "cleanCache" to satisfy neutron-software naming +/// convention +NeutronError clean_cache(const void* addr, int size); + +/// - Function for calling firmware for specialized kernel (matmul) +NeutronError matmul( + const void* info, + int sizeInfo, + const void* in, + int sizeIn, + const void* out, + int sizeOut, + int idxSlot); + /// Other functions to control the state of driver/firmware. #ifdef __cplusplus } diff --git a/backends/nxp/runtime/NeutronErrors.h b/backends/nxp/runtime/NeutronErrors.h index 5141c4bb4c5..071db8b44be 100644 --- a/backends/nxp/runtime/NeutronErrors.h +++ b/backends/nxp/runtime/NeutronErrors.h @@ -39,6 +39,32 @@ typedef enum ERROR_COMPONENT_ID { ERROR_COMPONENT_DRIVER = 0x3 } ERROR_COMPONENT_ID; +/* Neutron Firmware error category codes */ +typedef enum ERROR_CATEGORY_FW { + ERROR_CATEGORY_FW_GENERIC, /* Generic error category */ + ERROR_CATEGORY_FW_UCODE, /* Microcode bad magic or version incompatible. */ + ERROR_CATEGORY_FW_BUFFER_OVERFLOW, /* Buffer overflow error category */ + ERROR_CATEGORY_FW_NULL_POINTER, /* Pointer is null */ + ERROR_CATEGORY_FW_INTR_ERROR, /* Interrupt triggering error */ + ERROR_CATEGORY_FW_DMAPI_ERROR, /* DM API parameter error */ +} ERROR_CATEGORY_FW; + +/* Neutron Driver error category codes */ +typedef enum ERROR_CATEGORY_DRIVER { + ERROR_CATEGORY_DRIVER_GENERIC, /* Generic error category */ + ERROR_CATEGORY_DRIVER_UNSUPPORTED, /* Unsupported function */ + ERROR_CATEGORY_DRIVER_UCODE, /* Microcode bad magic or version incompatible. + */ + ERROR_CATEGORY_DRIVER_INVALID, /* Invalid arguments */ + ERROR_CATEGORY_DRIVER_BAD_HANDLE, /* Bad inference handle */ + ERROR_CATEGORY_DRIVER_NO_MEMORY, /* Not enough memory */ + ERROR_CATEGORY_DRIVER_INTERNAL_FAULT, /* Internal error */ + ERROR_CATEGORY_DRIVER_UNKNOWN_ARCH, /* Unknown architecture */ + ERROR_CATEGORY_DRIVER_TRACE_NOT_RUN, /* Tracing did not run, but trace buffer + was requested. */ + ERROR_CATEGORY_DRIVER_TIMEOUT /* Timeout error. */ +} ERROR_CATEGORY_DRIVER; + /// Retrieve component name as string from NeutronError code. char* getNeutronErrorComponent(NeutronError ne); diff --git a/backends/nxp/runtime/targets.bzl b/backends/nxp/runtime/targets.bzl index 1eacbbe0a2b..3214761a9cb 100644 --- a/backends/nxp/runtime/targets.bzl +++ b/backends/nxp/runtime/targets.bzl @@ -1,20 +1,25 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci") def define_common_targets(): runtime.cxx_library( - name = "nxp_backend", + name = "nxp_backend_base", srcs = ["NeutronBackend.cpp"], - headers = ["NeutronDriver.h", "NeutronErrors.h"], - compatible_with = ["ovr_config//cpu:arm32-embedded", "@fbsource//arvr/firmware/projects/smartglasses/config:embedded-mcu-rtos"], - # Neutron runtime needs to compile with executor as whole - # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + exported_headers = [ + "NeutronDriver.h", + "NeutronErrors.h", + ], link_whole = True, # Constructor needed for backend registration. compiler_flags = ["-Wno-global-constructors", "-fno-rtti", "-DNO_HEAP_USAGE"], - visibility = ["@EXECUTORCH_CLIENTS"], + labels = [ci.skip_target()], + visibility = [ + "//executorch/backends/nxp/runtime/fb:nxp_fb_backend", + "//executorch/backends/nxp/runtime/fb:nxp_hifi_fb_backend", + "@EXECUTORCH_CLIENTS", + ], deps = [ "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", - "fbsource//arvr/third-party/toolchains/nxp-sdk/2.16.0/middleware/eiq/executorch/third-party/neutron/rt700:libNeutron", ], ) diff --git a/backends/nxp/tests/TARGETS b/backends/nxp/tests/TARGETS index f492111aff2..c8ccd5fe900 100644 --- a/backends/nxp/tests/TARGETS +++ b/backends/nxp/tests/TARGETS @@ -1,3 +1,4 @@ +load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest") @@ -50,5 +51,9 @@ python_pytest( "//executorch/backends/nxp:neutron_backend", ":executorch_pipeline", ":models", - ] + ], + labels = [ + "local_only", + ci.skip_test(), + ], ) diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py index f2f625ad0c8..09bceb2b0d3 100644 --- a/backends/nxp/tests/executorch_pipeline.py +++ b/backends/nxp/tests/executorch_pipeline.py @@ -15,9 +15,6 @@ from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import ( NeutronEdgePassManager, ) -from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import ( - RemoveIOQuantOpsPass, -) from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer @@ -38,9 +35,9 @@ class ModelInputSpec: dtype: torch.dtype = torch.float32 -def _quantize_model(model, calibration_inputs: list[tuple[torch.Tensor, ...]]): - quantizer = NeutronQuantizer() - +def _quantize_model( + model, quantizer, calibration_inputs: list[tuple[torch.Tensor, ...]] +): m = prepare_pt2e(model, quantizer) for data in calibration_inputs: m(*data) @@ -88,9 +85,10 @@ def to_quantized_edge_program( [tuple[ModelInputSpec, ...]], list[tuple[torch.Tensor, ...]] ] = get_random_calibration_inputs, target="imxrt700", - neutron_converter_flavor="SDK_25_06", + neutron_converter_flavor="SDK_25_09", remove_quant_io_ops=False, custom_delegation_options=CustomDelegationOptions(), # noqa B008 + get_quantizer_fn=lambda: NeutronQuantizer(), ) -> EdgeProgramManager: calibration_inputs = get_calibration_inputs_fn(to_model_input_spec(input_spec)) @@ -102,7 +100,9 @@ def to_quantized_edge_program( exir_program_aten = torch.export.export(model, example_input, strict=True) exir_program_aten__module_quant = _quantize_model( - exir_program_aten.module(), calibration_inputs + exir_program_aten.module(), + get_quantizer_fn(), + calibration_inputs, ) edge_compile_config = EdgeCompileConfig(_check_ir_validity=False) @@ -112,7 +112,9 @@ def to_quantized_edge_program( edge_compile_config=edge_compile_config, ) - edge_program_manager = NeutronEdgePassManager()(edge_program_manager) + edge_program_manager = NeutronEdgePassManager( + remove_io_quant_ops=remove_quant_io_ops + )(edge_program_manager) compile_spec = generate_neutron_compile_spec( target, @@ -122,11 +124,6 @@ def to_quantized_edge_program( partitioner = NeutronPartitioner(compile_spec, custom_delegation_options) edge_program_manager = edge_program_manager.to_backend(partitioner) - if remove_quant_io_ops: - edge_program_manager = edge_program_manager.transform( - [RemoveIOQuantOpsPass(edge_program_manager=edge_program_manager)] - ) - return edge_program_manager diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py index afdb15af106..632e3da055f 100644 --- a/backends/nxp/tests/executors.py +++ b/backends/nxp/tests/executors.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024 NXP +# Copyright 2023-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -18,10 +18,8 @@ create_channels_first_to_channels_last_permutation, create_channels_last_to_channels_first_permutation, ) -from executorch.backends.nxp.backend.ir.converter.node_converter import ( - NodeConverter, - Target, -) +from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.export import ExportedProgram from torch.fx import Node from torch.fx.graph import Graph @@ -196,6 +194,11 @@ def compare_output_arrays( assert tfl_output.shape == edge_output.shape, "Output shapes don't match!" + if (max_diff := np.abs(np.max(tfl_output - edge_output))) > 0.0: + logger.w( + f"Maximum absolute difference of the tensor '{output_name}': '{max_diff}'" + ) + assert np.allclose( tfl_output, edge_output, rtol=rtol, atol=atol, equal_nan=True ), f"Output values of the `{output_name}` tensor don't match!" @@ -365,10 +368,16 @@ def convert_run_compare( def graph_contains_any_of_ops(graph: Graph, ops: list) -> bool: - return any(node.target in ops for node in graph.nodes) + return graph_contains_any( + graph, condition=lambda n: hasattr(n, "target") and n.target in ops + ) + + +def graph_contains_any(graph: Graph, condition: Callable[[Node], bool]) -> bool: + return any(map(condition, graph.nodes)) -target_support_check_function = Callable[[Node, Target], bool] +target_support_check_function = Callable[[Node, NeutronTargetSpec], bool] class OverrideTargetSupportCheck: diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py index 567b593e05b..2c3107eae77 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py @@ -1,3 +1,7 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. import numpy as np import pytest import torch diff --git a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py new file mode 100644 index 00000000000..6571ef8773e --- /dev/null +++ b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py @@ -0,0 +1,89 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import kgb +import numpy as np +import torch + +from executorch.backends.nxp.backend.edge_program_converter import ( + EdgeProgramToIRConverter, +) +from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.executors import ( + convert_run_compare, + graph_contains_any_of_ops, +) +from executorch.backends.nxp.tests.models import AddmmModule, LinearModule +from executorch.exir.dialects._ops import ops as exir_ops +from torch.export import ExportedProgram + + +class TestAddmmConversion(unittest.TestCase): + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(42) + + def test_addmm_conversion(self): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, call_original=True + ) as converter_spy: + input_shape = (1, 32) + model = AddmmModule(input_shape[1]) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=[exir_ops.edge.aten.addmm.default] + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + ) + + def test_linear_conversion__with_bias(self): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, call_original=True + ) as converter_spy: + input_shape = (10, 32) + model = LinearModule(bias=True) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=[exir_ops.edge.aten.addmm.default] + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py index f5945607f1b..c02d184c5ae 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py @@ -4,31 +4,33 @@ # LICENSE file in the root directory of this source tree. +import itertools +import unittest + +import kgb import numpy as np -import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) -from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.executorch_pipeline import ( + to_edge_program, + to_quantized_edge_program, +) from executorch.backends.nxp.tests.executors import ( convert_run_compare, + graph_contains_any, graph_contains_any_of_ops, - ToNCHWPreprocess, - ToNHWCPreprocess, + ToChannelFirstPreprocess, + ToChannelLastPreprocess, ) from executorch.exir.dialects._ops import ops as exir_ops +from parameterized import parameterized from torch import nn from torch.export import ExportedProgram -@pytest.fixture(autouse=True) -def reseed_model_per_test_run(): - torch.manual_seed(23) - np.random.seed(23) - - class SingleConvBlockWithDropout(torch.nn.Module): def __init__( self, conv_in_channels: int = 3, perform_inplace_dropout: bool = False @@ -74,57 +76,108 @@ def forward(self, x): return self.block(x) -@pytest.mark.parametrize("inplace_dropout", [False, True]) -@pytest.mark.parametrize("input_shape", [(1, 3, 128, 128), (1, 3, 256, 256)]) -def test_conv_dropout_quant(mocker, inplace_dropout: bool, input_shape: tuple[int]): - model = SingleConvBlockWithDropout( - conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout - ).eval() +class TestCloneConverter(unittest.TestCase): + __test__ = False # Prevent interfering with PyTest tests - converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(23) - quantized_program = to_quantized_edge_program(model, input_shape).exported_program() + @staticmethod + def _node_is_clone(node) -> bool: + clone_ops = [ + exir_ops.edge.aten.clone.default, + exir_ops.edge.dim_order_ops._clone_dim_order.default, + ] - tflite_flatbuffers_model, io_formats = converter_spy.spy_return - exported_program: ExportedProgram = converter_spy.call_args.args[1] - - assert not graph_contains_any_of_ops( - graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default] - ) - - input_data = (np.random.random(input_shape) * 50).astype(np.int8) - convert_run_compare( - exported_program, - tfl_model=tflite_flatbuffers_model, - tflite_input_preprocess=ToNHWCPreprocess(), - tflite_output_preprocess=ToNCHWPreprocess(), - input_data=input_data, - atol=1.0, - ) + def target_can_be_clone(node): + if hasattr(node, "op") and node.op == "call_function": + return "clone" in node.target.__name__ + return False -@pytest.mark.parametrize("inplace_dropout", [False, True]) -def test_clone_pool_view_copy_quant( - mocker, inplace_dropout: bool, input_shape: tuple[int] = (1, 64, 25, 5) -): - model = KWSFinalBlock(input_shape).eval() + return node in clone_ops or target_can_be_clone(node) - converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") - - quantized_program = to_quantized_edge_program(model, input_shape).exported_program() - - tflite_flatbuffers_model, io_formats = converter_spy.spy_return - exported_program: ExportedProgram = converter_spy.call_args.args[1] - - assert not graph_contains_any_of_ops( - graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default] + @parameterized.expand( + list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)])) ) - - input_data = (np.random.random(input_shape) * 50).astype(np.int8) - convert_run_compare( - exported_program, - tfl_model=tflite_flatbuffers_model, - tflite_input_preprocess=ToNHWCPreprocess(), - input_data=input_data, - atol=1.0, + def test_conv_dropout_quant(self, inplace_dropout: bool, input_shape: tuple[int]): + model = SingleConvBlockWithDropout( + conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout + ).eval() + + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, call_original=True + ) as converter_spy: + quantized_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + + assert not graph_contains_any( + graph=quantized_program.graph, + condition=TestCloneConverter._node_is_clone, + ) + + input_data = (np.random.random(input_shape) * 50).astype(np.int8) + convert_run_compare( + exported_program, + tfl_model=tflite_flatbuffers_model, + tflite_input_preprocess=ToChannelLastPreprocess(), + tflite_output_preprocess=ToChannelFirstPreprocess(), + input_data=input_data, + atol=1.0, + ) + + @parameterized.expand( + list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)])) ) + def test_conv_dropout_no_quant( + self, inplace_dropout: bool, input_shape: tuple[int] + ): + model = SingleConvBlockWithDropout( + conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout + ).eval() + + edge_program = to_edge_program(model, input_shape).exported_program() + + has_clone = graph_contains_any_of_ops( + graph=edge_program.graph, + ops=[ + exir_ops.edge.aten.clone.default, + exir_ops.edge.dim_order_ops._clone_dim_order.default, + ], + ) + + # Clone with inplace=True should not produce clone edge op and vice versa + assert inplace_dropout ^ has_clone + + def test_clone_pool_view_copy_quant(self, input_shape: tuple[int] = (1, 64, 25, 5)): + model = KWSFinalBlock(input_shape).eval() + + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, call_original=True + ) as converter_spy: + quantized_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + + assert not graph_contains_any( + graph=quantized_program.graph, + condition=TestCloneConverter._node_is_clone, + ) + + input_data = (np.random.random(input_shape) * 50).astype(np.int8) + convert_run_compare( + exported_program, + tfl_model=tflite_flatbuffers_model, + tflite_input_preprocess=ToChannelLastPreprocess(), + input_data=input_data, + atol=1.0, + ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py index 745b26ef8ff..d7a59cad6d6 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py @@ -76,7 +76,18 @@ def test_conv1d_quant_conversion(stride, dilation, kernel_size, mocker): @pytest.mark.parametrize("stride", [1, 2]) @pytest.mark.parametrize("dilation", [2, 1]) -@pytest.mark.parametrize("kernel_size", [(1,), (3,)]) +@pytest.mark.parametrize( + "kernel_size", + [ + pytest.param( + (1,), + marks=pytest.mark.xfail( + reason="Regression in Neutron SW 2.1.x (AIR-13336)", strict=True + ), + ), + (3,), + ], +) @pytest.mark.parametrize("padding", [(1,), 2]) def test_conv1d_quant_conversion__padded( stride, dilation, kernel_size, padding, mocker @@ -179,7 +190,18 @@ def test_conv1d_quant_conversion__depthwise(stride, dilation, kernel_size, mocke @pytest.mark.parametrize("stride", [1, 2]) @pytest.mark.parametrize("dilation", [2, 1]) -@pytest.mark.parametrize("kernel_size", [(1,), (3,)]) +@pytest.mark.parametrize( + "kernel_size", + [ + pytest.param( + (1,), + marks=pytest.mark.xfail( + reason="Regression in Neutron SW 2.1.x (AIR-13336)", strict=True + ), + ), + (3,), + ], +) @pytest.mark.parametrize("padding", [(1,), 2]) def test_conv1d_quant_conversion__depthwise__padded( stride, dilation, kernel_size, padding, mocker diff --git a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py index e17868d16e2..c4bc559817b 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py @@ -57,7 +57,7 @@ def test_relu6_quant(mocker, input_shape: tuple[int], inplace: bool): tflite_input_preprocess=ToNHWCPreprocess(), tflite_output_preprocess=ToNCHWPreprocess(), input_data=input_data, - atol=1.0, + atol=2.0, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py deleted file mode 100644 index 858724522cd..00000000000 --- a/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2024 NXP -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import numpy as np -import pytest -import torch - -from executorch.backends.nxp.tests.executorch_pipeline import to_edge_program -from executorch.backends.nxp.tests.executors import convert_run_compare -from executorch.backends.nxp.tests.models import LinearModule -from executorch.exir.dialects._ops import ops as exir_ops - - -@pytest.fixture(autouse=True) -def reseed_model_per_test_run(): - torch.manual_seed(23) - np.random.seed(23) - - -def test_linear_conversion__with_bias(): - input_shape = (10, 32) - edge_program = to_edge_program( - LinearModule(bias=True), input_shape - ).exported_program() - - input_data = np.random.random(input_shape).astype(np.float32) - - nodes = list(edge_program.graph.nodes) - assert nodes[4].target == exir_ops.edge.aten.addmm.default - assert len(nodes[4].args) == 3 # Has bias. - - convert_run_compare(edge_program, input_data=input_data) - - -def test_linear_conversion__without_bias(): - input_shape = (10, 32) - edge_program = to_edge_program( - LinearModule(bias=False), input_shape - ).exported_program() - - input_data = np.random.random(input_shape).astype(np.float32) - - nodes = list(edge_program.graph.nodes) - assert nodes[3].target == exir_ops.edge.aten.mm.default - assert len(nodes[3].args) == 2 # No bias. - - convert_run_compare(edge_program, input_data=input_data) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py index 0032eae5c1a..a634416f8a7 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py @@ -49,6 +49,7 @@ def test_mean_dim_conv_quant_conversion(mocker, input_shape, dim, keeepdim=True) input_data=input_data, tflite_output_preprocess=ToChannelFirstPreprocess(), tfl_model=tflite_flatbuffers_model, + atol=1.0, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py new file mode 100644 index 00000000000..609c0f6c78c --- /dev/null +++ b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py @@ -0,0 +1,89 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import kgb +import numpy as np +import torch + +from executorch.backends.nxp.backend.edge_program_converter import ( + EdgeProgramToIRConverter, +) +from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.executors import ( + convert_run_compare, + graph_contains_any_of_ops, +) +from executorch.backends.nxp.tests.models import LinearModule, MmModule +from executorch.exir.dialects._ops import ops as exir_ops +from torch.export import ExportedProgram + + +class TestMmConversion(unittest.TestCase): + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(42) + + def test_mm_conversion(self): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, call_original=True + ) as converter_spy: + input_shape = (1, 32) + model = MmModule(input_shape[1]) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=[exir_ops.edge.aten.mm.default] + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + ) + + def test_linear_conversion__without_bias(self): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, call_original=True + ) as converter_spy: + input_shape = (10, 32) + model = LinearModule(bias=False) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=[exir_ops.edge.aten.mm.default] + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py new file mode 100644 index 00000000000..98566ff1ad6 --- /dev/null +++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py @@ -0,0 +1,175 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import numpy as np +import pytest +import torch + +from executorch.backends.nxp.backend.edge_program_converter import ( + EdgeProgramToIRConverter, +) +from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.executors import ( + convert_run_compare, + ToChannelFirstPreprocess, + ToChannelLastPreprocess, +) +from executorch.backends.nxp.tests.models import ( + SubTensorConvModule, + SubTensorModule, + SubTensorOneInputModule, +) +from executorch.exir.dialects._ops import ops as exir_ops +from torch.export import ExportedProgram + + +@pytest.fixture(autouse=True) +def reseed_model_per_test_run(): + torch.manual_seed(23) + np.random.seed(23) + + +@pytest.mark.parametrize( + "input_shape", + [ + pytest.param((4,), id="1D."), + pytest.param((6, 6), id="2D."), + pytest.param((1, 4, 8), id="3D."), + pytest.param((1, 4, 8, 8), id="4D."), + ], +) +def test_sub_tensor_quant_conversion(mocker, input_shape): + model = SubTensorModule() + + converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") + + # Run conversion + _ = to_quantized_edge_program(model, [input_shape, input_shape]) + + # Capture generated model + tflite_flatbuffers_model, io_formats = converter_spy.spy_return + + # Capture converted program + exported_program: ExportedProgram = converter_spy.call_args.args[1] + + input_data_1 = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + input_data_2 = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + input_data = {0: input_data_1, 1: input_data_2} + + nodes = list(exported_program.graph.nodes) + assert nodes[4].target == exir_ops.edge.aten.sub.Tensor + + convert_run_compare( + exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data + ) + + +@pytest.mark.parametrize( + "input_shape", + [ + pytest.param((4,), id="1D."), + pytest.param((6, 6), id="2D."), + pytest.param((1, 4, 8), id="3D."), + pytest.param((1, 4, 8, 8), id="4D."), + ], +) +def test_sub_tensor_one_input_quant_conversion(mocker, input_shape): + model = SubTensorOneInputModule() + + converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") + + # Run conversion + _ = to_quantized_edge_program(model, input_shape) + + # Capture generated model + tflite_flatbuffers_model, io_formats = converter_spy.spy_return + + # Capture converted program + exported_program: ExportedProgram = converter_spy.call_args.args[1] + + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8) + + nodes = list(exported_program.graph.nodes) + assert nodes[2].target == exir_ops.edge.aten.sub.Tensor + + convert_run_compare( + exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data + ) + + +@pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param((1, 4, 8, 8), id="4D."), + pytest.param((1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."), + ], +) +def test_sub_tensor_w_conv_quant_conversion(mocker, x_input_shape): + model = SubTensorConvModule() + + converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") + + n, c, h, w = x_input_shape + y_input_shape = (n, 8, h, w) + + # Run conversion + _ = to_quantized_edge_program(model, [x_input_shape, y_input_shape]) + + # Capture generated model + tflite_flatbuffers_model, io_formats = converter_spy.spy_return + + # Capture converted program + exported_program: ExportedProgram = converter_spy.call_args.args[1] + + input_data_1 = (np.random.random(x_input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + input_data_2 = (np.random.random(y_input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + input_data = {0: input_data_1, 1: input_data_2} + + nodes = list(exported_program.graph.nodes) + assert nodes[15].target == exir_ops.edge.aten.sub.Tensor + + convert_run_compare( + exported_program, + input_data=input_data, + tflite_input_preprocess=ToChannelLastPreprocess(), + tfl_model=tflite_flatbuffers_model, + tflite_output_preprocess=ToChannelFirstPreprocess(), + ) + + +@pytest.mark.parametrize( + "x_input_shape, y_input_shape", + [ + pytest.param((1, 4, 7), (4, 7), id="3D -> 2D."), + pytest.param((1, 4, 8), (1, 4, 4, 8), id="3D -> 4D."), + pytest.param((1, 1, 4, 4, 8), (1, 4, 4, 8), id="5D -> 4D."), + pytest.param((4,), (4, 4), id="1D -> 2D."), + pytest.param((4,), (4, 4, 4), id="1D -> 3D."), + pytest.param((6, 6), (1, 8, 6, 6), id="2D -> 4D."), + pytest.param((6, 6), (6,), id="2D -> 1D."), + ], +) +def test_sub_tensor_broadcasting_unsupported_quant_conversion( + x_input_shape, y_input_shape +): + model = SubTensorModule() + + # Run conversion + edge_program = to_quantized_edge_program( + model, [x_input_shape, y_input_shape] + ).exported_program() + nodes = list(edge_program.graph.nodes) + + # Broadcast is not supported, node is not converted + assert ( + nodes[6].target == exir_ops.edge.aten.sub.Tensor + ) # Sub Tensor is not delegated. diff --git a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py index 40857d18eb8..bb4500bc1e2 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py @@ -27,6 +27,11 @@ class TestTanhConverter(unittest.TestCase): __test__ = False # Prevent interfering with PyTest tests + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(23) + @parameterized.expand( input=[ ( @@ -76,10 +81,5 @@ def test_conv_tanh( tflite_input_preprocess=ToChannelLastPreprocess(), tflite_output_preprocess=ToChannelFirstPreprocess(), input_data=input_data, - atol=1.0, + atol=2.0, ) - - @classmethod - def setUpClass(cls): - torch.manual_seed(23) - np.random.seed(23) diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py index bdad9ddc4b4..f613349fed0 100644 --- a/backends/nxp/tests/models.py +++ b/backends/nxp/tests/models.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import math from typing import Callable, Collection, Union import torch @@ -169,6 +170,32 @@ def forward(self, x): return self.linear(x) +class AddmmModule(torch.nn.Module): + def __init__(self, in_channels: int): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(in_channels, in_channels)) + self.bias = torch.nn.Parameter(torch.empty(in_channels)) + torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) + torch.nn.init.uniform_(self.bias, -bound, bound) + self.eval() + + def forward(self, x): + return torch.addmm(self.bias, x, self.weight) + + +class MmModule(torch.nn.Module): + def __init__(self, in_channels: int): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(in_channels, in_channels)) + torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + self.eval() + + def forward(self, x): + return torch.mm(x, self.weight) + + class LinearSoftmaxModule(torch.nn.Module): def __init__(self): super().__init__() @@ -424,6 +451,34 @@ def forward(x): return x + x +class SubTensorModule(torch.nn.Module): + def __init__(self): + super().__init__() + + @staticmethod + def forward(x, y): + return x - y + + +class SubTensorConvModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = Conv2dModule(padding=1, stride=1) + + def forward(self, x, y): + x = self.conv(x) + return x - y + + +class SubTensorOneInputModule(torch.nn.Module): + def __init__(self): + super().__init__() + + @staticmethod + def forward(x): + return x - x + + class MeanDimLinearModule(torch.nn.Module): def __init__(self, dim, keepdim): super().__init__() diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py index 3f1106c6d24..788d04c6dad 100644 --- a/backends/nxp/tests/test_batch_norm_fusion.py +++ b/backends/nxp/tests/test_batch_norm_fusion.py @@ -168,7 +168,7 @@ def test_batch_norm_conv_fusing__full_pipeline__1d(bias: bool): nodes = list(edge_program.graph.nodes) assert ( - len(nodes) == 13 + len(nodes) == 17 ) # 1D Conv currently isn't delegated, because it doesn't get quantized. assert not any( node.op == "call_function" and "batch_norm" in node.target.__name__ diff --git a/backends/nxp/tests/test_context_sensitive_delegation.py b/backends/nxp/tests/test_context_sensitive_delegation.py new file mode 100644 index 00000000000..1919bc63d82 --- /dev/null +++ b/backends/nxp/tests/test_context_sensitive_delegation.py @@ -0,0 +1,71 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import numpy as np +import torch + +from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import ( + ViewCopyConverter, +) +from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops +from executorch.exir.dialects._ops import ops as exir_ops + + +class SingleViewCopyModule(torch.nn.Module): + def __init__(self, new_shape: list[int]): + super().__init__() + self.new_shape = new_shape + + def forward(self, x): + return torch.reshape(x, self.new_shape) + + +class TestContextSensitiveDelegation(unittest.TestCase): + __test__ = False # Prevent interfering with PyTest tests. + + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(42) + + def test_single_view_copy_partition(self): + input_shape = (2, 10) + module = SingleViewCopyModule([1, 20]) + + ep = to_quantized_edge_program(module, input_shape).exported_program() + + # Make sure the `view_copy` was not delegated. + assert graph_contains_any_of_ops( + ep.graph, [exir_ops.edge.aten.view_copy.default] + ) + assert not any("delegate" in n.name for n in ep.graph.nodes) + + def test_single_view_copy_partition__forced_delegation(self): + input_shape = (2, 10) + module = SingleViewCopyModule([1, 20]) + + def _supported_partitioning(*_): + return True + + # Replace the partition support check function, to accept anything. + original_supports_partitioning_result = ( + ViewCopyConverter.supports_partitioning_result + ) + ViewCopyConverter.supports_partitioning_result = _supported_partitioning + + with self.assertRaises(RuntimeError) as e: + to_quantized_edge_program(module, input_shape).exported_program() + assert ( + str(e.exception) + == "Model converted with neutron-converter does not contain a NeutronGraph node." + ) + + # Return to the original partition support check function. + ViewCopyConverter.supports_partitioning_result = ( + original_supports_partitioning_result + ) diff --git a/backends/nxp/tests/test_linear_and_add_fusion.py b/backends/nxp/tests/test_linear_and_add_fusion.py new file mode 100644 index 00000000000..16d3c4140a2 --- /dev/null +++ b/backends/nxp/tests/test_linear_and_add_fusion.py @@ -0,0 +1,644 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from copy import deepcopy + +import numpy as np +import torch + +from executorch.backends.nxp.aten_passes.fuse_linear_and_add_pass import ( + FuseLinearAndAddPass, +) +from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import ( + NeutronAtenPassManager, +) +from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import ( + RemoveNodesWithKnownOutputs, +) +from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops +from parameterized import parameterized + + +class LinearAddModule(torch.nn.Module): + def __init__( + self, + fc_in_features: int, + fc_out_features: int, + bias: bool, + artificial_bias_shape: list[int], + alpha=1.0, + ): + super().__init__() + self.fc_in_features = fc_in_features + self.fc_out_features = fc_out_features + self.bias = bias + self.artificial_bias_shape = artificial_bias_shape + self.alpha = alpha + self.linear = torch.nn.Linear(fc_in_features, fc_out_features, bias=bias) + self.eval() + + def forward(self, x): + artificial_bias = torch.ones(self.artificial_bias_shape, dtype=torch.float32) + x = self.linear(x) + return torch.add(x, artificial_bias, alpha=self.alpha) + + +class LinearAddModuleReverseNodeOrder(torch.nn.Module): + """The `ones` added by the `add` are only generated after the `linear` node.""" + + def __init__( + self, + fc_in_features: int, + fc_out_features: int, + bias: bool, + artificial_bias_shape: list[int], + ): + super().__init__() + self.fc_in_features = fc_in_features + self.fc_out_features = fc_out_features + self.bias = bias + self.artificial_bias_shape = artificial_bias_shape + self.linear = torch.nn.Linear(fc_in_features, fc_out_features, bias=bias) + self.eval() + + def forward(self, x): + # The `ones` are generated after the `linear` call. + x = self.linear(x) + artificial_bias = torch.ones(self.artificial_bias_shape, dtype=torch.float32) + return torch.add(x, artificial_bias) + + +class LinearAddModuleReverseInputOrder(torch.nn.Module): + """The `add` has the output of the `linear` as its second input (which is the input multiplied by `alpha`).""" + + def __init__( + self, + fc_in_features: int, + fc_out_features: int, + bias: bool, + artificial_bias_shape: list[int], + alpha=1.0, + ): + super().__init__() + self.fc_in_features = fc_in_features + self.fc_out_features = fc_out_features + self.bias = bias + self.artificial_bias_shape = artificial_bias_shape + self.alpha = alpha + self.linear = torch.nn.Linear(fc_in_features, fc_out_features, bias=bias) + self.eval() + + def forward(self, x): + artificial_bias = torch.ones(self.artificial_bias_shape, dtype=torch.float32) + x = self.linear(x) + return torch.add(artificial_bias, x, alpha=self.alpha) # Reversed input order. + + +class TestLinearAndAddFusing(unittest.TestCase): + __test__ = False # Prevent interfering with PyTest tests. + + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(42) + + @parameterized.expand( + [ + ["2D", [4, 6]], + ["4D", [4, 6, 8, 10]], + ] + ) + def test_linear_add_fusing__static__no_bias__valid_shape( + self, _, input_shape: list[int] + ): + example_input = (torch.ones(input_shape),) + + module = LinearAddModule(input_shape[-1], 5, False, [5]) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[3].target == torch.ops.aten.linear.default + assert original_nodes[4].target == torch.ops.aten.add.Tensor + + # The `add` has been removed. + assert len(modified_nodes) == 5 + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert len(modified_nodes[3].args) == 3 + assert "ones" in modified_nodes[3].args[2].name + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.add.Tensor] + ) + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + @parameterized.expand( + [ + ["2D", [8, 10]], + ] + ) + def test_linear_add_fusing__static__no_bias__invalid_shape( + self, _, input_shape: list[int] + ): + example_input = (torch.ones(input_shape),) + + module = LinearAddModule( + input_shape[-1], 5, False, [8, 5] # Unsupported `linear` bias shape. + ) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[3].target == torch.ops.aten.linear.default + assert len(original_nodes[3].args) == 2 + assert original_nodes[4].target == torch.ops.aten.add.Tensor + + # Nothing changed. + assert len(modified_nodes) == 6 + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert modified_nodes[4].target == torch.ops.aten.add.Tensor + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + @parameterized.expand( + [ + ["2D", [4, 6]], + ["4D", [2, 3, 4, 5]], + ] + ) + def test_linear_add_fusing__static__bias__valid_shape( + self, _, input_shape: list[int] + ): + example_input = (torch.ones(input_shape),) + + module = LinearAddModule(input_shape[-1], 5, True, [5]) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 7 + assert original_nodes[3].target == torch.ops.aten.ones.default + assert original_nodes[4].target == torch.ops.aten.linear.default + assert len(original_nodes[4].args) == 3 + assert original_nodes[5].target == torch.ops.aten.add.Tensor + + # make sure the `add` and the `ones` were removed. + assert len(modified_nodes) == 5 + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.ones.default] + ) + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert len(modified_nodes[3].args) == 3 + assert "combined" in modified_nodes[3].args[2].name + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.add.Tensor] + ) + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + def test_linear_add_fusing__static__no_bias__reverse_order(self): + input_shape = [4, 8] + example_input = (torch.ones(input_shape),) + + # Use a module where the `bias` is generated after the `linear` node, which prevents the change. + module = LinearAddModuleReverseNodeOrder(input_shape[-1], 5, False, [5]) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[2].target == torch.ops.aten.linear.default + assert len(original_nodes[2].args) == 2 + assert ( + original_nodes[3].target == torch.ops.aten.ones.default + ) # `ones` after `linear`. + assert original_nodes[4].target == torch.ops.aten.add.Tensor + + # The `add` has been removed. + assert len(modified_nodes) == 5 + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert len(modified_nodes[3].args) == 3 + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.add.Tensor] + ) + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + def test_linear_add_fusing__static__bias__reverse_order(self): + input_shape = [4, 8] + example_input = (torch.ones(input_shape),) + + # Use a module where the `bias` is generated after the `linear` node, which prevents the change. + module = LinearAddModuleReverseNodeOrder(input_shape[-1], 5, True, [5]) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 7 + assert original_nodes[3].target == torch.ops.aten.linear.default + assert len(original_nodes[3].args) == 3 + assert ( + original_nodes[4].target == torch.ops.aten.ones.default + ) # `ones` after `linear`. + assert original_nodes[5].target == torch.ops.aten.add.Tensor + + # The `add` and `ones` have been removed. + assert len(modified_nodes) == 5 + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.ones.default] + ) + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert len(modified_nodes[3].args) == 3 + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.add.Tensor] + ) + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + def test_linear_add_fusing__static__alpha__no_bias(self): + alpha = 2.34 + input_shape = [4, 8] + example_input = (torch.ones(input_shape),) + + module = LinearAddModule(input_shape[-1], 5, False, [5], alpha=alpha) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[2].target == torch.ops.aten.ones.default + assert original_nodes[3].target == torch.ops.aten.linear.default + assert len(original_nodes[3].args) == 2 + assert original_nodes[4].target == torch.ops.aten.add.Tensor + assert original_nodes[4].kwargs["alpha"] == alpha + + # The `add` has been removed. + assert len(modified_nodes) == 5 + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert len(modified_nodes[3].args) == 3 + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.add.Tensor] + ) + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + def test_linear_add_fusing__static__alpha__bias(self): + alpha = 2.34 + input_shape = [4, 8] + example_input = (torch.ones(input_shape),) + + module = LinearAddModule(input_shape[-1], 5, True, [5], alpha=alpha) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 7 + assert original_nodes[3].target == torch.ops.aten.ones.default + assert original_nodes[4].target == torch.ops.aten.linear.default + assert len(original_nodes[4].args) == 3 + assert original_nodes[5].target == torch.ops.aten.add.Tensor + assert original_nodes[5].kwargs["alpha"] == alpha + + # The `add` has been removed. + assert len(modified_nodes) == 5 + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert len(modified_nodes[3].args) == 3 + assert not graph_contains_any_of_ops( + modified_module.graph, [torch.ops.aten.add.Tensor] + ) + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + def test_linear_add_fusing__static__alpha__reversed_add_inputs(self): + alpha = 2.34 + input_shape = [4, 8] + example_input = (torch.ones(input_shape),) + + module = LinearAddModuleReverseInputOrder( + input_shape[-1], 5, True, [5], alpha=alpha + ) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager( + [ + RemoveNodesWithKnownOutputs(), # Make the added tensor static. + FuseLinearAndAddPass(), + ] + )(deepcopy(program.module())).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 7 + assert original_nodes[3].target == torch.ops.aten.ones.default + assert original_nodes[4].target == torch.ops.aten.linear.default + assert len(original_nodes[4].args) == 3 + assert original_nodes[5].target == torch.ops.aten.add.Tensor + assert ( + original_nodes[5].args[1] == original_nodes[4] + ) # `linear` is the second input. + assert original_nodes[5].kwargs["alpha"] == alpha + + # Nothing changed (except the `ones` was replaced by static data). + assert len(modified_nodes) == 7 + assert modified_nodes[4].target == torch.ops.aten.linear.default + assert len(modified_nodes[4].args) == 3 + assert modified_nodes[5].target == torch.ops.aten.add.Tensor + assert ( + modified_nodes[5].args[1] == modified_nodes[4] + ) # `linear` is the second input. + assert modified_nodes[5].kwargs["alpha"] == alpha + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + @parameterized.expand( + [ + ["2D", [4, 6]], + ] + ) + def test_linear_add_fusing__dynamic__no_bias__valid_shape( + self, _, input_shape: list[int] + ): + example_input = (torch.ones(input_shape),) + + module = LinearAddModule(input_shape[-1], 5, False, [5]) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( + deepcopy(program.module()) + ).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[3].target == torch.ops.aten.linear.default + assert original_nodes[4].target == torch.ops.aten.add.Tensor + + # Nothing changed. + assert len(modified_nodes) == 6 + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert modified_nodes[4].target == torch.ops.aten.add.Tensor + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + @parameterized.expand( + [ + ["2D", [8, 10]], + ] + ) + def test_linear_add_fusing__dynamic__no_bias__invalid_shape( + self, _, input_shape: list[int] + ): + example_input = (torch.ones(input_shape),) + + module = LinearAddModule( + input_shape[-1], 5, False, [8, 5] # Unsupported `linear` bias shape. + ) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( + deepcopy(program.module()) + ).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[3].target == torch.ops.aten.linear.default + assert original_nodes[4].target == torch.ops.aten.add.Tensor + + # Nothing changed. + assert len(modified_nodes) == 6 + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert modified_nodes[4].target == torch.ops.aten.add.Tensor + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + @parameterized.expand( + [ + ["2D", [4, 6]], + ] + ) + def test_linear_add_fusing__dynamic__bias__valid_shape( + self, _, input_shape: list[int] + ): + example_input = (torch.ones(input_shape),) + + module = LinearAddModule(input_shape[-1], 5, True, [5]) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( + deepcopy(program.module()) + ).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 7 + assert original_nodes[3].target == torch.ops.aten.ones.default + assert original_nodes[4].target == torch.ops.aten.linear.default + assert original_nodes[5].target == torch.ops.aten.add.Tensor + + # Nothing has changed, as the second bias is dynamic, so it cannot be added together with the first bias. + assert len(modified_nodes) == 7 + assert modified_nodes[3].target == torch.ops.aten.ones.default + assert modified_nodes[4].target == torch.ops.aten.linear.default + assert modified_nodes[5].target == torch.ops.aten.add.Tensor + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + def test_linear_add_fusing__dynamic__reverse_order(self): + input_shape = [4, 8] + example_input = (torch.ones(input_shape),) + + # Use a module where the `bias` is generated after the `linear` node, which prevents the change. + module = LinearAddModuleReverseNodeOrder(input_shape[-1], 5, False, [5]) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( + deepcopy(program.module()) + ).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[2].target == torch.ops.aten.linear.default + assert original_nodes[3].target == torch.ops.aten.ones.default + assert original_nodes[4].target == torch.ops.aten.add.Tensor + + # Nothing has changed. + assert len(modified_nodes) == 6 + assert modified_nodes[2].target == torch.ops.aten.linear.default + assert modified_nodes[3].target == torch.ops.aten.ones.default + assert modified_nodes[4].target == torch.ops.aten.add.Tensor + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) + + def test_linear_add_fusing__dynamic__alpha(self): + alpha = 2.34 + input_shape = [4, 8] + example_input = (torch.ones(input_shape),) + + module = LinearAddModule(input_shape[-1], 5, False, [5], alpha=alpha) + program = torch.export.export(module, example_input, strict=True) + original_module = program.module() + + modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( + deepcopy(program.module()) + ).graph_module + + # Make sure the module wasn't broken. + original_nodes = list(original_module.graph.nodes) + modified_nodes = list(modified_module.graph.nodes) + + assert len(original_nodes) == 6 + assert original_nodes[2].target == torch.ops.aten.ones.default + assert original_nodes[3].target == torch.ops.aten.linear.default + assert original_nodes[4].target == torch.ops.aten.add.Tensor + + # Nothing has changed. + assert len(modified_nodes) == 6 + assert modified_nodes[2].target == torch.ops.aten.ones.default + assert modified_nodes[3].target == torch.ops.aten.linear.default + assert modified_nodes[4].target == torch.ops.aten.add.Tensor + + # Verify that the behavior has not changed. + input_data = torch.randn(input_shape, dtype=torch.float32) + out1 = original_module(input_data).detach().numpy() + out2 = modified_module(input_data).detach().numpy() + assert np.allclose(out1, out2) diff --git a/backends/nxp/tests/test_neutron_backend.py b/backends/nxp/tests/test_neutron_backend.py index 53e54ec2f56..c9917651fbd 100644 --- a/backends/nxp/tests/test_neutron_backend.py +++ b/backends/nxp/tests/test_neutron_backend.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py index af723ec9c7a..2fcfd8cd987 100644 --- a/backends/nxp/tests/test_neutron_converter_manager.py +++ b/backends/nxp/tests/test_neutron_converter_manager.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -29,9 +29,7 @@ def test_conv2d_neutron_conversion__default_flavor(): ) neutron_converter_manager = NeutronConverterManager() - neutron_model = neutron_converter_manager.convert( - tflite_model, "imxrt700", "SDK_25_06" - ) + neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700") assert len( neutron_model @@ -50,9 +48,8 @@ def test__conv2d_neutron_conversion__invalid_flavor(): edge_program_manager.exported_program() ) - neutron_converter_manager = NeutronConverterManager() with pytest.raises(RuntimeError) as excinfo: - _ = neutron_converter_manager.convert(tflite_model, "imxrt700", "bad_flavor") + _ = NeutronConverterManager("bad_flavor").convert(tflite_model, "imxrt700") assert "Neutron Converter module with flavor 'bad_flavor' not found." in str( excinfo diff --git a/backends/nxp/tests/test_per_channel_conversion.py b/backends/nxp/tests/test_per_channel_conversion.py new file mode 100644 index 00000000000..043ba8fc001 --- /dev/null +++ b/backends/nxp/tests/test_per_channel_conversion.py @@ -0,0 +1,153 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import kgb +import numpy as np +import torch + +from executorch.backends.nxp.backend.edge_program_converter import ( + EdgeProgramToIRConverter, +) +from executorch.backends.nxp.quantizer.neutron_quantizer import ( + act_qspec, + NeutronAtenQuantizer, + wgt_qspec, +) +from executorch.backends.nxp.quantizer.patterns import ( + NodeArgsIdx, + PartitionAnchors, + QuantizationPattern, +) +from executorch.backends.nxp.quantizer.utils import get_bias_qparams +from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.executors import ( + convert_run_compare, + ToChannelFirstPreprocess, + ToChannelLastPreprocess, +) +from executorch.backends.nxp.tests.models import Conv2dModule +from executorch.backends.nxp.tests.test_quantizer import _get_target_name + +from torch import fx +from torch._ops import OpOverload +from torch.export import ExportedProgram +from torchao.quantization.pt2e import MinMaxObserver, PerChannelMinMaxObserver +from torchao.quantization.pt2e.quantizer import ( + DerivedQuantizationSpec, + QuantizationConfig, + QuantizationSpec, +) + + +class Conv2dPatternPerChannel(QuantizationPattern): + + def __init__(self, is_per_channel: bool): + super().__init__() + self.is_per_channel = is_per_channel + + def partition_types(self) -> list[OpOverload]: + return [torch.ops.aten.conv2d.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] + ) -> PartitionAnchors: + conv2d_node = fused_partition[0].nodes[-1] + + bias_qscheme = ( + torch.per_channel_symmetric + if self.is_per_channel + else torch.per_tensor_symmetric + ) + bias_quantization_qspec = DerivedQuantizationSpec( + derived_from=[ + (conv2d_node.args[0], conv2d_node), + (conv2d_node.args[1], conv2d_node), + ], + derive_qparams_fn=get_bias_qparams, + dtype=torch.int32, + quant_min=-(2**31) + 1, + quant_max=2**31 - 1, + qscheme=bias_qscheme, + ch_axis=0, + ) + + weight_qscheme = ( + torch.per_channel_symmetric + if self.is_per_channel + else torch.per_tensor_symmetric + ) + weight_observer_or_fake_quant_ctr = ( + PerChannelMinMaxObserver if self.is_per_channel else MinMaxObserver + ) + weight_quantization_spec = QuantizationSpec( + dtype=torch.int8, + observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr, + quant_min=-127, + quant_max=127, + qscheme=weight_qscheme, + ch_axis=0, + ) + + return PartitionAnchors( + inputs=[(conv2d_node, NodeArgsIdx(0))], + weights=[(conv2d_node, NodeArgsIdx(1), weight_quantization_spec)], + biases=[(conv2d_node, NodeArgsIdx(2), bias_quantization_qspec)], + output=[(conv2d_node,)], + ) + + +class TestPerChannelConversion(unittest.TestCase): + __test__ = False # Prevent interfering with PyTest tests + + @classmethod + def setUpClass(cls): + torch.manual_seed(25) + np.random.seed(25) + + def test_per_channel_convolution(self): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, call_original=True + ) as converter_spy: + model = Conv2dModule( + in_channels=8, out_channels=32, kernel_size=5, padding=3 + ) + input_shape = (1, 8, 32, 32) + + static_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_qspec, None) + _ = to_quantized_edge_program( + model, + input_shape, + get_quantizer_fn=lambda: NeutronAtenQuantizer( + Conv2dPatternPerChannel(is_per_channel=True), static_qconfig + ), + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + + convert_run_compare( + exported_program, + tflite_input_preprocess=ToChannelLastPreprocess(), + tfl_model=tflite_flatbuffers_model, + tflite_output_preprocess=ToChannelFirstPreprocess(), + input_data=input_data, + atol=1.0, + ) + + nodes = list(exported_program.graph.nodes) + + assert _get_target_name(nodes[8]).endswith( + "quantized_decomposed.dequantize_per_channel.default" + ) + assert _get_target_name(nodes[9]).endswith( + "quantized_decomposed.dequantize_per_channel.default" + ) + assert nodes[10].name == "aten_convolution_default" diff --git a/backends/nxp/tests/test_qdq_clustering_conv.py b/backends/nxp/tests/test_qdq_clustering_conv.py index 1713aace1fe..ffae931dbb4 100644 --- a/backends/nxp/tests/test_qdq_clustering_conv.py +++ b/backends/nxp/tests/test_qdq_clustering_conv.py @@ -16,13 +16,13 @@ def test_conv2d_partitioner(): lowered_module = edge_program.exported_program().graph_module.lowered_module_0 nodes = list(lowered_module.original_module.graph.nodes) - assert len(nodes) == 7 + assert len(nodes) == 9 - q_x_node = nodes[1] - dq_w_node = nodes[2] - dq_x_node = nodes[3] - conv_node = nodes[4] - q_y_node = nodes[5] + q_x_node = nodes[3] + dq_w_node = nodes[4] + dq_x_node = nodes[5] + conv_node = nodes[6] + q_y_node = nodes[7] assert "cluster" not in q_x_node.meta assert dq_w_node.meta["cluster"] == "aten_convolution_default_cluster" diff --git a/backends/nxp/tests/test_quantizer.py b/backends/nxp/tests/test_quantizer.py index ef5fbb0cbca..624e350ed21 100644 --- a/backends/nxp/tests/test_quantizer.py +++ b/backends/nxp/tests/test_quantizer.py @@ -1,4 +1,4 @@ -# Copyright 2024 NXP +# Copyright 2024-2025 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -34,26 +34,26 @@ def test_quantizer_conv2d(): m(*example_input) nodes = list(m.graph.nodes) - assert len(nodes) == 11 - assert nodes[7].name == "conv2d" + assert len(nodes) == 15 + assert nodes[11].name == "conv2d" # [0]: Input, [1] : weights, [2]: bias assert ( - _get_target_name(nodes[7].args[0]) + _get_target_name(nodes[11].args[0]) == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" ) assert ( - _get_target_name(nodes[7].args[1]) - == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + _get_target_name(nodes[11].args[1]) + == "torch.ops.quantized_decomposed.dequantize_per_channel.default" ) assert ( - _get_target_name(nodes[7].args[2]) - == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + _get_target_name(nodes[11].args[2]) + == "torch.ops.quantized_decomposed.dequantize_per_channel.default" ) assert ( - _get_target_name(nodes[8]) + _get_target_name(nodes[12]) == "torch.ops.quantized_decomposed.quantize_per_tensor.default" ) - assert nodes[8].args[0].name == "conv2d" + assert nodes[12].args[0].name == "conv2d" def test_quantizer_linear(): @@ -112,22 +112,22 @@ def test_quantizer_maxpool2d(): m(*example_input) nodes = list(m.graph.nodes) - assert len(nodes) == 14 + assert len(nodes) == 18 # Check if QDQ pattern: - assert nodes[10].name == "max_pool2d" + assert nodes[14].name == "max_pool2d" assert ( - _get_target_name(nodes[10].args[0]) + _get_target_name(nodes[14].args[0]) == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" ) assert ( - _get_target_name(nodes[11]) + _get_target_name(nodes[15]) == "torch.ops.quantized_decomposed.quantize_per_tensor.default" ) - assert nodes[11].args[0].name == "max_pool2d" + assert nodes[15].args[0].name == "max_pool2d" # Check if input and output quantization is same - input_quant = nodes[10].args[0].args[1:] - output_quant = nodes[11].args[1:] + input_quant = nodes[14].args[0].args[1:] + output_quant = nodes[15].args[1:] assert input_quant == output_quant @@ -207,10 +207,10 @@ def test_quantizer_conv2d_relu(): m(*example_input) nodes = list(m.graph.nodes) - assert len(nodes) == 12 - assert nodes[7].name == "dequantize_per_tensor_default_2" - assert nodes[8].name == "relu" - assert nodes[9].name == "quantize_per_tensor_default_3" + assert len(nodes) == 14 + assert nodes[9].name == "dequantize_per_tensor_default_1" + assert nodes[10].name == "relu" + assert nodes[11].name == "quantize_per_tensor_default_2" def test_quantizer_conv2d_avg_pool2d(): @@ -230,10 +230,10 @@ def test_quantizer_conv2d_avg_pool2d(): m(*example_input) nodes = list(m.graph.nodes) - assert len(nodes) == 14 - assert nodes[9].name == "dequantize_per_tensor_default_3" - assert nodes[10].name == "avg_pool2d" - assert nodes[11].name == "quantize_per_tensor_default_4" + assert len(nodes) == 18 + assert nodes[13].name == "dequantize_per_tensor_default_1" + assert nodes[14].name == "avg_pool2d" + assert nodes[15].name == "quantize_per_tensor_default_2" def test_quantizer_conv2d_permute(): @@ -253,10 +253,11 @@ def test_quantizer_conv2d_permute(): m(*example_input) nodes = list(m.graph.nodes) - assert len(nodes) == 12 - assert nodes[7].name == "dequantize_per_tensor_default_2" - assert nodes[8].name == "permute" - assert nodes[9].name == "quantize_per_tensor_default_3" + + assert len(nodes) == 14 + assert nodes[9].name == "dequantize_per_tensor_default_1" + assert nodes[10].name == "permute" + assert nodes[11].name == "quantize_per_tensor_default_2" def test_multiple_shared_spec_ops_in_row(): @@ -281,15 +282,15 @@ def test_multiple_shared_spec_ops_in_row(): nodes = list(m.graph.nodes) - assert len(nodes) == 15 - assert nodes[-5].name == "dequantize_per_tensor_default_3" + assert len(nodes) == 17 + assert nodes[-5].name.startswith("dequantize_per_tensor_default") assert nodes[-4].name == "max_pool2d" - assert nodes[-3].name == "quantize_per_tensor_default_4" + assert nodes[-3].name.startswith("quantize_per_tensor_default") # Assert that post-ReLU quantize and pre-MaxPool dequantize has same specs assert nodes[-6].args[1:] == nodes[-5].args[1:] # Assert that post-Conv quantize and pre-ReLU dequantize has same specs - assert nodes[6].args[1:] == nodes[7].args[1:] + assert nodes[5].args[1:] == nodes[6].args[1:] def test_quantizers_order_invariance(): diff --git a/backends/nxp/tests/test_removing_dead_code.py b/backends/nxp/tests/test_removing_dead_code.py index 7b8641fb247..cc51746c81c 100644 --- a/backends/nxp/tests/test_removing_dead_code.py +++ b/backends/nxp/tests/test_removing_dead_code.py @@ -9,6 +9,7 @@ import pytest import torch +from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer from executorch.backends.nxp.tests.executorch_pipeline import _quantize_model from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops @@ -32,6 +33,11 @@ def forward(self, x): class TestRemovingDeadCode(unittest.TestCase): __test__ = False # Prevent interfering with PyTest tests + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(23) + def test_removing_dead_code(self): input_shape = (42,) example_inputs = (torch.ones(input_shape),) @@ -45,16 +51,12 @@ def test_removing_dead_code(self): ) # The `NeutronQuantizer` should remove the dead code in the `transform_for_annotation()` method. + quantizer = NeutronQuantizer() exir_program_aten_quant = _quantize_model( - exir_program_aten.module(), [example_inputs] + exir_program_aten.module(), quantizer, [example_inputs] ) # Make sure the is no `add` operation in the graph anymore. assert not any( "add" in str(node.target) for node in exir_program_aten_quant.graph.nodes ) - - @classmethod - def setUpClass(cls): - torch.manual_seed(23) - np.random.seed(23) diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py index 1da53af794d..4c9f277e34d 100644 --- a/backends/nxp/tests/test_split_group_convolution.py +++ b/backends/nxp/tests/test_split_group_convolution.py @@ -17,6 +17,7 @@ ) from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec +from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer from executorch.backends.nxp.tests.executorch_pipeline import ( _quantize_model, get_random_calibration_inputs, @@ -39,8 +40,11 @@ def _quantize_and_lower_module( module: GraphModule, input_shape: tuple[int, ...], target="imxrt700" ) -> EdgeProgramManager: calibration_inputs = get_random_calibration_inputs(to_model_input_spec(input_shape)) + quantizer = NeutronQuantizer() - exir_program_aten__module_quant = _quantize_model(module, calibration_inputs) + exir_program_aten__module_quant = _quantize_model( + module, quantizer, calibration_inputs + ) edge_compile_config = EdgeCompileConfig(_check_ir_validity=False) edge_program_manager = export_to_edge( @@ -49,7 +53,7 @@ def _quantize_and_lower_module( edge_compile_config=edge_compile_config, ) - compile_spec = generate_neutron_compile_spec(target, "SDK_25_06") + compile_spec = generate_neutron_compile_spec(target, "SDK_25_09") partitioner = NeutronPartitioner(compile_spec) return edge_program_manager.to_backend(partitioner) @@ -106,7 +110,7 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int input_data = torch.randn(input_shape, dtype=torch.float32) out1 = original_module(input_data).detach().numpy() out2 = modified_module(input_data).detach().numpy() - assert np.allclose(out1, out2, atol=2.0e-7) + assert np.allclose(out1, out2, atol=2.0e-7, rtol=1.9e-4) # Make sure the graph can be correctly quantized and lowered to edge. ep = _quantize_and_lower_module( diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt index 4d32d8932c2..736ed6d8603 100644 --- a/backends/openvino/CMakeLists.txt +++ b/backends/openvino/CMakeLists.txt @@ -53,35 +53,11 @@ target_sources( executorch_target_link_options_shared_lib(openvino_backend) -if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER) - # Build executor runner binary for openvino backend - list(APPEND openvino_executor_runner_libs openvino_backend executorch) - - set(_openvino_executor_runner__srcs - ${EXECUTORCH_ROOT}/examples/portable/executor_runner/executor_runner.cpp - ${EXECUTORCH_ROOT}/extension/data_loader/file_data_loader.cpp - ${EXECUTORCH_ROOT}/extension/evalue_util/print_evalue.cpp - ${EXECUTORCH_ROOT}/extension/runner_util/inputs.cpp - ${EXECUTORCH_ROOT}/extension/runner_util/inputs_portable.cpp - ) - add_executable(openvino_executor_runner ${_openvino_executor_runner__srcs}) - - list(APPEND openvino_executor_runner_libs) - - target_link_libraries( - openvino_executor_runner gflags portable_ops_lib - ${openvino_executor_runner_libs} - ) - target_compile_options( - openvino_executor_runner PUBLIC ${_common_compile_options} - ) -endif() - # Install OpenVINO backend library to the lib directory install( TARGETS openvino_backend EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) diff --git a/backends/openvino/README.md b/backends/openvino/README.md index 0046ad23486..5ce38ade56f 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -105,7 +105,7 @@ Follow the steps below to setup your build environment: ```bash ./openvino_build.sh --enable_python ``` - **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `/cmake-out` directory. The binary located at `/cmake-out/backends/openvino/openvino_executor_runner` can be used to run inference with vision models. + **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `/cmake-out` directory. The binary located at `/cmake-out/executor_runner` can be used to run inference with vision models. ```bash ./openvino_build.sh --cpp_runtime ``` diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 00107959412..0d407e33f6e 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -36,6 +36,7 @@ def __init__(self): class OpenvinoOperatorsSupport(OperatorSupportBase): extended_support_dict = { "torch.ops.dim_order_ops._clone_dim_order.default": None, + "torch.ops.dim_order_ops._to_dim_order_copy.default": None, } def __init__( diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index 72c781c0fb3..691115f6579 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -8,13 +8,14 @@ from typing import final, List -from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform from executorch.exir.backend.backend_details import ( BackendDetails, ExportedProgram, PreprocessResult, ) from executorch.exir.backend.compile_spec_schema import CompileSpec + +from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass from openvino.frontend.pytorch.torchdynamo.compile import ( # type: ignore[import-untyped] openvino_compile, ) @@ -37,8 +38,7 @@ def preprocess( Returns: PreprocessResult: The result of preprocessing, including the compiled model bytes. """ - # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations - transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module) + transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module) # Update the edge_program with the transformed graph if transformed_ep and transformed_ep.graph_module: diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh index b7e5f5270ab..6d7853b96e5 100755 --- a/backends/openvino/scripts/openvino_build.sh +++ b/backends/openvino/scripts/openvino_build.sh @@ -30,10 +30,11 @@ build_cpp_runtime() { -DEXECUTORCH_BUILD_OPENVINO=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index 32105597260..07166b92ea2 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -214,7 +214,7 @@ add_subdirectory( install( TARGETS qnn_executorch_backend EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} ) # QNN pybind diff --git a/backends/qualcomm/__init__.py b/backends/qualcomm/__init__.py index 04ba5fcf24b..5770dfb0fcd 100644 --- a/backends/qualcomm/__init__.py +++ b/backends/qualcomm/__init__.py @@ -1,23 +1,13 @@ import os -from .scripts.download_qnn_sdk import ( - check_glibc_exist_and_validate, - install_qnn_sdk, - is_linux_x86, -) +from .scripts.download_qnn_sdk import install_qnn_sdk, is_linux_x86 env_flag = os.getenv("EXECUTORCH_BUILDING_WHEEL", "0").lower() # If users have preinstalled QNN_SDK_ROOT, we will use it. qnn_sdk_root_flag = os.getenv("QNN_SDK_ROOT", None) -if ( - env_flag not in ("1", "true", "yes") - and not qnn_sdk_root_flag - and is_linux_x86() - and check_glibc_exist_and_validate() -): +if env_flag not in ("1", "true", "yes") and not qnn_sdk_root_flag and is_linux_x86(): ok = install_qnn_sdk() - if not ok: raise RuntimeError("Failed to install QNN SDK. Please check the logs above.") diff --git a/backends/qualcomm/_passes/TARGETS b/backends/qualcomm/_passes/TARGETS index 62a0fc43a78..876b51d3863 100644 --- a/backends/qualcomm/_passes/TARGETS +++ b/backends/qualcomm/_passes/TARGETS @@ -15,5 +15,6 @@ runtime.python_library( "//executorch/backends/transforms:decompose_sdpa", "//executorch/exir/backend:backend_details", "//executorch/exir/backend:compile_spec_schema", + "//executorch/backends/qualcomm/quantizer:quantizer", ], ) diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py index 15fce79ea12..154a360689e 100644 --- a/backends/qualcomm/_passes/__init__.py +++ b/backends/qualcomm/_passes/__init__.py @@ -13,14 +13,18 @@ from .convert_linear_to_conv2d import ConvertLinearToConv2d from .convert_square_to_pow import ConvertSquareToPow from .decompose_any import DecomposeAny +from .decompose_binary_alpha import DecomposeBinaryAlpha from .decompose_cdist import DecomposeCDist from .decompose_col_im import DecomposeColIm from .decompose_einsum import DecomposeEinsum from .decompose_expm1 import DecomposeExpM1 +from .decompose_floor_divide import DecomposeFloorDivide +from .decompose_glu import DecomposeGlu from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm from .decompose_minmaxdim import DecomposeMinMaxDim from .decompose_roll import DecomposeRoll from .decompose_silu import DecomposeSilu +from .decompose_threshold import DecomposeThreshold from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape from .fixed_linear_keep_dim import FixedLinearKeepDim @@ -30,6 +34,7 @@ from .i64_to_i32 import I64toI32 from .insert_io_qdq import InsertIOQDQ from .insert_requantize import InsertRequantize +from .insert_reshape_for_reduce_ops import InsertReshapeForReduceOps from .layout_transform import LayoutTransform from .lift_constant_scalar_operands import LiftConstantScalarOperands from .recompose_pixel_unshuffle import RecomposePixelUnshuffle @@ -42,7 +47,6 @@ from .seq_mse import SeqMSE from .tag_quant_io import TagQuantIO - __all__ = [ AnnotateAdaptiveAvgPool1D, AnnotateQuantAttrs, @@ -53,14 +57,18 @@ ConvertLinearToConv2d, ConvertSquareToPow, DecomposeAny, + DecomposeBinaryAlpha, DecomposeCDist, DecomposeColIm, DecomposeEinsum, DecomposeExpM1, + DecomposeFloorDivide, + DecomposeGlu, DecomposeLinalgVectorNorm, DecomposeMinMaxDim, DecomposeRoll, DecomposeSilu, + DecomposeThreshold, DecomposeWrapWithAutocast, ExpandBroadcastTensorShape, FixedLinearKeepDim, @@ -69,6 +77,7 @@ FuseConsecutiveTranspose, I64toI32, InsertIOQDQ, + InsertReshapeForReduceOps, InsertRequantize, LayoutTransform, LiftConstantScalarOperands, diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py index 610e88e6d3b..6077d51b099 100644 --- a/backends/qualcomm/_passes/annotate_quant_attrs.py +++ b/backends/qualcomm/_passes/annotate_quant_attrs.py @@ -19,6 +19,7 @@ QCOM_SCALE, QCOM_ZERO_POINT, ) +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult from .utils import get_quant_attrs @@ -38,6 +39,9 @@ def __init__( super(AnnotateQuantAttrs, self).__init__() self.edge_program = edge_program self.skip_advanced_requant = skip_advanced_requant + self.skip_requant_allowlist = { + exir_ops.edge.aten.sigmoid.default, + } def _annotate_source_nodes( self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any] @@ -80,6 +84,10 @@ def _annotate_requant(self, n): # node1 -> q_ui8 (n) -> dq_ui8 -> q_int32 -> dq_int32 -> node2 -> .... # We store {node2: quant_attr in dq_int32} in node1.meta if n.target in q_ops and n.args[0].target not in dq_ops: + # for some fixed scale op, there is no need to requantize it + if n.args[0].target in self.skip_requant_allowlist: + return + dq_nodes = self._find_last_dq_nodes(n) q_attrs = get_quant_attrs(self.edge_program, n) for dq_node in dq_nodes: diff --git a/backends/qualcomm/_passes/canonicalize_conv.py b/backends/qualcomm/_passes/canonicalize_conv.py index 3804fb05da0..dc5c26c1a94 100644 --- a/backends/qualcomm/_passes/canonicalize_conv.py +++ b/backends/qualcomm/_passes/canonicalize_conv.py @@ -34,6 +34,7 @@ def __init__(self, edge_program: torch.export.ExportedProgram): self.transpose_conv_set = { torch.ops.aten.conv_transpose1d.default, torch.ops.aten.conv_transpose2d.input, + torch.ops.aten.conv_transpose3d.input, } def dilate(self, tensor, dilation): diff --git a/backends/qualcomm/_passes/decompose_any.py b/backends/qualcomm/_passes/decompose_any.py index e92bf11dd18..0cb959ff77f 100644 --- a/backends/qualcomm/_passes/decompose_any.py +++ b/backends/qualcomm/_passes/decompose_any.py @@ -8,6 +8,8 @@ from executorch.exir import to_edge from executorch.exir.pass_base import ExportPass, PassResult +from .utils import merge_decomposed_graph + class Any(torch.nn.Module): def __init__(self, dim, keepdim): @@ -49,26 +51,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: # remap is used to map original node values to new node values, # which ensures that reference to nodes are correctly updated in the new graph remap = {"x": node.args[0]} - - for decomposed_node in decomposed_module.graph.nodes: - # no need to copy existent 'output' - if decomposed_node.op == "output": - for user in node.users.copy(): - # remap - user.replace_input_with( - node, - remap[decomposed_node.args[0][0]], - ) - # no need to copy existent placeholders - elif decomposed_node.op == "placeholder": - # replace node map from string to graph node - remap[decomposed_node] = remap.pop(decomposed_node.name) - else: - remap[decomposed_node] = graph.node_copy( - decomposed_node, - arg_transform=lambda x, remap=remap: remap[x], - ) - + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + ) graph.erase_node(node) graph.eliminate_dead_code() diff --git a/backends/qualcomm/_passes/decompose_binary_alpha.py b/backends/qualcomm/_passes/decompose_binary_alpha.py new file mode 100644 index 00000000000..df767f10ca9 --- /dev/null +++ b/backends/qualcomm/_passes/decompose_binary_alpha.py @@ -0,0 +1,61 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.exir.pass_base import ExportPass, PassResult + +from .utils import copy_meta + +decomp_set = {torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor} + + +class DecomposeBinaryAlpha(ExportPass): + """ + QNN does not support alpha parameter for add/sub. + Decompose to mul + add / mul + sub + """ + + def __init__(self) -> None: + super().__init__() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + graph = graph_module.graph + for node in graph.nodes: + if ( + node.target in decomp_set + and "alpha" in node.kwargs + and node.kwargs["alpha"] != 1 + ): + alpha = node.kwargs["alpha"] + # Remove alpha from immutable dict + node.kwargs = {k: v for k, v in node.kwargs.items() if k != "alpha"} + input2_node = node.args[1] + # If input2 is constant, we can just multiply the value for optimization + if isinstance(input2_node, (int, float)): + arg_list = list(node.args) + arg_list[1] = input2_node * alpha + node.args = tuple(arg_list) + continue + with graph.inserting_before(node): + mul_op = torch.ops.aten.mul.Scalar + mul_node = graph.create_node( + "call_function", + mul_op, + ( + input2_node, + alpha, + ), + ) + mul_node.meta = copy_meta(node.meta) + node.replace_input_with(input2_node, mul_node) + node.args = ( + node.args[0], + mul_node, + ) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/decompose_cdist.py b/backends/qualcomm/_passes/decompose_cdist.py index d18a0295ffb..a3c812bdc37 100644 --- a/backends/qualcomm/_passes/decompose_cdist.py +++ b/backends/qualcomm/_passes/decompose_cdist.py @@ -7,6 +7,8 @@ import torch from executorch.exir.pass_base import ExportPass, PassResult +from .utils import merge_decomposed_graph + class CDist(torch.nn.Module): def __init__(self): @@ -54,26 +56,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: # remap is used to map original node values to new node values, # which ensures that reference to nodes are correctly updated in the new graph remap = {"x": node.args[0], "y": node.args[1]} - - for decomposed_node in decomposed_module.graph.nodes: - # no need to copy existent 'output' - if decomposed_node.op == "output": - for user in node.users.copy(): - # remap - user.replace_input_with( - node, - remap[decomposed_node.args[0][0]], - ) - # no need to copy existent placeholders - elif decomposed_node.op == "placeholder": - # replace node map from string to graph node - remap[decomposed_node] = remap.pop(decomposed_node.name) - else: - remap[decomposed_node] = graph.node_copy( - decomposed_node, - arg_transform=lambda x, remap=remap: remap[x], - ) - + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + ) graph.erase_node(node) graph.eliminate_dead_code() diff --git a/backends/qualcomm/_passes/decompose_einsum.py b/backends/qualcomm/_passes/decompose_einsum.py index 046c1598311..464d989333f 100644 --- a/backends/qualcomm/_passes/decompose_einsum.py +++ b/backends/qualcomm/_passes/decompose_einsum.py @@ -8,7 +8,7 @@ from executorch.exir.pass_base import ExportPass, PassResult from torch.fx.experimental.proxy_tensor import make_fx -from .utils import copy_nn_module_stack +from .utils import merge_decomposed_graph class DecomposeEinsum(ExportPass): @@ -37,30 +37,13 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: for i, arg in enumerate(node.args[1]): remap[f"arg1_{i+1}"] = arg - for decomposed_node in decomposed_module.graph.nodes: - copy_nn_module_stack(node, decomposed_node) - # This is the arg[0] equation string, which is not required anymore after decomposition - if "arg0" in decomposed_node.name: - continue - - # no need to copy existent 'output' - if decomposed_node.op == "output": - for user in node.users.copy(): - # remap - user.replace_input_with( - node, - remap[decomposed_node.args[0][0]], - ) - # no need to copy existent placeholders - elif decomposed_node.op == "placeholder": - # replace node map from string to graph node - remap[decomposed_node] = remap.pop(decomposed_node.name) - else: - remap[decomposed_node] = graph.node_copy( - decomposed_node, - arg_transform=lambda x, remap=remap: remap[x], - ) - + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + predicate=lambda decomp_node: "arg0" not in decomp_node.name, + ) graph.erase_node(node) graph.eliminate_dead_code() diff --git a/backends/qualcomm/_passes/decompose_floor_divide.py b/backends/qualcomm/_passes/decompose_floor_divide.py new file mode 100644 index 00000000000..f7de074259e --- /dev/null +++ b/backends/qualcomm/_passes/decompose_floor_divide.py @@ -0,0 +1,62 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.exir.pass_base import ExportPass, PassResult + +from .utils import merge_decomposed_graph + + +class FloorDivide(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + dtype = x.dtype + result = torch.div(x, y) + result = torch.floor(result) + return result.to(dtype) + + +class DecomposeFloorDivide(ExportPass): + """ + Decompose for math equivalent op. + Since QNN does not support floor_divide operations for int32 or int64 inputs, + it is necessary to decompose the operation into a division using floating-point precision, + followed by applying the floor function. + """ + + def __init__(self) -> None: + super().__init__() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + graph = graph_module.graph + for node in graph.nodes: + model = FloorDivide() + if ( + torch.ops.aten.floor_divide.default == node.target + and not torch.is_floating_point(node.meta["val"]) + ): + decomposed_module = torch.export.export( + model, + (node.args[0].meta["val"], node.args[1].meta["val"]), + strict=True, + ).module() + with graph.inserting_before(node): + # remap is used to map original node values to new node values, + # which ensures that reference to nodes are correctly updated in the new graph + remap = {"x": node.args[0], "y": node.args[1]} + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + ) + graph.erase_node(node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/decompose_glu.py b/backends/qualcomm/_passes/decompose_glu.py new file mode 100644 index 00000000000..de363468799 --- /dev/null +++ b/backends/qualcomm/_passes/decompose_glu.py @@ -0,0 +1,55 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.exir.pass_base import ExportPass, PassResult + +from .utils import merge_decomposed_graph + + +# this wrapper is required for IO name mapping with decomposed graph +class Glu(torch.nn.Module): + def __init__(self, dim=-1): + super().__init__() + self.glu = torch.nn.GLU(dim=dim) + + def forward(self, x): + return self.glu(x) + + +class DecomposeGlu(ExportPass): + """ + Decompose glu for quantization annotation to work properly. + """ + + def __init__(self) -> None: + super().__init__() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + graph = graph_module.graph + for node in graph.nodes: + if node.target == torch.ops.aten.glu.default: + ep = torch.export.export( + Glu(dim=-1 if len(node.args) < 2 else node.args[1]), + (node.args[0].meta["val"],), + ) + decomposed_module = ep.run_decompositions().graph_module + + with graph.inserting_before(node): + # remap is used to map original node values to new node values, + # which ensures that reference to nodes are correctly updated in the new graph + remap = {"x": node.args[0]} + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + ) + graph.erase_node(node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py index 993f088da12..94a5b10ba3f 100644 --- a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py +++ b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py @@ -8,7 +8,7 @@ from executorch.exir import to_edge from executorch.exir.pass_base import ExportPass, PassResult -from .utils import copy_nn_module_stack +from .utils import merge_decomposed_graph class LinalgVectorNorm(torch.nn.Module): @@ -62,27 +62,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: # remap is used to map original node values to new node values, # which ensures that reference to nodes are correctly updated in the new graph remap = {"x": node.args[0]} - - for decomposed_node in decomposed_module.graph.nodes: - copy_nn_module_stack(node, decomposed_node) - # no need to copy existent 'output' - if decomposed_node.op == "output": - for user in node.users.copy(): - # remap - user.replace_input_with( - node, - remap[decomposed_node.args[0][0]], - ) - # no need to copy existent placeholders - elif decomposed_node.op == "placeholder": - # replace node map from string to graph node - remap[decomposed_node] = remap.pop(decomposed_node.name) - else: - remap[decomposed_node] = graph.node_copy( - decomposed_node, - arg_transform=lambda x, remap=remap: remap[x], - ) - + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + ) graph.erase_node(node) graph.eliminate_dead_code() diff --git a/backends/qualcomm/_passes/decompose_roll.py b/backends/qualcomm/_passes/decompose_roll.py index e13433508f5..e6f60d55464 100644 --- a/backends/qualcomm/_passes/decompose_roll.py +++ b/backends/qualcomm/_passes/decompose_roll.py @@ -7,7 +7,7 @@ from executorch.exir.pass_base import ExportPass, PassResult -from .utils import copy_nn_module_stack +from .utils import merge_decomposed_graph class SliceCopy(torch.nn.Module): @@ -65,27 +65,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: # remap is used to map original node values to new node values, # which ensures that reference to nodes are correctly updated in the new graph remap = {"x": input_node} - - for decomposed_node in decomposed_module.graph.nodes: - copy_nn_module_stack(node, decomposed_node) - # no need to copy existent 'output' - if decomposed_node.op == "output": - for user in node.users.copy(): - # remap - user.replace_input_with( - node, - remap[decomposed_node.args[0][0]], - ) - # no need to copy existent placeholders - elif decomposed_node.op == "placeholder": - # replace node map from string to graph node - remap[decomposed_node] = remap.pop(decomposed_node.name) - else: - remap[decomposed_node] = graph.node_copy( - decomposed_node, - arg_transform=lambda x, remap=remap: remap[x], - ) - + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + ) graph.erase_node(node) graph.eliminate_dead_code() diff --git a/backends/qualcomm/_passes/decompose_threshold.py b/backends/qualcomm/_passes/decompose_threshold.py new file mode 100644 index 00000000000..0f0a1bc4ea8 --- /dev/null +++ b/backends/qualcomm/_passes/decompose_threshold.py @@ -0,0 +1,61 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch + +from executorch.exir.pass_base import ExportPass, PassResult + +from .utils import merge_decomposed_graph + + +class DecomposeModule(torch.nn.Module): + def __init__(self, threshold, value): + super().__init__() + self.threshold = threshold + self.value = value + + def forward(self, x): + return torch.where(x <= self.threshold, self.value, x) + + +class DecomposeThreshold(ExportPass): + """ + Decompose threshold to less_equal and where. + """ + + def __init__(self) -> None: + super().__init__() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + graph = graph_module.graph + for node in graph.nodes: + if node.target in { + torch.ops.aten.threshold_.default, + torch.ops.aten.threshold.default, + }: + input_node = node.args[0] + threshold = node.args[1] + value = node.args[2] + + model = DecomposeModule(threshold, value) + decomposed_module = torch.export.export( + model, (input_node.meta["val"],), strict=True + ).module() + + with graph.inserting_before(node): + # remap is used to map original node values to new node values, + # which ensures that reference to nodes are correctly updated in the new graph + remap = {"x": input_node} + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + ) + graph.erase_node(node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/decompose_wrap_with_autocast.py b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py index 6c073bd309c..1b60b740ed3 100644 --- a/backends/qualcomm/_passes/decompose_wrap_with_autocast.py +++ b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py @@ -10,7 +10,7 @@ import torch from executorch.exir.pass_base import ExportPass, PassResult -from .utils import copy_nn_module_stack +from .utils import merge_decomposed_graph class DecomposeWrapWithAutocast(ExportPass): @@ -52,7 +52,7 @@ def _replace(self, gm: torch.fx.GraphModule) -> None: graph = gm.graph for node in graph.nodes: if isinstance(node.target, torch._higher_order_ops.wrap.WrapWithAutocast): - submod, submod_name = self._get_submod(gm, node) + submod, _ = self._get_submod(gm, node) n_args = node.args input_submod = n_args[4] decomposed_module = submod @@ -61,22 +61,13 @@ def _replace(self, gm: torch.fx.GraphModule) -> None: # which ensures that reference to nodes are correctly updated in the new graph # remap = {"expand_1": node.args[5], "to_4": node.args[6]} remap = {n_args[i].name: n_args[i] for i in range(5, len(n_args))} - - for decomposed_node in decomposed_module.graph.nodes: - copy_nn_module_stack(node, decomposed_node) - # no need to copy existent 'output' - if decomposed_node.op == "output": - self._replace_output(node, decomposed_node, remap) - # no need to copy existent placeholders - elif decomposed_node.op == "placeholder": - # replace node map from string to graph node - remap[decomposed_node] = remap.pop(decomposed_node.name) - else: - remap[decomposed_node] = graph.node_copy( - decomposed_node, - arg_transform=lambda x, remap=remap: remap[x], - ) - + merge_decomposed_graph( + remap=remap, + target_node=node, + target_graph=graph, + decomposed_graph_module=decomposed_module, + output_processor=self._replace_output, + ) graph.erase_node(node) graph.erase_node(input_submod) diff --git a/backends/qualcomm/_passes/fixed_linear_keep_dim.py b/backends/qualcomm/_passes/fixed_linear_keep_dim.py index 19f5c631921..04c0f92cebf 100644 --- a/backends/qualcomm/_passes/fixed_linear_keep_dim.py +++ b/backends/qualcomm/_passes/fixed_linear_keep_dim.py @@ -5,10 +5,14 @@ # LICENSE file in the root directory of this source tree. import torch +from executorch.backends.qualcomm.builders.node_visitor import dq_ops +from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult from executorch.exir.passes import dead_code_elimination_pass +from .utils import copy_meta, get_quant_attrs + class FixedLinearKeepDim(ExportPass): """ @@ -18,8 +22,12 @@ class FixedLinearKeepDim(ExportPass): view_copy = exir_ops.edge.aten.view_copy.default linear = exir_ops.edge.aten.linear.default - def __init__(self): + def __init__( + self, + edge_program: torch.export.ExportedProgram, + ): super(FixedLinearKeepDim, self).__init__() + self.edge_program = edge_program def _fixed_keep_dim(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: @@ -46,9 +54,15 @@ def _fixed_keep_dim(self, graph_module: torch.fx.GraphModule): ) # meta needs to be copied elementwisely for fake-tensor # to be updated correctly and not affect meta of input_node - for k, v in input_node.meta.items(): - squeeze_node.meta[k] = v + squeeze_node.meta = copy_meta(input_node.meta) squeeze_node.meta["val"] = input_tensor.reshape(squeeze_dim) + # if input_node is dequantize, we need to fetch encodings manually + # TODO: remove this when constant fold mechanism is introduced + if input_node.target in dq_ops: + squeeze_node.meta[QCOM_QUANT_ATTRS] = get_quant_attrs( + self.edge_program, input_node + ) + for user in input_users: if user == linear_node: user.replace_input_with(input_node, squeeze_node) @@ -66,8 +80,7 @@ def _fixed_keep_dim(self, graph_module: torch.fx.GraphModule): ) # meta needs to be copied elementwisely for fake-tensor # to be updated correctly and not affect meta of unsqueeze_node - for k, v in linear_node.meta.items(): - unsqueeze_node.meta[k] = v + unsqueeze_node.meta = copy_meta(linear_node.meta) # update linear node's shape linear_node.meta["val"] = linear_output.reshape( (squeeze_node.meta["val"].shape[0], linear_output.shape[-1]) diff --git a/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py new file mode 100644 index 00000000000..52f9546c28e --- /dev/null +++ b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py @@ -0,0 +1,59 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.exir.pass_base import ExportPass, PassResult +from executorch.exir.passes import dead_code_elimination_pass + + +class InsertReshapeForReduceOps(ExportPass): + """ + Rewrite `aten.argmax.default` with `dim=None` into + a reshape-to-1D followed by argmax(dim=0). + + PyTorch semantics: + torch.argmax(x, dim=None) -> flatten(x) then argmax along axis=0 + + QNN requires an explicit axis, so we insert the reshape. + """ + + def __init__(self): + super().__init__() + self.op_map = {torch.ops.aten.argmax.default, torch.ops.aten.argmin.default} + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + modified = False + + for n in graph.nodes: + if n.target in self.op_map: + dim_arg = None if len(n.args) == 1 else n.args[1] + + if dim_arg is None: + inp = n.args[0] + + # Insert reshape before argmax + with graph.inserting_before(n): + reshape_node = graph.create_node( + "call_function", + torch.ops.aten.reshape.default, + (inp, [-1]), + {}, + ) + reshape_node.meta = dict(inp.meta) + if "val" in inp.meta: + reshape_node.meta["val"] = inp.meta["val"].reshape(-1) + + # Rewrite argmax: take reshape_node as input, set dim=0 + n.args = (reshape_node, 0, *n.args[2:]) + + modified = True + + if modified: + graph_module.recompile() + dead_code_elimination_pass(graph_module) + + return PassResult(graph_module, modified) diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py index f5c5915cab2..52bdf7fa090 100644 --- a/backends/qualcomm/_passes/lift_constant_scalar_operands.py +++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py @@ -51,6 +51,7 @@ class TensorOpInfo: # The scalar number arg[1] is missing when using default. Result in a corner case to deal aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True, False), aten.leaky_relu_.default: TensorOpInfo(aten.prelu.default, True, False), + aten.where.ScalarSelf: TensorOpInfo(aten.where.self, False, True), aten.where.ScalarOther: TensorOpInfo(aten.where.self, False, True), aten.where.Scalar: TensorOpInfo(aten.where.self, False, True), aten.masked_fill.Scalar: TensorOpInfo(aten.masked_fill.Tensor, False, False), diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py index ffb9f3221df..360581a2929 100644 --- a/backends/qualcomm/_passes/qnn_pass_manager.py +++ b/backends/qualcomm/_passes/qnn_pass_manager.py @@ -18,14 +18,18 @@ ConvertLinearToConv2d, ConvertSquareToPow, DecomposeAny, + DecomposeBinaryAlpha, DecomposeCDist, DecomposeColIm, DecomposeEinsum, DecomposeExpM1, + DecomposeFloorDivide, + DecomposeGlu, DecomposeLinalgVectorNorm, DecomposeMinMaxDim, DecomposeRoll, DecomposeSilu, + DecomposeThreshold, DecomposeWrapWithAutocast, ExpandBroadcastTensorShape, FixedLinearKeepDim, @@ -35,6 +39,7 @@ I64toI32, InsertIOQDQ, InsertRequantize, + InsertReshapeForReduceOps, LayoutTransform, LiftConstantScalarOperands, RecomposePixelUnshuffle, @@ -193,26 +198,37 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): self.add_pass(RecomposePixelUnshuffle(quantization_capture=True)) self.add_pass(RecomposeRmsNorm(quantization_capture=True)) self.add_pass(ReplaceArangeArgs()) + self.add_pass(DecomposeBinaryAlpha()) self.add_pass(DecomposeCDist()) self.add_pass(DecomposeScaledDotProductAttention()) self.add_pass(DecomposeRoll()) self.add_pass(DecomposeSilu()) + self.add_pass(DecomposeThreshold()) self.add_pass(DecomposeWrapWithAutocast()) self.add_pass(DecomposeEinsum()) self.add_pass(DecomposeExpM1()) + self.add_pass(DecomposeGlu()) self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True)) self.add_pass(ReplaceInfValues()) self.add_pass(LiftConstantScalarOperands()) + self.add_pass(InsertReshapeForReduceOps()) return self._transform(graph_module) def transform_for_export_pipeline( self, exported_program: ExportedProgram, convert_linear_to_conv2d: bool = False ): + self.add_pass(DecomposeBinaryAlpha()) self.add_pass(DecomposeCDist()) self.add_pass(DecomposeScaledDotProductAttention()) self.add_pass(DecomposeRoll()) + self.add_pass(DecomposeThreshold()) self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True)) self.add_pass(DecomposeExpM1()) + # DecomposeFloorDivide does not apply to the annotation pipeline, + # since the CPU QDQ model would reduce accuracy. + # We keep div and floor operations in floating-point to maintain precision. + # This pass is needed before to_edge pipeline to avoid mixed type for div operator with RemoveMixedTypeOperators pass. + self.add_pass(DecomposeFloorDivide()) self.add_pass(DecomposeWrapWithAutocast()) # this pass will rewrite state_dict, it needs to be accomplished before # to_edge_transform_and_lower @@ -221,6 +237,7 @@ def transform_for_export_pipeline( self.add_pass(ConvertLinearToConv2d(exported_program)) self.add_pass(ConvertSquareToPow()) self.add_pass(LiftConstantScalarOperands()) + self.add_pass(InsertReshapeForReduceOps()) self._transform(exported_program.graph_module) ep = lift_constant_tensor_pass(exported_program) return ep diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py index 6d908707892..eebfa4d9eb4 100755 --- a/backends/qualcomm/_passes/utils.py +++ b/backends/qualcomm/_passes/utils.py @@ -117,6 +117,45 @@ def copy_nn_module_stack(src, target): target.meta["nn_module_stack"] = value +def merge_decomposed_graph( + remap: Dict[str, torch.fx.Node], + target_node: torch.fx.Node, + target_graph: torch.fx.GraphModule, + decomposed_graph_module: torch.fx.GraphModule, + predicate: Callable[[torch.fx.Node], None] = None, + # target_node, decomposed_output_node, remap + output_processor: Callable[ + [torch.fx.Node, torch.fx.Node, Dict[str, torch.fx.Node]], None + ] = None, +) -> None: + def default_output_process(node): + for user in node.users.copy(): + # remap + user.replace_input_with( + node, + remap[decomposed_node.args[0][0]], + ) + + for decomposed_node in decomposed_graph_module.graph.nodes: + copy_nn_module_stack(target_node, decomposed_node) + if predicate is None or predicate(decomposed_node): + # no need to copy existent 'output' + if decomposed_node.op == "output": + if output_processor is None: + default_output_process(target_node) + else: + output_processor(target_node, decomposed_node, remap) + # no need to copy existent placeholders + elif decomposed_node.op == "placeholder": + # replace node map from string to graph node + remap[decomposed_node] = remap.pop(decomposed_node.name) + else: + remap[decomposed_node] = target_graph.node_copy( + decomposed_node, + arg_transform=lambda x, remap=remap: remap[x], + ) + + def is_float_tensor(node: torch.fx.Node) -> bool: if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor): return False diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md index 6ba4eafb01f..61ae1061214 100644 --- a/backends/qualcomm/builders/README.md +++ b/backends/qualcomm/builders/README.md @@ -365,7 +365,7 @@ Please help update following table if you are contributing new operators: + 🚫 = Deprecated, supported with other QNN Ops -| Operators | HTP - 90/116 Enabled | +| Operators | HTP - 92/116 Enabled | |-----------|---------| | Argmax | ✓ | | Argmin | ✓ | @@ -375,7 +375,7 @@ Please help update following table if you are contributing new operators: | ChannelShuffle | ✗ | | Concat | ✓ | | Conv2d | ✓ | -| Conv3d | ✗ | +| Conv3d | ✓ | | Convert | ✓ | | CreateSparse | ✗ | | CumulativeSum | ✓ | @@ -481,7 +481,7 @@ Please help update following table if you are contributing new operators: | TopK | ✓ | | TransPose | ✓ | | TransPoseConv2d | ✓ | -| TransPoseConv3d | ✗ | +| TransPoseConv3d | ✓ | | Unpack | ✓ | ## Issues diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py index 9800fb7bdab..3fa8ae067fa 100644 --- a/backends/qualcomm/builders/__init__.py +++ b/backends/qualcomm/builders/__init__.py @@ -24,7 +24,7 @@ op_cat, op_ceil, op_clamp, - op_conv2d, + op_conv, op_copy, op_cos, op_cum_sum, @@ -129,7 +129,7 @@ op_cat, op_ceil, op_clamp, - op_conv2d, + op_conv, op_copy, op_cos, op_cum_sum, diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py index bc2b62c8c0b..8cbf3a50e22 100644 --- a/backends/qualcomm/builders/node_visitor.py +++ b/backends/qualcomm/builders/node_visitor.py @@ -176,7 +176,7 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict): user_0 = self.get_first_user(node) if "convolution" in user_0.target.__name__: # OIHW (pytorch) -> HWIO (QNN) - quant_config[QCOM_AXIS] = 3 + quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1 quant_config[QCOM_AXIS_ORDER] = (2, 3, 1, 0) elif "linear" in user_0.target.__name__: # OI (pytorch) -> OI (QNN) @@ -218,7 +218,7 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict): user_0 = self.get_first_user(node) # Memory layout of QNN conv weight always ends in Output. Like conv2d is HWIO if "convolution" in user_0.target.__name__: - quant_config[QCOM_AXIS] = 3 + quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1 else: quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS] diff --git a/backends/qualcomm/builders/op_cat.py b/backends/qualcomm/builders/op_cat.py index 9f6eb6676cf..644b087ab9c 100644 --- a/backends/qualcomm/builders/op_cat.py +++ b/backends/qualcomm/builders/op_cat.py @@ -29,14 +29,15 @@ def define_node( node: torch.fx.Node, nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], ) -> PyQnnWrapper.PyQnnOpWrapper: - list_of_tensors = cast(List[torch.fx.Node], node.args[0]) - list_of_tensor_wrappers = [] + input_nodes = cast(List[torch.fx.Node], node.args[0]) + input_tensor_wrappers = [] - for tensor_input in list_of_tensors: - input_tensor = self.get_tensor(self.get_node(tensor_input), node) - list_of_tensor_wrappers.append( + for input_node in input_nodes: + source_input_node = self.get_node(input_node) + input_tensor = self.get_tensor(source_input_node, node) + input_tensor_wrappers.append( self.define_tensor( - tensor_input, + source_input_node, node, input_tensor, PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, @@ -44,7 +45,7 @@ def define_node( ) ) - if len(list_of_tensors) != len(list_of_tensor_wrappers): + if len(input_nodes) != len(input_tensor_wrappers): warnings.warn( "[QNN Delegate Op Builder]: The number or input tensors is not equal to the number of input tensor wrappers.", stacklevel=1, @@ -76,7 +77,7 @@ def define_node( QNN_OP_PACKAGE_NAME_QTI_AISW, OpConcat.op_name, ) - concat_op.AddInputTensors(list_of_tensor_wrappers) + concat_op.AddInputTensors(input_tensor_wrappers) concat_op.AddOutputTensors([output_tensor_wrapper]) concat_op.AddScalarParam( diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv.py similarity index 82% rename from backends/qualcomm/builders/op_conv2d.py rename to backends/qualcomm/builders/op_conv.py index 1cfc1e45c9b..2bc0b41524d 100644 --- a/backends/qualcomm/builders/op_conv2d.py +++ b/backends/qualcomm/builders/op_conv.py @@ -7,7 +7,6 @@ from typing import cast, Dict, List import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper - import numpy as np import torch from executorch.backends.qualcomm.utils.constants import QCOM_DATA @@ -16,8 +15,10 @@ from .node_visitor_manager import register_node_visitor from .qnn_constants import ( OpConv2d, + OpConv3d, OpDepthWiseConv2d, OpTransposeConv2d, + OpTransposeConv3d, QNN_OP_PACKAGE_NAME_QTI_AISW, ) from .utils import get_parameter @@ -66,7 +67,7 @@ def _add_conv_op_parameter( len(padding_shape), padding_shape, np.array( - [[padding[0], padding[0]], [padding[1], padding[1]]], + padding, dtype=np.uint32, ), True, @@ -108,8 +109,14 @@ def define_node( input_node = self.get_node(node.args[0]) input_tensor = self.get_tensor(input_node, node) assert ( - input_tensor.dim() == 4 + input_tensor.dim() != 3 ), "All Conv1D should be converted to Conv2D in CanonicalizeConv," + assert input_tensor.dim() in { + 4, + 5, + }, "Only Conv2d and Conv3d is supported in conv builder," + + is_conv2d = input_tensor.dim() == 4 input_tensor_wrapper = self.define_tensor( input_node, node, @@ -120,9 +127,15 @@ def define_node( filter_node = self.get_node(node.args[1]) filter_tensor = get_parameter(filter_node, self.edge_program) - # weight of pytorch OIHW(conv2d) | IOHW(conv_transpose2d), yet QNN is HWIO + # weight of pytorch OIHW(conv2d) / OIDHW(conv3d) or IOHW(conv_transpose2d) / IODHW(conv_transpose3d), + # yet QNN is HWIO or DHWIO is_transpose_conv = cast(bool, node.args[6]) - filter_axis_order = (2, 3, 0, 1) if is_transpose_conv else (2, 3, 1, 0) + if is_conv2d: + filter_axis_order = (2, 3, 0, 1) if is_transpose_conv else (2, 3, 1, 0) + else: + filter_axis_order = ( + (2, 3, 4, 0, 1) if is_transpose_conv else (2, 3, 4, 1, 0) + ) filter_tensor = filter_tensor.permute(dims=filter_axis_order).contiguous() filter_tensor_wrapper = self.define_tensor( filter_node, @@ -132,7 +145,6 @@ def define_node( nodes_to_wrappers, ) conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper] - if node.args[2] is not None: bias_node = self.get_node(node.args[2]) bias_tensor = get_parameter(bias_node, self.edge_program) @@ -159,11 +171,10 @@ def define_node( padding = cast(List[int], node.args[4]) dilation = cast(List[int], node.args[5]) output_padding = cast(List[int], node.args[7]) - groups = cast(int, node.args[8]) - # Qnn filter tensor is (H, W, Cin, Cout) - group_input_channels = filter_tensor.shape[2] - group_output_channels = int(filter_tensor.shape[3] / groups) + # Qnn filter tensor is (H, W, Cin, Cout) or (D, H, W, Cin, Cout) + group_input_channels = filter_tensor.shape[-2] + group_output_channels = int(filter_tensor.shape[-1] / groups) # 1) groups = input_channels (i.e. group_input_channels = 1) # 2) output_channels is a positive integer multiple of input channels # TODO: Currently, negative results will be zero with Depthwise conv2d when input_channel == groups == 1 @@ -175,18 +186,23 @@ def define_node( ) if len(padding) == 1: padding = padding + padding + padding = [[x, x] for x in padding] stride_shape = [len(stride)] - padding_shape = [2, 2] + padding_shape = [len(padding), len(padding[0])] dilation_shape = [len(dilation)] output_padding_shape = [len(output_padding)] - if is_depthwise_conv: + if is_transpose_conv: + assert all( + val == 1 for val in dilation + ), "CanonicalizeConv pass should perform dilate for transpose_conv." + op_class = OpTransposeConv2d if is_conv2d else OpTransposeConv3d + elif is_depthwise_conv: + assert is_conv2d, "DepthWise only supports Conv2d" op_class = OpDepthWiseConv2d - elif is_transpose_conv: - op_class = OpTransposeConv2d else: - op_class = OpConv2d + op_class = OpConv2d if is_conv2d else OpConv3d conv_op = PyQnnWrapper.PyQnnOpWrapper( node.name, diff --git a/backends/qualcomm/builders/op_index_put.py b/backends/qualcomm/builders/op_index_put.py index c3c42ed483a..23481894f0d 100644 --- a/backends/qualcomm/builders/op_index_put.py +++ b/backends/qualcomm/builders/op_index_put.py @@ -1,14 +1,19 @@ import warnings +from collections import OrderedDict from typing import Dict import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper import numpy as np import torch -from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS +from executorch.backends.qualcomm.utils.constants import ( + QCOM_DATA, + QCOM_DTYPE, + QCOM_QUANT_ATTRS, +) from executorch.exir.dialects._ops import ops as exir_ops -from .node_visitor import NodeVisitor, QNN_TENSOR_TYPE_MAP +from .node_visitor import NodeVisitor, QNN_QUANT_TYPE_MAP, QNN_TENSOR_TYPE_MAP from .node_visitor_manager import register_node_visitor from .qnn_constants import ( OpConcat, @@ -26,7 +31,7 @@ class IndexPutVisitor(NodeVisitor): def __init__(self, *args) -> None: super().__init__(*args) - def define_node( + def define_node( # noqa: C901 self, node: torch.fx.Node, nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], @@ -37,6 +42,7 @@ def define_node( if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): quant_attrs = quant_attrs.copy() input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs + input_tensor = self.get_tensor(input_node, node) input_tensor_wrapper = self.define_tensor( input_node, @@ -46,52 +52,110 @@ def define_node( nodes_to_wrappers, ) - indicies_node = node.args[1] - index_node_dim = None - index_nodes = [] - index_tensors = [] + indices_nodes = ( + node.args[1] if isinstance(node.args[1], list) else [node.args[1]] + ) target_index = [] + all_range_index = OrderedDict() + index_dtype = [ + node.meta["val"].dtype for node in indices_nodes if node is not None + ][0] + + # preprocess: + # - broadcast dimension for multiple specified index + # - broadcast specified index if dimensions are not matched + max_indices_in_specified_index = 0 + for index, idx_node in enumerate(indices_nodes): + if isinstance(idx_node, torch.fx.Node): + last_specified_index_node = index + if max_indices_in_specified_index < idx_node.meta["val"].nelement(): + max_indices_in_specified_index = idx_node.meta["val"].nelement() # If there is None in a list, it means all range at that dimension - # E.g., indicies_node: [None, None, aten__to_copy_default_1] - if isinstance(indicies_node, list): - for index, idx_node in enumerate(indicies_node): - # First, collect the indice_node and index of None to construct the shape of index node - # E.g., shape of input: [1, 1024, 12, 64] - # For "None" axis (assume indicies_node: [None, None, aten__to_copy_default_1]), - # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2 - if isinstance(idx_node, torch.fx.Node): - index_nodes.append(idx_node) - index_tensors.append(self.get_tensor(idx_node, idx_node)) - target_index.extend(index_tensors[-1].size()) - index_node_dim = index - elif idx_node is None and index_node_dim is None: - # E.g., indicies_node: [None, aten__to_copy_default_1, None] - # Don't need to consider "None" after index_node. - target_index.append(input_tensor.size(index)) - else: - warnings.warn( - f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None", - stacklevel=1, + for index, idx_node in enumerate(indices_nodes): + # First, collect the index_node and index of None to construct the shape of index node + # E.g., shape of input: [1, 1024, 12, 64] + # For "None" axis (assume indices_node: [None, None, aten__to_copy_default_1]), + # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2 + if isinstance(idx_node, torch.fx.Node): + # e.g. for case [index_node_0, None, index_node_1], nodes will have the same number of indices + target_index.append( + self.get_tensor(idx_node, idx_node).nelement() + if last_specified_index_node == index + else 1 + ) + elif idx_node is None: + # E.g., indices_node: [None, None, aten__to_copy_default_1] + all_range_index[index] = torch.arange( + input_tensor.size(index), dtype=index_dtype + ) + target_index.append(input_tensor.size(index)) + else: + warnings.warn( + f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None", + stacklevel=1, + ) + return + + # preprocess all range indices if any + if None in indices_nodes: + all_range_tensor = torch.cartesian_prod(*all_range_index.values()) + # repeat all_range_tensor interleavely for future concatenation + # e.g. input_node = [5, 4, 3, 2], indices = [index_0_node, None, index_2_node] + # index_0.shape == index_2.shape == 2 (will guarantee this condition) + # where user specified (3, 4) for index_0, (0, 1) for index_2 + # --- + # we should have all_range_tensor: [0, 1, 2, 3] + # repeat interleavely with 2 to match future tiled index_0_node & index_2_node + # we'll have 1(index_0 -> same as index_2)*4(index_1)*2(index_2) indices in total: + # | index_0_node | None | index_2_node | + # | 3 | 0 | 0 | + # | 4 | 0 | 1 | + # | 3 | 1 | 0 | + # | 4 | 1 | 1 | + # | 3 | 2 | 0 | + # | 4 | 2 | 1 | + # | 3 | 3 | 0 | + # | 4 | 3 | 1 | + all_range_tensor_aug = all_range_tensor.repeat_interleave( + max_indices_in_specified_index, dim=0 + ) + for index in all_range_index.keys(): + # Repeat index for "None" axis in indices_nodes + range_index_node = torch.fx.Node( + node.graph, + node.name + f"_all_range_index_{index}", + "call_function", + exir_ops.edge.aten.tensor.default, + (), # args + {}, # kwargs + ) + range_indices = ( + ( + all_range_tensor_aug[:, index] + if all_range_tensor_aug.dim() > 1 + else + # if there is only one None + all_range_tensor_aug ) - return - # Assume that there is only one node in list - assert len(index_nodes) == 1, "Not support multiple indices tensor" - indice_node = index_nodes[0] - indice_tensor = index_tensors[0] - indices_tensor_wrapper = self.define_tensor( - indice_node, - node, - indice_tensor, - PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, - nodes_to_wrappers, - ) + .reshape(-1, 1) + .contiguous() + ) + target_index_tensor_wrapper = self.define_tensor( + range_index_node, + node, + range_indices, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + ) + # store it for future concatenation + all_range_index[index] = (range_indices, target_index_tensor_wrapper) # Need to reconstruct the index tensor. # E.g., based on ScatterND Op Def in QNN Docs. # Torch: # Given that # shape of input: [1, 12, 1024, 64] - # indicies_node: [None, None, aten__to_copy_default_1] + # indices_node: [None, None, aten__to_copy_default_1] # shape of aten__to_copy_default_1: [1] # QNN: # Index tensor: @@ -104,113 +168,135 @@ def define_node( # update_indices = indices.shape[:-1] # for idx in np.ndindex(update_indices): # output[indices[idx]] = updates[idx] + specified_index = OrderedDict() + for i, indices_node in enumerate(indices_nodes): + if indices_node is None: + continue - # Append one dimension to specify x-tuple - index_shape = target_index + [1] - # Reshape the index_node for tile op - reshape_shape = [ - shape if id == index_node_dim else 1 for id, shape in enumerate(index_shape) - ] - reshape_output_tensor = indice_tensor.reshape(reshape_shape) - reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper( - node_name=node.name + "_reshape", - tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, - dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype], - quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED, - quant_configs={}, - dims=reshape_output_tensor.size(), - tensor=reshape_output_tensor, - is_fake_tensor=True, - nodes_to_wrappers=nodes_to_wrappers, - ) - reshape_op = PyQnnWrapper.PyQnnOpWrapper( - node.name, - QNN_OP_PACKAGE_NAME_QTI_AISW, - OpReshape.op_name, - ) - reshape_op.AddInputTensors([indices_tensor_wrapper]) - reshape_op.AddOutputTensors([reshape_output_tensor_wrapper]) - op_wrapper_list.append(reshape_op) - index_put_index_input_tensor_wrapper = reshape_output_tensor_wrapper + indices_tensor = self.get_tensor(indices_node, indices_node) + indices_tensor_wrapper = self.define_tensor( + indices_node, + node, + indices_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + ) + if indices_tensor.nelement() < max_indices_in_specified_index: + # broadcast the specified index + indices_tensor = indices_tensor.repeat(max_indices_in_specified_index) + indices_multiples = [max_indices_in_specified_index] + indices_multiples_shape = [len(indices_multiples)] + indices_tile_tensor_wrapper = self.define_custom_tensor_wrapper( + node_name=node.name + f"_indices_tile_{i}", + tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + dtype=QNN_TENSOR_TYPE_MAP[indices_tensor.dtype], + quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED, + quant_configs={}, + dims=indices_tensor.size(), + tensor=indices_tensor, + is_fake_tensor=True, + nodes_to_wrappers=nodes_to_wrappers, + ) + tile_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpTile.op_name, + ) + tile_op.AddInputTensors([indices_tensor_wrapper]) + tile_op.AddOutputTensors([indices_tile_tensor_wrapper]) + tile_op.AddTensorParam( + OpTile.param_multiples, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(indices_multiples_shape), + indices_multiples_shape, + np.array(indices_multiples, dtype=np.uint32), + True, + ) + op_wrapper_list.append(tile_op) + indices_tensor_wrapper = indices_tile_tensor_wrapper - # Tile the index_node and concat the target index - if None in indicies_node: - tile_output_tensor = reshape_output_tensor.expand(index_shape) - # Tile the index_node to align with the shape of target_index - # Only need to tile the dim of None axis - # E.g., indicies_node: [None, None, aten__to_copy_default_1] - # Should tile the first two dimension. - multiples = [ - shape if id != index_node_dim else 1 - for id, shape in enumerate(index_shape) - ] - multiples_shape = [len(index_shape)] - tile_output_tensor_wrapper = self.define_custom_tensor_wrapper( - node_name=node.name + "_tile", + # Append one dimension to specify x-tuple + # Reshape the index_node for tile op + reshape_shape = list(indices_tensor.shape) + [1] + reshape_output_tensor = indices_tensor.reshape(reshape_shape) + reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper( + node_name=node.name + f"_reshape_{i}", tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, - dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype], + dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype], quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED, quant_configs={}, - dims=tile_output_tensor.size(), - tensor=tile_output_tensor, + dims=reshape_output_tensor.size(), + tensor=reshape_output_tensor, is_fake_tensor=True, nodes_to_wrappers=nodes_to_wrappers, ) - tile_op = PyQnnWrapper.PyQnnOpWrapper( + reshape_op = PyQnnWrapper.PyQnnOpWrapper( node.name, QNN_OP_PACKAGE_NAME_QTI_AISW, - OpTile.op_name, + OpReshape.op_name, ) - tile_op.AddInputTensors([reshape_output_tensor_wrapper]) - tile_op.AddOutputTensors([tile_output_tensor_wrapper]) - tile_op.AddTensorParam( - OpTile.param_multiples, - PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, - len(multiples_shape), - multiples_shape, - np.array(multiples, dtype=np.uint32), - True, - ) - op_wrapper_list.append(tile_op) + reshape_op.AddInputTensors([indices_tensor_wrapper]) + reshape_op.AddOutputTensors([reshape_output_tensor_wrapper]) + op_wrapper_list.append(reshape_op) + index_tensor_wrapper = reshape_output_tensor_wrapper + index_tensor = reshape_output_tensor - # Repeat index for "None" axis in indicies_node - ranges = [ - torch.arange(dim, dtype=indice_tensor.dtype) - for dim in target_index[:-1] - ] - target_index_shape = target_index + [len(ranges)] - target_index_tensor = torch.cartesian_prod(*ranges) - reshape_target_index_shape = [ - shape if id != index_node_dim else 1 - for id, shape in enumerate(target_index_shape) - ] - target_index_tensor = target_index_tensor.reshape( - reshape_target_index_shape - ) - target_index_tensor = target_index_tensor.expand( - target_index_shape - ).contiguous() - target_index_node = torch.fx.Node( - node.graph, - node.name + "_target_index", - "call_function", - exir_ops.edge.aten.tensor.default, - (), # args - {}, # kwargs - ) - target_index_tensor_wrapper = self.define_tensor( - target_index_node, - node, - target_index_tensor, - PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, - nodes_to_wrappers, - ) + # Tile the index_node and concat the target index + if None in indices_nodes: + tile_output_tensor = reshape_output_tensor.repeat( + all_range_tensor.size(0), 1 + ) + # Tile the index_node to align with the shape of target_index + # Only need to tile the dim of None axis + # E.g., indices_node: [None, None, aten__to_copy_default_1] + # Should tile the number of indices combination of first two dimension + # times number of indices specified by aten__to_copy_default_1 + multiples = [all_range_tensor.size(0), 1] + multiples_shape = [len(multiples)] + tile_output_tensor_wrapper = self.define_custom_tensor_wrapper( + node_name=node.name + f"_tile_{i}", + tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype], + quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED, + quant_configs={}, + dims=tile_output_tensor.size(), + tensor=tile_output_tensor, + is_fake_tensor=True, + nodes_to_wrappers=nodes_to_wrappers, + ) + tile_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpTile.op_name, + ) + tile_op.AddInputTensors([reshape_output_tensor_wrapper]) + tile_op.AddOutputTensors([tile_output_tensor_wrapper]) + tile_op.AddTensorParam( + OpTile.param_multiples, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(multiples_shape), + multiples_shape, + np.array(multiples, dtype=np.uint32), + True, + ) + op_wrapper_list.append(tile_op) + index_tensor_wrapper = tile_output_tensor_wrapper + index_tensor = tile_output_tensor - # Concat target_index and tile output to reconstruct index_node - # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype - concat_output_tensor = torch.concat( - (target_index_tensor, tile_output_tensor), dim=-1 + specified_index[i] = (index_tensor, index_tensor_wrapper) + + # Concat target_index and tile output to reconstruct index_node + # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype + index_tensors, index_tensor_wrappers = [], [] + for i, arg in enumerate(indices_nodes): + tensor, tensor_wrapper = ( + all_range_index[i] if arg is None else specified_index[i] ) + index_tensors.append(tensor) + index_tensor_wrappers.append(tensor_wrapper) + + if len(index_tensor_wrappers) > 1: + concat_output_tensor = torch.concat(index_tensors, dim=-1) concat_output_tensor_wrapper = self.define_custom_tensor_wrapper( node_name=node.name + "_concat", tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, @@ -227,9 +313,7 @@ def define_node( QNN_OP_PACKAGE_NAME_QTI_AISW, OpConcat.op_name, ) - concat_op.AddInputTensors( - [target_index_tensor_wrapper, tile_output_tensor_wrapper] - ) + concat_op.AddInputTensors(index_tensor_wrappers) concat_op.AddOutputTensors([concat_output_tensor_wrapper]) concat_op.AddScalarParam( OpConcat.param_axis, @@ -237,7 +321,6 @@ def define_node( {QCOM_DATA: np.uint32(concat_output_tensor.dim() - 1)}, ) op_wrapper_list.append(concat_op) - index_put_index_input_tensor_wrapper = concat_output_tensor_wrapper value_node = self.get_node(node.args[2]) value_tensor = self.get_tensor(value_node, node) @@ -248,6 +331,94 @@ def define_node( PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, nodes_to_wrappers, ) + # handle broadcast scenario + # e.g. input_tensor: (1, 12, 1024, 64), value_tensor: (1, 64) + # => value_reshape_tensor: (1, 1, 1, 64) + new_value_shape = ( + *([1] * (input_tensor.dim() - value_tensor.dim())), + *value_tensor.shape, + ) + # reshape the value_node for tile op + value_quant_encoding, value_quant_configs = self.get_quant_encoding_conf( + value_node, node + ) + value_dtype = ( + QNN_TENSOR_TYPE_MAP[value_tensor.dtype] + if value_quant_encoding + == PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED + else QNN_QUANT_TYPE_MAP[ + ( + torch.uint16 + if value_quant_configs[QCOM_DTYPE] == torch.int32 + else value_quant_configs[QCOM_DTYPE] + ) + ] + ) + value_reshape_tensor = value_tensor.reshape(new_value_shape) + value_reshape_tensor_wrapper = self.define_custom_tensor_wrapper( + node_name=node.name + "_value_reshape", + tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + dtype=value_dtype, + quant_encoding=value_quant_encoding, + quant_configs=value_quant_configs, + dims=value_reshape_tensor.size(), + tensor=value_reshape_tensor, + is_fake_tensor=True, + nodes_to_wrappers=nodes_to_wrappers, + ) + value_reshape_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpReshape.op_name, + ) + value_reshape_op.AddInputTensors([value_tensor_wrapper]) + value_reshape_op.AddOutputTensors([value_reshape_tensor_wrapper]) + op_wrapper_list.append(value_reshape_op) + + # e.g. input_tensor: (1, 12, 1024, 64), index_tensor: (None, None, 2), value_tensor: (1, 64) + # => multiples: [1, 12, 2, 1] + value_multiples = [] + for i in range(input_tensor.dim() - 1, -1, -1): + if i in specified_index: + # all user specified index node wil have the same dimension + multiplier = ( + indices_nodes[i].meta["val"].nelement() // new_value_shape[i] + if i == last_specified_index_node + else 1 + ) + else: + multiplier = input_tensor.shape[i] // new_value_shape[i] + value_multiples.insert(0, multiplier) + + value_tile_tensor = value_reshape_tensor.repeat(value_multiples) + value_multiples_shape = [len(value_multiples)] + value_tile_tensor_wrapper = self.define_custom_tensor_wrapper( + node_name=node.name + "_value_tile", + tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + dtype=value_dtype, + quant_encoding=value_quant_encoding, + quant_configs=value_quant_configs, + dims=value_tile_tensor.size(), + tensor=value_tile_tensor, + is_fake_tensor=True, + nodes_to_wrappers=nodes_to_wrappers, + ) + value_tile_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpTile.op_name, + ) + value_tile_op.AddInputTensors([value_reshape_tensor_wrapper]) + value_tile_op.AddOutputTensors([value_tile_tensor_wrapper]) + value_tile_op.AddTensorParam( + OpTile.param_multiples, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(value_multiples_shape), + value_multiples_shape, + np.array(value_multiples, dtype=np.uint32), + True, + ) + op_wrapper_list.append(value_tile_op) output_tensor = self.get_tensor(node, node) output_tensor_wrapper = self.define_tensor( @@ -263,11 +434,46 @@ def define_node( QNN_OP_PACKAGE_NAME_QTI_AISW, OpScatterNd.op_name, ) + # accumulation + if len(node.args) > 3 and node.args[3]: + index_put_op.AddScalarParam( + OpScatterNd.param_reduction, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + {QCOM_DATA: 1}, + ) + + # check final index_input tensor + index_input_tensor, index_input_tensor_wrapper = ( + (concat_output_tensor, concat_output_tensor_wrapper) + if len(index_tensor_wrappers) > 1 + else specified_index[last_specified_index_node] + ) + target_index_reshape_tensor = index_input_tensor.reshape((*target_index, -1)) + target_index_reshape_tensor_wrapper = self.define_custom_tensor_wrapper( + node_name=node.name + "_target_index_reshape", + tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + dtype=QNN_TENSOR_TYPE_MAP[target_index_reshape_tensor.dtype], + quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED, + quant_configs={}, + dims=target_index_reshape_tensor.size(), + tensor=target_index_reshape_tensor, + is_fake_tensor=True, + nodes_to_wrappers=nodes_to_wrappers, + ) + target_index_reshape_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpReshape.op_name, + ) + target_index_reshape_op.AddInputTensors([index_input_tensor_wrapper]) + target_index_reshape_op.AddOutputTensors([target_index_reshape_tensor_wrapper]) + op_wrapper_list.append(target_index_reshape_op) + index_put_op.AddInputTensors( [ input_tensor_wrapper, - index_put_index_input_tensor_wrapper, - value_tensor_wrapper, + target_index_reshape_tensor_wrapper, + value_tile_tensor_wrapper, ] ) index_put_op.AddOutputTensors([output_tensor_wrapper]) diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py index 630b1b0b8de..10644e17c79 100644 --- a/backends/qualcomm/builders/op_mean_dim.py +++ b/backends/qualcomm/builders/op_mean_dim.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import cast, Dict, List +from typing import cast, Dict import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper @@ -40,7 +40,22 @@ def define_node( ) # mean dims and keep dims - mean_dims = cast(List[int], node.args[1]) + rank = len(input_node.meta["val"].shape) + + if rank == 0: + raise RuntimeError( + "Mean doesn't support 0d input, please report a bug in https://github.com/pytorch/executorch/issues" + ) + + dim_arg = node.args[1] + + if dim_arg is None or len(dim_arg) == 0: + mean_dims = list(range(rank)) # reduce over all dims + elif isinstance(dim_arg, int): + mean_dims = [dim_arg] + else: + mean_dims = list(dim_arg) + mean_dims = [ mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims ] diff --git a/backends/qualcomm/builders/op_transpose.py b/backends/qualcomm/builders/op_transpose.py index dbed10ced46..e7fd84e8e79 100644 --- a/backends/qualcomm/builders/op_transpose.py +++ b/backends/qualcomm/builders/op_transpose.py @@ -42,6 +42,8 @@ def define_node( # permutation permute_order = cast(List[int], node.args[1]) + # to prevent negative values + permute_order = [x % len(permute_order) for x in permute_order] permute_order_shape = [len(permute_order)] output_tensor = input_tensor.permute(permute_order) diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py index b0c44dcae80..79a1c93d50c 100644 --- a/backends/qualcomm/builders/qnn_constants.py +++ b/backends/qualcomm/builders/qnn_constants.py @@ -59,6 +59,15 @@ class OpConv2d: param_dilation: str = "dilation" +@dataclass(init=False, frozen=True) +class OpConv3d: + op_name: str = "Conv3d" + param_stride: str = "stride" + param_pad_amount: str = "pad_amount" + param_group: str = "group" + param_dilation: str = "dilation" + + @dataclass(init=False, frozen=True) class OpConvert: op_name: str = "Convert" @@ -573,6 +582,15 @@ class OpTransposeConv2d: param_output_padding: str = "output_padding" +@dataclass(init=False, frozen=True) +class OpTransposeConv3d: + op_name: str = "TransposeConv3d" + param_stride: str = "stride" + param_pad_amount: str = "pad_amount" + param_group: str = "group" + param_output_padding: str = "output_padding" + + @dataclass(init=False, frozen=True) class OpUnpack: op_name: str = "UnPack" diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py index 7a2924fe756..0a947759538 100644 --- a/backends/qualcomm/partition/common_defs.py +++ b/backends/qualcomm/partition/common_defs.py @@ -17,6 +17,7 @@ to_be_implemented_operator = [ exir_ops.edge.aten._adaptive_avg_pool3d.default, exir_ops.edge.aten.adaptive_max_pool2d.default, + exir_ops.edge.aten.adaptive_max_pool3d.default, exir_ops.edge.aten.avg_pool3d.default, exir_ops.edge.aten.div.Tensor_mode, exir_ops.edge.aten.log10.default, diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py index 88109b51697..cf403a1a76d 100644 --- a/backends/qualcomm/quantizer/annotators.py +++ b/backends/qualcomm/quantizer/annotators.py @@ -68,7 +68,7 @@ def _is_float_tensor(node: Node): or not isinstance(node.meta["val"], FakeTensor) ): return False - return node.meta["val"].dtype == torch.float32 + return node.meta["val"].dtype in (torch.bfloat16, torch.float32) def _mark_nodes_as_annotated(nodes: List[Node]): @@ -674,7 +674,7 @@ def annotate_pad(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) -@register_annotator([torch.ops.aten.reshape.default]) +@register_annotator([torch.ops.aten.reshape.default, torch.ops.aten.unflatten.int]) def annotate_reshape(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -879,7 +879,7 @@ def annotate_unsqueeze_copy( annotate_single_in_share_out(node, quantization_config) -@register_annotator([torch.ops.aten.transpose.int]) +@register_annotator([torch.ops.aten.transpose.int, torch.ops.aten.swapaxes.default]) def annotate_transpose(node: Node, quantization_config: QuantizationConfig) -> None: annotate_in_out_obs_sharing_op(node, quantization_config) if not _is_annotated([node]): @@ -1094,11 +1094,13 @@ def annotate_cdist(node: Node, quantization_config: QuantizationConfig) -> None: @register_annotator( [ + torch.ops.aten.conv1d.default, torch.ops.aten.conv2d.default, torch.ops.aten.conv2d.padding, - torch.ops.aten.conv1d.default, - torch.ops.aten.conv_transpose2d.input, + torch.ops.aten.conv3d.default, torch.ops.aten.conv_transpose1d.default, + torch.ops.aten.conv_transpose2d.input, + torch.ops.aten.conv_transpose3d.input, torch.ops.aten.convolution.default, ] ) @@ -1356,7 +1358,7 @@ def annotate_chunk(node: Node, quantization_config: QuantizationConfig) -> None: ) -@register_annotator([torch.ops.aten.where.self]) +@register_annotator([torch.ops.aten.where.self, torch.ops.aten.where.ScalarSelf]) def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None: if _is_annotated([node]): return @@ -1366,7 +1368,6 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None: assert isinstance(input_node, Node) if _is_float_tensor(input_node): input_qspec_map[input_node] = quantization_config.input_activation - node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation( input_qspec_map=input_qspec_map, output_qspec=( diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py index 2f26cd27d31..694fab3dc6b 100644 --- a/backends/qualcomm/quantizer/qconfig.py +++ b/backends/qualcomm/quantizer/qconfig.py @@ -200,12 +200,11 @@ def get_16a8w_qnn_qat_config( act_observer=MovingAverageMinMaxObserver, ) -> QuantizationConfig: extra_args: Dict[str, Any] = {"eps": 2**-20} - act_fake_quant_ctr = FakeQuantize.with_args( + act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args( dtype=torch.int32, quant_min=torch.iinfo(torch.uint16).min, quant_max=torch.iinfo(torch.uint16).max, qscheme=torch.per_tensor_affine, - reduce_range=True, observer=act_observer.with_args(**extra_args), ) act_quantization_spec = QuantizationSpec( @@ -220,7 +219,6 @@ def get_16a8w_qnn_qat_config( quant_min=torch.iinfo(torch.int8).min + 1, quant_max=torch.iinfo(torch.int8).max, qscheme=torch.per_tensor_symmetric, - reduce_range=True, observer=MovingAverageMinMaxObserver, ) weight_quantization_spec = QuantizationSpec( @@ -400,7 +398,7 @@ def get_ptq_per_block_quant_config( def get_8a8w_qnn_qat_config( act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver ) -> QuantizationConfig: - act_fake_quant_ctr = FakeQuantize.with_args( + act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args( dtype=torch.uint8, qscheme=( torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine @@ -421,7 +419,6 @@ def get_8a8w_qnn_qat_config( quant_min=torch.iinfo(torch.int8).min + 1, quant_max=torch.iinfo(torch.int8).max, qscheme=torch.per_tensor_symmetric, - reduce_range=True, observer=MovingAverageMinMaxObserver, ) weight_quantization_spec = QuantizationSpec( @@ -438,7 +435,6 @@ def get_8a8w_qnn_qat_config( quant_min=torch.iinfo(torch.int32).min, quant_max=torch.iinfo(torch.int32).max, qscheme=torch.per_tensor_symmetric, - reduce_range=True, observer=MovingAverageMinMaxObserver, ) bias_quantization_spec = QuantizationSpec( @@ -462,12 +458,11 @@ def get_8a8w_qnn_qat_config( def get_16a4w_qnn_qat_config( act_observer=MovingAverageMinMaxObserver, ) -> QuantizationConfig: - act_fake_quant_ctr = FakeQuantize.with_args( + act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args( dtype=torch.int32, quant_min=torch.iinfo(torch.uint16).min, quant_max=torch.iinfo(torch.uint16).max, qscheme=torch.per_tensor_affine, - reduce_range=True, observer=act_observer, ) act_quantization_spec = QuantizationSpec( @@ -484,7 +479,6 @@ def get_16a4w_qnn_qat_config( quant_max=7, qscheme=torch.per_tensor_symmetric, ch_axis=0, - reduce_range=True, observer=MovingAverageMinMaxObserver, ) weight_quantization_spec = QuantizationSpec( @@ -501,7 +495,6 @@ def get_16a4w_qnn_qat_config( quant_min=torch.iinfo(torch.int32).min, quant_max=torch.iinfo(torch.int32).max, qscheme=torch.per_tensor_symmetric, - reduce_range=True, observer=MovingAverageMinMaxObserver, ) bias_quantization_spec = QuantizationSpec( @@ -548,10 +541,9 @@ def get_qat_per_channel_quant_config( # If zero_point is 128, htp can do optimizations. # If we keep quant_min and quant_max none, observer will default use 128 as zero_point. # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired. - act_fake_quant_ctr = FakeQuantize.with_args( + act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args( dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype, qscheme=torch.per_tensor_symmetric, - reduce_range=True, observer=act_observer, ) act_quantization_spec = QuantizationSpec( @@ -561,12 +553,11 @@ def get_qat_per_channel_quant_config( observer_or_fake_quant_ctr=act_fake_quant_ctr, ) else: - act_fake_quant_ctr = FakeQuantize.with_args( + act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args( dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype, quant_min=torch.iinfo(act_dtype).min, quant_max=torch.iinfo(act_dtype).max, qscheme=torch.per_tensor_affine, - reduce_range=True, observer=act_observer, ) act_quantization_spec = QuantizationSpec( diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py index 5943b54d968..44d129d5544 100644 --- a/backends/qualcomm/quantizer/quantizer.py +++ b/backends/qualcomm/quantizer/quantizer.py @@ -161,6 +161,7 @@ def __post_init__(self): { torch.ops.aten.conv1d.default, torch.ops.aten.conv2d.default, + torch.ops.aten.conv3d.default, torch.ops.aten.conv_transpose2d.input, } ) diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index c84911cf851..4cdd1efe6f4 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -86,6 +86,7 @@ if [ "$BUILD_AARCH64" = true ]; then -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ @@ -155,6 +156,7 @@ if [ "$BUILD_X86_64" = true ]; then -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ diff --git a/backends/qualcomm/scripts/download_qnn_sdk.py b/backends/qualcomm/scripts/download_qnn_sdk.py index 35006a41433..747524a0e5b 100644 --- a/backends/qualcomm/scripts/download_qnn_sdk.py +++ b/backends/qualcomm/scripts/download_qnn_sdk.py @@ -6,12 +6,15 @@ import platform import re import shutil +import subprocess +import sys import tarfile import tempfile import urllib.request import zipfile from typing import Dict, List, Optional, Tuple + logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) @@ -34,68 +37,81 @@ def is_linux_x86() -> bool: ) -import subprocess +######################### +# Cache directory helper +######################### -MINIMUM_LIBC_VERSION = 2.29 +APP_NAMESPACE = ["executorch", "qnn"] -REQUIRED_LIBC_LIBS = [ - "/lib/x86_64-linux-gnu/libc.so.6", - "/lib64/libc.so.6", - "/lib/libc.so.6", -] +def _get_staging_dir(*parts: str) -> pathlib.Path: + r""" + Return a cross-platform staging directory for staging SDKs/libraries. + + - On Linux: + ~/.cache/executorch/qnn/ + (falls back to $HOME/.cache if $XDG_CACHE_HOME is unset) -def check_glibc_exist_and_validate() -> bool: + - On Windows (not supported yet, but as placeholder): + %LOCALAPPDATA%\executorch\qnn\ + (falls back to $HOME/AppData/Local if %LOCALAPPDATA% is unset) + + - Override: + If QNN_STAGING_DIR is set in the environment, that path is used instead. + + Args: + parts (str): Subdirectories to append under the root staging dir. + + Returns: + pathlib.Path: Fully qualified staging path. """ - Check if users have glibc installed. + # Environment override wins + base = os.environ.get("QNN_STAGING_DIR") + if base: + return pathlib.Path(base).joinpath(*parts) + + system = platform.system().lower() + if system == "windows": + # On Windows, prefer %LOCALAPPDATA%, fallback to ~/AppData/Local + base = pathlib.Path( + os.environ.get("LOCALAPPDATA", pathlib.Path.home() / "AppData" / "Local") + ) + elif is_linux_x86(): + # On Linux/Unix, prefer $XDG_CACHE_HOME, fallback to ~/.cache + base = pathlib.Path( + os.environ.get("XDG_CACHE_HOME", pathlib.Path.home() / ".cache") + ) + else: + raise ValueError(f"Unsupported platform: {system}") + + return base.joinpath(*APP_NAMESPACE, *parts) + + +def _atomic_download(url: str, dest: pathlib.Path): """ - exists = False - for path in REQUIRED_LIBC_LIBS: - try: - output = subprocess.check_output( - [path, "--version"], stderr=subprocess.STDOUT - ) - output = output.decode().split("\n")[0] - logger.debug(f"[QNN] glibc version for path {path} is: {output}") - match = re.search(r"version (\d+\.\d+)", output) - if match: - version = match.group(1) - if float(version) >= MINIMUM_LIBC_VERSION: - logger.debug(f"[QNN] glibc version is {version}.") - exists = True - return True - else: - logger.error( - f"[QNN] glibc version is too low. The minimum libc version is {MINIMUM_LIBC_VERSION} Please install glibc following the commands below." - ) - else: - logger.error("[QNN] glibc version not found.") + Download URL into dest atomically: + - Write to a temp file in the same dir + - Move into place if successful + """ + dest.parent.mkdir(parents=True, exist_ok=True) - except Exception: - continue + # Temp file in same dir (guarantees atomic rename) + with tempfile.NamedTemporaryFile(dir=dest.parent, delete=False) as tmp: + tmp_path = pathlib.Path(tmp.name) - if not exists: - logger.error( - r"""" - [QNN] glibc not found or the version is too low. Please install glibc following the commands below. - Ubuntu/Debian: - sudo apt update - sudo apt install libc6 - - Fedora/Red Hat: - sudo dnf install glibc - - Arch Linux: - sudo pacman -S glibc - - Also please make sure the glibc version is >= MINIMUM_LIBC_VERSION. You can verify the glibc version by running the following command: - Option 1: - ldd --version - Option 2: - /path/to/libc.so.6 --version - """ - ) - return exists + try: + urllib.request.urlretrieve(url, tmp_path) + tmp_path.replace(dest) # atomic rename + except Exception: + # Clean up partial file on failure + if tmp_path.exists(): + tmp_path.unlink(missing_ok=True) + raise + + +#################### +# qnn sdk download management +#################### def _download_archive(url: str, archive_path: pathlib.Path) -> bool: @@ -178,9 +194,6 @@ def _download_qnn_sdk(dst_folder=SDK_DIR) -> Optional[pathlib.Path]: if not is_linux_x86(): logger.info("[QNN] Skipping Qualcomm SDK (only supported on Linux x86).") return None - elif not check_glibc_exist_and_validate(): - logger.info("[QNN] Skipping Qualcomm SDK (glibc not found or version too old).") - return None else: logger.info("[QNN] Downloading Qualcomm SDK for Linux x86") @@ -241,6 +254,136 @@ def _extract_tar(archive_path: pathlib.Path, prefix: str, target_dir: pathlib.Pa dst.write(src.read()) +#################### +# libc management +#################### + +GLIBC_VERSION = "2.34" +GLIBC_REEXEC_GUARD = "QNN_GLIBC_REEXEC" +MINIMUM_LIBC_VERSION = GLIBC_VERSION + + +def _get_glibc_libdir() -> pathlib.Path: + glibc_root = _get_staging_dir(f"glibc-{GLIBC_VERSION}") + return glibc_root / "lib" + + +def _parse_version(v: str) -> tuple[int, int]: + """Turn '2.34' → (2,34) so it can be compared.""" + parts = v.split(".") + return int(parts[0]), int(parts[1]) if len(parts) > 1 else 0 + + +def _current_glibc_version() -> str: + """Return system glibc version string (via ctypes).""" + try: + libc = ctypes.CDLL("libc.so.6") + func = libc.gnu_get_libc_version + func.restype = ctypes.c_char_p + return func().decode() + except Exception as e: + return f"error:{e}" + + +def _resolve_glibc_loader() -> pathlib.Path | None: + """Return staged ld.so path if available.""" + for p in [ + _get_glibc_libdir() / f"ld-{GLIBC_VERSION}.so", + _get_glibc_libdir() / "ld-linux-x86-64.so.2", + ]: + if p.exists(): + return p + return None + + +def _stage_prebuilt_glibc(): + """Download + extract Fedora 35 glibc RPM into /tmp.""" + logger.info(">>> Staging prebuilt glibc-%s from Fedora 35 RPM", GLIBC_VERSION) + _get_glibc_libdir().mkdir(parents=True, exist_ok=True) + rpm_path = _get_staging_dir("glibc") / "glibc.rpm" + work_dir = _get_staging_dir("glibc") / "extracted" + rpm_url = ( + "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/" + "Everything/x86_64/os/Packages/g/glibc-2.34-7.fc35.x86_64.rpm" + ) + + rpm_path.parent.mkdir(parents=True, exist_ok=True) + logger.info("[glibc] Downloading %s -> %s", rpm_url, rpm_path) + try: + urllib.request.urlretrieve(rpm_url, rpm_path) + except Exception as e: + logger.error("[glibc] Failed to download %s: %s", rpm_url, e) + raise + + # Extract + if work_dir.exists(): + shutil.rmtree(work_dir) + work_dir.mkdir(parents=True) + subprocess.check_call(["bsdtar", "-C", str(work_dir), "-xf", str(rpm_path)]) + + # Copy runtime libs + staged = [ + "ld-linux-x86-64.so.2", + "libc.so.6", + "libdl.so.2", + "libpthread.so.0", + "librt.so.1", + "libm.so.6", + "libutil.so.1", + ] + for lib in staged: + src = work_dir / "lib64" / lib + if src.exists(): + shutil.copy2(src, _get_glibc_libdir() / lib) + logger.info("[glibc] Staged %s", lib) + else: + logger.warning("[glibc] Missing %s in RPM", lib) + + +def ensure_glibc_minimum(min_version: str = GLIBC_VERSION): + """ + Ensure process runs under glibc >= min_version. + - If system glibc is new enough → skip. + - Else → stage Fedora RPM and re-exec under staged loader. + """ + current = _current_glibc_version() + logger.info("[glibc] Current loaded glibc: %s", current) + + # If system glibc already sufficient → skip everything + m = re.match(r"(\d+\.\d+)", current) + if m and _parse_version(m.group(1)) >= _parse_version(min_version): + logger.info("[glibc] System glibc >= %s, no staging needed.", min_version) + return + + # Avoid infinite loop + if os.environ.get(GLIBC_REEXEC_GUARD) == "1": + logger.info("[glibc] Already re-exec'd once, continuing.") + return + + # Stage prebuilt if not already staged + if not (_get_glibc_libdir() / "libc.so.6").exists(): + _stage_prebuilt_glibc() + + loader = _resolve_glibc_loader() + if not loader: + logger.error("[glibc] Loader not found in %s", _get_glibc_libdir()) + return + + logger.info( + "[glibc] Re-execing under loader %s with libdir %s", loader, _get_glibc_libdir() + ) + os.environ[GLIBC_REEXEC_GUARD] = "1" + os.execv( + str(loader), + [str(loader), "--library-path", str(_get_glibc_libdir()), sys.executable] + + sys.argv, + ) + + +#################### +# libc++ management +#################### + LLVM_VERSION = "14.0.0" LIBCXX_BASE_NAME = f"clang+llvm-{LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04" LLVM_URL = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/{LIBCXX_BASE_NAME}.tar.xz" @@ -258,12 +401,17 @@ def _stage_libcxx(target_dir: pathlib.Path): logger.info("[libcxx] Already staged at %s, skipping download", target_dir) return - temp_tar = pathlib.Path("/tmp") / f"{LIBCXX_BASE_NAME}.tar.xz" - temp_extract = pathlib.Path("/tmp") / LIBCXX_BASE_NAME + libcxx_stage = _get_staging_dir(f"libcxx-{LLVM_VERSION}") + temp_tar = libcxx_stage / f"{LIBCXX_BASE_NAME}.tar.xz" + temp_extract = libcxx_stage / LIBCXX_BASE_NAME if not temp_tar.exists(): logger.info("[libcxx] Downloading %s", LLVM_URL) - urllib.request.urlretrieve(LLVM_URL, temp_tar) + _atomic_download(LLVM_URL, temp_tar) + + # Sanity check before extracting + if not temp_tar.exists() or temp_tar.stat().st_size == 0: + raise FileNotFoundError(f"[libcxx] Tarball missing or empty: {temp_tar}") logger.info("[libcxx] Extracting %s", temp_tar) with tarfile.open(temp_tar, "r:xz") as tar: @@ -437,8 +585,10 @@ def install_qnn_sdk() -> bool: Returns: True if both steps succeeded (or were already satisfied), else False. """ - if check_glibc_exist_and_validate(): - if _ensure_libcxx_stack(): - if _ensure_qnn_sdk_lib(): - return True - return False + logger.info("[QNN] Starting SDK installation") + + # Make sure we’re running under >= 2.34 + ensure_glibc_minimum(GLIBC_VERSION) + + # libc++ and QNN SDK setup + return _ensure_libcxx_stack() and _ensure_qnn_sdk_lib() diff --git a/backends/qualcomm/tests/TARGETS b/backends/qualcomm/tests/TARGETS index 639303c7eb8..d968f954485 100644 --- a/backends/qualcomm/tests/TARGETS +++ b/backends/qualcomm/tests/TARGETS @@ -47,3 +47,17 @@ runtime.python_library( ":test_qnn_delegate" ] ) + +runtime.python_test( + name = "test_passes", + srcs = [ + "test_passes.py", + ], + deps = [ + "fbsource//third-party/pypi/expecttest:expecttest", # @manual + "//caffe2:torch", + "//executorch/exir:lib", + "//executorch/backends/qualcomm/_passes:passes", + "//executorch/backends/qualcomm/builders:builders", + ], +) diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 77ff1be4562..5ea6caf54ad 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -4,8 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import torch +from typing import List, Optional, Tuple, Union +import torch # module with related operator only @@ -66,6 +67,28 @@ def forward(self, x, y): return torch.add(x, y) +class AddAlpha(torch.nn.Module): + def __init__(self, alpha): + super().__init__() + self.alpha = alpha + + def forward(self, x, y): + return torch.add(x, y, alpha=self.alpha) + + +class AddAlphaConstant(torch.nn.Module): + def __init__(self, alpha, constant_first=False): + super().__init__() + self.alpha = alpha + self.constant_first = constant_first + + def forward(self, x): + if self.constant_first: + return torch.add(5.0, x, alpha=self.alpha) + else: + return torch.add(x, 5.0, alpha=self.alpha) + + class AddConstantFloat(torch.nn.Module): def __init__(self): super().__init__() @@ -148,21 +171,23 @@ def forward(self, y): class Argmax(torch.nn.Module): - def __init__(self): + def __init__(self, dim: Optional[int] = None, keepdim: bool = False): super().__init__() + self.dim = dim + self.keepdim = keepdim def forward(self, x): - x = torch.argmax(x, dim=0, keepdim=True) - return x + return torch.argmax(x, dim=self.dim, keepdim=self.keepdim) class Argmin(torch.nn.Module): - def __init__(self): + def __init__(self, dim: Optional[int] = None, keepdim: bool = False): super().__init__() + self.dim = dim + self.keepdim = keepdim def forward(self, x): - x = torch.argmin(x, dim=0, keepdim=True) - return x + return torch.argmin(x, dim=self.dim, keepdim=self.keepdim) class ArgminViewSqueezeConv2D(torch.nn.Module): @@ -274,6 +299,15 @@ def forward(self, x, y): return torch.cat((y, y, x, x), axis=2) +class Cat5(torch.nn.Module): + def __init__(self): + super().__init__() + self.const_tensor = torch.randn(1, 1, 2, 2) + + def forward(self, x, y): + return torch.cat((x, y, self.const_tensor), axis=2) + + class CausalMask(torch.nn.Module): def __init__(self): super().__init__() @@ -588,40 +622,6 @@ def forward(self, x): return self.conv(x) -class ConvTranspose1dSingle(torch.nn.Module): - def __init__(self, bias=True, dilation=1): - super().__init__() - self.conv_transpose = torch.nn.ConvTranspose1d( - in_channels=1, - out_channels=3, - kernel_size=3, - stride=2, - padding=1, - dilation=dilation, - bias=bias, - ) - - def forward(self, x): - return self.conv_transpose(x) - - -class ConvTranspose2dSingle(torch.nn.Module): - def __init__(self, bias=True, dilation=1): - super().__init__() - self.conv_transpose = torch.nn.ConvTranspose2d( - in_channels=1, - out_channels=3, - kernel_size=3, - stride=2, - padding=1, - dilation=dilation, - bias=bias, - ) - - def forward(self, x): - return self.conv_transpose(x) - - class Conv2dDownUpSample(torch.nn.Module): def __init__(self, bias=True): super().__init__() @@ -706,6 +706,79 @@ def forward(self, x): return topk_values +class Conv3dSequential(torch.nn.Module): + def __init__(self, bias=True): + super().__init__() + self.first = torch.nn.Conv3d( + in_channels=1, + out_channels=3, + kernel_size=(3, 3, 3), + padding=1, + bias=bias, + ) + self.second = torch.nn.Conv3d( + in_channels=3, + out_channels=2, + kernel_size=(3, 3, 3), + padding=1, + bias=bias, + ) + + def forward(self, x): + return self.second(self.first(x)) + + +class ConvTranspose1dSingle(torch.nn.Module): + def __init__(self, bias=True, dilation=1): + super().__init__() + self.conv_transpose = torch.nn.ConvTranspose1d( + in_channels=1, + out_channels=3, + kernel_size=3, + stride=2, + padding=1, + dilation=dilation, + bias=bias, + ) + + def forward(self, x): + return self.conv_transpose(x) + + +class ConvTranspose2dSingle(torch.nn.Module): + def __init__(self, bias=True, dilation=1): + super().__init__() + self.conv_transpose = torch.nn.ConvTranspose2d( + in_channels=1, + out_channels=3, + kernel_size=3, + stride=2, + padding=1, + dilation=dilation, + bias=bias, + ) + + def forward(self, x): + return self.conv_transpose(x) + + +class ConvTranspose3dSingle(torch.nn.Module): + def __init__(self, bias=True, dilation=1): + super().__init__() + self.conv_transpose = torch.nn.ConvTranspose3d( + in_channels=1, + out_channels=3, + kernel_size=3, + stride=2, + padding=1, + dilation=dilation, + bias=bias, + ) + + def forward(self, x): + return self.conv_transpose(x) + + class Cos(torch.nn.Module): def __init__(self): super().__init__() @@ -1068,20 +1141,62 @@ def forward(self, input_pos, k_val): class IndexPut(torch.nn.Module): - def __init__(self, skip_mutable_buffer=False): + def __init__(self, skip_mutable_buffer=False, mode=0): super().__init__() self.skip_mutable_buffer = skip_mutable_buffer self.register_buffer( "k_cache", - torch.zeros((1, 1024, 12, 64), dtype=torch.float32), + torch.zeros((2, 1024, 12, 64), dtype=torch.float32), persistent=True, ) + self.mode = mode def forward(self, input_pos, k_val): - k_out = torch.ops.aten.index_put_(self.k_cache, [None, input_pos], k_val) + match self.mode: + case 0: + k_out = torch.ops.aten.index_put_(self.k_cache, [input_pos], k_val) + case 1: + k_out = torch.ops.aten.index_put_( + self.k_cache, [None, input_pos], k_val + ) + case 2: + k_out = torch.ops.aten.index_put_( + self.k_cache, [None, None, input_pos], k_val + ) + case 3: + k_out = torch.ops.aten.index_put_( + self.k_cache, [input_pos[0], input_pos[1]], k_val + ) + case 4: + k_out = torch.ops.aten.index_put_( + self.k_cache, [None, input_pos[0], input_pos[1]], k_val + ) + case 5: + k_out = torch.ops.aten.index_put_( + self.k_cache, [input_pos[0], None, input_pos[1]], k_val + ) + return k_out + 0 +class IndexPutSuite(torch.nn.Module): + def __init__(self, accumulate=False, in_place=False): + super().__init__() + self.accumulate = accumulate + self.in_place = in_place + + def forward(self, x, indices, values): + if self.in_place: + # Clone the input to avoid modifying it in-place + result = x.clone() + # Apply index_put_ and return the modified tensor + result.index_put_(indices, values, self.accumulate) + return result + else: + # Use the non-in-place variant which returns a new tensor + return torch.index_put(x, indices, values, self.accumulate) + + class IndexSelect(torch.nn.Module): def __init__(self, dim): super().__init__() @@ -1262,20 +1377,20 @@ def forward(self, x): return self.max_pool2d(x) -class MeanWKeppDim(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.mean(x, (-1, -2), keepdim=True) - - -class MeanWOKeppDim(torch.nn.Module): - def __init__(self): +class Mean(torch.nn.Module): + def __init__( + self, + dim: Optional[Union[int, Tuple[int, ...], List[int]]] = None, + keepdim: bool = False, + dtype: Optional[torch.dtype] = None, + ): super().__init__() + self.dim = dim + self.keepdim = keepdim + self.dtype = dtype def forward(self, x): - return torch.mean(x, (-1, -2)) + return torch.mean(x, dim=self.dim, keepdim=self.keepdim, dtype=self.dtype) class MaskedFill(torch.nn.Module): @@ -1436,6 +1551,15 @@ def forward(self, x): ) +class Permute(torch.nn.Module): + def __init__(self, dims: List[int]): + super().__init__() + self.dims = dims + + def forward(self, x): + return x.permute(self.dims) + + class PixelShuffle(torch.nn.Module): def __init__(self, scale): super().__init__() @@ -1469,11 +1593,12 @@ def forward(self, x): class PowTensorScalar(torch.nn.Module): - def __init__(self): + def __init__(self, exponent=2): super().__init__() + self.exponent = exponent def forward(self, x): - return torch.pow(x, 2) + return torch.pow(x, self.exponent) class PReLUDefault(torch.nn.Module): @@ -1854,6 +1979,28 @@ def forward(self, x, y): return torch.sub(x, y) +class SubAlpha(torch.nn.Module): + def __init__(self, alpha): + super().__init__() + self.alpha = alpha + + def forward(self, x, y): + return torch.sub(x, y, alpha=self.alpha) + + +class SubAlphaConstant(torch.nn.Module): + def __init__(self, alpha, constant_first=False): + super().__init__() + self.alpha = alpha + self.constant_first = constant_first + + def forward(self, x): + if self.constant_first: + return torch.sub(5.0, x, alpha=self.alpha) + else: + return torch.sub(x, 5.0, alpha=self.alpha) + + class SubConstantFloat(torch.nn.Module): def __init__(self): super().__init__() @@ -1890,6 +2037,16 @@ def forward(self, x): return torch.sum(x, dim=(2, 3), keepdim=True) +class SwapAxes(torch.nn.Module): + def __init__(self, axis0, axis1): + super().__init__() + self.axis0 = axis0 + self.axis1 = axis1 + + def forward(self, x): + return torch.swapaxes(x, axis0=self.axis0, axis1=self.axis1) + + class Tanh(torch.nn.Module): def __init__(self): super().__init__() @@ -1898,6 +2055,19 @@ def forward(self, x): return torch.tanh(x) +class Threshold(torch.nn.Module): + def __init__(self, threshold=0.0, value=0.0, inplace=False): + super().__init__() + self.threshold = threshold + self.value = value + self.inplace = inplace + + def forward(self, x): + return torch.nn.functional.threshold( + x, threshold=self.threshold, value=self.value, inplace=self.inplace + ) + + class TopKandIndex(torch.nn.Module): def __init__(self): super().__init__() @@ -1916,6 +2086,16 @@ def forward(self, x): return torch.unbind(x) +class Unflatten(torch.nn.Module): + def __init__(self, dim, sizes): + super().__init__() + self.dim = dim + self.sizes = sizes + + def forward(self, x): + return torch.unflatten(x, dim=self.dim, sizes=self.sizes) + + class Unfold(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_passes.py b/backends/qualcomm/tests/test_passes.py new file mode 100644 index 00000000000..94a5d08acc1 --- /dev/null +++ b/backends/qualcomm/tests/test_passes.py @@ -0,0 +1,54 @@ +import unittest + +import torch +from executorch.backends.qualcomm._passes import InsertReshapeForReduceOps + + +class TestPasses(unittest.TestCase): + def test_insert_reshape_for_argmax(self): + class ArgmaxModule(torch.nn.Module): + def forward(self, x): + return torch.argmax(x, dim=None) + + mod = ArgmaxModule() + + x = torch.tensor([[1.0, 5.0], [3.0, 2.0]]) + ep = torch.export.export(mod, (x,)) + # Run original module for reference + ref = mod(x) + + reshape_nodes = [ + n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default + ] + argmax_nodes = [ + n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default + ] + self.assertTrue(len(reshape_nodes) == 0, "Reshape node not inserted") + self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing") + + InsertReshapeForReduceOps()(ep.graph_module) + + out = ep.graph_module(x) + + # Check graph structure: argmax should take a reshape as input + reshape_nodes = [ + n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default + ] + argmax_nodes = [ + n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default + ] + self.assertTrue(len(reshape_nodes) == 1, "Reshape node should be inserted") + self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing") + + argmax_node = argmax_nodes[0] + self.assertEqual(argmax_node.args[1], 0, "Argmax dim not set to 0") + + # Execute new graph and compare with reference + out = ep.graph_module(x) + self.assertTrue( + torch.equal(*out, ref), f"Output mismatch: got {out}, expected {ref}" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 5a86d5f286d..2641acc5a2d 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import io +import itertools import json import subprocess import sys @@ -173,14 +174,64 @@ def test_qnn_backend_arange(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_argmax(self): - module = Argmax() # noqa: F405 - sample_input = (torch.randn(16, 3, 4, 4),) - self.lower_module_and_test_output(module, sample_input) + test_cases = [ + { + QCOM_MODULE: Argmax(), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmax(dim=0, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmax(dim=1, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),), + }, + { + QCOM_MODULE: Argmax(dim=None, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),), + }, + { + QCOM_MODULE: Argmax(dim=2, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),), + }, + ] + + for i, case in enumerate(test_cases): + with self.subTest(i=i): + self.lower_module_and_test_output( + case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS] + ) def test_qnn_backend_argmin(self): - module = Argmin() # noqa: F405 - sample_input = (torch.rand(3, 4),) - self.lower_module_and_test_output(module, sample_input) + test_cases = [ + { + QCOM_MODULE: Argmin(), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmin(dim=0, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmin(dim=1, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),), + }, + { + QCOM_MODULE: Argmin(dim=None, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),), + }, + { + QCOM_MODULE: Argmin(dim=2, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),), + }, + ] + + for i, case in enumerate(test_cases): + with self.subTest(i=i): + self.lower_module_and_test_output( + case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS] + ) @unittest.expectedFailure def test_qnn_backend_asin(self): @@ -232,7 +283,7 @@ def test_qnn_backend_cast(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_cat(self): - modules = [Cat2(), Cat3(), Cat4()] # noqa: F405 + modules = [Cat2(), Cat3(), Cat4(), Cat5()] # noqa: F405 sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2)) for i, module in enumerate(modules): with self.subTest(i=i): @@ -282,6 +333,13 @@ def test_qnn_backend_conv2d_channel_last(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_conv3d_sequential(self): + modules = [Conv3dSequential(), Conv3dSequential(bias=False)] # noqa: F405 + sample_input = (torch.randn([2, 1, 10, 32, 32]),) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_conv_transpose1d(self): modules = [ ConvTranspose1dSingle(), # noqa: F405 @@ -306,6 +364,18 @@ def test_qnn_backend_conv_transpose2d(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_conv_transpose3d(self): + modules = [ + ConvTranspose3dSingle(), # noqa: F405 + ConvTranspose3dSingle(bias=False), # noqa: F405 + ConvTranspose3dSingle(dilation=2), # noqa: F405 + ConvTranspose3dSingle(dilation=(3, 2, 3)), # noqa: F405 + ] + sample_input = (torch.randn([1, 1, 3, 3, 3]),) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_cos(self): module = Cos() # noqa: F405 sample_input = (torch.randn(2, 5, 1, 3),) @@ -328,8 +398,8 @@ def test_qnn_backend_cumsum(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_einsum_outer_product(self): module = EinsumOuterProduct() # noqa: F405 @@ -372,6 +442,24 @@ def test_qnn_backend_element_wise_add(self): ], QCOM_SAMPLE_INPUTS: [(torch.randint(0, 10, size=(2, 3)),)], }, + { + QCOM_MODULE: [ + AddAlpha(alpha=2), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [ + ( + torch.tensor([[1.2, 1.3, 1.4]]), + torch.tensor([[0.8, 1.6, 0.2]]), + ) + ], + }, + { + QCOM_MODULE: [ + AddAlphaConstant(alpha=2, constant_first=True), # noqa: F405 + AddAlphaConstant(alpha=2, constant_first=False), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)], + }, ] index = 0 @@ -379,8 +467,8 @@ def test_qnn_backend_element_wise_add(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_element_wise_and(self): module = And(torch.tensor(1.7), torch.tensor(0.2)) # noqa: F405 @@ -418,8 +506,8 @@ def test_qnn_backend_element_wise_div(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_element_wise_mul(self): test_comb = [ @@ -445,8 +533,8 @@ def test_qnn_backend_element_wise_mul(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_element_wise_or(self): test_comb = [ @@ -495,6 +583,24 @@ def test_qnn_backend_element_wise_sub(self): QCOM_MODULE: [SubConstantFloat()], # noqa: F405 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)], }, + { + QCOM_MODULE: [ + SubAlpha(alpha=2), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [ + ( + torch.tensor([[1.2, 1.3, 1.4]]), + torch.tensor([[0.8, 1.6, 0.2]]), + ) + ], + }, + { + QCOM_MODULE: [ + SubAlphaConstant(alpha=2, constant_first=True), # noqa: F405 + SubAlphaConstant(alpha=2, constant_first=False), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)], + }, ] index = 0 @@ -502,8 +608,8 @@ def test_qnn_backend_element_wise_sub(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) @unittest.expectedFailure def test_qnn_backend_elu(self): @@ -545,10 +651,10 @@ def test_qnn_backend_expand(self): for module in modules: for sample_input in sample_inputs: with self.subTest(i=index): + index += 1 self.lower_module_and_test_output( module, sample_input, passes_job=passes_job ) - index += 1 def test_qnn_backend_expm1(self): sample_input = (torch.randn(3, 4, 5),) @@ -571,6 +677,21 @@ def test_qnn_backend_floor_divide(self): { QCOM_MODULE: [FloorDiv()], # noqa: F405 QCOM_SAMPLE_INPUTS: [ + (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)), + ( + torch.randint(-100, 100, (10, 10)).float(), + torch.full((10, 10), 2.5), + ), + (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)), + (torch.tensor([10]), torch.arange(1, 5)), # Failed + (torch.arange(-10, 10), torch.tensor([2])), + (torch.randint(-100, 100, (20,)), torch.full((20,), 2)), + (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)), + (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)), + ( + torch.randint(-100, 100, (2, 3, 4, 5)), + torch.full((2, 3, 4, 5), 2), + ), (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)), (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])), ], @@ -586,8 +707,8 @@ def test_qnn_backend_floor_divide(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_fold(self): sample_input = (torch.randn(3, 512, 256),) @@ -631,6 +752,13 @@ def test_qnn_backend_gelu(self): sample_input = (torch.randn(2, 5, 1, 3),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_glu(self): + modules = [torch.nn.GLU(), torch.nn.GLU(dim=0)] + sample_input = (torch.randn(2, 5, 1, 4),) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_greater_equal(self): test_comb = [ { @@ -760,28 +888,191 @@ def test_qnn_backend_index_copy(self): ) def test_qnn_backend_index_put(self): - test_comb = [ - { - QCOM_MODULE: IndexPut(skip_mutable_buffer=False), # noqa: F405 - QCOM_SAMPLE_INPUTS: ( - torch.tensor([2], dtype=torch.int32), - torch.randn([1, 1, 12, 64]), + skip_mutable_buffer = [False, True] + total_test_combo = [] + # mode 0 + sample_inputs = [ + (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])), + (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])), + (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])), + (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 1 + sample_inputs = [ + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])), + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])), + (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])), + (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 2 + sample_inputs = [ + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])), + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])), + (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])), + (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 3 + sample_inputs = [ + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), ), - }, - { - QCOM_MODULE: IndexPut(skip_mutable_buffer=True), # noqa: F405 - QCOM_SAMPLE_INPUTS: ( - torch.tensor([2], dtype=torch.int32), - torch.randn([1, 1, 12, 64]), + torch.randn([2, 12, 64]), + ), + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), ), - }, + torch.randn([1, 64]), + ), ] - for i, test in enumerate(test_comb): + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 4 + sample_inputs = [ + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([2, 64]), + ), + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([1, 64]), + ), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 5 + sample_inputs = [ + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([64]), + ), + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([1]), + ), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + + for i, test_combo in enumerate(total_test_combo): + for j, combo in enumerate(test_combo): + with self.subTest(f"mode_{i}-{j}"): + self.lower_module_and_test_output( + IndexPut(skip_mutable_buffer=combo[0], mode=i), # noqa: F405 + combo[1], + skip_mutable_buffer=combo[0], + ) + + def test_qnn_backend_index_put_suite(self): + accumulate = [False, True] + in_place = [False, True] + sample_inputs = [ + # basic + ( + torch.rand(5, 2) * 100, + (torch.tensor([0, 2]),), + torch.tensor([10.0, 20.0]), + ), + (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])), + # shape + (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])), + ( + torch.rand(5, 2), + (torch.tensor([0, 2]), torch.tensor([1, 1])), + torch.tensor([10.0, 20.0]), + ), + ( + torch.rand(5, 3, 2), + (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])), + torch.tensor([10.0, 20.0]), + ), + # TODO: not supported by HTP + # ( + # torch.rand(5, 3, 2, 4), + # (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])), + # torch.tensor([10.0]), + # ), + # indices + (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])), + ( + torch.rand(5, 3), + (torch.tensor([0, 2, 4]),), + torch.tensor([10.0, 20.0, 30.0]), + ), + ( + torch.rand(5), + (torch.tensor([1, 1, 3, 3]),), + torch.tensor([10.0, 20.0, 30.0, 40.0]), + ), + # broadcasting + (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])), + ( + torch.rand(3, 4), + (torch.tensor([0, 1]), torch.tensor([1, 2])), + torch.tensor([10.0, 20.0]), + ), + (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])), + ( + torch.rand(3, 2, 2), + (torch.tensor([0, 1]),), + torch.tensor([[1.0, 2.0], [3.0, 4.0]]), + ), + (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])), + # two-index + ( + torch.rand(4, 3), + (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])), + torch.tensor([10.0, 20.0, 30.0]), + ), + ( + torch.rand(3, 3), + (torch.tensor([0, 2]), torch.tensor([1, 1])), + torch.tensor([15.0, 25.0]), + ), + ( + torch.rand(3, 2), + (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])), + torch.tensor([5.0, 10.0, 15.0]), + ), + ( + torch.rand(3, 2), + (torch.tensor([1]), torch.tensor([0, 0, 1])), + torch.tensor([5.0, 10.0, 15.0]), + ), + ] + test_combo = list(itertools.product(accumulate, in_place, sample_inputs)) + for i, combo in enumerate(test_combo): with self.subTest(i=i): self.lower_module_and_test_output( - test[QCOM_MODULE], - test[QCOM_SAMPLE_INPUTS], - skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer, + IndexPutSuite(accumulate=combo[0], in_place=combo[1]), # noqa: F405 + combo[2], ) def test_qnn_backend_index_select(self): @@ -860,8 +1151,8 @@ def test_qnn_backend_leaky_relu(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_less_equal(self): test_comb = [ @@ -956,12 +1247,61 @@ def test_qnn_backend_max_pool2d(self): sample_input = (torch.randn(4, 3, 24, 24),) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_mean_dim(self): - modules = [MeanWKeppDim(), MeanWOKeppDim()] # noqa: F405 - sample_input = (torch.randn([2, 5, 1, 3]),) - for i, module in enumerate(modules): + def test_qnn_backend_mean(self): + test_comb = [ + # Reduce over last two dims, keepdim=True + { + QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),), + }, + # Reduce over last two dims, keepdim=False + { + QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),), + }, + # Default: reduce all dims + { + QCOM_MODULE: Mean(), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),), + }, + # TODO: To be enabled via reshape input to 1d tensor + # # Scalar case + # { + # QCOM_MODULE: Mean(), + # QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),), + # }, + # Edge case: dim is a empty list + { + QCOM_MODULE: Mean(dim=[]), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),), + }, + # Edge case: reduce along dim=0 (batch dimension) + { + QCOM_MODULE: Mean(dim=0), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),), + }, + # Edge case: reduce along dim=0 with keepdim=True + { + QCOM_MODULE: Mean(dim=0, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),), + }, + # Edge case: reduce along multiple dims + { + QCOM_MODULE: Mean(dim=(0, 2)), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),), + }, + # Edge case: high-dimensional tensor + { + QCOM_MODULE: Mean(dim=(1, 3), keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),), + }, + ] + + for i, test in enumerate(test_comb): with self.subTest(i=i): - self.lower_module_and_test_output(module, sample_input) + self.lower_module_and_test_output( + test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS] + ) @unittest.skip("failed to lower in QNN 2.26") def test_qnn_backend_mha(self): @@ -1006,6 +1346,16 @@ def test_qnn_backend_pad(self): sample_input = (torch.randn([1, 8, 128]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_permute(self): + modules = [ + Permute([0, 2, 3, 1]), # noqa: F405 + Permute([-1, -3, -2, -4]), # noqa: F405 + ] + sample_input = (torch.randn([2, 3, 4, 5]),) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_pixel_shuffle(self): module = PixelShuffle(2) # noqa: F405 sample_input = (torch.ones([2, 4, 3, 3]),) @@ -1017,9 +1367,28 @@ def test_qnn_backend_pixel_unshuffle(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_pow_tensor_scalar(self): - module = PowTensorScalar() # noqa: F405 - sample_input = (torch.rand([2, 4, 3, 3]),) - self.lower_module_and_test_output(module, sample_input) + test_comb = [ + { + QCOM_MODULE: [ + PowTensorScalar(), # noqa: F405 + PowTensorScalar(1), # noqa: F405 + PowTensorScalar(-1), # noqa: F405 + PowTensorScalar(0.5), # noqa: F405 + ], # noqa: F405 + QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)], + }, + { + QCOM_MODULE: [PowTensorScalar(10)], # noqa: F405 + QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)], + }, + ] + index = 0 + for comb in test_comb: + for module in comb[QCOM_MODULE]: + for sample_input in comb[QCOM_SAMPLE_INPUTS]: + with self.subTest(i=index): + index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_prelu(self): test_comb = [ @@ -1038,8 +1407,8 @@ def test_qnn_backend_prelu(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_relu(self): module = Relu() # noqa: F405 @@ -1154,10 +1523,8 @@ def test_qnn_backend_slice_scatter(self): ], QCOM_SAMPLE_INPUTS: [ ( - ( - torch.zeros(8, 8), - torch.ones(8, 2), - ) + torch.zeros(8, 8), + torch.ones(8, 2), ) ], }, @@ -1168,8 +1535,8 @@ def test_qnn_backend_slice_scatter(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - self.lower_module_and_test_output(module, sample_input) index += 1 + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_stack(self): module = Stack() # noqa: F405 @@ -1202,11 +1569,32 @@ def test_qnn_backend_sum_int_list(self): sample_input = (torch.randn([1, 4, 8, 8]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_swapaxes(self): + module = SwapAxes(0, 1) # noqa: F405 + sample_input = (torch.randn([1, 2, 3, 4]),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_tanh(self): module = Tanh() # noqa: F405 sample_input = (torch.randn(2, 5, 1, 3),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_threshold(self): + modules = [ + Threshold(), # noqa: F405 + Threshold(threshold=0.5, value=3.0, inplace=True), # noqa: F405 + Threshold(threshold=0.5, value=3.0, inplace=False), # noqa: F405 + ] + sample_input = (torch.randn(2, 5, 1, 3),) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) + + def test_qnn_backend_unflatten(self): + module = Unflatten(dim=1, sizes=(2, 3, 4)) # noqa: F405 + sample_input = (torch.randn([1, 24]),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_unbind(self): module = Unbind() # noqa: F405 sample_input = (torch.randn([3, 3]),) @@ -1638,16 +2026,66 @@ def test_qnn_backend_arange(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_argmax(self): - module = Argmax() # noqa: F405 - sample_input = (torch.randn(16, 3, 4, 4),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + test_cases = [ + { + QCOM_MODULE: Argmax(), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmax(dim=0, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmax(dim=1, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),), + }, + { + QCOM_MODULE: Argmax(dim=None, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),), + }, + { + QCOM_MODULE: Argmax(dim=2, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),), + }, + ] + + for i, case in enumerate(test_cases): + with self.subTest(i=i): + module = self.get_qdq_module( + case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS] + ) + self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS]) def test_qnn_backend_argmin(self): - module = Argmin() # noqa: F405 - sample_input = (torch.randn(16, 3, 4, 4),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + test_cases = [ + { + QCOM_MODULE: Argmin(), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmin(dim=0, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),), + }, + { + QCOM_MODULE: Argmin(dim=1, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),), + }, + { + QCOM_MODULE: Argmin(dim=None, keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),), + }, + { + QCOM_MODULE: Argmin(dim=2, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),), + }, + ] + + for i, case in enumerate(test_cases): + with self.subTest(i=i): + module = self.get_qdq_module( + case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS] + ) + self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS]) def test_qnn_backend_asin(self): module = Asin() # noqa: F405 @@ -1699,7 +2137,7 @@ def test_qnn_backend_cast(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_cat(self): - modules = [Cat2(), Cat3(), Cat4()] # noqa: F405 + modules = [Cat2(), Cat3(), Cat4(), Cat5()] # noqa: F405 sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2)) for i, module in enumerate(modules): with self.subTest(i=i): @@ -1789,6 +2227,14 @@ def test_qnn_backend_conv2d_channel_last(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_conv3d_sequential(self): + modules = [Conv3dSequential(), Conv3dSequential(bias=False)] # noqa: F405 + sample_input = (torch.randn([2, 1, 10, 32, 32]),) + for i, module in enumerate(modules): + with self.subTest(i=i): + qdq_module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(qdq_module, sample_input) + def test_qnn_backend_conv_transpose1d(self): modules = [ ConvTranspose1dSingle(), # noqa: F405 @@ -1814,6 +2260,19 @@ def test_qnn_backend_conv_transpose2d(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_conv_transpose3d(self): + modules = [ + ConvTranspose3dSingle(), # noqa: F405 + ConvTranspose3dSingle(bias=False), # noqa: F405 + ConvTranspose3dSingle(dilation=2), # noqa: F405 + ConvTranspose3dSingle(dilation=(3, 2, 3)), # noqa: F405 + ] + sample_input = (torch.randn([1, 1, 3, 3, 3]),) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_cos(self): module = Cos() # noqa: F405 sample_input = (torch.randn(2, 5, 1, 3),) @@ -1863,6 +2322,24 @@ def test_qnn_backend_element_wise_add(self): QCOM_MODULE: [AddConstantFloat(), AddConstantLong()], # noqa: F405 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)], }, + { + QCOM_MODULE: [ + AddAlpha(alpha=2), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [ + ( + torch.tensor([[1.2, 1.3, 1.4]]), + torch.tensor([[0.8, 1.6, 0.2]]), + ) + ], + }, + { + QCOM_MODULE: [ + AddAlphaConstant(alpha=2, constant_first=True), # noqa: F405 + AddAlphaConstant(alpha=2, constant_first=False), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)], + }, ] index = 0 @@ -1870,9 +2347,9 @@ def test_qnn_backend_element_wise_add(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): + index += 1 gm = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(gm, sample_input) - index += 1 def test_qnn_backend_element_wise_and(self): module = And(torch.tensor(1.7), torch.tensor(0.2)) # noqa: F405 @@ -1911,9 +2388,9 @@ def test_qnn_backend_element_wise_div(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): + index += 1 gm = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(gm, sample_input) - index += 1 def test_qnn_backend_element_wise_mul(self): test_comb = [ @@ -1939,9 +2416,9 @@ def test_qnn_backend_element_wise_mul(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): + index += 1 gm = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(gm, sample_input) - index += 1 def test_qnn_backend_element_wise_or(self): test_comb = [ @@ -1992,6 +2469,24 @@ def test_qnn_backend_element_wise_sub(self): QCOM_MODULE: [SubConstantFloat(), SubConstantLong()], # noqa: F405 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)], }, + { + QCOM_MODULE: [ + SubAlpha(alpha=2), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [ + ( + torch.tensor([[1.2, 1.3, 1.4]]), + torch.tensor([[0.8, 1.6, 0.2]]), + ) + ], + }, + { + QCOM_MODULE: [ + SubAlphaConstant(alpha=2, constant_first=True), # noqa: F405 + SubAlphaConstant(alpha=2, constant_first=False), # noqa: F405 + ], + QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)], + }, ] index = 0 @@ -1999,9 +2494,9 @@ def test_qnn_backend_element_wise_sub(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): + index += 1 gm = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(gm, sample_input) - index += 1 def test_qnn_backend_elu(self): module = Elu() # noqa: F405 @@ -2050,11 +2545,11 @@ def test_qnn_backend_expand(self): for module in modules: for sample_input in sample_inputs: with self.subTest(i=index): + index += 1 module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output( module, sample_input, passes_job=passes_job ) - index += 1 def test_qnn_backend_expm1(self): sample_input = (torch.randn(3, 4, 5),) @@ -2080,6 +2575,21 @@ def test_qnn_backend_floor_divide(self): { QCOM_MODULE: [FloorDiv()], # noqa: F405 QCOM_SAMPLE_INPUTS: [ + (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)), + ( + torch.randint(-100, 100, (10, 10)).float(), + torch.full((10, 10), 2.5), + ), + (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)), + (torch.tensor([10]), torch.arange(1, 5)), + (torch.arange(-10, 10), torch.tensor([2])), + (torch.randint(-100, 100, (20,)), torch.full((20,), 2)), + (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)), + (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)), + ( + torch.randint(-100, 100, (2, 3, 4, 5)), + torch.full((2, 3, 4, 5), 2), + ), (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)), (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])), ], @@ -2095,9 +2605,12 @@ def test_qnn_backend_floor_divide(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - gm = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(gm, sample_input) index += 1 + # Support int input cases with bypass_check=True + gm = self.get_qdq_module( + module, sample_input, bypass_check=True + ) + self.lower_module_and_test_output(gm, sample_input) def test_qnn_backend_fold(self): sample_input = (torch.randn(3, 512, 256),) @@ -2146,6 +2659,14 @@ def test_qnn_backend_gelu(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_glu(self): + modules = [torch.nn.GLU(), torch.nn.GLU(dim=0)] + sample_input = (torch.randn(2, 5, 1, 4),) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_greater_equal(self): test_comb = [ { @@ -2285,32 +2806,197 @@ def test_qnn_backend_index_copy(self): ) def test_qnn_backend_index_put(self): - test_comb = [ - { - QCOM_MODULE: IndexPut(skip_mutable_buffer=False), # noqa: F405 - QCOM_SAMPLE_INPUTS: ( - torch.tensor([2], dtype=torch.int32), - torch.randn([1, 1, 12, 64]), + skip_mutable_buffer = [False, True] + total_test_combo = [] + # mode 0 + sample_inputs = [ + (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])), + (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])), + (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])), + (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 1 + sample_inputs = [ + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])), + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])), + (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])), + (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 2 + sample_inputs = [ + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])), + (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])), + (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])), + (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 3 + sample_inputs = [ + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), ), - }, - { - QCOM_MODULE: IndexPut(skip_mutable_buffer=True), # noqa: F405 - QCOM_SAMPLE_INPUTS: ( - torch.tensor([2], dtype=torch.int32), - torch.randn([1, 1, 12, 64]), + torch.randn([2, 12, 64]), + ), + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), ), - }, + torch.randn([1, 64]), + ), ] - for i, test in enumerate(test_comb): + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 4 + sample_inputs = [ + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([2, 64]), + ), + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([1, 64]), + ), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + # mode 5 + sample_inputs = [ + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([64]), + ), + ( + ( + torch.tensor([0, 1], dtype=torch.int32), + torch.tensor([2, 3], dtype=torch.int32), + ), + torch.randn([1]), + ), + ] + total_test_combo.append( + list(itertools.product(skip_mutable_buffer, sample_inputs)) + ) + + for i, test_combo in enumerate(total_test_combo): + for j, combo in enumerate(test_combo): + with self.subTest(f"mode_{i}-{j}"): + module = self.get_qdq_module( + IndexPut(skip_mutable_buffer=combo[0], mode=i), # noqa: F405 + combo[1], + ) + self.lower_module_and_test_output( + module, + combo[1], + skip_mutable_buffer=combo[0], + ) + + def test_qnn_backend_index_put_suite(self): + accumulate = [False, True] + in_place = [False, True] + sample_inputs = [ + # basic + ( + torch.rand(5, 2) * 100, + (torch.tensor([0, 2]),), + torch.tensor([10.0, 20.0]), + ), + (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])), + # shape + (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])), + ( + torch.rand(5, 2), + (torch.tensor([0, 2]), torch.tensor([1, 1])), + torch.tensor([10.0, 20.0]), + ), + ( + torch.rand(5, 3, 2), + (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])), + torch.tensor([10.0, 20.0]), + ), + # TODO: not supported by HTP + # ( + # torch.rand(5, 3, 2, 4), + # (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])), + # torch.tensor([10.0]), + # ), + # indices + (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])), + ( + torch.rand(5, 3), + (torch.tensor([0, 2, 4]),), + torch.tensor([10.0, 20.0, 30.0]), + ), + ( + torch.rand(5), + (torch.tensor([1, 1, 3, 3]),), + torch.tensor([10.0, 20.0, 30.0, 40.0]), + ), + # broadcasting + (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])), + ( + torch.rand(3, 4), + (torch.tensor([0, 1]), torch.tensor([1, 2])), + torch.tensor([10.0, 20.0]), + ), + (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])), + ( + torch.rand(3, 2, 2), + (torch.tensor([0, 1]),), + torch.tensor([[1.0, 2.0], [3.0, 4.0]]), + ), + (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])), + # two-index + ( + torch.rand(4, 3), + (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])), + torch.tensor([10.0, 20.0, 30.0]), + ), + ( + torch.rand(3, 3), + (torch.tensor([0, 2]), torch.tensor([1, 1])), + torch.tensor([15.0, 25.0]), + ), + ( + torch.rand(3, 2), + (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])), + torch.tensor([5.0, 10.0, 15.0]), + ), + ( + torch.rand(3, 2), + (torch.tensor([1]), torch.tensor([0, 0, 1])), + torch.tensor([5.0, 10.0, 15.0]), + ), + ] + test_combo = list(itertools.product(accumulate, in_place, sample_inputs)) + for i, combo in enumerate(test_combo): with self.subTest(i=i): module = self.get_qdq_module( - test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS] - ) - self.lower_module_and_test_output( - module, - test[QCOM_SAMPLE_INPUTS], - skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer, + IndexPutSuite(accumulate=combo[0], in_place=combo[1]), # noqa: F405 + combo[2], ) + self.lower_module_and_test_output(module, combo[2]) def test_qnn_backend_index_select(self): module = IndexSelect(dim=1) # noqa: F405 @@ -2395,9 +3081,9 @@ def test_qnn_backend_leaky_relu(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): + index += 1 module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - index += 1 def test_qnn_backend_less_equal(self): test_comb = [ @@ -2529,13 +3215,62 @@ def test_qnn_backend_max_pool2d(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_mean_dim(self): - modules = [MeanWKeppDim(), MeanWOKeppDim()] # noqa: F405 - sample_input = (torch.randn([2, 5, 1, 3]),) - for i, module in enumerate(modules): + def test_qnn_backend_mean(self): + test_comb = [ + # Reduce over last two dims, keepdim=True + { + QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),), + }, + # Reduce over last two dims, keepdim=False + { + QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),), + }, + # Default: reduce all dims + { + QCOM_MODULE: Mean(), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),), + }, + # TODO: To be enabled via reshape input to 1d tensor + # Scalar case + # { + # QCOM_MODULE: Mean(), + # QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),), + # }, + # Edge case: dim is a empty list + { + QCOM_MODULE: Mean(dim=[]), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),), + }, + # Edge case: reduce along dim=0 (batch dimension) + { + QCOM_MODULE: Mean(dim=0), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),), + }, + # Edge case: reduce along dim=0 with keepdim=True + { + QCOM_MODULE: Mean(dim=0, keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),), + }, + # Edge case: reduce along multiple dims + { + QCOM_MODULE: Mean(dim=(0, 2)), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),), + }, + # Edge case: high-dimensional tensor + { + QCOM_MODULE: Mean(dim=(1, 3), keepdim=True), # noqa: F405 + QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),), + }, + ] + + for i, test in enumerate(test_comb): with self.subTest(i=i): - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + module = self.get_qdq_module( + test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS] + ) + self.lower_module_and_test_output(module, test[QCOM_SAMPLE_INPUTS]) def test_qnn_backend_mha(self): module = MultiheadAttention() # noqa: F405 @@ -2585,6 +3320,17 @@ def test_qnn_backend_pad(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_permute(self): + modules = [ + Permute([0, 2, 3, 1]), # noqa: F405 + Permute([-1, -3, -2, -4]), # noqa: F405 + ] + sample_input = (torch.randn([2, 3, 4, 5]),) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_pixel_shuffle(self): module = PixelShuffle(2) # noqa: F405 sample_input = (torch.ones([2, 4, 3, 3]),) @@ -2598,10 +3344,29 @@ def test_qnn_backend_pixel_unshuffle(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_pow_tensor_scalar(self): - module = PowTensorScalar() # noqa: F405 - sample_input = (torch.rand([2, 4, 3, 3]),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + test_comb = [ + { + QCOM_MODULE: [ + PowTensorScalar(), # noqa: F405 + PowTensorScalar(1), # noqa: F405 + PowTensorScalar(-1), # noqa: F405 + PowTensorScalar(0.5), # noqa: F405 + ], # noqa: F405 + QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)], + }, + { + QCOM_MODULE: [PowTensorScalar(10)], # noqa: F405 + QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)], + }, + ] + index = 0 + for comb in test_comb: + for module in comb[QCOM_MODULE]: + for sample_input in comb[QCOM_SAMPLE_INPUTS]: + with self.subTest(i=index): + index += 1 + qdq_module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(qdq_module, sample_input) def test_qnn_backend_prelu(self): test_comb = [ @@ -2620,9 +3385,9 @@ def test_qnn_backend_prelu(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): + index += 1 module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - index += 1 def test_qnn_backend_relu(self): module = Relu() # noqa: F405 @@ -2760,10 +3525,8 @@ def test_qnn_backend_slice_scatter(self): ], QCOM_SAMPLE_INPUTS: [ ( - ( - torch.zeros(8, 8), - torch.ones(8, 2), - ) + torch.zeros(8, 8), + torch.ones(8, 2), ) ], }, @@ -2774,9 +3537,9 @@ def test_qnn_backend_slice_scatter(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): + index += 1 module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - index += 1 def test_qnn_backend_softmax(self): modules = [Softmax(dim=1), Softmax(dim=-1)] # noqa: F405 @@ -2814,12 +3577,36 @@ def test_qnn_backend_sum_int_list(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_swapaxes(self): + module = SwapAxes(0, 1) # noqa: F405 + sample_input = (torch.randn([1, 2, 3, 4]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_tanh(self): module = Tanh() # noqa: F405 sample_input = (torch.randn(2, 5, 1, 3),) module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_threshold(self): + modules = [ + Threshold(), # noqa: F405 + Threshold(threshold=0.5, value=3.0, inplace=True), # noqa: F405 + Threshold(threshold=0.5, value=3.0, inplace=False), # noqa: F405 + ] + sample_input = (torch.randn(2, 5, 1, 3),) + for i, module in enumerate(modules): + with self.subTest(i=i): + qdq_module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(qdq_module, sample_input) + + def test_qnn_backend_unflatten(self): + module = Unflatten(dim=1, sizes=(2, 3, 4)) # noqa: F405 + sample_input = (torch.randn([1, 24]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_unbind(self): module = Unbind() # noqa: F405 sample_input = (torch.randn([3, 3]),) @@ -2943,6 +3730,51 @@ def test_qnn_backend_chunk_add(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_conformer(self): + from typing import Tuple + + import torchaudio + + class PatchedConformer(torch.nn.Module): + """ + A lightly modified version of the top-level Conformer module, such that it can be exported. + Instead of taking lengths and computing the padding mask, it takes the padding mask directly. + See https://github.com/pytorch/audio/blob/main/src/torchaudio/models/conformer.py#L215 + """ + + def __init__(self, conformer): + super().__init__() + self.conformer = conformer + + def forward( + self, input: torch.Tensor, encoder_padding_mask: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = input.transpose(0, 1) + for layer in self.conformer.conformer_layers: + x = layer(x, encoder_padding_mask) + return x.transpose(0, 1) + + inner_model = torchaudio.models.Conformer( + input_dim=80, + num_heads=4, + ffn_dim=128, + num_layers=4, + depthwise_conv_kernel_size=31, + ) + lengths = torch.randint(1, 400, (10,)) + encoder_padding_mask = torchaudio.models.conformer._lengths_to_padding_mask( + lengths + ) + sample_input = ( + torch.rand(10, int(lengths.max()), 80), + encoder_padding_mask.to(torch.float32), + ) + module = PatchedConformer(inner_model).eval() + module = self.get_qdq_module( + module, sample_input, quant_dtype=QuantDtype.use_16a8w + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_conv1d_relu_log_softmax(self): modules = [ Conv1dReluLogSoftmax(dim=1), # noqa: F405 @@ -4680,6 +5512,65 @@ def test_qnn_backend_seq_mse(self): class TestExampleLLMScript(TestQNN): + def test_static_gemma_2b(self): + if not self.required_envs(): + self.skipTest("missing required envs") + + prompt = "My favourite condiment is " + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + "--decoder_model", + "gemma-2b", + "--model_mode", + "kv", + "--max_seq_len", + "1024", + "--eval_perplexity", + "--tasks", + "wikitext", + "--limit", + "1", + ] + if self.compile_only: + cmds.extend(["--compile_only"]) + elif self.device: + cmds.extend(["--device", self.device]) + if self.host: + cmds.extend(["--host", self.host]) + elif self.enable_x86_64: + cmds.extend(["--enable_x86_64"]) + if self.pre_gen_pte: + cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + inference_speed_ref = {"SM8650": 32, "SM8750": 36} + self.assertLessEqual(msg["wiki_ppl"], 35) + self.assertLessEqual(msg["pte_size"], 2_700_000_000) # 2.7GB + if self.model in inference_speed_ref: + self.assertGreaterEqual( + msg["inference_speed"], inference_speed_ref[self.model] + ) + def test_static_gemma3_1b(self): if not self.required_envs(): self.skipTest("missing required envs") @@ -5438,6 +6329,43 @@ def test_conv_former(self): self.assertGreaterEqual(msg["top_1"], 70) self.assertGreaterEqual(msg["top_5"], 92) + def test_convnext_small(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/convnext_small.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + "--seed", + str(1126), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 76) + self.assertGreaterEqual(msg["top_5"], 97) + def test_cvt(self): if not self.required_envs([self.image_dataset]): self.skipTest("missing required envs") @@ -5936,6 +6864,43 @@ def test_gMLP(self): self.assertGreaterEqual(msg["top_1"], 70) self.assertGreaterEqual(msg["top_5"], 88) + def test_maxvit_t(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/maxvit_t.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + "--seed", + str(1126), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 72) + self.assertGreaterEqual(msg["top_5"], 91) + @unittest.skip("Only outputs good accuracy in QNN 2.29") def test_mobilevit_v2(self): if not self.required_envs([self.image_dataset]): @@ -6282,6 +7247,43 @@ def test_swin_transformer(self): self.assertGreaterEqual(msg["top_1"], 71) self.assertGreaterEqual(msg["top_5"], 90) + def test_swin_v2_t(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/swin_v2_t.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + "--seed", + str(1126), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 63) + self.assertGreaterEqual(msg["top_5"], 92) + def test_t5(self): if not self.required_envs([self.qa_dataset]): self.skipTest("missing required envs") @@ -6318,6 +7320,43 @@ def test_t5(self): else: self.assertGreaterEqual(msg["f1"], 0.72) + def test_vit_b_16(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/vit_b_16.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + "--seed", + str(1126), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 72) + self.assertGreaterEqual(msg["top_5"], 96) + def test_whisper(self): if not self.required_envs(): self.skipTest("missing required envs") diff --git a/backends/samsung/CMakeLists.txt b/backends/samsung/CMakeLists.txt index fff3ece5239..6ea020c0970 100644 --- a/backends/samsung/CMakeLists.txt +++ b/backends/samsung/CMakeLists.txt @@ -161,7 +161,7 @@ if(${ANDROID}) install( TARGETS enn_backend enn_logging EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} ) endif() diff --git a/backends/samsung/_passes/annotate_qparams.py b/backends/samsung/_passes/annotate_qparams.py new file mode 100644 index 00000000000..663d1fdf5fa --- /dev/null +++ b/backends/samsung/_passes/annotate_qparams.py @@ -0,0 +1,201 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import operator +from typing import Any, Dict, List, Optional + +import torch +from executorch.backends.samsung.utils.constants import QuantConstants +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch._export.utils import get_buffer +from torch.export import ExportedProgram +from torch.fx import GraphModule, Node + + +class AnnotateQparamsPass(ExportPass): + """This parse is to add quantize properties to node need to be quantized. + + Annotate Quant params: + For src_node->Q->DQ->..., we will add the quant params from Q->DQ node + to the src_node + + Annotate Requantize: + For src_node->Q->DQ->Q->DQ->..., if the multiple Q->DQ contains + different quant params, we will mark the src_node as need requantize, + and add Q->DQ after removing all the Q->DQs. + """ + + propagate_nodes = { + exir_ops.edge.aten.view_copy.default, + exir_ops.edge.aten.permute_copy.default, + exir_ops.edge.aten.squeeze_copy.default, + exir_ops.edge.aten.squeeze_copy.dim, + exir_ops.edge.aten.squeeze_copy.dims, + exir_ops.edge.aten.slice_copy.Tensor, + exir_ops.edge.aten.unsqueeze_copy.default, + exir_ops.edge.aten.concat.default, + exir_ops.edge.aten.cat.default, + exir_ops.edge.aten.expand_copy.default, + } + + def __init__(self, edge_program: ExportedProgram): + super().__init__() + self.edge_program = edge_program + + def _get_last_dqs(self, node: Node) -> List[Node]: + r"""From one Q-DQ node, find the last DQs in the quantization node chain. + + + need to consider such case: + /--Q-DQ-node1 + node->Q->DQ--node-node2 + \--Q-DQ-node3 + This is a dfs implemention, so result will keep sorted + Args: + node (Node): Search DQ from this node. + + Returns: + List[Node]: list of DQ node by original sequence + """ + + def _impl(node: Node, res_list: List[Node]): + if ( + node.target not in QuantConstants.QUANT_OPS_KEY_MAP + and node.target not in QuantConstants.DEQUANT_OPS_KEY_MAP + ): + return + for user in node.users.keys(): + if ( + user.target not in QuantConstants.QUANT_OPS_KEY_MAP + and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP + ): + res_list.append(node) + else: + _impl(user, res_list) + + res_list: List[Node] = [] + for user in node.users: + _impl(user, res_list) + return res_list + + def _propagate_quant_params(self, node: Node): + assert ( + quantize_attrs := node.meta.get("quantize_attrs") + ), "Must be annotated node." + requantize_map: Dict[Node, Node] = node.meta.get("requantize", {}) + while node.users: + if len(node.users) != 1: + break + user = list(node.users.keys())[0] + if ( + user.target not in QuantConstants.QUANT_OPS_KEY_MAP + and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP + ): + break + node = user + # Case1: ...-q-dq(cur)-propagate_node-node(not d-dq) + # Case2: propagate_node(propagateed)-propagate_node-node(not q-dq) + for idx, user in enumerate(node.users.keys()): + # For the branch who need to be requantized, we propagate the requantize params + user_attrs = requantize_map.get(idx, quantize_attrs) + if user.target not in self.propagate_nodes: + continue + if len(user.users) == 1: + # Possibily no need for checking len(users)>1 + user_of_user = list(user.users)[0] + # node-q-dq-propagate-q-dq not need for propagatey + if ( + user_of_user.target in QuantConstants.QUANT_OPS_KEY_MAP + or user_of_user.target in QuantConstants.DEQUANT_OPS_KEY_MAP + ): + continue + # propagate quant for node-q-dq-propagate_node-node(not qdq) + user.meta["quantize_attrs"] = user_attrs + self._propagate_quant_params(user) + + def _annotate_requantize(self, node: Node): + assert ( + ori_quant_attrs := node.meta.get("quantize_attrs") + ), "No quant parameters found" + list_for_requantize = self._get_last_dqs(node) + node.meta["requantize"] = node.meta.get("requantize", {}) + + # We use index to mark the output to be requantized + # Because user obj and name may change when we requantize them. + + def _check_same(requant_obj, ori_obj) -> bool: + if type(requant_obj) != type(ori_obj): # noqa E721 + # We need actually same type here. + return False + if not isinstance(requant_obj, torch.Tensor): + return requant_obj == ori_obj + if requant_obj.shape != ori_obj.shape: + return False + return bool((requant_obj == ori_obj).all()) + + requantize_map: Dict[int, Dict] = node.meta["requantize"] + for idx, dq in enumerate(list_for_requantize): + q = dq.all_input_nodes[0] + if q.target not in QuantConstants.QUANT_OPS_KEY_MAP: + continue + key_map = QuantConstants.DEQUANT_OPS_KEY_MAP[dq.target] + requantize_attrs = self.get_quant_attrs(q, key_map) + if not all( + _check_same(ori_quant_attrs[key], requantize_attrs[key]) + for key in key_map.values() + ): + requantize_map[idx] = requantize_attrs + + def _annotate(self, graph_module: GraphModule): + for node in graph_module.graph.nodes: + key_map = QuantConstants.QUANT_OPS_KEY_MAP.get(node.target, None) + if not key_map: + continue + source_node = node.args[0] + if source_node.target in ( + *QuantConstants.QUANT_OPS_KEY_MAP, + *QuantConstants.DEQUANT_OPS_KEY_MAP, + ): + # Currently, don't add quant info for d_qd node here. + continue + elif source_node.target == operator.getitem: + source_node = source_node.args[0] + quant_attrs = self.get_quant_attrs(node, key_map) + source_node.meta["quantize_attrs"] = quant_attrs + self._annotate_requantize(source_node) + self._propagate_quant_params(source_node) + + def call(self, graph_module: GraphModule): + self._annotate(graph_module) + graph_module.recompile() + return PassResult(graph_module, True) + + def get_quant_attrs( + self, quant_node: torch.fx.Node, key_map: Optional[Dict] = None + ) -> Dict[str, Any]: + quant_attr_keys = [arg.name for arg in quant_node.target._schema.arguments] + quant_attrs = dict.fromkeys(quant_attr_keys) + for key, attr in zip(quant_attr_keys[1:], quant_node.args[1:]): + # For channel-wise quantization, params are stored by buffer nodes. + if isinstance(attr, torch.fx.Node): + attr = get_buffer(self.edge_program, attr) + quant_attrs[key] = attr + quant_attrs["target"] = quant_node.target + if key_map is None: + return quant_attrs + miss_attrs = [] + for aten_attr, snc_attr in key_map.items(): + if aten_attr not in quant_attrs: + miss_attrs.append(aten_attr) + continue + attr = quant_attrs[aten_attr] + quant_attrs.pop(aten_attr) + quant_attrs[snc_attr] = attr + assert ( + not miss_attrs + ), f"Miss quant attrs {miss_attrs} for node {quant_node.name}" + return quant_attrs diff --git a/backends/samsung/_passes/annotate_scalar_parameters.py b/backends/samsung/_passes/annotate_scalar_parameters.py new file mode 100644 index 00000000000..643685bdb25 --- /dev/null +++ b/backends/samsung/_passes/annotate_scalar_parameters.py @@ -0,0 +1,65 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.backends.samsung.quantizer.quantizer import global_quant_info +from executorch.backends.samsung.utils.constants import QuantConstants +from executorch.backends.transforms.utils import get_param_tensor, is_param_node +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.export import ExportedProgram + + +class AnnotateScalarParametersPass(ExportPass): + """ + Need to add quantization parameters for scalars for some ops + Ifm(Quantized)------TargetOP--- + Scalar(Non-Quant)---/ + Notice: Such scalars are converted to tensor node by default pass + """ + + TARGET_OPS = { + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.div.Tensor, + } + + def __init__(self, edge_program: ExportedProgram): + super().__init__() + self.edge_program = edge_program + + def annotate(self, graph_module: torch.fx.GraphModule): + for node in graph_module.graph.nodes: + if node.target not in self.TARGET_OPS or "quantize_attrs" not in node.meta: + continue + torch_quant_dtype = global_quant_info.weight_precison.torch_dtype + for input_arg in node.all_input_nodes: + if input_arg.op not in ("placeholder", "get_attr") or not is_param_node( + self.edge_program, input_arg + ): + continue + else: + tensor = get_param_tensor(self.edge_program, input_arg) + if not tensor.shape: + qparams = { + QuantConstants.QUANT_KEY.scale: float(tensor), + QuantConstants.QUANT_KEY.quant_dtype: torch_quant_dtype, + QuantConstants.QUANT_KEY.quant_max: torch.iinfo( + torch_quant_dtype + ).max, + QuantConstants.QUANT_KEY.quant_min: torch.iinfo( + torch_quant_dtype + ).min, + QuantConstants.QUANT_KEY.zero_point: 0, + } + input_arg.meta["quantize_attrs"] = qparams + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + self.annotate(graph_module) + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/samsung/_passes/conv1d_to_conv2d.py b/backends/samsung/_passes/conv1d_to_conv2d.py index 57f1074b348..1b8782d956b 100644 --- a/backends/samsung/_passes/conv1d_to_conv2d.py +++ b/backends/samsung/_passes/conv1d_to_conv2d.py @@ -5,84 +5,93 @@ # LICENSE file in the root directory of this source tree. import torch +from executorch.backends.transforms.utils import get_param_tensor from executorch.exir import ExportedProgram from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from torch._export.utils import get_param class Conv1dToConv2d(ExportPass): - def __init__(self, edge_program: ExportedProgram): super().__init__() self.edge_program = edge_program + def update_kernel(self, weight_node: torch.Tensor): + # lifted tensor in tensor constant + weight_3d = get_param_tensor(self.edge_program, weight_node) + if param_name := self.edge_program.graph_signature.inputs_to_parameters.get( + weight_node.name + ): + new_weight_param = torch.nn.Parameter( + data=weight_3d.data.contiguous().unsqueeze(dim=-1), requires_grad=False + ) + self.edge_program.state_dict[param_name] = new_weight_param + elif tensor_name := self.edge_program.graph_signature.inputs_to_lifted_tensor_constants.get( + weight_node.name + ): + self.edge_program.constants[tensor_name] = torch.unsqueeze(weight_3d, -1) + else: + RuntimeError("Weight of 1d conv should be constant tensor or Parameter obj") + weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze(dim=-1) + def call(self, graph_module: torch.fx.GraphModule): graph = graph_module.graph node_list = list(graph.nodes) for node in node_list: - if node.op == "call_function": - if node.target == exir_ops.edge.aten.convolution.default: - stride = list(node.args[3]) - if len(stride) != 1: - continue + if node.op != "call_function": + continue + if node.target != exir_ops.edge.aten.convolution.default: + continue + stride = list(node.args[3]) + if len(stride) != 1: + continue - # convert 3dim weight to 4dim - weight_node = node.args[1] - weight_3dim = get_param(self.edge_program, weight_node) - weight_4dim = torch.nn.Parameter( - data=weight_3dim.data.contiguous().unsqueeze(dim=-1), - requires_grad=False, - ) - parameter_name = ( - self.edge_program.graph_signature.inputs_to_parameters[ - weight_node.name - ] - ) - self.edge_program.state_dict[parameter_name] = weight_4dim - weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze( - dim=-1 - ) + # convert 3dim weight to 4dim + weight_node = node.args[1] + self.update_kernel(weight_node) - # Extend stride, padding, and dilation - node.args = ( - node.args[0], - node.args[1], - node.args[2], - node.args[3] + [1], # stride - node.args[4] + [0], # padding - node.args[5] + [1], # dilation - node.args[6], - node.args[7], - node.args[8], - ) + # Extend stride, padding, and dilation + node.args = ( + node.args[0], + node.args[1], + node.args[2], + node.args[3] + [1], # stride + node.args[4] + [0], # padding + node.args[5] + [1], # dilation + node.args[6], + node.args[7], + node.args[8], + ) + # unsqueeze -> conv2d -> squeeze - # unsqueeze -> conv2d -> squeeze - with graph.inserting_before(node): - input_node = node.args[0] - unsqueeze_before = graph.create_node( - "call_function", exir_ops.edge.aten.unsqueeze_copy.default - ) - unsqueeze_before.args = ( - input_node, - -1, - ) - node.replace_input_with(input_node, unsqueeze_before) + with graph.inserting_before(node): + input_node = node.args[0] + prev_qparams = input_node.meta.get("quantize_attrs") + unsqueeze_before = graph.create_node( + "call_function", exir_ops.edge.aten.unsqueeze_copy.default + ) + unsqueeze_before.args = ( + input_node, + -1, + ) + node.replace_input_with(input_node, unsqueeze_before) - with graph.inserting_after(node): - squeeze_after = graph.create_node( - "call_function", exir_ops.edge.aten.squeeze_copy.dims - ) - squeeze_after.args = ( - node, - [-1], - ) - original_users = [ - user for user in node.users if user != squeeze_after - ] - for user in original_users: - user.replace_input_with(node, squeeze_after) + with graph.inserting_after(node): + squeeze_after = graph.create_node( + "call_function", exir_ops.edge.aten.squeeze_copy.dims + ) + squeeze_after.args = ( + node, + [-1], + ) + original_users = [user for user in node.users if user != squeeze_after] + for user in original_users: + user.replace_input_with(node, squeeze_after) + if quant_attr := node.meta.get("quantize_attrs"): + squeeze_after.meta["quantize_attrs"] = quant_attr + if prev_qparams is not None: + unsqueeze_before.meta["quantize_attrs"] = prev_qparams graph_module.recompile() - graph_module = super().call(graph_module).graph_module + _ = super().call(graph_module).graph_module return PassResult(graph_module, True) diff --git a/backends/samsung/_passes/fold_qdq.py b/backends/samsung/_passes/fold_qdq.py new file mode 100644 index 00000000000..c6f3699ece7 --- /dev/null +++ b/backends/samsung/_passes/fold_qdq.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.backends.samsung.utils.constants import QuantConstants +from executorch.exir.pass_base import ExportPass, PassResult +from executorch.exir.passes import dead_code_elimination_pass +from torch.fx import GraphModule + + +class FoldQDQPass(ExportPass): + def __init__(self): + super().__init__() + + def _fold( + self, + graph_module: GraphModule, + ): + for node in graph_module.graph.nodes: + if node.target not in ( + *QuantConstants.QUANT_OPS_KEY_MAP.keys(), + *QuantConstants.DEQUANT_OPS_KEY_MAP.keys(), + ): + continue + for user in [user for user in node.users.keys()]: # noqa: C416 + user.replace_input_with(node, node.args[0]) + graph_module.graph.erase_node(node) + + def call(self, graph_module: GraphModule): + self._fold(graph_module) + graph_module.recompile() + dead_code_elimination_pass(graph_module) + _ = super().call(graph_module).graph_module + return PassResult(graph_module, True) diff --git a/backends/samsung/_passes/fuse_conv_act.py b/backends/samsung/_passes/fuse_conv_act.py new file mode 100644 index 00000000000..c034c98bb14 --- /dev/null +++ b/backends/samsung/_passes/fuse_conv_act.py @@ -0,0 +1,77 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import torch +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from executorch.exir.passes import dead_code_elimination_pass +from torch.fx import GraphModule + + +def map_hardtan_relux(tanhnode: torch.fx.node.Node) -> Optional[str]: + assert ( + tanhnode.target == exir_ops.edge.aten.hardtanh.default + ), "Must be a hardtanh node" + if not tanhnode.args[1] == 0.0: + return None + if tanhnode.args[2] == 6.0: + return "RELU6" + return None + + +class FuseConvActPass(ExportPass): + TARGET_ACTS_MAP = { + exir_ops.edge.aten.relu.default: (lambda x: "RELU"), + exir_ops.edge.aten.relu_.default: (lambda x: "RELU"), + exir_ops.edge.aten.relu6.default: (lambda x: "RELU6"), + exir_ops.edge.aten.relu6_.default: (lambda x: "RELU6"), + exir_ops.edge.aten.hardtanh.default: map_hardtan_relux, + exir_ops.edge.aten.hardtanh_.default: map_hardtan_relux, + } + + def _fuse( + self, + graph_module: GraphModule, + ): + for target_conv, target_act in self.get_target_conv_act(graph_module): + assert ( + act_name := self.TARGET_ACTS_MAP.get(target_act.target)(target_act) + ), f"Not supported {target_act.name} now." + target_conv.meta["activation"] = act_name + if "quantize_attrs" in target_act.meta: + target_conv.meta["quantize_attrs"] = target_act.meta["quantize_attrs"] + + # If we merge the real out activation to conv, the conv should be the real out + if "real_out" in target_act.meta: + target_conv.meta["real_out"] = target_act.meta["real_out"] + for user in [user for user in target_act.users.keys()]: # noqa: C416 + user.replace_input_with(target_act, target_conv) + graph_module.graph.erase_node(target_act) + + def get_target_conv_act(self, graph_module: GraphModule): + for node in graph_module.graph.nodes: + if node.target != exir_ops.edge.aten.convolution.default: + continue + if len(node.users) != 1: + # Such cases couldn't be conv + act + continue + act_node = list(node.users.keys())[0] + if act_node.target not in self.TARGET_ACTS_MAP: + continue + if "quantize_attrs" in node.meta: + # If the conv's output is quantized + # We do not fuse them + continue + yield node, act_node + + def call(self, graph_module: GraphModule): + self._fuse(graph_module) + graph_module.recompile() + dead_code_elimination_pass(graph_module) + _ = super().call(graph_module).graph_module + return PassResult(graph_module, True) diff --git a/backends/samsung/_passes/insert_qdq.py b/backends/samsung/_passes/insert_qdq.py new file mode 100644 index 00000000000..a59b011ac4b --- /dev/null +++ b/backends/samsung/_passes/insert_qdq.py @@ -0,0 +1,164 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from enum import Enum +from typing import Any, Dict + +import torch +from executorch.backends.samsung._passes.utils import none_quant_tensor_quant_meta +from executorch.backends.samsung.utils.constants import QuantConstants +from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.export import ExportedProgram +from torch.fx import GraphModule + + +class QType(Enum): + Quant = 0 + Dequant = 1 + + +class InsertQDQPass(ExportPass): + QDQ_MAP = { + # per tensor + exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, + exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, + # per channel + exir_ops.edge.quantized_decomposed.quantize_per_channel.default: exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, + } + + def __init__(self, edge_program: ExportedProgram): + super().__init__() + self.edge_program = edge_program + + def _create_qdq_node( + self, + graph_module: GraphModule, + qtype: QType, + input_node: torch.fx.Node, + quant_attrs: Dict[str, Any], + ) -> torch.fx.Node: + assert (target := quant_attrs.get("target")), "" + new_node_args = [input_node] + new_node_meta_val = input_node.meta["val"] + new_node_quant_attrs = {} + if qtype == QType.Dequant: + target = self.QDQ_MAP[target] + else: + # For input node, we should set the val type as quant type + key = QuantConstants.QUANT_KEY.quant_dtype + new_node_meta_val = new_node_meta_val.to(quant_attrs[key]) + new_node_quant_attrs.update(quant_attrs) + + for arg in target._schema.arguments[1:]: + name = arg.name + if name == "out_dtype": + continue + if qtype == QType.Quant: + key = QuantConstants.QUANT_OPS_KEY_MAP[target].get(name, name) + else: + key = QuantConstants.DEQUANT_OPS_KEY_MAP[target].get(name, name) + arg_value = quant_attrs[key] + if isinstance(arg.type, torch.Tensor) and ( + isinstance(arg_value, int) or isinstance(arg_value, float) + ): + arg_value = torch.Tensor(arg_value) + new_node_args.append(arg_value) + + new_node = graph_module.graph.create_node( + "call_function", target, tuple(new_node_args) + ) + if new_node_quant_attrs: + new_node.meta["quantize_attrs"] = new_node_quant_attrs + else: + new_node.meta["quantize_attrs"] = { + QuantConstants.QUANT_KEY.quant_dtype: torch.float32, + QuantConstants.QUANT_KEY.scale: [1.0], + QuantConstants.QUANT_KEY.zero_point: [0], + } + new_node.meta["val"] = new_node_meta_val + return new_node + + def _add_dq_after(self, graph_module: GraphModule, node: torch.fx.Node): + if not (quant_attrs := node.meta.get("quantize_attrs")): + return + with graph_module.graph.inserting_after(node): + new_node = self._create_qdq_node( + graph_module, QType.Dequant, node, quant_attrs + ) + users = [user for user in node.users.keys() if (user.op == "output")] + for user in users: + user.replace_input_with(node, new_node) + + def _add_q_after(self, graph_module: GraphModule, node: torch.fx.Node): + # In node don't need quant attrs after insert new quantize node. + if not (quant_attrs := node.meta.pop("quantize_attrs", None)): + return + node.meta["quantize_attrs"] = none_quant_tensor_quant_meta() + with graph_module.graph.inserting_after(node): + users = list(node.users.keys()) + new_node = self._create_qdq_node( + graph_module, QType.Quant, node, quant_attrs + ) + for user in users: + if user.target not in QuantConstants.QUANT_OPS_KEY_MAP: + user.replace_input_with(node, new_node) + + def _add_q_before( + self, + graph_module: GraphModule, + node: torch.fx.Node, + from_node: torch.fx.Node, + quantize_attrs: Dict, + ): + with graph_module.graph.inserting_before(node): + new_quant_node = self._create_qdq_node( + graph_module, QType.Quant, from_node, quantize_attrs + ) + node.replace_input_with(from_node, new_quant_node) + return new_quant_node + + def _add_dq_before( + self, + graph_module: GraphModule, + node: torch.fx.Node, + from_node: torch.fx.Node, + quantize_attrs: Dict, + ): + with graph_module.graph.inserting_before(node): + new_dequant_node = self._create_qdq_node( + graph_module, QType.Dequant, from_node, quantize_attrs + ) + node.replace_input_with(from_node, new_dequant_node) + return new_dequant_node + + def _add_qdq_for_requantize(self, graph_module: GraphModule): + for node in graph_module.graph.nodes: + requant_map: Dict[int, Dict] = node.meta.get("requantize") + if requant_map is None: + continue + assert (ori_quant_attrs := node.meta.get("quantize_attrs")) + usr_list = list(node.users.keys()) + for user_idx, requant_params in requant_map.items(): + user = usr_list[user_idx] + q_node = self._add_q_before(graph_module, user, node, requant_params) + _ = self._add_dq_before(graph_module, q_node, node, ori_quant_attrs) + + def _add_qdq(self, graph_module: GraphModule): + for node in list(graph_module.graph.nodes): + if is_graph_input(self.edge_program, node): + self._add_q_after(graph_module, node) + elif is_graph_output(node): + self._add_dq_after(graph_module, node) + + def call(self, graph_module: GraphModule): + self._add_qdq(graph_module) + self._add_qdq_for_requantize(graph_module) + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/samsung/_passes/remove_useless_ops.py b/backends/samsung/_passes/remove_useless_ops.py new file mode 100644 index 00000000000..c88a2d4a5d8 --- /dev/null +++ b/backends/samsung/_passes/remove_useless_ops.py @@ -0,0 +1,87 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from executorch.exir.passes import dead_code_elimination_pass +from torch.fx import GraphModule + + +class RemoveUselessOpPass(ExportPass): + # such ops should be single-in and single-out + USELESS_OP_SET = { + exir_ops.edge.aten._to_copy.default, + exir_ops.edge.aten.clone.default, + exir_ops.edge.aten.clone.default, + exir_ops.edge.aten.alias.default, + exir_ops.edge.aten.lift_fresh_copy.default, + exir_ops.edge.dim_order_ops._to_dim_order_copy.default, + } + + def __init__(self): + super().__init__() + + def gen_pattern_as_strided_copy(self, graph_module: GraphModule): + for node in list(graph_module.graph.nodes): # noqa: C416 + if node.target != exir_ops.edge.aten.mean.dim: + continue + if len(node.users) != 1: + continue + successor = list(node.users.keys())[0] + if successor.target != exir_ops.edge.aten.as_strided_copy.default: + continue + is_pattern = True + count = 0 + for i, stride in enumerate(successor.args[2]): + if stride < node.meta["val"].size()[i]: + if stride == 1: + count += 1 + else: + is_pattern = False + break + if count >= 2: + is_pattern = False + break + if is_pattern: + yield successor + + def _fold_as_strided_copy( + self, + graph_module: GraphModule, + ): + for as_strided_copy_node in self.gen_pattern_as_strided_copy(graph_module): + for user in list(as_strided_copy_node.users.keys()): + user.replace_input_with( + as_strided_copy_node, as_strided_copy_node.args[0] + ) + graph_module.graph.erase_node(as_strided_copy_node) + + def _remove_useless( + self, + graph_module: GraphModule, + ): + for node in graph_module.graph.nodes: + if node.target not in self.USELESS_OP_SET: + continue + + # Prevent from removing if data type may change. + if ( + node.target == exir_ops.edge.aten._to_copy.default + or node.target == exir_ops.edge.dim_order_ops._to_dim_order_copy.default + ) and "memory_format" not in node.kwargs: + continue + + for user in [user for user in node.users.keys()]: # noqa: C416 + user.replace_input_with(node, node.all_input_nodes[0]) + graph_module.graph.erase_node(node) + self._fold_as_strided_copy(graph_module) + + def call(self, graph_module: GraphModule): + self._remove_useless(graph_module) + graph_module.recompile() + dead_code_elimination_pass(graph_module) + _ = super().call(graph_module).graph_module + return PassResult(graph_module, True) diff --git a/backends/samsung/_passes/utils.py b/backends/samsung/_passes/utils.py new file mode 100644 index 00000000000..afa7c72c601 --- /dev/null +++ b/backends/samsung/_passes/utils.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch + + +def none_quant_tensor_quant_meta(): + return { + "quant_dtype": torch.float32, + "scales": 1, + "zero_points": 0, + } diff --git a/backends/samsung/build.sh b/backends/samsung/build.sh index dfa6407ff50..4845c760f0c 100755 --- a/backends/samsung/build.sh +++ b/backends/samsung/build.sh @@ -45,6 +45,7 @@ function build_x86_64() { -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -S ${PROJECT_DIR} \ -B ${X86_64_BUILD_DIR} @@ -77,6 +78,7 @@ function build_android() { -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DEXECUTORCH_BUILD_DEVTOOLS=ON \ diff --git a/backends/samsung/builders/__init__.py b/backends/samsung/builders/__init__.py index 02a457fd06e..978da82b370 100644 --- a/backends/samsung/builders/__init__.py +++ b/backends/samsung/builders/__init__.py @@ -14,11 +14,13 @@ op_clamp, op_constant_pad_nd, op_conv2d, + op_dequantize, op_div, op_embedding, op_expand_copy, op_gelu, op_getitem, + op_hardsigmoid, op_hardswish, op_hardtanh, op_layer_norm, @@ -32,6 +34,7 @@ op_mul, op_permute, op_pixel_shuffle, + op_quantize, op_relu, op_reshape, op_rsqrt, @@ -57,6 +60,7 @@ op_clamp, op_conv2d, op_constant_pad_nd, + op_dequantize, op_div, op_embedding, op_expand_copy, @@ -64,6 +68,7 @@ op_getitem, op_hardswish, op_hardtanh, + op_hardsigmoid, op_layer_norm, op_leaky_relu, op_linear, @@ -75,6 +80,7 @@ op_mul, op_permute, op_pixel_shuffle, + op_quantize, op_relu, op_reshape, op_rsqrt, diff --git a/backends/samsung/builders/node_visitor.py b/backends/samsung/builders/node_visitor.py index a35c0b4715d..0d2707da8f5 100644 --- a/backends/samsung/builders/node_visitor.py +++ b/backends/samsung/builders/node_visitor.py @@ -14,6 +14,7 @@ get_tensor_type, ) from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph +from executorch.backends.samsung.utils.constants import QuantConstants from executorch.backends.transforms.utils import is_param_node from torch.export import ExportedProgram @@ -61,18 +62,26 @@ def define_tensor( dims = [1] if len(tensor.size()) == 0 else list(tensor.size()) + quant_attrs = node.meta.get("quantize_attrs") enn_tensor_id = enn_graph.define_tensor( node.name, dims, data_type, tensor_type.name, const_data, + quant_param=quant_attrs, ) assert enn_tensor_id is not None vals_to_ids[node] = enn_tensor_id return enn_tensor_id + def _update_params_qdtype(self, node: torch.fx.Node, params: Dict): + if qdtype := node.meta.get("quantize_attrs", {}).get( + QuantConstants.QUANT_KEY.quant_dtype + ): + params["quant_dtype"] = EnnGraph._affine_meta_param(qdtype) + _node_visitor_dict = {} @@ -92,6 +101,7 @@ def register_node_visitor(visitor): raise TypeError( f"target of vistor should be str|Tuple[str]|List[str], not{type(visitor.target)}" ) + return visitor def get_node_visitors(*args) -> Dict[str, NodeVisitor]: diff --git a/backends/samsung/builders/op_add.py b/backends/samsung/builders/op_add.py index 1b0dddb0d02..a6eb79897dd 100644 --- a/backends/samsung/builders/op_add.py +++ b/backends/samsung/builders/op_add.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import Dict import torch @@ -28,9 +29,13 @@ def define_node( ) -> None: input1 = node.args[0] input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids) + params = {} + self._update_params_qdtype(node, params) input2 = node.args[1] input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids) output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op(node.name, "ELTSUM", [input_id_1, input_id_2], [output_id]) + enn_graph.define_op( + node.name, "ELTSUM", [input_id_1, input_id_2], [output_id], params + ) diff --git a/backends/samsung/builders/op_avg_pool2d.py b/backends/samsung/builders/op_avg_pool2d.py index ad7ccbac3ae..bfca8b89b22 100644 --- a/backends/samsung/builders/op_avg_pool2d.py +++ b/backends/samsung/builders/op_avg_pool2d.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import cast, Dict, List import torch @@ -49,6 +50,7 @@ def define_node( params["stride_w"] = stride[1] params["padding"] = "EXPLICIT" params["explicit_padding"] = explicit_padding + self._update_params_qdtype(node, params) if len(node.args) > 4: ceil_mode = cast(bool, node.args[4]) @@ -64,7 +66,5 @@ def define_node( assert ( divisor_override == kernel_size[0] * kernel_size[1] ), "Not supported divisor_override which is not equal to pooling region." - output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op(node.name, "AVGPOOL2D", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_bmm.py b/backends/samsung/builders/op_bmm.py index 6ba8864ebb3..13e0d19cb14 100644 --- a/backends/samsung/builders/op_bmm.py +++ b/backends/samsung/builders/op_bmm.py @@ -16,7 +16,7 @@ @register_node_visitor class BMMVisitor(NodeVisitor): - target = "aten.bmm.default" + target = ["aten.bmm.default"] def __init__(self, *args) -> None: super().__init__(*args) @@ -29,12 +29,15 @@ def define_node( ) -> None: input1 = node.args[0] input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids) + input2 = node.args[1] input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids) # output output_id = self.define_tensor(node, enn_graph, vals_to_ids) + params = {} + self._update_params_qdtype(node, params) enn_graph.define_op( - node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id] + node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id], params ) diff --git a/backends/samsung/builders/op_cat.py b/backends/samsung/builders/op_cat.py index e9c0a32b389..09387f2e361 100644 --- a/backends/samsung/builders/op_cat.py +++ b/backends/samsung/builders/op_cat.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import cast, Dict, List import torch @@ -12,6 +13,7 @@ ) from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph from executorch.backends.transforms import get_shape +from executorch.backends.transforms.utils import is_param_node @register_node_visitor @@ -29,14 +31,20 @@ def define_node( ) -> None: tensors = cast(List[torch.fx.Node], node.args[0]) input_tensor_ids = [] - - for in_tensor in tensors: + constant_idx = None + for idx, in_tensor in enumerate(tensors): + if is_param_node(self.exported_program, in_tensor): + assert constant_idx is None, "Only support at most 1 constant tensor" + constant_idx = idx input_id = self.define_tensor(in_tensor, enn_graph, vals_to_ids) input_tensor_ids.append(input_id) in_shape = get_shape(node) axis = cast(int, node.args[1]) % len(in_shape) if len(node.args) >= 2 else 0 params = {"axis": axis} + if constant_idx is not None: + params["constant_index"] = constant_idx + self._update_params_qdtype(node, params) output_id = self.define_tensor(node, enn_graph, vals_to_ids) enn_graph.define_op(node.name, "CONCAT", input_tensor_ids, [output_id], params) diff --git a/backends/samsung/builders/op_clamp.py b/backends/samsung/builders/op_clamp.py index c5670b80fa3..74af83212a5 100644 --- a/backends/samsung/builders/op_clamp.py +++ b/backends/samsung/builders/op_clamp.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import cast, Dict import torch @@ -32,12 +33,15 @@ def define_node( # The default value of lower bound and upper bound output_min = torch.finfo(torch.float32).min output_max = torch.finfo(torch.float32).max + if node.args[1] is not None: output_min = cast(float, node.args[1]) if len(node.args) > 2 and node.args[2] is not None: output_max = cast(float, node.args[2]) params = {"minimum": output_min, "maximum": output_max} + self._update_params_qdtype(node, params) + output_id = self.define_tensor(node, enn_graph, vals_to_ids) enn_graph.define_op(node.name, "CLIP", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_conv2d.py b/backends/samsung/builders/op_conv2d.py index 881a533801f..ab77d8df626 100644 --- a/backends/samsung/builders/op_conv2d.py +++ b/backends/samsung/builders/op_conv2d.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import cast, Dict, List import torch @@ -56,6 +57,9 @@ def define_node( input_shape = get_shape(input) kernel_shape = get_shape(weight_node) params = {} + self._update_params_qdtype(node, params) + if "activation" in node.meta: + params["activation"] = node.meta["activation"] params["kernel_h"] = kernel_shape[2] params["kernel_w"] = kernel_shape[3] params["stride_h"] = stride[0] diff --git a/backends/samsung/builders/op_dequantize.py b/backends/samsung/builders/op_dequantize.py new file mode 100644 index 00000000000..a1c31af4037 --- /dev/null +++ b/backends/samsung/builders/op_dequantize.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.backends.samsung.builders.node_visitor import register_node_visitor +from executorch.backends.samsung.builders.op_quantize import _QuantOpVistorBase + + +# Dequant ops here +@register_node_visitor +class DequantizeVistor(_QuantOpVistorBase): + target = [ + "quantized_decomposed.dequantize_per_tensor.default", + "quantized_decomposed.dequantize_per_tensor.tensor", + "quantized_decomposed.dequantize_per_channel.default", + "quantized_decomposed.dequantize_per_channel.tensor", + ] diff --git a/backends/samsung/builders/op_div.py b/backends/samsung/builders/op_div.py index 89d773ddb0e..8b0e7cdd5af 100644 --- a/backends/samsung/builders/op_div.py +++ b/backends/samsung/builders/op_div.py @@ -27,13 +27,16 @@ def define_node( enn_graph: EnnGraph, vals_to_ids: Dict[torch.Tensor, int], ) -> None: - # inputs input1 = node.args[0] input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids) + input2 = node.args[1] input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids) - + params = {} + self._update_params_qdtype(node, params) # output output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op(node.name, "ELTDIV", [input_id_1, input_id_2], [output_id]) + enn_graph.define_op( + node.name, "ELTDIV", [input_id_1, input_id_2], [output_id], params + ) diff --git a/backends/samsung/builders/op_gelu.py b/backends/samsung/builders/op_gelu.py index 059a3b77850..88417f688f9 100644 --- a/backends/samsung/builders/op_gelu.py +++ b/backends/samsung/builders/op_gelu.py @@ -27,8 +27,14 @@ def define_node( enn_graph: EnnGraph, vals_to_ids: Dict[torch.Tensor, int], ) -> None: - input_id = self.define_tensor(node.args[0], enn_graph, vals_to_ids) + # input1 + input = node.args[0] + input_id = self.define_tensor(input, enn_graph, vals_to_ids) + # output output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op(node.name, "GELU", [input_id], [output_id]) + params = {} + self._update_params_qdtype(node, params) + + enn_graph.define_op(node.name, "GELU", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_hardsigmoid.py b/backends/samsung/builders/op_hardsigmoid.py new file mode 100644 index 00000000000..3a50d65da41 --- /dev/null +++ b/backends/samsung/builders/op_hardsigmoid.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import torch +from executorch.backends.samsung.builders.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph + + +@register_node_visitor +class HardSigmoidVisitor(NodeVisitor): + target = "aten.hardsigmoid.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + enn_graph: EnnGraph, + vals_to_ids: Dict[torch.Tensor, int], + ) -> None: + input = node.args[0] + input_id = self.define_tensor(input, enn_graph, vals_to_ids) + output_id = self.define_tensor(node, enn_graph, vals_to_ids) + params = {} + self._update_params_qdtype(node, params) + enn_graph.define_op(node.name, "HardSigmoid", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_hardswish.py b/backends/samsung/builders/op_hardswish.py index 72a99d17b83..8c30125e8a4 100644 --- a/backends/samsung/builders/op_hardswish.py +++ b/backends/samsung/builders/op_hardswish.py @@ -29,7 +29,7 @@ def define_node( ) -> None: input = node.args[0] input_id = self.define_tensor(input, enn_graph, vals_to_ids) - + params = {} + self._update_params_qdtype(node, params) output_id = self.define_tensor(node, enn_graph, vals_to_ids) - - enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id]) + enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_hardtanh.py b/backends/samsung/builders/op_hardtanh.py index 4f667bf5299..7d65e97a566 100644 --- a/backends/samsung/builders/op_hardtanh.py +++ b/backends/samsung/builders/op_hardtanh.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import cast, Dict import torch @@ -29,9 +30,12 @@ def define_node( input = node.args[0] input_id = self.define_tensor(input, enn_graph, vals_to_ids) + # default value of output_min and output_max output_min = cast(float, node.args[1]) if len(node.args) > 1 else -1 output_max = cast(float, node.args[2]) if len(node.args) > 2 else 1 + params = {"minimum": output_min, "maximum": output_max} + self._update_params_qdtype(node, params) output_id = self.define_tensor(node, enn_graph, vals_to_ids) diff --git a/backends/samsung/builders/op_layer_norm.py b/backends/samsung/builders/op_layer_norm.py index e6f853178d8..098bc92dc84 100644 --- a/backends/samsung/builders/op_layer_norm.py +++ b/backends/samsung/builders/op_layer_norm.py @@ -46,9 +46,8 @@ def define_node( epsilon = node.args[4] if len(node.args) > 4 else 1e-5 params = {"epsilon": epsilon} - + self._update_params_qdtype(node, params) output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op( node.name, "LAYERNORM", all_input_tensors, [output_id], params ) diff --git a/backends/samsung/builders/op_linear.py b/backends/samsung/builders/op_linear.py index 2f7aa1e6415..720439de976 100644 --- a/backends/samsung/builders/op_linear.py +++ b/backends/samsung/builders/op_linear.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import Dict import torch @@ -43,6 +44,7 @@ def define_node( weight_shape = get_shape(weight_node) params = {"in_channels": weight_shape[1], "out_channels": weight_shape[0]} + self._update_params_qdtype(node, params) output_id = self.define_tensor(node, enn_graph, vals_to_ids) diff --git a/backends/samsung/builders/op_max_pool2d.py b/backends/samsung/builders/op_max_pool2d.py index d386dd30b1a..57b716fcb34 100644 --- a/backends/samsung/builders/op_max_pool2d.py +++ b/backends/samsung/builders/op_max_pool2d.py @@ -73,6 +73,7 @@ def define_node( params["explicit_padding"] = explicit_padding params["dilation_h"] = dilation[0] params["dilation_w"] = dilation[1] + self._update_params_qdtype(node, params) if len(node.args) > 5: ceil_mode = cast(bool, node.args[5]) diff --git a/backends/samsung/builders/op_mean_dim.py b/backends/samsung/builders/op_mean_dim.py index 2f07f870ec4..3d0377703a7 100644 --- a/backends/samsung/builders/op_mean_dim.py +++ b/backends/samsung/builders/op_mean_dim.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import cast, Dict, List import torch @@ -27,6 +28,7 @@ def define_node( enn_graph: EnnGraph, vals_to_ids: Dict[torch.Tensor, int], ) -> None: + # input input = node.args[0] input_id = self.define_tensor(input, enn_graph, vals_to_ids) @@ -37,8 +39,11 @@ def define_node( in_shape = get_shape(input) for dim in dims: reduce_axes.append(dim % len(in_shape)) - reduce_axes.sort() + + if len(node.args[1]) > 1: + reduce_axes.sort() keep_dim = node.args[2] if len(node.args) >= 3 else False params = {"keep_dims": keep_dim, "axis": reduce_axes} + self._update_params_qdtype(node, params) enn_graph.define_op(node.name, "REDUCEMEAN", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_mul.py b/backends/samsung/builders/op_mul.py index dce531ff0b0..6dd7c0dd9f0 100644 --- a/backends/samsung/builders/op_mul.py +++ b/backends/samsung/builders/op_mul.py @@ -1,5 +1,9 @@ -# Copyright (c) 2024 Samsung Electronics Co. LTD +# Copyright (c) 2025 Samsung Electronics Co. LTD # All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + from typing import Dict import torch @@ -23,11 +27,17 @@ def define_node( enn_graph: EnnGraph, vals_to_ids: Dict[torch.Tensor, int], ) -> None: + input1 = node.args[0] input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids) + input2 = node.args[1] input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids) + params = {} + self._update_params_qdtype(node, params) output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op(node.name, "ELTMUL", [input_id_1, input_id_2], [output_id]) + enn_graph.define_op( + node.name, "ELTMUL", [input_id_1, input_id_2], [output_id], params + ) diff --git a/backends/samsung/builders/op_quantize.py b/backends/samsung/builders/op_quantize.py new file mode 100644 index 00000000000..dcf30e291f9 --- /dev/null +++ b/backends/samsung/builders/op_quantize.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import torch +from executorch.backends.samsung.builders.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph +from executorch.backends.samsung.utils.constants import QuantConstants + + +class _QuantOpVistorBase(NodeVisitor): + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + enn_graph: EnnGraph, + vals_to_ids: Dict[torch.Tensor, int], + ) -> None: + # input + input = node.args[0] + input_id = self.define_tensor(input, enn_graph, vals_to_ids) + + scales = node.args[1] + if isinstance(scales, torch.Tensor): + scales = scales.tolist() + elif not isinstance(scales, list): + scales = torch.tensor(scales).reshape([1]).tolist() + zero_points = node.args[2] + if isinstance(zero_points, torch.Tensor): + zero_points = zero_points.tolist() + elif not isinstance(zero_points, list): + zero_points = torch.tensor(zero_points).reshape([1]).tolist() + + output_id = self.define_tensor(node, enn_graph, vals_to_ids) + + params = {"scales": scales, "zero_points": zero_points} + + if node.target in QuantConstants.QUANT_OPS_KEY_MAP: + enn_graph.define_op(node.name, "QUANTIZE", [input_id], [output_id], params) + else: + enn_graph.define_op( + node.name, "DEQUANTIZE", [input_id], [output_id], params + ) + + +@register_node_visitor +class QuantizeVistor(_QuantOpVistorBase): + target = [ + "quantized_decomposed.quantize_per_tensor.default", + "quantized_decomposed.quantize_per_channel.default", + ] diff --git a/backends/samsung/builders/op_relu.py b/backends/samsung/builders/op_relu.py index ba90116be1d..a4a2b6bc4f0 100644 --- a/backends/samsung/builders/op_relu.py +++ b/backends/samsung/builders/op_relu.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + from typing import Dict import torch @@ -30,5 +31,7 @@ def define_node( input_id = self.define_tensor(input, enn_graph, vals_to_ids) output_id = self.define_tensor(node, enn_graph, vals_to_ids) + params = {} + self._update_params_qdtype(node, params) - enn_graph.define_op(node.name, "RELU", [input_id], [output_id]) + enn_graph.define_op(node.name, "RELU", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_softmax.py b/backends/samsung/builders/op_softmax.py index 1e2e4a378dc..7f569cea6fc 100644 --- a/backends/samsung/builders/op_softmax.py +++ b/backends/samsung/builders/op_softmax.py @@ -35,5 +35,5 @@ def define_node( axis = cast(int, node.args[1]) params = {"axis": axis} - + self._update_params_qdtype(node, params) enn_graph.define_op(node.name, "SOFTMAX", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_squeeze.py b/backends/samsung/builders/op_squeeze.py index d165a22fcb3..82fa17fbc95 100644 --- a/backends/samsung/builders/op_squeeze.py +++ b/backends/samsung/builders/op_squeeze.py @@ -33,4 +33,5 @@ def define_node( # output output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id]) + params = {"new_shape": [*node.meta["val"].shape]} + enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_to_copy.py b/backends/samsung/builders/op_to_copy.py index 545672ef6a3..c770602bb5f 100644 --- a/backends/samsung/builders/op_to_copy.py +++ b/backends/samsung/builders/op_to_copy.py @@ -11,6 +11,8 @@ NodeVisitor, register_node_visitor, ) + +from executorch.backends.samsung.builders.utils import get_map_dtype, get_tensor from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph @@ -35,5 +37,8 @@ def define_node( input_id = self.define_tensor(input, enn_graph, vals_to_ids) output_id = self.define_tensor(node, enn_graph, vals_to_ids) + params = {} + out_tensor = get_tensor(self.exported_program, node) + params["out_dtype"] = get_map_dtype(out_tensor.dtype) - enn_graph.define_op(node.name, "CAST", [input_id], [output_id]) + enn_graph.define_op(node.name, "CAST", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_unsqueeze.py b/backends/samsung/builders/op_unsqueeze.py index 942c3307de7..61fa06e6310 100644 --- a/backends/samsung/builders/op_unsqueeze.py +++ b/backends/samsung/builders/op_unsqueeze.py @@ -31,4 +31,5 @@ def define_node( output_id = self.define_tensor(node, enn_graph, vals_to_ids) - enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id]) + params = {"new_shape": [*node.meta["val"].shape]} + enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params) diff --git a/backends/samsung/builders/op_upsample_bilinear2d.py b/backends/samsung/builders/op_upsample_bilinear2d.py index a934b2789ba..d4b040460e3 100644 --- a/backends/samsung/builders/op_upsample_bilinear2d.py +++ b/backends/samsung/builders/op_upsample_bilinear2d.py @@ -46,6 +46,7 @@ def define_node( "upsampling_factor": scale_factor, "half_pixel_centers": True, } + self._update_params_qdtype(node, params) output_id = self.define_tensor(node, enn_graph, vals_to_ids) enn_graph.define_op( node.name, "RESIZE_BILINEAR", [input_id], [output_id], params diff --git a/backends/samsung/builders/utils.py b/backends/samsung/builders/utils.py index 58c84ff6d31..a640071c798 100644 --- a/backends/samsung/builders/utils.py +++ b/backends/samsung/builders/utils.py @@ -9,7 +9,6 @@ import torch from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output from executorch.backends.transforms.utils import get_param_tensor, is_param_node - from torch.export import ExportedProgram DATA_TYPE_STR_MAPPING = { diff --git a/backends/samsung/enn_preprocess.py b/backends/samsung/enn_preprocess.py index dde01bc09c7..0847ec0adeb 100644 --- a/backends/samsung/enn_preprocess.py +++ b/backends/samsung/enn_preprocess.py @@ -9,10 +9,16 @@ import executorch.backends.samsung.python.PyEnnWrapperAdaptor as PyEnnWrapper import torch +from executorch.backends.samsung._passes.annotate_qparams import AnnotateQparamsPass +from executorch.backends.samsung._passes.annotate_scalar_parameters import ( + AnnotateScalarParametersPass, +) from executorch.backends.samsung._passes.conv1d_to_conv2d import Conv1dToConv2d from executorch.backends.samsung._passes.customized_constant_prop import ( ConstantPropPass, ) +from executorch.backends.samsung._passes.fold_qdq import FoldQDQPass +from executorch.backends.samsung._passes.insert_qdq import InsertQDQPass from executorch.backends.samsung._passes.replace_scalar_ops import ReplaceOpsWithScalar from executorch.backends.samsung.builders.node_visitor import get_node_visitors from executorch.backends.samsung.serialization.compile_options import ( @@ -53,12 +59,16 @@ def preprocess( enn_preprocess_passes = PassManager( passes=[ + AnnotateQparamsPass(edge_program), + FoldQDQPass(), ConstantPropPass(edge_program), Conv1dToConv2d(edge_program), FuseBatchNormWithConvPass(edge_program), AddmmToLinearTransform(), ReplaceOpsWithScalar(), RemoveGetItemPass(), + InsertQDQPass(edge_program), + AnnotateScalarParametersPass(edge_program), ] ) pass_result = enn_preprocess_passes(edge_program.graph_module) diff --git a/backends/samsung/partition/enn_partitioner.py b/backends/samsung/partition/enn_partitioner.py index 952cb000429..368d069c380 100644 --- a/backends/samsung/partition/enn_partitioner.py +++ b/backends/samsung/partition/enn_partitioner.py @@ -129,5 +129,6 @@ def ops_to_not_decompose( torch.ops.aten.prelu.default, torch.ops.aten.layer_norm.default, torch.ops.aten.pixel_shuffle.default, + torch.ops.aten.hardsigmoid.default, ] return (ops_not_to_decompose, None) diff --git a/backends/samsung/quantizer/__init__.py b/backends/samsung/quantizer/__init__.py new file mode 100644 index 00000000000..621eec69240 --- /dev/null +++ b/backends/samsung/quantizer/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from .qconfig import Precision +from .quantizer import EnnQuantizer + +__all__ = [EnnQuantizer, Precision] diff --git a/backends/samsung/quantizer/annotator.py b/backends/samsung/quantizer/annotator.py new file mode 100644 index 00000000000..31015698006 --- /dev/null +++ b/backends/samsung/quantizer/annotator.py @@ -0,0 +1,871 @@ +# Copyright (c) Qualcomm Innovation Center, Inc +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Dict, List + +import torch +from torch._ops import OpOverload +from torch._subclasses import FakeTensor + +from torch.fx import Graph, Node + +from torchao.quantization.pt2e import FixedQParamsObserver +from torchao.quantization.pt2e.quantizer import ( + annotate_output_qspec, + QuantizationAnnotation, + QuantizationSpec, + SharedQuantizationSpec, +) + +from .qconfig import QuantizationConfig + +OP_ANNOTATOR: Dict[OpOverload, Callable] = {} + +ADD_OPS = [ + torch.ops.aten.add, + torch.ops.aten.add.Tensor, + torch.ops.aten.add_.Tensor, +] + + +def register_annotator(ops: List[OpOverload]): + def decorator(annotator: Callable): + for op in ops: + OP_ANNOTATOR[op] = annotator + + return decorator + + +def annotate(graph: Graph, quant_config: QuantizationConfig) -> None: + # Pattern annotation + _annotate_fused_activation_pattern(graph, quant_config) + + # Per-op annotation + for node in graph.nodes: + if node.op == "placeholder": + annotate_placeholder(node, quant_config) + elif node.op == "call_function": + annotate_func = OP_ANNOTATOR.get(node.target, None) + if annotate_func is not None: + annotate_func(node, quant_config) + + +def _is_annotated(nodes: List[Node]): + """ + Given a list of nodes (that represents an operator pattern), + return True if any of the node + is annotated, otherwise return False + """ + annotated = False + for node in nodes: + annotated = annotated or ( + "quantization_annotation" in node.meta + and node.meta["quantization_annotation"]._annotated + ) + return annotated + + +def _is_fake_tensor(node: Node): + if ( + isinstance(node, Node) + and "val" in node.meta + and isinstance(node.meta["val"], FakeTensor) + ): + return True + return False + + +def _is_float_tensor(node: Node): + """Check if the node's tensor is a float tensor, + so that we can skip quantization for the node + since observers only works with float Tensors + """ + if not _is_fake_tensor(node): + return False + return node.meta["val"].dtype in [torch.float32, torch.float16] + + +def _mark_nodes_as_annotated(nodes: List[Node]): + for node in nodes: + if "quantization_annotation" not in node.meta: + node.meta["quantization_annotation"] = QuantizationAnnotation() + node.meta["quantization_annotation"]._annotated = True + + +# for nodes whose targets ars placehold (not call_function) +def annotate_placeholder(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated([node]): + return + + if _is_float_tensor(node): + annotate_output_qspec(node, quant_config.output_activation) + + _mark_nodes_as_annotated([node]) + + +# CASE 1: fused_activation case (ex. Conv2D + ReLU) +def _is_hardtanh_for_relux(relu_node: torch.fx.node.Node): + if relu_node.target in [ + torch.ops.aten.hardtanh.default, + torch.ops.aten.hardtanh_.default, + ]: + # checking if hardtanh is convertable to ReLU6 + # ReLU1 is not supported now + if not relu_node.args[1] == 0.0: + return False + if relu_node.args[2] == 6.0: # for ReLU6 + return True + return True + + +def _annotate_fused_activation_pattern( + graph: Graph, quant_config: QuantizationConfig +) -> None: + for relu_node in graph.nodes: + # Check relu/relu6 node + if relu_node.op != "call_function": + continue + if relu_node.target not in [ + # The strategy of ReLU and ReLU6 is fold_activation in ENNQuant + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + torch.ops.aten.relu6.default, + torch.ops.aten.relu6_.default, + torch.ops.aten.hardtanh.default, + torch.ops.aten.hardtanh_.default, + ]: + continue + + if not _is_hardtanh_for_relux(relu_node): + continue + + producer_node = relu_node.args[0] + if not isinstance(producer_node, Node): + continue + if producer_node.op != "call_function": + continue + if len(producer_node.users) != 1: + continue + + # Handle affine + relu fusion + if producer_node.target in [ + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + torch.ops.aten.linear.default, + ]: + # input & weight (or bias) setting for Conv node(producer_node) + quantization_annotation = producer_node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + + input = producer_node.args[0] + quantization_annotation.input_qspec_map[input] = ( + quant_config.input_activation + ) + + quantization_annotation.input_qspec_map[producer_node.args[1]] = ( + quant_config.weight + ) + if len(producer_node.args) > 2 and quant_config.bias is not None: + quantization_annotation.input_qspec_map[producer_node.args[2]] = ( + quant_config.bias + ) + + producer_node.meta["quantization_annotation"] = quantization_annotation + producer_node.meta["quantization_annotation"]._annotated = True + # out setting for activation node (relu_node) + quantization_annotation = relu_node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + quantization_annotation.output_qspec = quant_config.output_activation + + relu_node.meta["quantization_annotation"] = quantization_annotation + relu_node.meta["quantization_annotation"]._annotated = True + continue + + +# CASE 2-1: two input case without Shared Quant +@register_annotator( + [ + torch.ops.aten.div, + torch.ops.aten.div.Tensor, + torch.ops.aten.divide.Tensor, + torch.ops.aten.matmul.default, + torch.ops.aten.bmm.default, + torch.ops.aten.sum.dim_IntList, + ] +) +def annotate_2in1out(node: Node, quant_config: QuantizationConfig) -> None: + input_act0 = node.args[0] + input_act1 = node.args[1] + # skipping quantization if 1st input is not float. + if _is_annotated([node]) or not _is_float_tensor(input_act0): + return + + input_act_qspec = quant_config.input_activation + output_act_qspec = ( + quant_config.output_activation if _is_float_tensor(node) else None + ) + + input_qspec_map = {} + if _is_float_tensor(input_act0): + input_qspec_map[input_act0] = input_act_qspec + + if _is_float_tensor(input_act1): + input_qspec_map[input_act1] = input_act_qspec + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + + +# getting QuantAnnot though the first input +def _get_quantization_annotation(node: Node): + if node.op == "placeholder": + return False + elif "quantization_annotation" in node.meta: + return node + elif node.args == (): + return False + elif isinstance(node.args[0], Node): + return _get_quantization_annotation(node.args[0]) + elif isinstance(node.args[0], list): + # for cat, concatenate and stack + if isinstance(node.args[0][0], Node): + return _get_quantization_annotation(node.args[0][0]) + else: + return False + else: + return False + + +# CASE 2-2: two input case with Shared Quant +# ops.add / ops.add_ are processed by another annotator +@register_annotator( + [ + torch.ops.aten.sub, + torch.ops.aten.mul, + torch.ops.aten.sub.Tensor, + torch.ops.aten.mul.Tensor, + torch.ops.aten.sub_.Tensor, + torch.ops.aten.mul_.Tensor, + torch.ops.aten.rsub.Scalar, + torch.ops.aten.mul.Scalar, + ] +) +def annotate_2in1out_with_SharedQuant( + node: Node, quant_config: QuantizationConfig +) -> None: + + input_qspec_map = {} + input0 = node.args[0] + input1 = node.args[1] + + # skipping quantization if 1st input is not float. + if _is_annotated([node]) or not _is_float_tensor(input0): + return + if ( + isinstance(input0, Node) + and isinstance(input1, float) + and not _get_quantization_annotation(input0) + ): + return + if ( + isinstance(input0, float) + and isinstance(input1, Node) + and not _get_quantization_annotation(input1) + ): + return + if isinstance(input0, Node) and isinstance(input1, Node): + shared_qspec = SharedQuantizationSpec((input0, node)) + input_qspec_map[input0] = quant_config.input_activation + input_qspec_map[input1] = shared_qspec + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_qspec, + _annotated=True, + ) + + else: + input_act_qspec = quant_config.input_activation + output_act_qspec = ( + quant_config.output_activation if _is_float_tensor(node) else None + ) + + input_qspec_map = {} + input_act0 = node.args[0] + if _is_float_tensor(input_act0): + input_qspec_map[input_act0] = input_act_qspec + + input_act1 = node.args[1] + if _is_float_tensor(input_act1): + input_qspec_map[input_act1] = input_act_qspec + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + + +# CASE 2-3: only for add ops +@register_annotator(ADD_OPS) +def annotate_add_ops_with_SharedQuant( + node: Node, quant_config: QuantizationConfig +) -> None: + + input_qspec_map = {} + input0 = node.args[0] + input1 = node.args[1] + + # skipping quantization if 1st input is not float. + if _is_annotated([node]) or not _is_float_tensor(input0): + return + + if isinstance(input0, Node) and isinstance(input1, Node): + NonQuantShare_ops_for_add = [torch.ops.aten.dropout.default] + ADD_OPS + if ( + input0.op == "call_function" and input0.target in NonQuantShare_ops_for_add + ) or ( + input1.op == "call_function" and input1.target in NonQuantShare_ops_for_add + ): + input_act_qspec = quant_config.input_activation + output_act_qspec = ( + quant_config.output_activation if _is_float_tensor(node) else None + ) + + input_qspec_map = {} + input_act0 = node.args[0] + if _is_float_tensor(input_act0): + input_qspec_map[input_act0] = input_act_qspec + + input_act1 = node.args[1] + if _is_float_tensor(input_act1): + input_qspec_map[input_act1] = input_act_qspec + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + else: + shared_qspec = SharedQuantizationSpec((input0, node)) + input_qspec_map[input0] = quant_config.input_activation + input_qspec_map[input1] = shared_qspec + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_qspec, + _annotated=True, + ) + elif ( + isinstance(input0, Node) + and isinstance(input1, float) + and not _get_quantization_annotation(input0) + ): + pass + elif ( + isinstance(input0, float) + and isinstance(input1, Node) + and not _get_quantization_annotation(input1) + ): + pass + else: + input_act_qspec = quant_config.input_activation + output_act_qspec = ( + quant_config.output_activation if _is_float_tensor(node) else None + ) + + input_qspec_map = {} + input_act0 = node.args[0] + if _is_float_tensor(input_act0): + input_qspec_map[input_act0] = input_act_qspec + + input_act1 = node.args[1] + if _is_float_tensor(input_act1): + input_qspec_map[input_act1] = input_act_qspec + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + + +# CASE 3-1: Single input + Single Out case without Shared Quant +@register_annotator( + [ + torch.ops.aten.ceil.default, + torch.ops.aten.clamp.default, + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + torch.ops.aten.relu6.default, + torch.ops.aten.relu6_.default, + torch.ops.aten.cos.default, + torch.ops.aten.sin.default, + torch.ops.aten.tanh.default, + torch.ops.aten.hardswish.default, + torch.ops.aten.hardswish_.default, + torch.ops.aten.hardsigmoid.default, + torch.ops.aten.hardsigmoid_.default, + torch.ops.aten.hardtanh.default, + torch.ops.aten.hardtanh_.default, + torch.ops.aten.mean.default, + torch.ops.aten.adaptive_avg_pool2d.default, + torch.ops.aten.avg_pool2d.default, + torch.ops.aten.leaky_relu.default, + torch.ops.aten.leaky_relu_.default, + torch.ops.aten.prelu.default, + torch.ops.aten.upsample_bilinear2d.vec, + torch.ops.aten.upsample_nearest2d.vec, + torch.ops.aten.mean.dim, + torch.ops.aten.sqrt.default, + torch.ops.aten.gelu.default, + torch.ops.aten.scaled_dot_product_attention.default, + torch.ops.aten.rsqrt.default, + torch.ops.aten.pow.Tensor_Scalar, + torch.ops.aten.topk.default, + ] +) +def annotate_1in1out(node: Node, quant_config: QuantizationConfig) -> None: + # skipping quantization if input is not float. + if _is_annotated([node]) or not _is_float_tensor(node.args[0]): + return + + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + + # one inputs + one output case. + input_act_qspec = quant_config.input_activation + quantization_annotation.input_qspec_map[node.args[0]] = input_act_qspec + quantization_annotation.output_qspec = quant_config.output_activation + + node.meta["quantization_annotation"] = quantization_annotation + node.meta["quantization_annotation"]._annotated = True + + +# CASE 3-2: Single input + Single Out case with Shared Quant +@register_annotator( + [ + torch.ops.aten.permute.default, + torch.ops.aten.view.default, + torch.ops.aten._unsafe_view.default, + torch.ops.aten.squeeze.default, + torch.ops.aten.squeeze.dim, + torch.ops.aten.squeeze_copy.dims, + torch.ops.aten.unsqueeze.default, + torch.ops.aten.unsqueeze_copy.default, + torch.ops.aten.transpose.int, + torch.ops.aten.expand.default, + torch.ops.aten.max_pool2d.default, + torch.ops.aten.max_pool2d_with_indices.default, + torch.ops.aten.reshape.default, + torch.ops.aten.select.int, + torch.ops.aten.flatten.using_ints, + torch.ops.aten.pad.default, + torch.ops.aten.slice.Tensor, + torch.ops.aten.to.dtype, + ] +) +def annotate_1in1out_with_SharedQuant( + node: Node, quant_config: QuantizationConfig +) -> None: + input_qspec_map = {} + input = node.args[0] + assert isinstance(input, Node) + if _is_annotated([node]) or not _is_float_tensor(input): + return + + shared_qspec = SharedQuantizationSpec((input, node)) + + # get QuantAnnot from the input path + shared_quant_node = _get_quantization_annotation(input) + if shared_quant_node: + input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node) + shared_qspec = SharedQuantizationSpec((shared_quant_node, node)) + else: + # if no QuantAnnot in the input path + input_qspec_map[input] = quant_config.input_activation + shared_qspec = SharedQuantizationSpec((input, node)) + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_qspec, + _annotated=True, + ) + + +# CASE 3-3: Single input + Single Out case with FP +@register_annotator( + [ + torch.ops.aten.softmax.int, + torch.ops.aten._softmax.default, + torch.ops.aten._safe_softmax.default, + torch.ops.aten.log_softmax.int, + ] +) +def annotate_1in1out_with_SharedQuant_for_FP( + node: Node, quant_config: QuantizationConfig +) -> None: + input_qspec_map = {} + input = node.args[0] + assert isinstance(input, Node) + + if _is_annotated([node]) or not _is_float_tensor(input): + return + + if input.target in ADD_OPS and _is_annotated([input]): + del input.meta["quantization_annotation"] + + # get QuantAnnot from the input path + shared_quant_node = _get_quantization_annotation(input) + if shared_quant_node: + # if QuantAnnot in the input path, input_qspec is shared, but output_qspec is not. + input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node) + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quant_config.output_activation, + _annotated=True, + ) + else: + # if no QuantAnnot in the input path + node.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=quant_config.output_activation, + _annotated=True, + ) + + +# CASE 4: One value input + one index input with Shared Quant +@register_annotator([torch.ops.aten.index.Tensor]) +def annotate_index(node: Node, quant_config: QuantizationConfig) -> None: + input_qspec_map = {} + input = node.args[0] + assert isinstance(input, Node) + + if _is_annotated([node]) or not _is_float_tensor(input): + return + + # get QuantAnnt from the input path + shared_quant_node = _get_quantization_annotation(input) + if shared_quant_node: + shared_qspec = SharedQuantizationSpec((shared_quant_node, node)) + input_qspec_map[input] = quant_config.input_activation + + # sharing QuantAnnot with the parent + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_qspec, + _annotated=True, + ) + + +# CASE 5 input + index + value & output with Shared Quant +@register_annotator( + [torch.ops.aten.index_put.default, torch.ops.aten.index_put_.default] +) +def annotate_index_put(node: Node, quant_config: QuantizationConfig) -> None: + input_qspec_map = {} + input = node.args[0] # from KVCache in LLAMA + value = node.args[2] # from linear projection layer + assert isinstance(input, Node) + assert isinstance(value, Node) + + if _is_annotated([node]) or not _is_float_tensor(input): + return + + # get QuantAnnot from input path + shared_quant_node = _get_quantization_annotation(input) + if shared_quant_node: + shared_qspec = SharedQuantizationSpec((shared_quant_node, node)) + input_qspec_map[input] = shared_qspec + input_qspec_map[value] = shared_qspec + output_qspec = shared_qspec + else: + # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config. + input_qspec_map[input] = quant_config.input_activation + input_qspec_map[value] = SharedQuantizationSpec((input, node)) + output_qspec = SharedQuantizationSpec((input, node)) + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_qspec, + _annotated=True, + ) + + +# CASE 6 unbind + getitem case +# (inputQuant--unbinde--no Qunat) --> (no Qunat--getitem--outputQuant) +@register_annotator([torch.ops.aten.unbind.int]) +def annotate_unbind(node: Node, quant_config: QuantizationConfig) -> None: + input_qspec_map = {} + input = node.args[0] + assert isinstance(input, Node) + + if _is_annotated([node]) or not _is_float_tensor(input): + return + + # get QuantAnnot from input path + shared_quant_node = _get_quantization_annotation(input) + if shared_quant_node: + input_qspec_map[input] = quant_config.input_activation + shared_qspec = SharedQuantizationSpec((shared_quant_node, node)) + else: + # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config. + input_qspec_map[input] = quant_config.input_activation + shared_qspec = SharedQuantizationSpec((input, node)) + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_qspec, + _annotated=True, + ) + + for users_node in node.users: + users_node.meta["quantization_annotation"] = QuantizationAnnotation( + output_qspec=shared_qspec, + _annotated=True, + ) + + +# CASE 7: stand-alone Conv2d and Conv1d +@register_annotator( + [ + torch.ops.aten.conv2d.default, + torch.ops.aten.conv1d.default, + torch.ops.aten.linear.default, + ] +) +def annotate_conv2d(node: Node, quant_config: QuantizationConfig) -> None: + # skipping quantization if weights are not float + if _is_annotated([node]) or not _is_float_tensor(node.args[1]): + return + + input = node.args[0] + # input & weight (or bias) setting for Conv node(producer_node) + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + + shared_quant_node = _get_quantization_annotation(input) + if shared_quant_node: + quantization_annotation.input_qspec_map[input] = SharedQuantizationSpec( + shared_quant_node + ) + else: + quantization_annotation.input_qspec_map[input] = quant_config.input_activation + quantization_annotation.input_qspec_map[node.args[1]] = quant_config.weight + if len(node.args) > 2 and quant_config.bias is not None: + quantization_annotation.input_qspec_map[node.args[2]] = quant_config.bias + quantization_annotation.output_qspec = quant_config.output_activation + + node.meta["quantization_annotation"] = quantization_annotation + node.meta["quantization_annotation"]._annotated = True + + +# CASE 8: embedding +@register_annotator([torch.ops.aten.embedding.default]) +def annotate_embedding(node: Node, quant_config: QuantizationConfig) -> None: + input_qspec_map = {} + weight = node.args[0] + if _is_annotated([node]) or not _is_float_tensor(weight): + return + + input_qspec_map[weight] = quant_config.input_activation + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quant_config.output_activation, + _annotated=True, + ) + + +# CASE 9: Concat & Stack +@register_annotator( + [ + torch.ops.aten.cat.default, + torch.ops.aten.concat.default, + torch.ops.aten.stack.default, + ] +) +def annotate_cat(node: Node, quant_config: QuantizationConfig) -> None: + inputs = node.args[0] + first_input = inputs[0] + assert isinstance(inputs, list) + assert isinstance(first_input, Node) + + if _is_annotated([node]) or not _is_float_tensor(first_input): + return + + input_qspec_map = {} + shared_qspec = SharedQuantizationSpec((first_input, node)) + for input in inputs: + if input == first_input: + input_qspec_map[input] = quant_config.input_activation + else: + input_qspec_map[input] = shared_qspec + + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_qspec, + _annotated=True, + ) + + +# CASE 10: various normalizations +@register_annotator([torch.ops.aten.rms_norm.default]) +def annotate_rms_norm(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated([node]): + return + + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + + quantization_annotation.input_qspec_map[node.args[0]] = ( + quant_config.input_activation + ) # active + quantization_annotation.input_qspec_map[node.args[2]] = ( + quant_config.input_activation + ) # weight + quantization_annotation.output_qspec = quant_config.output_activation + node.meta["quantization_annotation"] = quantization_annotation + node.meta["quantization_annotation"]._annotated = True + + +@register_annotator([torch.ops.aten.group_norm.default]) +def annotate_group_norm(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated([node]): + return + + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + + quantization_annotation.input_qspec_map[node.args[0]] = ( + quant_config.input_activation + ) # active + quantization_annotation.input_qspec_map[node.args[2]] = ( + quant_config.weight + ) # weight + quantization_annotation.output_qspec = quant_config.output_activation + + node.meta["quantization_annotation"] = quantization_annotation + node.meta["quantization_annotation"]._annotated = True + + +@register_annotator([torch.ops.aten.layer_norm.default]) +def annotate_layer_norm(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated([node]): + return + + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + + quantization_annotation.input_qspec_map[node.args[0]] = ( + quant_config.input_activation + ) # active + quantization_annotation.input_qspec_map[node.args[2]] = ( + quant_config.input_activation + ) # weight + quantization_annotation.output_qspec = quant_config.output_activation + + node.meta["quantization_annotation"] = quantization_annotation + node.meta["quantization_annotation"]._annotated = True + + +@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default]) +def annotate_batch_norm(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated([node]): + return + + quantization_annotation = node.meta.get( + "quantization_annotation", QuantizationAnnotation() + ) + if quantization_annotation.input_qspec_map is None: + quantization_annotation.input_qspec_map = {} + + quantization_annotation.input_qspec_map[node.args[0]] = ( + quant_config.input_activation + ) # active + + quantization_annotation.input_qspec_map[node.args[1]] = ( + quant_config.input_activation + ) # weight + quantization_annotation.output_qspec = quant_config.output_activation + + node.meta["quantization_annotation"] = quantization_annotation + node.meta["quantization_annotation"]._annotated = True + + +# CASE 11: Sigmoid +@register_annotator([torch.ops.aten.sigmoid, torch.ops.aten.sigmoid.default]) +def annotate_sigmoid(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated([node]): + return + + input_qspec_map = {} + input_act = node.args[0] + input_qspec_map[input_act] = quant_config.input_activation + + assert isinstance(input_act, Node) + out_qconf = quant_config.output_activation + + q_max = ( + torch.iinfo(out_qconf.dtype).max + if out_qconf.quant_max is None + else out_qconf.quant_max + ) + q_min = ( + torch.iinfo(out_qconf.dtype).min + if out_qconf.quant_min is None + else out_qconf.quant_min + ) + + scale = 1 / (q_max - q_min + 1) + + bias_obs_ctr = FixedQParamsObserver.with_args( + scale=scale, + zero_point=0, + dtype=quant_config.output_activation.dtype, + qscheme=torch.torch.per_tensor_affine, + quant_max=q_max, + quant_min=q_min, + ) + + # make sigmoid map to the range between 0~1 + out_act_quantization_spec = QuantizationSpec( + dtype=quant_config.output_activation.dtype, + quant_max=q_max, + quant_min=q_min, + observer_or_fake_quant_ctr=bias_obs_ctr, + qscheme=torch.torch.per_tensor_affine, + ) + + if _is_float_tensor(node): + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=out_act_quantization_spec, + _annotated=True, + ) diff --git a/backends/samsung/quantizer/qconfig.py b/backends/samsung/quantizer/qconfig.py new file mode 100644 index 00000000000..f32c8d39796 --- /dev/null +++ b/backends/samsung/quantizer/qconfig.py @@ -0,0 +1,174 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from enum import IntEnum, unique +from typing import Callable, Optional + +import torch +from torchao.quantization.pt2e import ( + FakeQuantize, + MinMaxObserver, + PerChannelMinMaxObserver, +) +from torchao.quantization.pt2e.quantizer import QuantizationSpec + + +@unique +class Precision(IntEnum): + A8W8 = 3 + + +@dataclass(eq=True, frozen=True) +class QuantizationConfig: + input_activation: Optional[QuantizationSpec] + output_activation: Optional[QuantizationSpec] + weight: Optional[QuantizationSpec] + bias: Optional[QuantizationSpec | Callable] + + +def get_quant_config( + precision: Precision, + is_per_channel: bool = False, + is_qat: bool = False, +) -> QuantizationConfig: + + precision_mappings = { + Precision.A8W8: get_a8w8_enn_quant_config, + } + if precision not in precision_mappings: + raise RuntimeError("Unrecognized precision setting.") + + is_weight_symm = is_per_channel + + qconfig_fn = precision_mappings[precision] + return qconfig_fn(is_per_channel, is_qat, wei_symmetric=is_weight_symm) + + +def _get_activation_qspec( + dtype, + is_symmetric, + is_qat, + observer_cls=MinMaxObserver, + quant_min=None, + quant_max=None, +): + eps_value = 2**-12 + if quant_max is None: + quant_max = torch.iinfo(dtype).max + if quant_min is None: + quant_min = torch.iinfo(dtype).min + + qscheme = torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine + if is_qat: + observer_or_fake_quant = FakeQuantize.with_args( + observer=observer_cls, eps=eps_value + ) + else: + observer_or_fake_quant = observer_cls.with_args(eps=eps_value) + + return QuantizationSpec( + dtype=dtype, + quant_min=quant_min, + quant_max=quant_max, + qscheme=qscheme, + observer_or_fake_quant_ctr=observer_or_fake_quant, + ) + + +def _get_weight_qspec( + dtype, is_symmetric, is_per_channel, is_qat, quant_min=None, quant_max=None +): + assert is_symmetric or not is_per_channel, "Not support asymm+perchannel mode" + + eps_value = 2**-12 + + if quant_max is None: + quant_max = torch.iinfo(dtype).max + if quant_min is None: + quant_min = torch.iinfo(dtype).min + + if not is_per_channel: + qscheme = ( + torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine + ) + observer_cls = MinMaxObserver + else: + qscheme = ( + torch.per_channel_symmetric if is_symmetric else torch.per_channel_affine + ) + observer_cls = PerChannelMinMaxObserver + + if is_qat: + observer_or_fake_quant = FakeQuantize.with_args( + observer=observer_cls, eps=eps_value + ) + else: + observer_or_fake_quant = observer_cls.with_args(eps=eps_value) + + return QuantizationSpec( + dtype=dtype, + quant_min=quant_min, + quant_max=quant_max, + qscheme=qscheme, + ch_axis=0, + observer_or_fake_quant_ctr=observer_or_fake_quant, + ) + + +def get_a8w8_enn_quant_config( + is_per_channel=True, is_qat=False, act_symmetric=False, wei_symmetric=False +) -> QuantizationConfig: + act_quantization_spec = _get_activation_qspec(torch.int8, act_symmetric, is_qat) + wgt_quantization_spec = _get_weight_qspec( + torch.int8, wei_symmetric, is_per_channel, is_qat + ) + bias_quantization_spec = None + quantization_config = QuantizationConfig( + input_activation=act_quantization_spec, + output_activation=act_quantization_spec, + weight=wgt_quantization_spec, + bias=bias_quantization_spec, + ) + return quantization_config + + +class QuantInfo: + def __init__(self, torch_dtype: torch.dtype, string: str): + self._torch_dtype = torch_dtype + self._string = string + + @property + def torch_dtype(self): + return self._torch_dtype + + @property + def string(self): + return self._string + + +class QuantInfoManager: + QUANT_INFO_MAP = { + Precision.A8W8: (QuantInfo(torch.int8, "INT8"), QuantInfo(torch.int8, "INT8")), + } + FP_INFO = ( + QuantInfo(torch.float32, "FLOAT32"), + QuantInfo(torch.float32, "FLOAT32"), + ) + + def __init__(self): + self.precision = None + + def set_precision(self, precision: Precision): + self.precision = precision + + @property + def weight_precison(self) -> Optional[QuantInfo]: + return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[0] + + @property + def act_precision(self) -> Optional[QuantInfo]: + return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[1] diff --git a/backends/samsung/quantizer/quantizer.py b/backends/samsung/quantizer/quantizer.py new file mode 100644 index 00000000000..cf46677d000 --- /dev/null +++ b/backends/samsung/quantizer/quantizer.py @@ -0,0 +1,65 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Sequence + +import torch +from torch.fx import GraphModule +from torchao.quantization.pt2e.quantizer import Quantizer + +from .annotator import annotate +from .qconfig import get_quant_config, Precision, QuantInfoManager + + +global_quant_info = QuantInfoManager() + + +class EnnQuantizer(Quantizer): + + def __init__(self): + super().__init__() + + self._precision = Precision.A8W8 + global_quant_info.set_precision(self._precision) + self._is_per_channel = True + self._is_qat = False + self.custom_quant_annotations: Sequence[Callable] = [] + + def setup_precision(self, quant_dtype: Precision) -> None: + assert quant_dtype in Precision, f"No support for Precision {quant_dtype}." + self._precision = quant_dtype + global_quant_info.set_precision(self._precision) + + def setup_quant_params( + self, quant_dtype: Precision, is_per_channel=True, is_qat=False + ) -> None: + assert quant_dtype in Precision, f"No support for Precision {quant_dtype}." + self._precision = quant_dtype + self._is_per_channel = is_per_channel + self._is_qat = is_qat + + def annotate(self, model: GraphModule) -> GraphModule: + self._annotate(model) + self._annotate_custom_annotation(model) + return model + + def _annotate(self, gm: GraphModule) -> None: + quant_config = get_quant_config( + self._precision, self._is_per_channel, self._is_qat + ) + annotate(gm.graph, quant_config) + + def add_custom_quant_annotations( + self, custom_quant_annotations: Sequence[Callable] + ) -> None: + self.custom_quant_annotations = custom_quant_annotations + + def _annotate_custom_annotation(self, gm: GraphModule) -> None: + for annotation_func in self.custom_quant_annotations: + annotation_func(gm) + + def validate(self, model: torch.fx.GraphModule) -> None: + return diff --git a/backends/samsung/serialization/compile_options.py b/backends/samsung/serialization/compile_options.py index 1ad2350cfeb..a4af40368e9 100644 --- a/backends/samsung/serialization/compile_options.py +++ b/backends/samsung/serialization/compile_options.py @@ -11,7 +11,8 @@ from dataclasses import dataclass from enum import IntEnum, unique -import pkg_resources +from importlib.resources import files + from executorch.exir._serialize._dataclass import _DataclassEncoder from executorch.exir._serialize._flatbuffer import _flatc_compile from executorch.exir.backend.backend_details import CompileSpec @@ -36,12 +37,15 @@ def gen_samsung_backend_compile_spec_core(options: EnnExecuTorchOptions) -> Comp with tempfile.TemporaryDirectory() as d: # schema schema_path = os.path.join(d, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME)) + + schema_content = ( + files(__package__) + .joinpath(f"{COMPILE_OPTION_SCHEMA_NAME}.fbs") + .read_bytes() + ) + with open(schema_path, "wb") as schema_file: - schema_file.write( - pkg_resources.resource_string( - __name__, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME) - ) - ) + schema_file.write(schema_content) # dump json json_path = os.path.join(d, "{}.json".format(COMPILE_OPTION_SCHEMA_NAME)) enn_options_json = json.dumps(options, cls=_DataclassEncoder, indent=4) diff --git a/backends/samsung/serialization/enn_graph_schema.py b/backends/samsung/serialization/enn_graph_schema.py index 7e74182f9d7..5209a8672ee 100644 --- a/backends/samsung/serialization/enn_graph_schema.py +++ b/backends/samsung/serialization/enn_graph_schema.py @@ -5,13 +5,16 @@ # LICENSE file in the root directory of this source tree. import logging -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import executorch.backends.samsung.python.PyGraphWrapperAdaptor as PyGraphWrapper import numpy as np import torch +from executorch.backends.samsung.builders.utils import DATA_TYPE_STR_MAPPING +from executorch.backends.samsung.utils.constants import QuantConstants +from executorch.backends.samsung.utils.utils import quantize_tensor class EnnGraph: @@ -24,6 +27,10 @@ def __init__(self): self.inputs = [] self.outputs = [] + def init(self, name: str, soc_name): + self.name = name + self.soc_name = soc_name + def define_op( self, name, @@ -46,22 +53,54 @@ def define_op( py_param_wrapper.SetScalarValue(params[key]) else: logging.error("Unsupported param type.") + # Set op.AddOpParam(py_param_wrapper) self.graph.DefineOpNode(op) - def define_tensor( + def define_tensor( # noqa: C901 self, name: str, shape: List, data_type: str, tensor_type: str, data: Optional[Union[np.ndarray, torch.Tensor]] = None, + quant_param: Optional[Dict[str, Any]] = None, ) -> int: layout = "NCHW" if len(shape) == 4 else "UNDEFINED" + if quant_param is not None: + data_type = DATA_TYPE_STR_MAPPING[ + quant_param[QuantConstants.QUANT_KEY.quant_dtype] + ] + tensor = PyGraphWrapper.PyEnnTensorWrapper(name, shape, data_type, layout) + if quant_param is not None: + need_quantize = True + + scales = self._affine_meta_param( + quant_param[QuantConstants.QUANT_KEY.scale] + ) + zero_points = self._affine_meta_param( + quant_param[QuantConstants.QUANT_KEY.zero_point] + ) + q_dtype = self._affine_meta_param( + quant_param[QuantConstants.QUANT_KEY.quant_dtype] + ) + tensor.AddQuantizeParam(q_dtype, scales, zero_points) + + if need_quantize and data is not None: + if isinstance(data, np.ndarray): + data = torch.tensor(data) + data = quantize_tensor( + data, + scales, + zero_points, + quant_param[QuantConstants.QUANT_KEY.quant_dtype], + axis=quant_param.get("axis"), + ) + if data is not None: if isinstance(data, torch.Tensor): data = data.detach().numpy() @@ -83,3 +122,20 @@ def finish(self): def serialize(self): return self.graph.Serialize() + + @staticmethod + def _affine_meta_param(param: Any) -> str: + type_str_affine_table = { + torch.int8: "AINT8", + } + if isinstance(param, str): + return param + if isinstance(param, (float, int)): + return [param] + if hasattr(param, "tolist"): + return param.tolist() + if isinstance(param, torch.dtype): + # Convenient for debugging + param = type_str_affine_table.get(param, "") + + return param diff --git a/backends/samsung/utils/constants.py b/backends/samsung/utils/constants.py new file mode 100644 index 00000000000..7c3997b9fe2 --- /dev/null +++ b/backends/samsung/utils/constants.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025 Samsung Electronics Co. LTD +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.exir.dialects._ops import ops as exir_ops + + +class QuantConstants: + # TODO: check keys + class QUANT_KEY: + scale = "scales" + zero_point = "zero_points" + quant_min = "quant_min" + quant_max = "quant_max" + quant_dtype = "quant_dtype" + + PERCHANNEL_KEY_MAP = { + "scales": QUANT_KEY.scale, + "zero_points": QUANT_KEY.zero_point, + "quant_min": QUANT_KEY.quant_min, + "quant_max": QUANT_KEY.quant_max, + "dtype": QUANT_KEY.quant_dtype, + } + # SNC ir always use key 'scales' and 'zero_points' + PERTENSOR_KEY_MAP = { + "scale": QUANT_KEY.scale, + "zero_point": QUANT_KEY.zero_point, + "quant_min": QUANT_KEY.quant_min, + "quant_max": QUANT_KEY.quant_max, + "dtype": QUANT_KEY.quant_dtype, + } + + QUANT_OPS_KEY_MAP = { + exir_ops.edge.quantized_decomposed.quantize_per_channel.default: PERCHANNEL_KEY_MAP, + exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: PERTENSOR_KEY_MAP, + exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: PERTENSOR_KEY_MAP, + } + + DEQUANT_OPS_KEY_MAP = { + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: PERTENSOR_KEY_MAP, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: PERTENSOR_KEY_MAP, + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: PERCHANNEL_KEY_MAP, + } diff --git a/backends/samsung/utils/export_utils.py b/backends/samsung/utils/export_utils.py index aaf407ef0b3..39992f2ea2a 100644 --- a/backends/samsung/utils/export_utils.py +++ b/backends/samsung/utils/export_utils.py @@ -4,20 +4,30 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Optional, Tuple +import logging +from typing import List, Optional, Tuple import executorch.exir as exir import torch +from executorch.backends.samsung._passes.fuse_conv_act import FuseConvActPass +from executorch.backends.samsung._passes.remove_useless_ops import RemoveUselessOpPass from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner +from executorch.backends.samsung.quantizer.quantizer import EnnQuantizer, Precision +from executorch.backends.transforms.decompose_sdpa import ( + DecomposeScaledDotProductAttention, +) from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform from executorch.exir import EdgeCompileConfig from executorch.exir.backend.backend_details import CompileSpec - from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_manager import PassType from executorch.exir.program._program import to_edge_transform_and_lower +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e def get_edge_compile_config(): + # Maybe most ops in non-decomposition list should be added here + # TODO: to confirm whether all op in none-decomposed table should be added here return EdgeCompileConfig( _skip_dim_order=True, _core_aten_ops_exception_list=[ @@ -29,24 +39,55 @@ def get_edge_compile_config(): exir_ops.edge.aten._safe_softmax.default, exir_ops.edge.aten.layer_norm.default, exir_ops.edge.aten.matmul.default, + exir_ops.edge.aten.hardsigmoid.default, ], ) +def get_enn_pass_list() -> List[PassType]: + return [ + RemoveUselessOpPass(), + RemoveCloneOpsTransform(), + FuseConvActPass(), + ] + + +def quantize_module( + module: torch.nn.Module, + inputs, + calibration_dataset, + precision: Precision, + is_per_channel: bool = True, + is_qat: bool = False, +) -> torch.nn.Module: + quantizer = EnnQuantizer() + quantizer.setup_quant_params(precision, is_per_channel, is_qat) + logging.info("Export nn module for quantization...") + exported_module = torch.export.export_for_training(module, inputs).module() + DecomposeScaledDotProductAttention()(exported_module) + logging.info("Quantizing the module...") + annotated_module = prepare_pt2e(exported_module, quantizer) + for data in calibration_dataset: + annotated_module(*data) + quantized_module = convert_pt2e(annotated_module, fold_quantize=False) + logging.info("Quantizing finished.") + return quantized_module + + def to_edge_transform_and_lower_to_enn( module: torch.nn.Module, inputs: Tuple[torch.Tensor], + custom_pass_config: List[PassType] = None, compile_specs: Optional[CompileSpec] = None, ) -> exir.ExecutorchProgramManager: - assert ( - compile_specs is not None - ), "Please provide compile specifications for enn backend" + assert compile_specs is not None, "For now, we must deliver complile specs" prog = torch.export.export(module, inputs) - - ahead_pass_list = [RemoveCloneOpsTransform()] + pass_list = get_enn_pass_list() + if custom_pass_config: + pass_list.extend(custom_pass_config) return to_edge_transform_and_lower( prog, - ahead_pass_list, + pass_list, {"forward": [EnnPartitioner(compile_specs)]}, compile_config=get_edge_compile_config(), ) diff --git a/backends/samsung/utils/utils.py b/backends/samsung/utils/utils.py index 5da9808f38f..bbbec518b2a 100644 --- a/backends/samsung/utils/utils.py +++ b/backends/samsung/utils/utils.py @@ -4,12 +4,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List +from typing import List, Optional, Tuple import torch from executorch.backends.transforms.utils import is_param_node from executorch.exir.backend.backend_details import CompileSpec +from executorch.exir.dialects._ops import ops as exir_ops from torch.export.exported_program import ExportedProgram @@ -35,3 +36,90 @@ def is_graph_output(node: torch.fx.Node) -> bool: ): return True return False + + +def _quantize_per_tensor( + in_tensor: torch.Tensor, + scales: List[float], + zeropoints: List[int], + dtype: torch.dtype, + qrange: Optional[Tuple[int, int]], +): + assert ( + len(scales) == 1 + ), "For per-tensor quantization, there should be only one scale/zeropoint" + return exir_ops.edge.quantized_decomposed.quantize_per_tensor.default( + in_tensor, + torch.Tensor(scales), + torch.Tensor(zeropoints), + qrange[0], + qrange[1], + dtype, + ) + + +def _quantize_per_channel( + in_tensor: torch.Tensor, + scales: List[float], + zeropoints: List[int], + dtype: torch.dtype, + qrange: Optional[Tuple[int, int]], + axis: Optional[int], # Only for per-channel +): + assert ( + len(scales) == in_tensor.shape[axis] + ), "Shape not match for quant params and input tensor" + return exir_ops.edge.quantized_decomposed.quantize_per_channel.default( + in_tensor, + torch.Tensor(scales), + torch.Tensor(zeropoints), + axis, + qrange[0], + qrange[1], + dtype, + ) + + +def quantize_tensor( + in_tensor: torch.Tensor, + scales: List[float], + zeropoints: List[int], + dtype: torch.dtype, + qrange: Optional[Tuple[int, int]] = None, + axis: Optional[int] = None, # Only for per-channel +) -> torch.Tensor: + """ + To quantize constant tensor by executorch OPs. If `axis` not set, we quantize the tensor by per tensor. + If `axis` was set, we do per-channel quantize. + + :param in_tensor: The tensor to be quantized + :param scales: List of scales. For per-tensor quantization, it should contain only one element + :param zeropoints: List of zeropoints. For per-tensor quantization, it should contain only one element + :param dtype: The output dtype + :param qrange: The quantization range (qmin, qmax). + If not set, we will get the maximum range of the dtype by `torch.iinfo` + :param axis: We do per-channel quantize by which axis. + Only when this parameter set, we do per-channel quantization + :type in_tensor: torch.Tensor + :type scalse: List[float] + :type zeropoints: List[int] + :type dtype: torch.dtype + :type qrange: Optional[Tuple[int,int]] + :type axis: Optional[int] + :return: The quantized tensor + """ + assert len(scales) == len( + zeropoints + ), "scales should have same shape with zeropoints" + if not qrange: + qrange = (torch.iinfo(dtype).min, torch.iinfo(dtype).max) + + if axis is not None: + return _quantize_per_channel(in_tensor, scales, zeropoints, dtype, qrange, axis) + return _quantize_per_tensor( + in_tensor, + scales, + zeropoints, + dtype, + qrange, + ) diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py index 351bab4a605..02c6fc4c82d 100644 --- a/backends/test/harness/tester.py +++ b/backends/test/harness/tester.py @@ -1,3 +1,8 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + import random from collections import Counter, OrderedDict from typing import Any, Callable, Dict, List, Optional, Tuple @@ -62,6 +67,7 @@ def __init__( StageType.RUN_PASSES: [ StageType.PARTITION, StageType.TO_EDGE_TRANSFORM_AND_LOWER, + StageType.TO_EXECUTORCH, ], # TODO Make this Stage optional StageType.PARTITION: [StageType.TO_EXECUTORCH], diff --git a/backends/test/multi_method_delegate_test.cpp b/backends/test/multi_method_delegate_test.cpp index e24585434c4..bf17d7c8743 100644 --- a/backends/test/multi_method_delegate_test.cpp +++ b/backends/test/multi_method_delegate_test.cpp @@ -5,6 +5,10 @@ #include #include +#include + +#include +#include #include #include @@ -12,6 +16,11 @@ #include #include +using executorch::backends::xnnpack::workspace_sharing_mode_option_key; +using executorch::backends::xnnpack::WorkspaceSharingMode; +using executorch::backends::xnnpack::xnnpack_backend_key; + +using executorch::runtime::BackendOptions; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::HierarchicalAllocator; @@ -126,34 +135,61 @@ class XNNPACKMultiDelegateTest : public ETPTEMethodRunBaseTest { num_threads = 40; kMethodName = "forward"; } -}; -// This test is to validate the assumption that the delegate is thread safe. -// That includes the following: -// 1. The delegate can be initilized by multiple threads in parallel. -// 2. The delegate can be executed by multiple threads in parallel. -// 3. The delegate can be destroyed by multiple threads in parallel. -// Regardless of the underlying implementation of the delegate. -// This is particularly important when we have shared resources across -// delegate instances through a singleton backend instance. -TEST_F(XNNPACKMultiDelegateTest, MultipleThreads) { - ASSERT_NE(kTestPTE1Path.size(), 0); - ASSERT_NE(kTestPTE2Path.size(), 0); - ASSERT_NE(num_threads, 0); - ASSERT_NE(kMethodName.size(), 0); - - std::vector threads(num_threads); - std::atomic count{0}; - - for (int i = 0; i < num_threads; i++) { - threads[i] = std::thread([&, i]() { - run(i, i % 7 ? kTestPTE1Path : kTestPTE2Path, kMethodName, count); - }); + // This test is to validate the assumption that the delegate is thread safe. + // That includes the following: + // 1. The delegate can be initilized by multiple threads in parallel. + // 2. The delegate can be executed by multiple threads in parallel. + // 3. The delegate can be destroyed by multiple threads in parallel. + // Regardless of the underlying implementation of the delegate. + // This is particularly important when we have shared resources across + // delegate instances through a singleton backend instance. + void runStressTest() { + ASSERT_NE(kTestPTE1Path.size(), 0); + ASSERT_NE(kTestPTE2Path.size(), 0); + ASSERT_NE(num_threads, 0); + ASSERT_NE(kMethodName.size(), 0); + + std::vector threads(num_threads); + std::atomic count{0}; + + for (int i = 0; i < num_threads; i++) { + threads[i] = std::thread([&, i]() { + run(i, i % 7 ? kTestPTE1Path : kTestPTE2Path, kMethodName, count); + }); + } + for (int i = 0; i < num_threads; i++) { + threads[i].join(); + } + ASSERT_EQ(count, num_threads); } - for (int i = 0; i < num_threads; i++) { - threads[i].join(); + + void setWorkspaceSharingMode(WorkspaceSharingMode mode) { + executorch::runtime::runtime_init(); + + BackendOptions<1> backend_options; + backend_options.set_option( + workspace_sharing_mode_option_key, static_cast(mode)); + + auto status = executorch::runtime::set_option( + xnnpack_backend_key, backend_options.view()); + ASSERT_EQ(status, Error::Ok); } - ASSERT_EQ(count, num_threads); +}; + +TEST_F(XNNPACKMultiDelegateTest, MultipleThreadsSharingDisabled) { + setWorkspaceSharingMode(WorkspaceSharingMode::Disabled); + runStressTest(); +} + +TEST_F(XNNPACKMultiDelegateTest, MultipleThreadsPerModelSharing) { + setWorkspaceSharingMode(WorkspaceSharingMode::PerModel); + runStressTest(); +} + +TEST_F(XNNPACKMultiDelegateTest, MultipleThreadsGlobalSharing) { + setWorkspaceSharingMode(WorkspaceSharingMode::Global); + runStressTest(); } // TODO(T208989291): Add more tests here. For example, diff --git a/backends/test/suite/README.md b/backends/test/suite/README.md index 564f44362ad..901cd461dbe 100644 --- a/backends/test/suite/README.md +++ b/backends/test/suite/README.md @@ -5,37 +5,71 @@ This directory contains tests that validate correctness and coverage of backends These tests are intended to ensure that backends are robust and provide a smooth, "out-of-box" experience for users across the full span of input patterns. They are not intended to be a replacement for backend-specific tests, as they do not attempt to validate performance or that backends delegate operators that they expect to. ## Running Tests and Interpreting Output -Tests can be run from the command line, either using the runner.py entry point or the standard Python unittest runner. When running through runner.py, the test runner will report test statistics, including the number of tests with each result type. +Tests can be run from the command line using pytest. When generating a JSON test report, the runner will report detailed test statistics, including output accuracy, delegated nodes, lowering timing, and more. -Backends can be specified with the `ET_TEST_ENABLED_BACKENDS` environment variable. By default, all available backends are enabled. Note that backends such as Core ML or Vulkan may require specific hardware or software to be available. See the documentation for each backend for information on requirements. +Each backend and test flow (recipe) registers a pytest [marker](https://docs.pytest.org/en/stable/example/markers.html) that can be passed to pytest with the `-m marker` argument to filter execution. -Example: +To run all XNNPACK backend operator tests: ``` -ET_TEST_ENABLED_BACKENDS=xnnpack python -m executorch.backends.test.suite.runner +pytest -c /dev/nul backends/test/suite/operators/ -m backend_xnnpack -n auto ``` +To run all model tests for the CoreML static int8 lowering flow: +``` +pytest -c /dev/nul backends/test/suite/models/ -m flow_coreml_static_int8 -n auto ``` -2465 Passed / 2494 -16 Failed -13 Skipped -[Success] -736 Delegated -1729 Undelegated +To run a specific test: +``` +pytest -c /dev/nul backends/test/suite/ -k "test_prelu_f32_custom_init[xnnpack]" +``` -[Failure] -5 Lowering Fail -3 PTE Run Fail -8 Output Mismatch Fail +To generate a JSON report: +``` +pytest -c /dev/nul backends/test/suite/operators/ -n auto --json-report --json-report-file="test_report.json" ``` -Outcomes can be interpreted as follows: - * Success (delegated): The test passed and at least one op was delegated by the backend. - * Success (undelegated): The test passed with no ops delegated by the backend. This is a pass, as the partitioner works as intended. - * Skipped: test fails in eager or export (indicative of a test or dynamo issue). - * Lowering fail: The test fails in to_edge_transform_and_lower. - * PTE run failure: The test errors out when loading or running the method. - * Output mismatch failure: Output delta (vs eager) exceeds the configured tolerance. +See [pytest-json-report](https://pypi.org/project/pytest-json-report/) for information on the report format. The test logic in this repository attaches additional metadata to each test entry under the `metadata`/`subtests` keys. One entry is created for each call to `test_runner.lower_and_run_model`. + +Here is a excerpt from a test run, showing a successful run of the `test_add_f32_bcast_first[xnnpack]` test. +```json +"tests": [ + { + "nodeid": "operators/test_add.py::test_add_f32_bcast_first[xnnpack]", + "lineno": 38, + "outcome": "passed", + "keywords": [ + "test_add_f32_bcast_first[xnnpack]", + "flow_xnnpack", + "backend_xnnpack", + ... + ], + "metadata": { + "subtests": [ + { + "Test ID": "test_add_f32_bcast_first[xnnpack]", + "Test Case": "test_add_f32_bcast_first", + "Subtest": 0, + "Flow": "xnnpack", + "Result": "Pass", + "Result Detail": "", + "Error": "", + "Delegated": "True", + "Quantize Time (s)": null, + "Lower Time (s)": "2.881", + "Output 0 Error Max": "0.000", + "Output 0 Error MAE": "0.000", + "Output 0 SNR": "inf", + "Delegated Nodes": 1, + "Undelegated Nodes": 0, + "Delegated Ops": { + "aten::add.Tensor": 1 + }, + "PTE Size (Kb)": "1.600" + } + ] + } +``` ## Backend Registration @@ -43,11 +77,11 @@ To plug into the test framework, each backend should provide an implementation o At a minimum, the backend will likely need to provide a custom implementation of the Partition and ToEdgeTransformAndLower stages using the appropriate backend partitioner. See backends/xnnpack/test/tester/tester.py for an example implementation. -Once a tester is available, the backend flow(s) can be added in __init__.py in this directory by adding an entry to `ALL_TESTER_FLOWS`. Each flow entry consists of a name (used in the test case naming) and a function to instantiate a tester for a given model and input tuple. +Once a tester is available, the backend flow(s) can be added under flows/ and registered in flow.py. It is intended that this will be unified with the lowering recipes under executorch/export in the near future. ## Test Cases -Operator test cases are defined under the operators/ directory. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow. The `@operator_test` decorator is applied to each test class to trigger this behavior. Tests can also be tagged with an appropriate type specifier, such as `@dtype_test`, to generate variants for each dtype. The decorators and "magic" live in __init__.py in this directory. +Operator test cases are defined under the operators/ directory. Model tests are under models/. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow by use of the `test_runner` fixture parameter. Tests can additionally be parameterized using standard pytest decorators. Parameterizing over dtype is a common use case. ## Evolution of this Test Suite diff --git a/backends/test/suite/__init__.py b/backends/test/suite/__init__.py index 43d4e16818f..734a6690fd2 100644 --- a/backends/test/suite/__init__.py +++ b/backends/test/suite/__init__.py @@ -11,6 +11,7 @@ import os import executorch.backends.test.suite.flow +import torch from executorch.backends.test.suite.flow import TestFlow from executorch.backends.test.suite.runner import runner_main @@ -55,6 +56,11 @@ def get_test_flows() -> dict[str, TestFlow]: return _ALL_TEST_FLOWS +def dtype_to_str(dtype: torch.dtype) -> str: + # Strip off "torch." + return str(dtype)[6:] + + def load_tests(loader, suite, pattern): package_dir = os.path.dirname(__file__) discovered_suite = loader.discover( diff --git a/backends/test/suite/conftest.py b/backends/test/suite/conftest.py new file mode 100644 index 00000000000..70a97454c4e --- /dev/null +++ b/backends/test/suite/conftest.py @@ -0,0 +1,182 @@ +from typing import Any + +import pytest +import torch + +from executorch.backends.test.suite.flow import all_flows +from executorch.backends.test.suite.reporting import _sum_op_counts +from executorch.backends.test.suite.runner import run_test + + +def pytest_configure(config): + backends = set() + + for flow in all_flows().values(): + config.addinivalue_line( + "markers", + f"flow_{flow.name}: mark a test as testing the {flow.name} flow", + ) + + if flow.backend not in backends: + config.addinivalue_line( + "markers", + f"backend_{flow.backend}: mark a test as testing the {flow.backend} backend", + ) + backends.add(flow.backend) + + +class TestRunner: + def __init__(self, flow, test_name, test_base_name): + self._flow = flow + self._test_name = test_name + self._test_base_name = test_base_name + self._subtest = 0 + self._results = [] + + def lower_and_run_model( + self, + model: torch.nn.Module, + inputs: Any, + generate_random_test_inputs=True, + dynamic_shapes=None, + ): + run_summary = run_test( + model, + inputs, + self._flow, + self._test_name, + self._test_base_name, + self._subtest, + None, + generate_random_test_inputs=generate_random_test_inputs, + dynamic_shapes=dynamic_shapes, + ) + + self._subtest += 1 + self._results.append(run_summary) + + if not run_summary.result.is_success(): + if run_summary.result.is_backend_failure(): + raise RuntimeError("Test failure.") from run_summary.error + else: + # Non-backend failure indicates a bad test. Mark as skipped. + pytest.skip( + f"Test failed for reasons other than backend failure. Error: {run_summary.error}" + ) + + +@pytest.fixture( + params=[ + pytest.param( + f, + marks=[ + getattr(pytest.mark, f"flow_{f.name}"), + getattr(pytest.mark, f"backend_{f.backend}"), + ], + ) + for f in all_flows().values() + ], + ids=str, +) +def test_runner(request): + return TestRunner(request.param, request.node.name, request.node.originalname) + + +@pytest.hookimpl(optionalhook=True) +def pytest_json_runtest_metadata(item, call): + # Store detailed results in the test report under the metadata key. + metadata = {"subtests": []} + + if hasattr(item, "funcargs") and "test_runner" in item.funcargs: + runner_instance = item.funcargs["test_runner"] + + for record in runner_instance._results: + subtest_metadata = {} + + error_message = "" + if record.error is not None: + error_str = str(record.error) + if len(error_str) > 400: + error_message = error_str[:200] + "..." + error_str[-200:] + else: + error_message = error_str + + subtest_metadata["Test ID"] = record.name + subtest_metadata["Test Case"] = record.base_name + subtest_metadata["Subtest"] = record.subtest_index + subtest_metadata["Flow"] = record.flow + subtest_metadata["Result"] = record.result.to_short_str() + subtest_metadata["Result Detail"] = record.result.to_detail_str() + subtest_metadata["Error"] = error_message + subtest_metadata["Delegated"] = "True" if record.is_delegated() else "False" + subtest_metadata["Quantize Time (s)"] = ( + f"{record.quantize_time.total_seconds():.3f}" + if record.quantize_time + else None + ) + subtest_metadata["Lower Time (s)"] = ( + f"{record.lower_time.total_seconds():.3f}" + if record.lower_time + else None + ) + + for output_idx, error_stats in enumerate(record.tensor_error_statistics): + subtest_metadata[f"Output {output_idx} Error Max"] = ( + f"{error_stats.error_max:.3f}" + ) + subtest_metadata[f"Output {output_idx} Error MAE"] = ( + f"{error_stats.error_mae:.3f}" + ) + subtest_metadata[f"Output {output_idx} SNR"] = f"{error_stats.sqnr:.3f}" + + subtest_metadata["Delegated Nodes"] = _sum_op_counts( + record.delegated_op_counts + ) + subtest_metadata["Undelegated Nodes"] = _sum_op_counts( + record.undelegated_op_counts + ) + if record.delegated_op_counts: + subtest_metadata["Delegated Ops"] = dict(record.delegated_op_counts) + if record.undelegated_op_counts: + subtest_metadata["Undelegated Ops"] = dict(record.undelegated_op_counts) + subtest_metadata["PTE Size (Kb)"] = ( + f"{record.pte_size_bytes / 1000.0:.3f}" if record.pte_size_bytes else "" + ) + + metadata["subtests"].append(subtest_metadata) + return metadata + + +@pytest.hookimpl(optionalhook=True) +def pytest_json_modifyreport(json_report): + # Post-process the report, mainly to populate metadata for crashed tests. The runtest_metadata + # hook doesn't seem to be called when there's a native crash, but xdist still creates a report + # entry. + + for test_data in json_report["tests"]: + if "metadata" not in test_data: + test_data["metadata"] = {} + metadata = test_data["metadata"] + if "subtests" not in metadata: + metadata["subtests"] = [] + subtests = metadata["subtests"] + + # Native crashes are recorded differently and won't have the full metadata. + # Pytest-xdist records crash info under the "???" key. + if "???" in test_data: + test_id = test_data["nodeid"].removeprefix("::") # Remove leading :: + test_base_id = test_id.split("[")[ + 0 + ] # Strip parameterization to get the base test case + params = test_id[len(test_base_id) + 1 : -1].split("-") + flow = params[0] + + crashed_test_meta = { + "Test ID": test_id, + "Test Case": test_base_id, + "Flow": flow, + "Result": "Fail", + "Result Detail": "Process Crash", + "Error": test_data["???"].get("longrepr", "Process crashed."), + } + subtests.append(crashed_test_meta) diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py index a4b34fee98d..29394951bd7 100644 --- a/backends/test/suite/flow.py +++ b/backends/test/suite/flow.py @@ -1,3 +1,8 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + import logging from dataclasses import dataclass, field @@ -44,6 +49,9 @@ class TestFlow: def should_skip_test(self, test_name: str) -> bool: return any(pattern in test_name for pattern in self.skip_patterns) + def __str__(self): + return self.name + def all_flows() -> dict[str, TestFlow]: flows = [] @@ -119,10 +127,18 @@ def all_flows() -> dict[str, TestFlow]: logger.info(f"Skipping QNN flow registration: {e}") try: - from executorch.backends.test.suite.flows.arm import ARM_TOSA_FLOW + from executorch.backends.test.suite.flows.arm import ( + ARM_ETHOS_U55_FLOW, + ARM_ETHOS_U85_FLOW, + ARM_TOSA_FP_FLOW, + ARM_TOSA_INT_FLOW, + ) flows += [ - ARM_TOSA_FLOW, + ARM_TOSA_FP_FLOW, + ARM_TOSA_INT_FLOW, + ARM_ETHOS_U55_FLOW, + ARM_ETHOS_U85_FLOW, ] except Exception as e: logger.info(f"Skipping ARM flow registration: {e}") diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py index baa2df79de9..85674331eda 100644 --- a/backends/test/suite/flows/arm.py +++ b/backends/test/suite/flows/arm.py @@ -1,24 +1,68 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Create flows for Arm Backends used to test operator and model suits + +from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec +from executorch.backends.arm.quantizer import get_symmetric_quantization_config from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec +from executorch.backends.arm.util._factory import create_quantizer from executorch.backends.test.suite.flow import TestFlow +from executorch.backends.xnnpack.test.tester.tester import Quantize -def _create_arm_tester_tosa_fp(*args, **kwargs) -> ArmTester: - kwargs["compile_spec"] = common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP") +def _create_arm_flow( + name, + compile_spec: ArmCompileSpec, + symmetric_io_quantization: bool = False, + per_channel_quantization: bool = True, +) -> TestFlow: - return ArmTester( - *args, - **kwargs, - ) + def _create_arm_tester(*args, **kwargs) -> ArmTester: + kwargs["compile_spec"] = compile_spec + return ArmTester(*args, **kwargs) + + support_serialize = not isinstance(compile_spec, TosaCompileSpec) + quantize = compile_spec.tosa_spec.support_integer() + + if quantize is True: + def create_quantize_stage() -> Quantize: + quantizer = create_quantizer(compile_spec) + quantization_config = get_symmetric_quantization_config( + is_per_channel=per_channel_quantization + ) + if symmetric_io_quantization: + quantizer.set_io(quantization_config) + return Quantize(quantizer, quantization_config) -def _create_tosa_flow() -> TestFlow: return TestFlow( - "arm_tosa", + name, backend="arm", - tester_factory=_create_arm_tester_tosa_fp, - supports_serialize=False, + tester_factory=_create_arm_tester, + supports_serialize=support_serialize, + quantize=quantize, + quantize_stage_factory=(create_quantize_stage if quantize is True else False), ) -ARM_TOSA_FLOW = _create_tosa_flow() +ARM_TOSA_FP_FLOW = _create_arm_flow( + "arm_tosa_fp", + common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"), +) +ARM_TOSA_INT_FLOW = _create_arm_flow( + "arm_tosa_int", + common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"), +) +ARM_ETHOS_U55_FLOW = _create_arm_flow( + "arm_ethos_u55", + common.get_u55_compile_spec(), +) +ARM_ETHOS_U85_FLOW = _create_arm_flow( + "arm_ethos_u85", + common.get_u85_compile_spec(), +) diff --git a/backends/test/suite/flows/qualcomm.py b/backends/test/suite/flows/qualcomm.py index 9998caa51b6..99deb3d4877 100644 --- a/backends/test/suite/flows/qualcomm.py +++ b/backends/test/suite/flows/qualcomm.py @@ -42,7 +42,7 @@ def create_quantize_stage() -> Quantize: QNN_TEST_FLOW = _create_qnn_flow("qnn") QNN_16A16W_TEST_FLOW = _create_qnn_flow( - "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False + "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_16a16w, use_fp16=False ) QNN_16A8W_TEST_FLOW = _create_qnn_flow( "qnn_16a8w", quantize=True, quant_dtype=QuantDtype.use_16a8w, use_fp16=False diff --git a/backends/test/suite/generate_markdown_summary.py b/backends/test/suite/generate_markdown_summary.py index 73da8fba678..e54fc691723 100644 --- a/backends/test/suite/generate_markdown_summary.py +++ b/backends/test/suite/generate_markdown_summary.py @@ -1,44 +1,69 @@ import argparse import csv +import json import sys -# -# A standalone script to generate a Markdown representation of a test report. -# This is primarily intended to be used with GitHub actions to generate a nice -# representation of the test results when looking at the action run. -# -# Usage: python executorch/backends/test/suite/generate_markdown_summary.py -# Markdown is written to stdout. -# +from dataclasses import dataclass, field -def escape_for_markdown(text: str) -> str: +@dataclass +class ResultCounts: """ - Modify a string to properly display in a markdown table cell. + Represents aggregated result counts for each status. """ - if not text: - return text - # Replace newlines with
tags - escaped = text.replace("\n", "
") + total: int = 0 + passes: int = 0 + fails: int = 0 + skips: int = 0 + by_detail: dict[str, int] = field(default_factory=lambda: {}) - # Escape backslashes. - escaped = escaped.replace("\\", "\\\\") + def add_row(self, result_value: str, result_detail: str) -> None: + """ + Update the result counts for the specified row. + """ - # Escape pipe characters that would break table structure - escaped = escaped.replace("|", "\\|") + self.total += 1 - return escaped + if result_value == "Pass": + self.passes += 1 + elif result_value == "Fail": + self.fails += 1 + elif result_value == "Skip": + self.skips += 1 + else: + raise RuntimeError(f"Unknown result value {result_value}") + if result_detail: + if result_detail not in self.by_detail: + self.by_detail[result_detail] = 0 + + self.by_detail[result_detail] += 1 + + +@dataclass +class AggregatedSummary: + """ + Represents aggegrated summary data for the test run. + """ + + counts: ResultCounts + counts_by_params: dict[str, ResultCounts] + failed_tests: list[list[str]] + header: list[str] + + +# +# A standalone script to generate a Markdown representation of a test report. +# This is primarily intended to be used with GitHub actions to generate a nice +# representation of the test results when looking at the action run. +# +# Usage: python executorch/backends/test/suite/generate_markdown_summary.py +# Markdown is written to stdout. +# -def generate_markdown(csv_path: str, exit_code: int = 0): # noqa (C901) - # Print warning if exit code is non-zero - if exit_code != 0: - print("> [!WARNING]") - print( - f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n" - ) +def aggregate_results(csv_path: str) -> AggregatedSummary: with open(csv_path, newline="", encoding="utf-8") as f: reader = csv.reader(f) rows = list(reader) @@ -46,24 +71,28 @@ def generate_markdown(csv_path: str, exit_code: int = 0): # noqa (C901) header = rows[0] data_rows = rows[1:] - # Find the Result and Result Detail column indices - result_column_index = None - result_detail_column_index = None - for i, col in enumerate(header): - if col.lower() == "result": - result_column_index = i - elif col.lower() == "result detail": - result_detail_column_index = i + header_indices_by_name = {n.lower(): i for (i, n) in enumerate(header)} + params_column_index = header_indices_by_name.get("params", None) + result_column_index = header_indices_by_name["result"] + result_detail_column_index = header_indices_by_name["result detail"] # Count results and prepare data - pass_count = 0 - fail_count = 0 - skip_count = 0 + counts = ResultCounts() failed_tests = [] - processed_rows = [] - result_detail_counts = {} + counts_by_param = {} for row in data_rows: + result = row[result_column_index] + result_detail = row[result_detail_column_index] + + counts.add_row(result, result_detail) + + params = row[params_column_index] if params_column_index else None + if params: + if params not in counts_by_param: + counts_by_param[params] = ResultCounts() + counts_by_param[params].add_row(result, result_detail) + # Make a copy of the row to avoid modifying the original processed_row = [escape_for_markdown(cell) for cell in row] @@ -71,54 +100,130 @@ def generate_markdown(csv_path: str, exit_code: int = 0): # noqa (C901) if result_column_index is not None and result_column_index < len(row): result_value = row[result_column_index].strip().lower() if result_value == "pass": - pass_count += 1 processed_row[result_column_index] = ( 'Pass' ) elif result_value == "fail": - fail_count += 1 processed_row[result_column_index] = ( 'Fail' ) failed_tests.append(processed_row.copy()) elif result_value == "skip": - skip_count += 1 processed_row[result_column_index] = ( 'Skip' ) - # Count result details (excluding empty ones) - if result_detail_column_index is not None and result_detail_column_index < len( - row - ): - result_detail_value = row[result_detail_column_index].strip() - if result_detail_value: # Only count non-empty result details - if result_detail_value in result_detail_counts: - result_detail_counts[result_detail_value] += 1 - else: - result_detail_counts[result_detail_value] = 1 + return AggregatedSummary( + counts=counts, + failed_tests=failed_tests, + counts_by_params=counts_by_param, + header=header, + ) + + +def escape_for_markdown(text: str) -> str: + """ + Modify a string to properly display in a markdown table cell. + """ + if not text: + return text + + # Replace newlines with
tags + escaped = text.replace("\n", "
") - processed_rows.append(processed_row) + # Escape backslashes. + escaped = escaped.replace("\\", "\\\\") + + # Escape pipe characters that would break table structure + escaped = escaped.replace("|", "\\|") + + return escaped + + +def generate_markdown(csv_path: str, exit_code: int = 0): # noqa (C901) + # Print warning if exit code is non-zero + if exit_code != 0: + print("> [!WARNING]") + print( + f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n" + ) + + results = aggregate_results(csv_path) # Generate Summary section - total_rows = len(data_rows) print("# Summary\n") - print(f"- **Pass**: {pass_count}/{total_rows}") - print(f"- **Fail**: {fail_count}/{total_rows}") - print(f"- **Skip**: {skip_count}/{total_rows}") + total_excluding_skips = results.counts.passes + results.counts.fails + pass_fraction = results.counts.passes / total_excluding_skips + fail_fraction = results.counts.fails / total_excluding_skips + print( + f"- **Pass**: {results.counts.passes}/{total_excluding_skips} ({pass_fraction*100:.2f}%)" + ) + print( + f"- **Fail**: {results.counts.fails}/{total_excluding_skips} ({fail_fraction*100:.2f}%)" + ) + print(f"- **Skip**: {results.counts.skips}") + + if results.counts_by_params: + print("\n## Results by Parameters\n") + + # Extract all unique parameter keys from the JSON strings + all_param_keys = set() + parsed_params = {} + + for params_str in results.counts_by_params.keys(): + # Parse the JSON string (it's a string representation of a dict) + params_dict = json.loads(params_str) + parsed_params[params_str] = params_dict + all_param_keys.update(params_dict.keys()) + + if parsed_params and len(parsed_params) > 1: + # Sort parameter keys for consistent column ordering + sorted_param_keys = sorted(all_param_keys) + + # Create table header + header_cols = sorted_param_keys + ["Pass", "Fail", "Skip", "Pass %"] + print("| " + " | ".join(header_cols) + " |") + print("|" + "|".join(["---"] * len(header_cols)) + "|") + + # Create table rows + for params_str, counts in results.counts_by_params.items(): + if params_str in parsed_params: + params_dict = parsed_params[params_str] + row_values = [] + + # Add parameter values + for key in sorted_param_keys: + value = params_dict.get(key, "") + row_values.append(str(value)) + + pass_fraction = counts.passes / (counts.passes + counts.fails) + + # Add count values + row_values.extend( + [ + str(counts.passes), + str(counts.fails), + str(counts.skips), + f"{pass_fraction*100:.2f}%", + ] + ) + + print("| " + " | ".join(row_values) + " |") + + print() print("## Failure Breakdown:") - total_rows_with_result_detail = sum(result_detail_counts.values()) - for detail, count in sorted(result_detail_counts.items()): + total_rows_with_result_detail = sum(results.counts.by_detail.values()) + for detail, count in sorted(results.counts.by_detail.items()): print(f"- **{detail}**: {count}/{total_rows_with_result_detail}") # Generate Failed Tests section print("# Failed Tests\n") - if failed_tests: - escaped_header = [escape_for_markdown(col) for col in header] + if results.failed_tests: + escaped_header = [escape_for_markdown(col) for col in results.header] print("| " + " | ".join(escaped_header) + " |") - print("|" + "|".join(["---"] * len(header)) + "|") - for row in failed_tests: + print("|" + "|".join(["---"] * len(results.header)) + "|") + for row in results.failed_tests: print("| " + " | ".join(row) + " |") else: print("No failed tests.\n") diff --git a/backends/test/suite/generate_markdown_summary_json.py b/backends/test/suite/generate_markdown_summary_json.py new file mode 100644 index 00000000000..4b6edc2a635 --- /dev/null +++ b/backends/test/suite/generate_markdown_summary_json.py @@ -0,0 +1,229 @@ +import argparse +import json + +from dataclasses import dataclass, field + + +@dataclass +class ResultCounts: + """ + Represents aggregated result counts for each status. + """ + + total: int = 0 + passes: int = 0 + fails: int = 0 + skips: int = 0 + by_detail: dict[str, int] = field(default_factory=lambda: {}) + + def add_row(self, result_value: str, result_detail: str) -> None: + """ + Update the result counts for the specified row. + """ + + self.total += 1 + + if result_value == "Pass": + self.passes += 1 + elif result_value == "Fail": + self.fails += 1 + elif result_value == "Skip": + self.skips += 1 + else: + raise RuntimeError(f"Unknown result value {result_value}") + + if result_detail: + if result_detail not in self.by_detail: + self.by_detail[result_detail] = 0 + + self.by_detail[result_detail] += 1 + + +@dataclass +class AggregatedSummary: + """ + Represents aggegrated summary data for the test run. + """ + + counts: ResultCounts + counts_by_params: dict[str, ResultCounts] + failed_tests: list[list[str]] + + +# +# A standalone script to generate a Markdown representation of a test report. +# This is primarily intended to be used with GitHub actions to generate a nice +# representation of the test results when looking at the action run. +# +# Usage: python executorch/backends/test/suite/generate_markdown_summary.py +# Markdown is written to stdout. +# + + +def aggregate_results(json_path: str) -> AggregatedSummary: + with open(json_path) as f: + data = json.load(f) + + # Count results and prepare data + counts = ResultCounts() + failed_tests = [] + counts_by_param = {} + + for test_data in data["tests"]: + result_meta = test_data["metadata"] + for subtest_meta in result_meta["subtests"]: + result = subtest_meta["Result"] + result_detail = subtest_meta.get("Result Detail") or "" + + counts.add_row(result, result_detail) + + test_id = subtest_meta["Test ID"] + base_test = subtest_meta["Test Case"] + params = test_id[len(base_test) + 1 : -1] + + if params: + if params not in counts_by_param: + counts_by_param[params] = ResultCounts() + counts_by_param[params].add_row(result, result_detail) + + if result.lower() == "fail": + failed_tests.append(subtest_meta) + + return AggregatedSummary( + counts=counts, + failed_tests=failed_tests, + counts_by_params=counts_by_param, + ) + + +def escape_for_markdown(text: str) -> str: + """ + Modify a string to properly display in a markdown table cell. + """ + if not text: + return text + + # Replace newlines with
tags + escaped = text.replace("\n", "
") + + # Escape backslashes. + escaped = escaped.replace("\\", "\\\\") + + # Escape pipe characters that would break table structure + escaped = escaped.replace("|", "\\|") + + return escaped + + +def generate_markdown(json_path: str, exit_code: int = 0): # noqa (C901) + results = aggregate_results(json_path) + + # Generate Summary section + print("# Summary\n") + total_excluding_skips = results.counts.passes + results.counts.fails + pass_fraction = results.counts.passes / total_excluding_skips + fail_fraction = results.counts.fails / total_excluding_skips + print( + f"- **Pass**: {results.counts.passes}/{total_excluding_skips} ({pass_fraction*100:.2f}%)" + ) + print( + f"- **Fail**: {results.counts.fails}/{total_excluding_skips} ({fail_fraction*100:.2f}%)" + ) + print(f"- **Skip**: {results.counts.skips}") + + if results.counts_by_params: + print("\n## Results by Parameters\n") + + if len(results.counts_by_params) > 0: + # Create table header + header_cols = ["Params", "Pass", "Fail", "Skip", "Pass %"] + print("| " + " | ".join(header_cols) + " |") + print("|" + "|".join(["---"] * len(header_cols)) + "|") + + # Create table rows + for params_str, counts in results.counts_by_params.items(): + row_values = [params_str] + + # Add parameter values + pass_fraction = counts.passes / (counts.passes + counts.fails) + + # Add count values + row_values.extend( + [ + str(counts.passes), + str(counts.fails), + str(counts.skips), + f"{pass_fraction*100:.2f}%", + ] + ) + + print("| " + " | ".join(row_values) + " |") + + print() + + print("## Failure Breakdown:") + total_rows_with_result_detail = sum(results.counts.by_detail.values()) + for detail, count in sorted(results.counts.by_detail.items()): + print(f"- **{detail}**: {count}/{total_rows_with_result_detail}") + + # Generate Failed Tests section + print("# Failed Tests\n") + print( + "To reproduce, run the following command from the root of the ExecuTorch repository:" + ) + print("```") + print('pytest -c /dev/nul backends/test/suite/ -k ""') + print("```") + if results.failed_tests: + header = build_header(results.failed_tests) + + escaped_header = [escape_for_markdown(col) for col in header.keys()] + print("| " + " | ".join(escaped_header) + " |") + print("|" + "|".join(["---"] * len(escaped_header)) + "|") + for rec in results.failed_tests: + row = build_row(rec, header) + print("| " + " | ".join(row) + " |") + else: + print("No failed tests.\n") + + +def build_header(data) -> dict[str, int]: + """ + Find the union of all keys and return a dict of header keys and indices. Try to preserve + ordering as much as possible. + """ + + keys = max(data, key=len) + + header = {k: i for (i, k) in enumerate(keys)} + + for rec in data: + keys = set(rec.keys()) + for k in keys: + if k not in header: + header[k] = len(header) + + return header + + +def build_row(rec, header: dict[str, int]) -> list[str]: + row = [""] * len(header) + for k, v in rec.items(): + row[header[k]] = escape_for_markdown(str(v)) + return row + + +def main(): + parser = argparse.ArgumentParser( + description="Generate a Markdown representation of a test report." + ) + parser.add_argument("json_path", help="Path to the test report CSV file.") + parser.add_argument( + "--exit-code", type=int, default=0, help="Exit code from the test process." + ) + args = parser.parse_args() + generate_markdown(args.json_path, args.exit_code) + + +if __name__ == "__main__": + main() diff --git a/backends/test/suite/models/__init__.py b/backends/test/suite/models/__init__.py index ea44275a463..6ac1a72bde6 100644 --- a/backends/test/suite/models/__init__.py +++ b/backends/test/suite/models/__init__.py @@ -5,136 +5,3 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe - -import itertools -import os -import unittest -from typing import Any, Callable - -import torch -from executorch.backends.test.suite import get_test_flows -from executorch.backends.test.suite.context import get_active_test_context, TestContext -from executorch.backends.test.suite.flow import TestFlow -from executorch.backends.test.suite.reporting import log_test_summary -from executorch.backends.test.suite.runner import run_test - - -DTYPES: list[torch.dtype] = [ - torch.float16, - torch.float32, -] - - -def load_tests(loader, suite, pattern): - package_dir = os.path.dirname(__file__) - discovered_suite = loader.discover( - start_dir=package_dir, pattern=pattern or "test_*.py" - ) - suite.addTests(discovered_suite) - return suite - - -def _create_test( - cls, - test_func: Callable, - flow: TestFlow, - dtype: torch.dtype, - use_dynamic_shapes: bool, -): - dtype_name = str(dtype)[6:] # strip "torch." - test_name = f"{test_func.__name__}_{flow.name}_{dtype_name}" - if use_dynamic_shapes: - test_name += "_dynamic_shape" - - def wrapped_test(self): - params = { - "dtype": dtype, - "use_dynamic_shapes": use_dynamic_shapes, - } - with TestContext(test_name, test_func.__name__, flow.name, params): - if flow.should_skip_test(test_name): - raise unittest.SkipTest( - f"Skipping test due to matching flow {flow.name} skip patterns" - ) - - test_func(self, flow, dtype, use_dynamic_shapes) - - wrapped_test._name = test_func.__name__ # type: ignore - wrapped_test._flow = flow # type: ignore - - setattr(cls, test_name, wrapped_test) - - -# Expand a test into variants for each registered flow. -def _expand_test(cls, test_name: str) -> None: - test_func = getattr(cls, test_name) - supports_dynamic_shapes = getattr(test_func, "supports_dynamic_shapes", True) - dynamic_shape_values = [True, False] if supports_dynamic_shapes else [False] - dtypes = getattr(test_func, "dtypes", DTYPES) - - for flow, dtype, use_dynamic_shapes in itertools.product( - get_test_flows().values(), dtypes, dynamic_shape_values - ): - _create_test(cls, test_func, flow, dtype, use_dynamic_shapes) - delattr(cls, test_name) - - -def model_test_cls(cls) -> Callable | None: - """Decorator for model tests. Handles generating test variants for each test flow and configuration.""" - for key in dir(cls): - if key.startswith("test_"): - _expand_test(cls, key) - return cls - - -def model_test_params( - supports_dynamic_shapes: bool = True, - dtypes: list[torch.dtype] | None = None, -) -> Callable: - """Optional parameter decorator for model tests. Specifies test pararameters. Only valid with a class decorated by model_test_cls.""" - - def inner_decorator(func: Callable) -> Callable: - func.supports_dynamic_shapes = supports_dynamic_shapes # type: ignore - - if dtypes is not None: - func.dtypes = dtypes # type: ignore - - return func - - return inner_decorator - - -def run_model_test( - model: torch.nn.Module, - inputs: tuple[Any], - flow: TestFlow, - dtype: torch.dtype, - dynamic_shapes: Any | None, -): - model = model.to(dtype) - context = get_active_test_context() - - # This should be set in the wrapped test. See _create_test above. - assert context is not None, "Missing test context." - - run_summary = run_test( - model, - inputs, - flow, - context.test_name, - context.test_base_name, - 0, # subtest_index - currently unused for model tests - context.params, - dynamic_shapes=dynamic_shapes, - ) - - log_test_summary(run_summary) - - if not run_summary.result.is_success(): - if run_summary.result.is_backend_failure(): - raise RuntimeError("Test failure.") from run_summary.error - else: - # Non-backend failure indicates a bad test. Mark as skipped. - raise unittest.SkipTest( - f"Test failed for reasons other than backend failure. Error: {run_summary.error}" - ) diff --git a/backends/test/suite/models/test_torchaudio.py b/backends/test/suite/models/test_torchaudio.py index 69f6de4684f..2287b226c37 100644 --- a/backends/test/suite/models/test_torchaudio.py +++ b/backends/test/suite/models/test_torchaudio.py @@ -9,15 +9,11 @@ import unittest from typing import Tuple +import pytest import torch import torchaudio -from executorch.backends.test.suite.flow import TestFlow -from executorch.backends.test.suite.models import ( - model_test_cls, - model_test_params, - run_model_test, -) +from executorch.backends.test.suite import dtype_to_str from torch.export import Dim # @@ -47,64 +43,68 @@ def forward( return x.transpose(0, 1) -@model_test_cls -class TorchAudio(unittest.TestCase): - @model_test_params(dtypes=[torch.float32], supports_dynamic_shapes=False) - def test_conformer( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - inner_model = torchaudio.models.Conformer( - input_dim=80, - num_heads=4, - ffn_dim=128, - num_layers=4, - depthwise_conv_kernel_size=31, - ) - model = PatchedConformer(inner_model) - lengths = torch.randint(1, 400, (10,)) +@pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str) +@pytest.mark.parametrize("use_dynamic_shapes", [False], ids=["static_shapes"]) +def test_conformer(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + inner_model = torchaudio.models.Conformer( + input_dim=80, + num_heads=4, + ffn_dim=128, + num_layers=4, + depthwise_conv_kernel_size=31, + ) + model = PatchedConformer(inner_model).eval().to(dtype) + lengths = torch.randint(1, 400, (10,)) - encoder_padding_mask = torchaudio.models.conformer._lengths_to_padding_mask( - lengths - ) - inputs = ( - torch.rand(10, int(lengths.max()), 80), - encoder_padding_mask, - ) + encoder_padding_mask = torchaudio.models.conformer._lengths_to_padding_mask(lengths) + inputs = ( + torch.rand(10, int(lengths.max()), 80), + encoder_padding_mask, + ) + + test_runner.lower_and_run_model(model, inputs) - run_model_test(model, inputs, flow, dtype, None) - - @model_test_params(dtypes=[torch.float32]) - def test_wav2letter( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchaudio.models.Wav2Letter() - inputs = (torch.randn(1, 1, 1024, dtype=dtype),) - dynamic_shapes = ( - { - "x": { - 2: Dim("d", min=900, max=1024), - } + +@pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str) +@pytest.mark.parametrize( + "use_dynamic_shapes", [False, True], ids=["static_shapes", "dynamic_shapes"] +) +def test_wav2letter(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchaudio.models.Wav2Letter().to(dtype) + inputs = (torch.randn(1, 1, 1024, dtype=dtype),) + dynamic_shapes = ( + { + "x": { + 2: Dim("d", min=900, max=1024), } - if use_dynamic_shapes - else None - ) - run_model_test(model, inputs, flow, dtype, dynamic_shapes) - - @unittest.skip("This model times out on all backends.") - def test_wavernn( - self, - flow: TestFlow, - dtype: torch.dtype, - use_dynamic_shapes: bool, - ): - model = torchaudio.models.WaveRNN( + } + if use_dynamic_shapes + else None + ) + + test_runner.lower_and_run_model(model, inputs, dynamic_shapes=dynamic_shapes) + + +@pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str) +@pytest.mark.parametrize("use_dynamic_shapes", [False], ids=["static_shapes"]) +@unittest.skip("This model times out on all backends.") +def test_wavernn( + test_runner, + dtype: torch.dtype, + use_dynamic_shapes: bool, +): + model = ( + torchaudio.models.WaveRNN( upsample_scales=[5, 5, 8], n_classes=512, hop_length=200 - ).eval() - - # See https://docs.pytorch.org/audio/stable/generated/torchaudio.models.WaveRNN.html#forward - inputs = ( - torch.randn(1, 1, (64 - 5 + 1) * 200), # waveform - torch.randn(1, 1, 128, 64), # specgram ) + .eval() + .to(dtype) + ) + + # See https://docs.pytorch.org/audio/stable/generated/torchaudio.models.WaveRNN.html#forward + inputs = ( + torch.randn(1, 1, (64 - 5 + 1) * 200).to(dtype), # waveform + torch.randn(1, 1, 128, 64).to(dtype), # specgram + ) - run_model_test(model, inputs, flow, dtype, None) + test_runner.lower_and_run_model(model, inputs) diff --git a/backends/test/suite/models/test_torchvision.py b/backends/test/suite/models/test_torchvision.py index e69de80a871..58cf6a990d4 100644 --- a/backends/test/suite/models/test_torchvision.py +++ b/backends/test/suite/models/test_torchvision.py @@ -6,17 +6,12 @@ # pyre-unsafe -import unittest +import pytest import torch import torchvision +from executorch.backends.test.suite import dtype_to_str -from executorch.backends.test.suite.flow import TestFlow -from executorch.backends.test.suite.models import ( - model_test_cls, - model_test_params, - run_model_test, -) from torch.export import Dim # @@ -25,148 +20,175 @@ # multiple size variants, one small or medium variant is used. # +PARAMETERIZE_DTYPE = pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str) +PARAMETERIZE_DYNAMIC_SHAPES = pytest.mark.parametrize( + "use_dynamic_shapes", [False, True], ids=["static_shapes", "dynamic_shapes"] +) +PARAMETERIZE_STATIC_ONLY = pytest.mark.parametrize( + "use_dynamic_shapes", [False], ids=["static_shapes"] +) + + +def _test_cv_model( + model: torch.nn.Module, + test_runner, + dtype: torch.dtype, + use_dynamic_shapes: bool, +): + model = model.eval().to(dtype) + + # Test a CV model that follows the standard conventions. + inputs = (torch.randn(1, 3, 224, 224, dtype=dtype),) -@model_test_cls -class TorchVision(unittest.TestCase): - def _test_cv_model( - self, - model: torch.nn.Module, - flow: TestFlow, - dtype: torch.dtype, - use_dynamic_shapes: bool, - ): - # Test a CV model that follows the standard conventions. - inputs = (torch.randn(1, 3, 224, 224, dtype=dtype),) - - dynamic_shapes = ( - ( - { - 2: Dim("height", min=1, max=16) * 16, - 3: Dim("width", min=1, max=16) * 16, - }, - ) - if use_dynamic_shapes - else None + dynamic_shapes = ( + ( + { + 2: Dim("height", min=1, max=16) * 16, + 3: Dim("width", min=1, max=16) * 16, + }, ) + if use_dynamic_shapes + else None + ) + + test_runner.lower_and_run_model(model, inputs, dynamic_shapes=dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_alexnet(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.alexnet() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_convnext_small(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.convnext_small() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_densenet161(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.densenet161() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_efficientnet_b4(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.efficientnet_b4() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_efficientnet_v2_s(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.efficientnet_v2_s() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_googlenet(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.googlenet() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_inception_v3(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.inception_v3() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_STATIC_ONLY +def test_maxvit_t(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.maxvit_t() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_mnasnet1_0(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.mnasnet1_0() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_mobilenet_v2(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.mobilenet_v2() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_mobilenet_v3_small(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.mobilenet_v3_small() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_regnet_y_1_6gf(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.regnet_y_1_6gf() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_resnet50(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.resnet50() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_resnext50_32x4d(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.resnext50_32x4d() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_shufflenet_v2_x1_0(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.shufflenet_v2_x1_0() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_squeezenet1_1(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.squeezenet1_1() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_swin_v2_t(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.swin_v2_t() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_vgg11(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.vgg11() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + + +@PARAMETERIZE_DTYPE +@PARAMETERIZE_STATIC_ONLY +def test_vit_b_16(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.vit_b_16() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) + - run_model_test(model, inputs, flow, dtype, dynamic_shapes) - - def test_alexnet( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.alexnet() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_convnext_small( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.convnext_small() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_densenet161( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.densenet161() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_efficientnet_b4( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.efficientnet_b4() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_efficientnet_v2_s( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.efficientnet_v2_s() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_googlenet( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.googlenet() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_inception_v3( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.inception_v3() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - @model_test_params(supports_dynamic_shapes=False) - def test_maxvit_t( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.maxvit_t() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_mnasnet1_0( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.mnasnet1_0() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_mobilenet_v2( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.mobilenet_v2() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_mobilenet_v3_small( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.mobilenet_v3_small() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_regnet_y_1_6gf( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.regnet_y_1_6gf() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_resnet50( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.resnet50() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_resnext50_32x4d( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.resnext50_32x4d() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_shufflenet_v2_x1_0( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.shufflenet_v2_x1_0() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_squeezenet1_1( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.squeezenet1_1() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_swin_v2_t( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.swin_v2_t() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_vgg11(self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool): - model = torchvision.models.vgg11() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - @model_test_params(supports_dynamic_shapes=False) - def test_vit_b_16( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.vit_b_16() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) - - def test_wide_resnet50_2( - self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool - ): - model = torchvision.models.wide_resnet50_2() - self._test_cv_model(model, flow, dtype, use_dynamic_shapes) +@PARAMETERIZE_DTYPE +@PARAMETERIZE_DYNAMIC_SHAPES +def test_wide_resnet50_2(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool): + model = torchvision.models.wide_resnet50_2() + _test_cv_model(model, test_runner, dtype, use_dynamic_shapes) diff --git a/backends/test/suite/operators/__init__.py b/backends/test/suite/operators/__init__.py index 9c550b3a49c..7475af29e15 100644 --- a/backends/test/suite/operators/__init__.py +++ b/backends/test/suite/operators/__init__.py @@ -6,19 +6,14 @@ # pyre-unsafe -import copy import os +import sys import unittest from enum import Enum -from typing import Callable +import pytest import torch -from executorch.backends.test.suite import get_test_flows -from executorch.backends.test.suite.context import get_active_test_context, TestContext -from executorch.backends.test.suite.flow import TestFlow -from executorch.backends.test.suite.reporting import log_test_summary -from executorch.backends.test.suite.runner import run_test def load_tests(loader, suite, pattern): @@ -66,112 +61,48 @@ def dtype_test(func): return func -# Class annotation for operator tests. This triggers the test framework to register -# the tests. -def operator_test(cls): - _create_tests(cls) - return cls - - -# Generate test cases for each backend flow. -def _create_tests(cls): - for key in dir(cls): - if key.startswith("test_"): - _expand_test(cls, key) - - -# Expand a test into variants for each registered flow. -def _expand_test(cls, test_name: str): - test_func = getattr(cls, test_name) - for flow in get_test_flows().values(): - _create_test_for_backend(cls, test_func, flow) - delattr(cls, test_name) +class OperatorTest(unittest.TestCase): + pass -def _make_wrapped_test( - test_func: Callable, - test_name: str, - test_base_name: str, - flow: TestFlow, - params: dict | None = None, -): - def wrapped_test(self): - with TestContext(test_name, test_base_name, flow.name, params): - if flow.should_skip_test(test_name): - raise unittest.SkipTest( - f"Skipping test due to matching flow {flow.name} skip patterns" - ) +class TestCaseShim: + def __init__(self, test_runner): + self._test_runner = test_runner - test_kwargs = copy.copy(params) or {} - test_kwargs["flow"] = flow + def _test_op(self, model, args, flow, generate_random_test_inputs=True): + self._test_runner.lower_and_run_model( + model, args, generate_random_test_inputs=generate_random_test_inputs + ) - test_func(self, **test_kwargs) - wrapped_test._name = test_name - wrapped_test._flow = flow +def wrap_test(original_func, test_type): + if test_type == TestType.STANDARD: - return wrapped_test + def wrapped_func(test_runner): + shim = TestCaseShim(test_runner) + original_func(shim, test_runner._flow) + return wrapped_func + elif test_type == TestType.DTYPE: -def _create_test_for_backend( - cls, - test_func: Callable, - flow: TestFlow, -): - test_type = getattr(test_func, "test_type", TestType.STANDARD) + @pytest.mark.parametrize("dtype", [torch.float32], ids=lambda s: str(s)[6:]) + def wrapped_func(test_runner, dtype): + shim = TestCaseShim(test_runner) + original_func(shim, test_runner._flow, dtype) - if test_type == TestType.STANDARD: - test_name = f"{test_func.__name__}_{flow.name}" - wrapped_test = _make_wrapped_test( - test_func, test_name, test_func.__name__, flow - ) - setattr(cls, test_name, wrapped_test) - elif test_type == TestType.DTYPE: - for dtype in DTYPES: - dtype_name = str(dtype)[6:] # strip "torch." - test_name = f"{test_func.__name__}_{dtype_name}_{flow.name}" - wrapped_test = _make_wrapped_test( - test_func, - test_name, - test_func.__name__, - flow, - {"dtype": dtype}, - ) - setattr(cls, test_name, wrapped_test) + return wrapped_func else: - raise NotImplementedError(f"Unknown test type {test_type}.") + raise ValueError() -class OperatorTest(unittest.TestCase): - def _test_op( - self, model, inputs, flow: TestFlow, generate_random_test_inputs: bool = True - ): - context = get_active_test_context() - - # This should be set in the wrapped test. See _make_wrapped_test above. - assert context is not None, "Missing test context." - - run_summary = run_test( - model, - inputs, - flow, - context.test_name, - context.test_base_name, - context.subtest_index, - context.params, - generate_random_test_inputs=generate_random_test_inputs, - ) - - log_test_summary(run_summary) +def operator_test(cls): + parent_module = sys.modules[cls.__module__] - # This is reset when a new test is started - it creates the context per-test. - context.subtest_index = context.subtest_index + 1 + for func_name in dir(cls): + if func_name.startswith("test"): + original_func = getattr(cls, func_name) + test_type = getattr(original_func, "test_type", TestType.STANDARD) + wrapped_func = wrap_test(original_func, test_type) + setattr(parent_module, func_name, wrapped_func) - if not run_summary.result.is_success(): - if run_summary.result.is_backend_failure(): - raise RuntimeError("Test failure.") from run_summary.error - else: - # Non-backend failure indicates a bad test. Mark as skipped. - raise unittest.SkipTest( - f"Test failed for reasons other than backend failure. Error: {run_summary.error}" - ) + return None diff --git a/backends/test/suite/operators/test_add.py b/backends/test/suite/operators/test_add.py index 6b21c3bf985..850e6f5132c 100644 --- a/backends/test/suite/operators/test_add.py +++ b/backends/test/suite/operators/test_add.py @@ -7,14 +7,8 @@ # pyre-unsafe +import pytest import torch -from executorch.backends.test.suite.flow import TestFlow - -from executorch.backends.test.suite.operators import ( - dtype_test, - operator_test, - OperatorTest, -) class Model(torch.nn.Module): @@ -31,55 +25,52 @@ def forward(self, x, y): return torch.add(x, y, alpha=self.alpha) -@operator_test -class Add(OperatorTest): - @dtype_test - def test_add_dtype(self, flow: TestFlow, dtype) -> None: - self._test_op( - Model(), - ( - (torch.rand(2, 10) * 100).to(dtype), - (torch.rand(2, 10) * 100).to(dtype), - ), - flow, - ) - - def test_add_f32_bcast_first(self, flow: TestFlow) -> None: - self._test_op( - Model(), - ( - torch.randn(5), - torch.randn(1, 5, 1, 5), - ), - flow, - ) - - def test_add_f32_bcast_second(self, flow: TestFlow) -> None: - self._test_op( - Model(), - ( - torch.randn(4, 4, 2, 7), - torch.randn(2, 7), - ), - flow, - ) - - def test_add_f32_bcast_unary(self, flow: TestFlow) -> None: - self._test_op( - Model(), - ( - torch.randn(5), - torch.randn(1, 1, 5), - ), - flow, - ) - - def test_add_f32_alpha(self, flow: TestFlow) -> None: - self._test_op( - ModelAlpha(alpha=2), - ( - torch.randn(1, 25), - torch.randn(1, 25), - ), - flow, - ) +@pytest.mark.parametrize("dtype", [torch.float32], ids=lambda s: str(s)[6:]) +def test_add_dtype(test_runner, dtype) -> None: + test_runner.lower_and_run_model( + Model(), + ( + (torch.rand(2, 10) * 100).to(dtype), + (torch.rand(2, 10) * 100).to(dtype), + ), + ) + + +def test_add_f32_bcast_first(test_runner) -> None: + test_runner.lower_and_run_model( + Model(), + ( + torch.randn(5), + torch.randn(1, 5, 1, 5), + ), + ) + + +def test_add_f32_bcast_second(test_runner) -> None: + test_runner.lower_and_run_model( + Model(), + ( + torch.randn(4, 4, 2, 7), + torch.randn(2, 7), + ), + ) + + +def test_add_f32_bcast_unary(test_runner) -> None: + test_runner.lower_and_run_model( + Model(), + ( + torch.randn(5), + torch.randn(1, 1, 5), + ), + ) + + +def test_add_f32_alpha(test_runner) -> None: + test_runner.lower_and_run_model( + ModelAlpha(alpha=2), + ( + torch.randn(1, 25), + torch.randn(1, 25), + ), + ) diff --git a/backends/test/suite/operators/test_lstm.py b/backends/test/suite/operators/test_lstm.py index 91dd73c9052..11632e1e055 100644 --- a/backends/test/suite/operators/test_lstm.py +++ b/backends/test/suite/operators/test_lstm.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -15,6 +16,11 @@ operator_test, OperatorTest, ) +from torch.nn.quantizable.modules.rnn import LSTM as QuantizableLSTM + + +def _get_lstm_cls(use_quantizable_lstm: bool): + return QuantizableLSTM if use_quantizable_lstm else torch.nn.LSTM class Model(torch.nn.Module): @@ -27,9 +33,11 @@ def __init__( batch_first=True, dropout=0.0, bidirectional=False, + use_quantizable_lstm: bool = False, ): super().__init__() - self.lstm = torch.nn.LSTM( + lstm_cls = _get_lstm_cls(use_quantizable_lstm) + self.lstm = lstm_cls( input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, @@ -47,106 +55,133 @@ def forward(self, x): class LSTM(OperatorTest): @dtype_test def test_lstm_dtype(self, flow: TestFlow, dtype) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(num_layers=2).to(dtype), + Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm).to(dtype), ((torch.rand(1, 10, 64) * 10).to(dtype),), # (batch=1, seq_len, input_size) flow, ) @dtype_test def test_lstm_no_bias_dtype(self, flow: TestFlow, dtype) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(num_layers=2, bias=False).to(dtype), + Model( + num_layers=2, bias=False, use_quantizable_lstm=use_quantizable_lstm + ).to(dtype), ((torch.rand(1, 10, 64) * 10).to(dtype),), flow, ) def test_lstm_feature_sizes(self, flow: TestFlow) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(input_size=32, hidden_size=16), + Model( + input_size=32, + hidden_size=16, + use_quantizable_lstm=use_quantizable_lstm, + ), (torch.randn(1, 8, 32),), # (batch=1, seq_len, input_size) flow, ) self._test_op( - Model(input_size=128, hidden_size=64), + Model( + input_size=128, + hidden_size=64, + use_quantizable_lstm=use_quantizable_lstm, + ), (torch.randn(1, 12, 128),), flow, ) self._test_op( - Model(input_size=256, hidden_size=128), + Model( + input_size=256, + hidden_size=128, + use_quantizable_lstm=use_quantizable_lstm, + ), (torch.randn(1, 6, 256),), flow, ) self._test_op( - Model(input_size=16, hidden_size=32), + Model( + input_size=16, + hidden_size=32, + use_quantizable_lstm=use_quantizable_lstm, + ), (torch.randn(1, 5, 16),), flow, ) def test_lstm_batch_sizes(self, flow: TestFlow) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(), + Model(use_quantizable_lstm=use_quantizable_lstm), (torch.randn(8, 10, 64),), flow, ) self._test_op( - Model(), + Model(use_quantizable_lstm=use_quantizable_lstm), (torch.randn(32, 10, 64),), flow, ) self._test_op( - Model(), + Model(use_quantizable_lstm=use_quantizable_lstm), (torch.randn(100, 10, 64),), flow, ) def test_lstm_seq_lengths(self, flow: TestFlow) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(), + Model(use_quantizable_lstm=use_quantizable_lstm), (torch.randn(1, 5, 64),), flow, ) self._test_op( - Model(), + Model(use_quantizable_lstm=use_quantizable_lstm), (torch.randn(1, 20, 64),), flow, ) self._test_op( - Model(), + Model(use_quantizable_lstm=use_quantizable_lstm), (torch.randn(1, 50, 64),), flow, ) def test_lstm_batch_first_false(self, flow: TestFlow) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(batch_first=False), + Model(batch_first=False, use_quantizable_lstm=use_quantizable_lstm), (torch.randn(10, 1, 64),), # (seq_len, batch=1, input_size) flow, ) def test_lstm_num_layers(self, flow: TestFlow) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(num_layers=2), + Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm), (torch.randn(1, 10, 64),), flow, ) self._test_op( - Model(num_layers=3), + Model(num_layers=3, use_quantizable_lstm=use_quantizable_lstm), (torch.randn(1, 10, 64),), flow, ) def test_lstm_bidirectional(self, flow: TestFlow) -> None: + use_quantizable_lstm = flow.quantize self._test_op( - Model(bidirectional=True), + Model(bidirectional=True, use_quantizable_lstm=use_quantizable_lstm), (torch.randn(1, 10, 64),), flow, ) def test_lstm_with_dropout(self, flow: TestFlow) -> None: # Note: Dropout is only effective with num_layers > 1 + use_quantizable_lstm = flow.quantize self._test_op( - Model(num_layers=2, dropout=0.2), + Model(num_layers=2, dropout=0.2, use_quantizable_lstm=use_quantizable_lstm), (torch.randn(1, 10, 64),), flow, ) @@ -154,9 +189,10 @@ def test_lstm_with_dropout(self, flow: TestFlow) -> None: def test_lstm_with_initial_states(self, flow: TestFlow) -> None: # Create a model that accepts initial states class ModelWithStates(torch.nn.Module): - def __init__(self): + def __init__(self, use_quantizable_lstm: bool = False): super().__init__() - self.lstm = torch.nn.LSTM( + lstm_cls = _get_lstm_cls(use_quantizable_lstm) + self.lstm = lstm_cls( input_size=64, hidden_size=32, num_layers=2, @@ -169,9 +205,10 @@ def forward(self, x, h0, c0): batch_size = 1 num_layers = 2 hidden_size = 32 + use_quantizable_lstm = flow.quantize self._test_op( - ModelWithStates(), + ModelWithStates(use_quantizable_lstm=use_quantizable_lstm), ( torch.randn(batch_size, 10, 64), # input torch.randn(num_layers, batch_size, hidden_size), # h0 @@ -183,9 +220,10 @@ def forward(self, x, h0, c0): def test_lstm_return_hidden_states(self, flow: TestFlow) -> None: # Create a model that returns both output and hidden states class ModelWithHiddenStates(torch.nn.Module): - def __init__(self): + def __init__(self, use_quantizable_lstm: bool = False): super().__init__() - self.lstm = torch.nn.LSTM( + lstm_cls = _get_lstm_cls(use_quantizable_lstm) + self.lstm = lstm_cls( input_size=64, hidden_size=32, num_layers=2, @@ -200,9 +238,10 @@ def forward(self, x): batch_size = 1 seq_len = 10 input_size = 64 + use_quantizable_lstm = flow.quantize self._test_op( - ModelWithHiddenStates(), + ModelWithHiddenStates(use_quantizable_lstm=use_quantizable_lstm), (torch.randn(batch_size, seq_len, input_size),), flow, ) diff --git a/backends/test/suite/operators/test_rsqrt.py b/backends/test/suite/operators/test_rsqrt.py index 705833194fb..bb51b213dd4 100644 --- a/backends/test/suite/operators/test_rsqrt.py +++ b/backends/test/suite/operators/test_rsqrt.py @@ -37,15 +37,28 @@ def test_rsqrt_dtype(self, flow: TestFlow, dtype) -> None: def test_rsqrt_shapes(self, flow: TestFlow) -> None: # Test with different tensor shapes - # 1D tensor - self._test_op(RsqrtModel(), (torch.rand(20) + 0.01,), flow) - + self._test_op( + RsqrtModel(), + (torch.rand(20) + 0.01,), + flow, + generate_random_test_inputs=False, + ) # 2D tensor - self._test_op(RsqrtModel(), (torch.rand(5, 10) + 0.01,), flow) + self._test_op( + RsqrtModel(), + (torch.rand(5, 10) + 0.01,), + flow, + generate_random_test_inputs=False, + ) # 3D tensor - self._test_op(RsqrtModel(), (torch.rand(3, 4, 5) + 0.01,), flow) + self._test_op( + RsqrtModel(), + (torch.rand(3, 4, 5) + 0.01,), + flow, + generate_random_test_inputs=False, + ) @unittest.skip("NaN and Inf are not enforced for backends.") def test_rsqrt_edge_cases(self, flow: TestFlow) -> None: diff --git a/backends/test/suite/operators/test_sqrt.py b/backends/test/suite/operators/test_sqrt.py index 3d327ade6a5..92fbc64878e 100644 --- a/backends/test/suite/operators/test_sqrt.py +++ b/backends/test/suite/operators/test_sqrt.py @@ -39,13 +39,19 @@ def test_sqrt_shapes(self, flow: TestFlow) -> None: # Test with different tensor shapes # 1D tensor - self._test_op(SqrtModel(), (torch.rand(20),), flow) + self._test_op( + SqrtModel(), (torch.rand(20),), flow, generate_random_test_inputs=False + ) # 2D tensor - self._test_op(SqrtModel(), (torch.rand(5, 10),), flow) + self._test_op( + SqrtModel(), (torch.rand(5, 10),), flow, generate_random_test_inputs=False + ) # 3D tensor - self._test_op(SqrtModel(), (torch.rand(3, 4, 5),), flow) + self._test_op( + SqrtModel(), (torch.rand(3, 4, 5),), flow, generate_random_test_inputs=False + ) @unittest.skip("NaN and Inf are not enforced for backends.") def test_sqrt_edge_cases(self, flow: TestFlow) -> None: diff --git a/backends/test/suite/operators/test_sub.py b/backends/test/suite/operators/test_sub.py index be7b871fdad..2243eb6ee71 100644 --- a/backends/test/suite/operators/test_sub.py +++ b/backends/test/suite/operators/test_sub.py @@ -6,7 +6,6 @@ # pyre-unsafe - import torch from executorch.backends.test.suite.flow import TestFlow diff --git a/backends/test/suite/reporting.py b/backends/test/suite/reporting.py index cdf2ce870e1..09e950ab672 100644 --- a/backends/test/suite/reporting.py +++ b/backends/test/suite/reporting.py @@ -1,4 +1,5 @@ import csv +import json from collections import Counter from dataclasses import dataclass, field @@ -343,7 +344,9 @@ def _sum_op_counts(counter: Counter | None) -> int | None: def _serialize_params(params: dict[str, Any] | None) -> str: if params is not None: - return str(dict(sorted(params.items()))) + # Convert values to strings - JSON conversion doesn't like dtypes. + str_params = {k: str(v) for k, v in params.items()} + return json.dumps(str_params) else: return "" diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py index eeea09e0fc1..a6d7d07bce0 100644 --- a/backends/test/suite/runner.py +++ b/backends/test/suite/runner.py @@ -57,7 +57,7 @@ def _graph_has_unsupported_patterns(program: torch.export.ExportedProgram) -> bo and node.target == exir_ops.edge.aten.convolution.default ): in_rank = node.args[0].meta["val"].dim() - if in_rank != 4: + if in_rank > 4: return True return False diff --git a/backends/test/suite/tests/test_reporting.py b/backends/test/suite/tests/test_reporting.py index 58ff76cba17..e42681fc678 100644 --- a/backends/test/suite/tests/test_reporting.py +++ b/backends/test/suite/tests/test_reporting.py @@ -1,3 +1,4 @@ +import json import unittest from csv import DictReader @@ -102,14 +103,16 @@ def test_csv_report_simple(self): self.assertEqual(records[2]["Test Case"], "test2") self.assertEqual(records[2]["Flow"], "flow1") self.assertEqual(records[2]["Result"], "Pass") - self.assertEqual(records[2]["Params"], str({"dtype": torch.float32})) + self.assertEqual(records[2]["Params"], json.dumps({"dtype": "torch.float32"})) # Validate fourth record: test2, backend2, EXPORT_FAIL with use_dynamic_shapes param self.assertEqual(records[3]["Test ID"], "test2_backend2_flow1") self.assertEqual(records[3]["Test Case"], "test2") self.assertEqual(records[3]["Flow"], "flow1") self.assertEqual(records[3]["Result"], "Skip") - self.assertEqual(records[3]["Params"], str({"use_dynamic_shapes": True})) + self.assertEqual( + records[3]["Params"], json.dumps({"use_dynamic_shapes": "True"}) + ) def test_count_ops(self): """ diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py index d49e0da0c9b..6c36d1803fc 100644 --- a/backends/transforms/decompose_sdpa.py +++ b/backends/transforms/decompose_sdpa.py @@ -7,6 +7,7 @@ # pyre-strict import math +from typing import Set, Type import torch from executorch.exir.pass_base import ExportPass, PassResult @@ -19,6 +20,8 @@ class DecomposeScaledDotProductAttention(ExportPass): Decompose from scaled_dot_product_attention to multiple nodes. """ + _passes_required_after: Set[Type[ExportPass]] = set() + def __init__(self, allow_non_fake_inputs: bool = True) -> None: super().__init__() # With allow_non_fake_inputs=False, we don't get _unsafe_view ops diff --git a/backends/transforms/fuse_view_copy.py b/backends/transforms/fuse_view_copy.py index c740515cdcc..1972513d2ef 100644 --- a/backends/transforms/fuse_view_copy.py +++ b/backends/transforms/fuse_view_copy.py @@ -7,6 +7,8 @@ # pyre-strict +from typing import Set, Type + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -62,6 +64,8 @@ def remove_noop_view_copy(graph: torch.fx.Graph) -> tuple[torch.fx.Graph, bool]: class FuseViewCopyTransform(ExportPass): + _passes_required_after: Set[Type[ExportPass]] = set() + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: graph_module.graph, merge_modified = merge_view_copy_chains(graph_module.graph) graph_module.graph, noop_modified = remove_noop_view_copy(graph_module.graph) diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt index 29ff90e7293..d9acde79ecf 100644 --- a/backends/vulkan/CMakeLists.txt +++ b/backends/vulkan/CMakeLists.txt @@ -105,17 +105,33 @@ target_include_directories( $ ) +# vulkan runtime utils files + +file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp) + # vulkan_backend +# Try to find boost to log stack traces when throwing exceptions +find_package(Boost 1.89 COMPONENTS stacktrace_basic stacktrace_addr2line) + file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp) list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp}) list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp}) +list(APPEND vulkan_backend_cpp ${vulkan_runtime_utils_cpp}) add_library(vulkan_backend ${vulkan_backend_cpp}) target_include_directories( vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES} ) target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core) +# Optionally link boost for stacktraces if boost is available +if(DEFINED Boost_STACKTRACE_BASIC_LIBRARY) + target_link_libraries( + vulkan_backend PRIVATE ${Boost_STACKTRACE_LIBRARY} + ${Boost_STACKTRACE_ADDR2LINE_LIBRARY} + ) + list(APPEND VULKAN_CXX_FLAGS "-DETVK_BOOST_STACKTRACE_AVAILABLE") +endif() target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS}) # Link this library with --whole-archive due to dynamic backend registration executorch_target_link_options_shared_lib(vulkan_backend) @@ -127,7 +143,7 @@ set_property(TARGET vulkan_backend PROPERTY CXX_STANDARD 17) install( TARGETS vulkan_backend vulkan_schema EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${COMMON_INCLUDES} ) diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md index e0a953d05fe..63a9b0b049a 100644 --- a/backends/vulkan/README.md +++ b/backends/vulkan/README.md @@ -150,7 +150,7 @@ when building with CMake. First, make sure that you have the Android NDK installed; any NDK version past NDK r19c should work. Note that the examples in this doc have been validated with -NDK r27b. The Android SDK should also be installed so that you have access to `adb`. +NDK r28c. The Android SDK should also be installed so that you have access to `adb`. The instructions in this page assumes that the following environment variables are set. diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS index aed41114ada..ae1a0b79654 100644 --- a/backends/vulkan/_passes/TARGETS +++ b/backends/vulkan/_passes/TARGETS @@ -117,6 +117,19 @@ runtime.python_library( ], ) +runtime.python_library( + name = "replace_qdq", + srcs = ["replace_qdq.py"], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/vulkan:utils_lib", + "//executorch/exir:pass_base", + ], +) + runtime.python_library( name = "fuse_patterns", srcs = ["fuse_patterns.py"], @@ -150,6 +163,7 @@ runtime.python_library( ":remove_asserts", ":remove_local_scalar_dense", ":remove_redundant_ops", + ":replace_qdq", ":squeeze_unsqueeze_inputs", ":tag_memory_meta_pass", ] diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py index f4ef6b2ac0e..169bd60543c 100644 --- a/backends/vulkan/_passes/__init__.py +++ b/backends/vulkan/_passes/__init__.py @@ -22,6 +22,7 @@ from executorch.backends.vulkan._passes.remove_redundant_ops import ( RemoveRedundantOpsTransform, ) +from executorch.backends.vulkan._passes.replace_qdq import ReplaceQDQPass from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import ( SqueezeUnsqueezeInputs, ) @@ -36,6 +37,7 @@ "RemoveAssertsTransform", "RemoveLocalScalarDenseOpsTransform", "RemoveRedundantOpsTransform", + "ReplaceQDQPass", "SqueezeUnsqueezeInputs", "TagMemoryMetaPass", ] diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py index 3beccc2205c..a6a5e751c05 100644 --- a/backends/vulkan/_passes/fold_qdq.py +++ b/backends/vulkan/_passes/fold_qdq.py @@ -17,9 +17,8 @@ class FoldQDQPass(ExportPass): valid quant op patterns have already been fused before this pass. """ - def __init__(self, edge_program: torch.export.ExportedProgram): - super(FoldQDQPass, self).__init__() - self.edge_program = edge_program + def __init__(self): + super().__init__() def call(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: diff --git a/backends/vulkan/_passes/fuse_patterns.py b/backends/vulkan/_passes/fuse_patterns.py index 6ced1f32a7c..1575dd6a4f6 100644 --- a/backends/vulkan/_passes/fuse_patterns.py +++ b/backends/vulkan/_passes/fuse_patterns.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Optional + import executorch.backends.vulkan.patterns as vk_patterns import torch @@ -13,13 +15,15 @@ class FusePatternsPass(ExportPass): - def __init__(self, exported_program: ExportedProgram) -> None: + def __init__(self) -> None: super().__init__() - self.program = exported_program + self._exported_program: Optional[ExportedProgram] = None def call(self, graph_module: torch.fx.GraphModule): + assert self._exported_program is not None + total_replaced = vk_patterns.replace_all_fusable_subgraphs( - self.program, graph_module + self._exported_program, graph_module ) if total_replaced > 0: diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py index ca9f7541159..bb8cf5f2e64 100644 --- a/backends/vulkan/_passes/fuse_quantized_ops.py +++ b/backends/vulkan/_passes/fuse_quantized_ops.py @@ -211,18 +211,20 @@ def fuse_into_linear_qcnw_node( class FuseQuantizedOpsTransform(ExportPass): - def __init__(self, exported_program: ExportedProgram) -> None: + def __init__(self) -> None: super().__init__() - self.program = exported_program + self._exported_program: Optional[ExportedProgram] = None def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + assert self._exported_program is not None + for node in graph_module.graph.nodes: # Check for linear_qcnw pattern (weight-only quantization) - qcnw_details = matches_linear_qcnw_pattern(self.program, node) + qcnw_details = matches_linear_qcnw_pattern(self._exported_program, node) if qcnw_details is not None: qcnw_method, qcnw_nbits = qcnw_details fuse_into_linear_qcnw_node( - self.program, graph_module, node, qcnw_method, qcnw_nbits + self._exported_program, graph_module, node, qcnw_method, qcnw_nbits ) continue diff --git a/backends/vulkan/_passes/replace_qdq.py b/backends/vulkan/_passes/replace_qdq.py new file mode 100644 index 00000000000..fcfcdfc4c18 --- /dev/null +++ b/backends/vulkan/_passes/replace_qdq.py @@ -0,0 +1,94 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import executorch.backends.vulkan.utils as utils +import torch +from executorch.exir.dialects._ops import ops as exir_ops + +from executorch.exir.pass_base import ExportPass, PassResult + + +class ReplaceQDQPass(ExportPass): + """ + Replace standard quantize/dequantize ops with custom conv-specific ops when they + feed into/from quantized convolution operations. This optimization allows the + backend to handle quantization more efficiently for convolution operations. + """ + + def __init__(self): + super(ReplaceQDQPass, self).__init__() + + def call(self, graph_module: torch.fx.GraphModule): + # Track nodes that need to be replaced + nodes_to_replace = [] + + for node in graph_module.graph.nodes: + # Check if this is the custom quantized conv2d op + if node.target in [ + exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to.default, + exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to_dw.default, + exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default, + ]: + # Replace quantize op feeding into conv2d (first argument is the quantized input) + quantized_input_node = node.args[0] + if isinstance( + quantized_input_node, torch.fx.Node + ) and utils.is_quant_node(quantized_input_node): + # Get the arguments from the original quantize node + input_tensor = quantized_input_node.args[0] + scale = quantized_input_node.args[1] + zero_point = quantized_input_node.args[2] + + nodes_to_replace.append( + { + "old_node": quantized_input_node, + "new_target": exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default, + "args": (input_tensor, scale, zero_point), + "node_type": "quantize_input", + } + ) + + # Find dequantize ops that consume the output of this conv2d + for user in node.users: + if utils.is_dequant_node(user): + # Get the arguments from the original dequantize node + scale = user.args[1] + zero_point = user.args[2] + + nodes_to_replace.append( + { + "old_node": user, + "new_target": exir_ops.edge.et_vk.dequantize_q8to_from_conv2d.default, + "args": ( + node, + scale, + zero_point, + ), # node is the conv2d output + "node_type": "dequantize_output", + } + ) + + # Apply the replacements + for replacement in nodes_to_replace: + old_node = replacement["old_node"] + new_target = replacement["new_target"] + new_args = replacement["args"] + + with graph_module.graph.inserting_before(old_node): + new_node = graph_module.graph.create_node( + "call_function", new_target, args=new_args + ) + new_node.meta = old_node.meta.copy() + old_node.replace_all_uses_with(new_node) + + # Clean up the graph + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + + # Re-trace to validate everything is ok + graph_module = super().call(graph_module).graph_module + + return PassResult(graph_module, True) diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py index db53cc666a8..8ed71aa1dae 100644 --- a/backends/vulkan/_passes/tag_memory_meta_pass.py +++ b/backends/vulkan/_passes/tag_memory_meta_pass.py @@ -230,6 +230,10 @@ def get_arg_tensor_source_repset( """ arg_node = op_node.args[arg_i] + # For non-tensor arguments, return ANY_STORAGE + if not utils.is_tensor_arg_node(arg_node): + return utils.ANY_STORAGE + # Special case for cat - use the first tensor in the list as representative if isinstance(arg_node, list): arg_node = arg_node[0] diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake index 1b6838c4dfd..16a60abf6f3 100644 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ b/backends/vulkan/cmake/ShaderLibrary.cmake @@ -24,22 +24,17 @@ if(NOT EXECUTORCH_ROOT) message("WARNING: EXECUTORCH_ROOT is not set! A failure is likely imminent.") endif() -if(ANDROID) - if(NOT ANDROID_NDK) - message(FATAL_ERROR "ANDROID_NDK not set") - endif() - - if(NOT GLSLC_PATH) - set(GLSLC_PATH - "${ANDROID_NDK}/shader-tools/${ANDROID_NDK_HOST_SYSTEM_NAME}/glslc" - ) - endif() -else() - find_program(GLSLC_PATH glslc PATHS $ENV{PATH}) +find_program(GLSLC_PATH glslc PATHS $ENV{PATH}) - if(NOT GLSLC_PATH) - message(FATAL_ERROR "USE_VULKAN glslc not found") - endif() +if(NOT GLSLC_PATH) + message( + FATAL_ERROR + "glslc from the Vulkan SDK must be installed to build the Vulkan backend. " + "Please install the Vulkan SDK 1.4.321.0 or newer from " + "https://vulkan.lunarg.com/sdk/home and ensure that the glslc binary is in your PATH. " + "Note that the glslc distributed with the Android NDK is not compatible since it " + "does not support the GL_EXT_integer_dot_product extension. " + ) endif() # Required to enable linking with --whole-archive diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py index 56e803b9127..6e5aa926d37 100644 --- a/backends/vulkan/custom_ops_lib.py +++ b/backends/vulkan/custom_ops_lib.py @@ -354,18 +354,20 @@ def linear_q8ta_q8csw( lib.impl(name, linear_q8ta_q8csw, "CompositeExplicitAutograd") qa_q8csw_linear = getattr(getattr(torch.ops, namespace), name) -####################### -## conv2d_q8ta_q8csw ## -####################### +############################ +## conv2d_q8ta_q8csw_q8to ## +############################ -def conv2d_q8ta_q8csw( +def conv2d_q8ta_q8csw_q8to( x: torch.Tensor, input_scale: float, input_zero_point: int, weights: torch.Tensor, weight_sums: torch.Tensor, weight_scales: torch.Tensor, + output_scale: float, + output_zero_point: int, bias: Optional[torch.Tensor], kernel_size: list, stride: list, @@ -373,27 +375,103 @@ def conv2d_q8ta_q8csw( dilation: list, groups: int, ): - IC = x.shape[1] + x = torch.ops.quantized_decomposed.dequantize_per_tensor( + x, input_scale, input_zero_point, -128, 127, x.dtype + ) + + # Calculate weight dimensions + OC = weights.shape[0] + assert OC % groups == 0, "Output channels must be divisible by groups" + IC_per_group = int(x.shape[1] / groups) K_h, K_w = kernel_size[0], kernel_size[1] - canonical_weight_K_dim = K_h * K_w * IC + orig_weight_K_dim = K_h * K_w * IC_per_group + # Remove any padding added to in_features dim to align to a multiple of 4 + if weights.shape[-1] > orig_weight_K_dim: + weights = weights[:, :orig_weight_K_dim] + # Remove any padding added to output channels dim to align to a multiple of 4 - if weights.shape[-1] != canonical_weight_K_dim: - weights = weights[:, :canonical_weight_K_dim] - weight_scales = weight_scales[:canonical_weight_K_dim] + if weight_scales.shape[0] > OC: + weight_scales = weight_scales[:OC] if bias is not None: - bias = bias[:canonical_weight_K_dim] + bias = bias[:OC] + + # Reshape to original 4D format (OC, IC, H, W) + weights = weights.view(OC, IC_per_group, K_h, K_w) weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32) + # Dequantize weights + weights = torch.ops.quantized_decomposed.dequantize_per_channel( + weights, + weight_scales, + weight_zeros, + 0, # axis=0 for output channel quantization + -127, + 127, + torch.int8, + ) - # Calculate dimensions - OC = weights.shape[0] - in_features = weights.shape[1] - IC = in_features // (K_h * K_w) + # Perform convolution + out = torch.nn.functional.conv2d( + x, weights, bias, stride, padding, dilation, groups + ) - # Reshape to original 4D format (OC, IC, H, W) - weights = weights.view(OC, IC, K_h, K_w) + out = torch.ops.quantized_decomposed.quantize_per_tensor( + out, output_scale, output_zero_point, -128, 127, torch.int8 + ) + return out + + +name = "conv2d_q8ta_q8csw_q8to" +lib.define( + f""" + {name}( + Tensor x, + float input_scale, + int input_zero_point, + Tensor weights, + Tensor weight_sums, + Tensor weight_scales, + float output_scale, + int output_zero_point, + Tensor? bias, + SymInt[] kernel_size, + SymInt[] stride, + SymInt[] padding, + SymInt[] dilation, + SymInt groups) -> Tensor + """ +) +lib.impl(name, conv2d_q8ta_q8csw_q8to, "CompositeExplicitAutograd") +conv2d_q8ta_q8csw_op = getattr(getattr(torch.ops, namespace), name) + + +def conv2d_q8ta_q8csw_q8to_dw( + x: torch.Tensor, + input_scale: float, + input_zero_point: int, + weights: torch.Tensor, + weight_sums: torch.Tensor, + weight_scales: torch.Tensor, + output_scale: float, + output_zero_point: int, + bias: Optional[torch.Tensor], + kernel_size: list, + stride: list, + padding: list, + dilation: list, + groups: int, +): + x = torch.ops.quantized_decomposed.dequantize_per_tensor( + x, input_scale, input_zero_point, -128, 127, x.dtype + ) + + # Restore weight to original data layout + K_h, K_w, OC = weights.shape + weights = weights.permute(2, 0, 1).reshape(OC, 1, K_h, K_w) + + weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32) # Dequantize weights weights = torch.ops.quantized_decomposed.dequantize_per_channel( weights, @@ -410,10 +488,14 @@ def conv2d_q8ta_q8csw( x, weights, bias, stride, padding, dilation, groups ) + out = torch.ops.quantized_decomposed.quantize_per_tensor( + out, output_scale, output_zero_point, -128, 127, torch.int8 + ) + return out -name = "conv2d_q8ta_q8csw" +name = "conv2d_q8ta_q8csw_q8to_dw" lib.define( f""" {name}( @@ -423,6 +505,8 @@ def conv2d_q8ta_q8csw( Tensor weights, Tensor weight_sums, Tensor weight_scales, + float output_scale, + int output_zero_point, Tensor? bias, SymInt[] kernel_size, SymInt[] stride, @@ -431,8 +515,8 @@ def conv2d_q8ta_q8csw( SymInt groups) -> Tensor """ ) -lib.impl(name, conv2d_q8ta_q8csw, "CompositeExplicitAutograd") -conv2d_q8ta_q8csw_op = getattr(getattr(torch.ops, namespace), name) +lib.impl(name, conv2d_q8ta_q8csw_q8to_dw, "CompositeExplicitAutograd") +conv2d_q8ta_q8csw_dw_op = getattr(getattr(torch.ops, namespace), name) ###################### ## apply_rotary_emb ## @@ -452,3 +536,81 @@ def apply_rotary_emb_impl( ) lib.impl(name, apply_rotary_emb_impl, "CompositeExplicitAutograd") apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name) + +############################# +## quantize/dequantize ops ## +############################# + + +def quantize_q8ta_for_conv2d_impl( + input: torch.Tensor, + scale: float, + zero_point: int, +): + return torch.ops.quantized_decomposed.quantize_per_tensor( + input, scale, zero_point, -128, 127, torch.int8 + ) + + +name = "quantize_q8ta_for_conv2d" +lib.define(f"{name}(Tensor input, float scale, int zero_point) -> Tensor") +lib.impl(name, quantize_q8ta_for_conv2d_impl, "CompositeExplicitAutograd") +quantize_q8ta_for_conv2d_op = getattr(getattr(torch.ops, namespace), name) + + +def dequantize_q8to_from_conv2d_impl( + input: torch.Tensor, + scale: float, + zero_point: int, +): + return torch.ops.quantized_decomposed.dequantize_per_tensor( + input, scale, zero_point, -128, 127, input.dtype + ) + + +name = "dequantize_q8to_from_conv2d" +lib.define(f"{name}(Tensor input, float scale, int zero_point) -> Tensor") +lib.impl(name, dequantize_q8to_from_conv2d_impl, "CompositeExplicitAutograd") +dequantize_q8to_from_conv2d_op = getattr(getattr(torch.ops, namespace), name) + +######################## +## add_q8ta_q8ta_q8to ## +######################## + + +def add_q8ta_q8ta_q8to_impl( + input_a: torch.Tensor, + input_b: torch.Tensor, + input_a_scale: float, + input_a_zero_point: int, + input_b_scale: float, + input_b_zero_point: int, + output_scale: float, + output_zero_point: int, + alpha: float, +): + # Dequantize inputs to float + dequant_a = torch.ops.quantized_decomposed.dequantize_per_tensor( + input_a, input_a_scale, input_a_zero_point, -128, 127, input_a.dtype + ) + dequant_b = torch.ops.quantized_decomposed.dequantize_per_tensor( + input_b, input_b_scale, input_b_zero_point, -128, 127, input_b.dtype + ) + + # Perform addition with alpha scaling + result = dequant_a + alpha * dequant_b + + # Quantize the result back to int8 + quantized_result = torch.ops.quantized_decomposed.quantize_per_tensor( + result, output_scale, output_zero_point, -128, 127, torch.int8 + ) + + return quantized_result + + +name = "add_q8ta_q8ta_q8to" +lib.define( + f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor" +) +lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd") +add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name) diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 4c686e0cfc5..63b57a0e79c 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -16,8 +16,6 @@ import torch -from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkMemoryLayout - from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload @@ -48,6 +46,9 @@ class OpFeatures: # Optional check function used during partitioning to determine if a node's # inputs are supported by the operator implementation. "are_node_inputs_supported_fn", + # Optional function to determine valid representation sets for input and outputs + # once a node's actual inputs are known. + "pick_io_storage_fn", ] def __init__( @@ -61,6 +62,7 @@ def __init__( supports_resize: bool = False, supports_prepacking: bool = False, are_node_inputs_supported_fn: Optional[Callable] = allow_node, + pick_io_storage_fn: Optional[Callable] = None, ): self.inputs_storage: utils.TensorRepSetList = utils.TensorRepSetList( inputs_storage if inputs_storage is not None else [] @@ -77,15 +79,21 @@ def __init__( self.supports_prepacking = supports_prepacking self.are_node_inputs_supported_fn = are_node_inputs_supported_fn + self.pick_io_storage_fn = pick_io_storage_fn def make_op_repsets( self, op_node: torch.fx.Node, texture_limits: utils.ImageExtents = utils.DEFAULT_TEXTURE_LIMITS, ) -> utils.OpRepSets: - return utils.OpRepSets( - self.inputs_storage, self.outputs_storage, op_node, texture_limits - ) + inputs_storage = self.inputs_storage + outputs_storage = self.outputs_storage + if self.pick_io_storage_fn is not None: + i_storage, o_storage = self.pick_io_storage_fn(op_node) + inputs_storage = utils.TensorRepSetList(i_storage) + outputs_storage = utils.TensorRepSetList(o_storage) + + return utils.OpRepSets(inputs_storage, outputs_storage, op_node, texture_limits) ####################### @@ -410,28 +418,16 @@ def register_softmax_op(): ) def register_reduce_op(): def check_reduce_node(node: torch.fx.Node) -> bool: + # Only one argument implies that the reduction is over the entire tensor, which + # is not supported yet. + if len(node.args) == 1: + return False + dim_list = node.args[1] + # Only 1D and 2D reductions are supported at the moment. if isinstance(dim_list, list) and len(dim_list) > 2: return False - if isinstance(dim_list, list) and len(dim_list) == 2: - # Try to get the memory layout for this node - try: - memory_layout = utils.get_node_memory_layout(node) - - # If we have memory layout information, check if any dimension in dim_list corresponds to a packed dimension - if ( - memory_layout is not None - and memory_layout != VkMemoryLayout.DEFAULT_LAYOUT - ): - # For now only default layout is supported for 2D reduction. - # Because we can't determine if the input is NCHW or NHWC here, - # assume the reduction dimension is packed so we cannot support it. - return False - except (AssertionError, KeyError, AttributeError): - # If we can't get memory layout information, we'll assume the dims aren't packed - pass - def try_find_keepdim_arg(node: torch.fx.Node) -> bool: for arg in node.args: if isinstance(arg, bool): @@ -446,10 +442,41 @@ def try_find_keepdim_arg(node: torch.fx.Node) -> bool: return True + def pick_io_storage_for_reduce(node: torch.fx.Node): + inputs_storage = utils.ANY_TEXTURE + outputs_storage = utils.ANY_TEXTURE + + input_tensor = node.args[0] + ndim = input_tensor.meta["val"].ndim + dim_list = node.args[1] + if isinstance(dim_list, list) and len(dim_list) == 2: + reduce_dim1_whcn = utils.nchw_dim_to_whcn_dim(dim_list[0], ndim) + reduce_dim2_whcn = utils.nchw_dim_to_whcn_dim(dim_list[1], ndim) + + possible_packed_dims = {0, 1, 2} + possible_packed_dims.discard(reduce_dim1_whcn) + possible_packed_dims.discard(reduce_dim2_whcn) + + packed_dim = possible_packed_dims.pop() + assert packed_dim in [0, 1, 2] + + if packed_dim == 0: + inputs_storage = utils.WIDTH_PACKED_TEXTURE + outputs_storage = utils.WIDTH_PACKED_TEXTURE + elif packed_dim == 1: + inputs_storage = utils.HEIGHT_PACKED_TEXTURE + outputs_storage = utils.HEIGHT_PACKED_TEXTURE + else: + inputs_storage = utils.CHANNELS_PACKED_TEXTURE + outputs_storage = utils.CHANNELS_PACKED_TEXTURE + + return inputs_storage, outputs_storage + return OpFeatures( inputs_storage=utils.ANY_TEXTURE, supports_resize=True, are_node_inputs_supported_fn=check_reduce_node, + pick_io_storage_fn=pick_io_storage_for_reduce, ) @@ -474,6 +501,23 @@ def register_2d_pool_op(): ] ) def register_convolution_op(): + def check_conv_node(node: torch.fx.Node) -> bool: + x = node.args[0] + x_shape = x.meta["val"].size() + # 4-D input implies 2D convolution + if len(x_shape) == 4: + batches = x.meta["val"].size()[0] + if batches != 1: + return False + # 3-D input implies 1D convolution + if len(x_shape) == 3: + transpose = node.args[6] + # Transposed 1D convolution is not supported yet + if transpose: + return False + + return True + return OpFeatures( inputs_storage=[ utils.CHANNELS_PACKED_TEXTURE, # input @@ -490,23 +534,27 @@ def register_convolution_op(): ], supports_resize=True, supports_prepacking=True, + are_node_inputs_supported_fn=check_conv_node, ) @update_features( [ - exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default, + exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to.default, + exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to_dw.default, ] ) def register_quantized_conv_op(): return OpFeatures( inputs_storage=[ - utils.CHANNELS_PACKED_TEXTURE, # input + utils.PACKED_INT8_4W4C_BUFFER, # input utils.NO_STORAGE, # input_scale (non tensor) utils.NO_STORAGE, # input_zero_point (non tensor) utils.NO_STORAGE, # weight (prepacked) utils.NO_STORAGE, # weight_sums (prepacked) utils.NO_STORAGE, # weight_scales (prepacked) + utils.NO_STORAGE, # output_scale (non tensor) + utils.NO_STORAGE, # output_zero_point (non tensor) utils.NO_STORAGE, # bias (prepacked) utils.NO_STORAGE, # kernel_size (non tensor) utils.NO_STORAGE, # stride (non tensor) @@ -520,6 +568,53 @@ def register_quantized_conv_op(): ) +@update_features( + [ + exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default, + ] +) +def register_quantized_binary_op(): + return OpFeatures( + inputs_storage=utils.PACKED_INT8_4W4C_BUFFER, + supports_resize=False, + supports_prepacking=True, + ) + + +@update_features( + [ + exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default, + ] +) +def register_quantize_for_conv2d_op(): + return OpFeatures( + inputs_storage=[ + utils.CHANNELS_PACKED_TEXTURE, + ], + outputs_storage=[ + utils.PACKED_INT8_4W4C_BUFFER, + ], + supports_resize=False, + ) + + +@update_features( + [ + exir_ops.edge.et_vk.dequantize_q8to_from_conv2d.default, + ] +) +def register_dequantize_for_conv2d_op(): + return OpFeatures( + inputs_storage=[ + utils.PACKED_INT8_4W4C_BUFFER, + ], + outputs_storage=[ + utils.CHANNELS_PACKED_TEXTURE, + ], + supports_resize=False, + ) + + @update_features("llama::sdpa_with_kv_cache") def register_sdpa_with_kv_cache_op(): return OpFeatures( @@ -666,6 +761,7 @@ def register_ported_ops_with_prepacking(): return OpFeatures( inputs_storage=utils.CHANNELS_PACKED_TEXTURE, supports_prepacking=True, + supports_resize=True, ) @@ -696,6 +792,7 @@ def register_ported_ops_with_prepacking_all_dims(): return OpFeatures( inputs_storage=utils.ANY_TEXTURE, supports_prepacking=True, + supports_resize=True, ) diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py index e5b2d0f7864..0bdc16616ef 100644 --- a/backends/vulkan/partitioner/vulkan_partitioner.py +++ b/backends/vulkan/partitioner/vulkan_partitioner.py @@ -36,7 +36,7 @@ Partitioner, PartitionResult, ) -from executorch.exir.backend.utils import tag_constant_data +from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer from executorch.exir.dialects._ops import ops as exir_ops from torch.export.exported_program import ExportedProgram @@ -254,9 +254,10 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool: # noqa: C901 self.log_skip(node, "permute node of non compatible linear node") return False - is_in_local_scalar_dense_chain, dst_node_is_compatible = ( - self.is_in_local_scalar_dense_chain(node) - ) + ( + is_in_local_scalar_dense_chain, + dst_node_is_compatible, + ) = self.is_in_local_scalar_dense_chain(node) if is_in_local_scalar_dense_chain and dst_node_is_compatible: return True elif is_in_local_scalar_dense_chain: @@ -419,6 +420,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.") tag_constant_data(exported_program) + tag_mutated_buffer(exported_program) return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags diff --git a/backends/vulkan/patterns/TARGETS b/backends/vulkan/patterns/TARGETS index 791edf58984..285efe2b933 100644 --- a/backends/vulkan/patterns/TARGETS +++ b/backends/vulkan/patterns/TARGETS @@ -11,6 +11,7 @@ runtime.python_library( "rope.py", "quantized_linear.py", "quantized_convolution.py", + "quantized_binary.py", ], visibility = [ "//executorch/backends/...", diff --git a/backends/vulkan/patterns/__init__.py b/backends/vulkan/patterns/__init__.py index 8ffad98b3c3..e23dfc7629c 100644 --- a/backends/vulkan/patterns/__init__.py +++ b/backends/vulkan/patterns/__init__.py @@ -6,6 +6,8 @@ from typing import List +import executorch.backends.vulkan.patterns.quantized_binary # noqa + import executorch.backends.vulkan.patterns.quantized_convolution # noqa import executorch.backends.vulkan.patterns.quantized_linear # noqa diff --git a/backends/vulkan/patterns/quantized_binary.py b/backends/vulkan/patterns/quantized_binary.py new file mode 100644 index 00000000000..da4985b931d --- /dev/null +++ b/backends/vulkan/patterns/quantized_binary.py @@ -0,0 +1,161 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import executorch.backends.vulkan.utils as utils + +import torch + +from executorch.backends.vulkan.patterns.pattern_registry import ( + PatternMatch, + register_pattern_detector, + register_pattern_replacement, +) + +from executorch.exir import ExportedProgram +from executorch.exir.dialects._ops import ops as exir_ops + + +class QuantizedBinaryMatch(PatternMatch): + def __init__(self, binary_node: torch.fx.Node) -> None: + self.anchor_node = binary_node + self.match_found = False + self.all_nodes = [self.anchor_node] + + # Extract alpha parameter if it exists (for add operations) + self.alpha = 1.0 + if len(binary_node.args) > 2 and binary_node.args[2] is not None: + # Alpha is typically a scalar value + if isinstance(binary_node.args[2], (int, float)): + self.alpha = binary_node.args[2] + + # Identify input nodes - both should be dequantize nodes for static quantization + if len(binary_node.args) < 2: + return + + input_a_node = binary_node.args[0] + assert isinstance(input_a_node, torch.fx.Node) + input_b_node = binary_node.args[1] + assert isinstance(input_b_node, torch.fx.Node) + + # Both arguments must be dequant nodes for static quantization + if not utils.is_dequant_node(input_a_node) or not utils.is_dequant_node( + input_b_node + ): + return + + self.dequantize_input_a_node = input_a_node + self.dequantize_input_b_node = input_b_node + + # Extract quantization parameters for input A + self.quantize_input_a_node = self.dequantize_input_a_node.args[0] + self.input_a_scales_node = self.dequantize_input_a_node.args[1] + self.input_a_zeros_node = self.dequantize_input_a_node.args[2] + + # Extract quantization parameters for input B + self.quantize_input_b_node = self.dequantize_input_b_node.args[0] + self.input_b_scales_node = self.dequantize_input_b_node.args[1] + self.input_b_zeros_node = self.dequantize_input_b_node.args[2] + + self.all_nodes.extend( + [self.dequantize_input_a_node, self.dequantize_input_b_node] + ) + + # Identify output node + self.output_node = self.anchor_node + + # The binary operation output must have only one user; it will be either a relu node + # or a quantize node. + if len(self.output_node.users) != 1: + return + + cur_node = list(self.output_node.users)[0] + self.relu_node = None + if cur_node.target == exir_ops.edge.aten.relu.default: + self.relu_node = cur_node + self.all_nodes.append(self.relu_node) + # If there's a relu, get its user (should be the quantize node) + if len(cur_node.users) != 1: + return + cur_node = list(cur_node.users)[0] + + if not utils.is_quant_node(cur_node): + return + + self.quantize_output_node = cur_node + self.output_scales_node = self.quantize_output_node.args[1] + self.output_zeros_node = self.quantize_output_node.args[2] + + self.all_nodes.append(self.quantize_output_node) + + self.match_found = True + + +# Define the binary operation anchor nodes that we support +binary_anchor_nodes = { + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.add_.Tensor, +} + + +@register_pattern_detector("quantized_binary") +def find_quantized_binary_patterns( + node: torch.fx.Node, +) -> Optional[QuantizedBinaryMatch]: + if node.target not in binary_anchor_nodes: + return None + + matched_pattern = QuantizedBinaryMatch(node) + if matched_pattern.match_found: + return matched_pattern + + return None + + +## +## Pattern Replacement +## + + +@register_pattern_replacement("quantized_binary") +def make_add_q8ta_q8ta_q8to_custom_op( + ep: ExportedProgram, + graph_module: torch.fx.GraphModule, + match: QuantizedBinaryMatch, +): + # Determine the operation type based on the anchor node + op_target = None + if match.anchor_node.target in { + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.add_.Tensor, + }: + op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default + else: + # For future binary operations, add more mappings here + raise NotImplementedError( + f"Unsupported binary operation: {match.anchor_node.target}" + ) + + with graph_module.graph.inserting_before(match.output_node): + qbinary_node = graph_module.graph.create_node( + "call_function", + op_target, + args=( + match.quantize_input_a_node, + match.quantize_input_b_node, + match.input_a_scales_node, + match.input_a_zeros_node, + match.input_b_scales_node, + match.input_b_zeros_node, + match.output_scales_node, + match.output_zeros_node, + match.alpha, # Alpha parameter for scaling + ), + ) + + qbinary_node.meta["val"] = match.output_node.meta["val"] + match.quantize_output_node.replace_all_uses_with(qbinary_node) diff --git a/backends/vulkan/patterns/quantized_convolution.py b/backends/vulkan/patterns/quantized_convolution.py index 65b51b5e103..522a19c58d6 100644 --- a/backends/vulkan/patterns/quantized_convolution.py +++ b/backends/vulkan/patterns/quantized_convolution.py @@ -76,11 +76,13 @@ def __init__(self, conv_node: torch.fx.Node) -> None: # Identify output node self.output_node = self.anchor_node - out_channels = self.output_node.meta["val"].shape[-1] - # The implementation requires that for grouped convolutions, a group does not - # cross any texel boundary. The output channels per group must be a multiple of - # 4. If this is not true, then don't match the pattern. - if self.groups > 1 and (out_channels / self.groups) % 4 == 0: + out_channels = self.output_node.meta["val"].shape[-3] + # The implementation requires that for non-depthwise grouped convolutions, a + # group does not cross the texel boundary. The output channels per group must be + # a multiple of 4. If this is not true, then don't match the pattern. + if (self.groups > 1 and self.groups < out_channels) and ( + out_channels / self.groups + ) % 4 != 0: return # Identify bias node, if applicable @@ -93,23 +95,37 @@ def __init__(self, conv_node: torch.fx.Node) -> None: self.all_nodes.extend(arg_chain) # Identify input node - self.fp_input_node, self.quantize_input_node, dq_node = ( - utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0]) - ) - assert self.fp_input_node is not None - self.all_nodes.append(self.fp_input_node) - assert self.quantize_input_node is not None - assert dq_node is not None - - self.input_scales_node = self.quantize_input_node.args[1] - self.input_zeros_node = self.quantize_input_node.args[2] - - self.all_nodes.extend( - [ - self.quantize_input_node, - dq_node, - ] - ) + primary_input_node = self.anchor_node.args[0] + assert isinstance(primary_input_node, torch.fx.Node) + # Argument must be a dequant node for static quantization + if not utils.is_dequant_node(primary_input_node): + return + + self.dequantize_input_node = primary_input_node + self.quantize_input_node = self.dequantize_input_node.args[0] + + self.input_scales_node = self.dequantize_input_node.args[1] + self.input_zeros_node = self.dequantize_input_node.args[2] + + self.all_nodes.extend([self.dequantize_input_node]) + + # The convolution output must have only one user; it will be either a relu node + # or a dequantize node. + if len(self.output_node.users) != 1: + return + + cur_node = list(self.output_node.users)[0] + self.relu_node = None + if cur_node.target == exir_ops.edge.aten.relu.default: + self.relu_node = cur_node + cur_node = list(cur_node.users)[0] + + if not utils.is_quant_node(cur_node): + return + + self.quantize_output_node = cur_node + self.output_scales_node = self.quantize_output_node.args[1] + self.output_zeros_node = self.quantize_output_node.args[2] self.match_found = True @@ -161,13 +177,26 @@ def make_conv2d_q8ta_q8csw_custom_op( bias_tensor = get_param_tensor(ep, match.bias_node) assert bias_tensor is not None - OC, IC, H, W = weight_tensor.shape + OC, IC_per_group, H, W = weight_tensor.shape - # Reshape weight tensor from (OC, IC, H, W) to (OC, H * W * IC) (i.e. matrix format) - # This prepares the weights for Im2Col-based convolution - weight_tensor = ( - weight_tensor.permute(0, 2, 3, 1).contiguous().view(OC, H * W * IC).contiguous() - ) + is_depthwise_conv = IC_per_group == 1 and match.groups == OC + + if is_depthwise_conv: + assert OC % 4 == 0, "depthwise conv requires that OC is divisible by 4" + # Depthwise convs use a specialized layout; the weight tensor is reshaped to + # (H, W, OC) + weight_tensor = ( + weight_tensor.permute(2, 3, 1, 0).contiguous().view(H, W, OC).contiguous() + ) + else: + # Reshape weight tensor from (OC, IC_per_group, H, W) to (OC, H * W * IC_per_group) + # (i.e. matrix format). This prepares the weights for Im2Col-based convolution. + weight_tensor = ( + weight_tensor.permute(0, 2, 3, 1) + .contiguous() + .view(OC, H * W * IC_per_group) + .contiguous() + ) # Need to make sure that OC dim is a multiple of 4 so that data load/stores are well # aligned with texel boundaries. Add padding to align to the next multiple of 4 if @@ -178,6 +207,7 @@ def make_conv2d_q8ta_q8csw_custom_op( utils.align_width_and_update_state_dict( ep, match.weight_scales_node, weight_scales_tensor ) + if bias_tensor is not None: utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor) @@ -185,7 +215,7 @@ def make_conv2d_q8ta_q8csw_custom_op( with graph_module.graph.inserting_before(first_graph_node): qweight_tensor_name = utils.get_tensor_name(ep, match.weight_node) # Pre-compute the weight sums which are needed to apply activation zero point - # when using integer accumulation. For the reshaped 2D weight matrix (IC * H * W, OC), + # when using integer accumulation. For the reshaped 2D weight matrix (IC_per_group * H * W, OC), # sum over dimension 0 to get sums per output channel sum_per_output_channel = weight_tensor.sum(dim=1).to(torch.int32).contiguous() sums_name = qweight_tensor_name + "_sums" @@ -201,16 +231,22 @@ def make_conv2d_q8ta_q8csw_custom_op( ) with graph_module.graph.inserting_before(match.output_node): + op_target = exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to.default + if is_depthwise_conv: + op_target = exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to_dw.default + qconv_node = graph_module.graph.create_node( "call_function", - exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default, + op_target, args=( - match.fp_input_node, + match.quantize_input_node, match.input_scales_node, match.input_zeros_node, match.weight_node, weight_sums_node, match.weight_scales_node, + match.output_scales_node, + match.output_zeros_node, match.bias_node, # Add bias after weight_scales [H, W], # Pass kernel size information before stride match.stride, @@ -221,4 +257,4 @@ def make_conv2d_q8ta_q8csw_custom_op( ) qconv_node.meta["val"] = match.output_node.meta["val"] - match.output_node.replace_all_uses_with(qconv_node) + match.quantize_output_node.replace_all_uses_with(qconv_node) diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py index 882d0d41e6d..374e29c634d 100644 --- a/backends/vulkan/patterns/quantized_linear.py +++ b/backends/vulkan/patterns/quantized_linear.py @@ -92,9 +92,11 @@ def __init__(self, mm_node: torch.fx.Node) -> None: return # Identify input node - self.fp_input_node, self.quantize_input_node, dq_node = ( - utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0]) - ) + ( + self.fp_input_node, + self.quantize_input_node, + dq_node, + ) = utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0]) assert self.fp_input_node is not None self.all_nodes.append(self.fp_input_node) @@ -386,7 +388,7 @@ def make_linear_dq8ca_q4gsw_op( weight_sums_node = create_constant_placeholder( exp_program=ep, graph=graph_module.graph, - kind=InputKind.CONSTANT_TENSOR, + kind=InputKind.PARAMETER, name=sums_name, data=sum_per_quant_group, ) @@ -429,7 +431,7 @@ def make_linear_q8ta_q8csw_custom_op( weight_sums_node = create_constant_placeholder( exp_program=ep, graph=graph_module.graph, - kind=InputKind.CONSTANT_TENSOR, + kind=InputKind.PARAMETER, name=sums_name, data=sum_per_output_channel, ) diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 67b646ae1a8..fe8cc83c481 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -139,6 +139,10 @@ utils::GPUMemoryLayout get_memory_layout( return utils::kHeightPacked; case vkgraph::VkMemoryLayout::TENSOR_CHANNELS_PACKED: return utils::kChannelsPacked; + case vkgraph::VkMemoryLayout::PACKED_INT8_4W4C: + return utils::kPackedInt8_4W4C; + case vkgraph::VkMemoryLayout::PACKED_INT8_4H4W: + return utils::kPackedInt8_4H4W; default: break; } diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 433ae15db4e..d798b203673 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -14,6 +14,21 @@ namespace vkcompute { namespace api { +/* + * For PackedInt8 memory layouts, ensure that the scalar type used for the + * tensor is kInt8x4. Otherwise, return the original scalar type. + */ +vkapi::ScalarType get_effective_scalar_type( + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout) { + vkapi::ScalarType effective_dtype = dtype; + if (utils::is_packed_int8_layout(memory_layout)) { + VK_CHECK_COND(dtype == vkapi::kInt8x4 || dtype == vkapi::kChar); + effective_dtype = vkapi::kInt8x4; + } + return effective_dtype; +} + /* * Used to infer the sizes of a tensor that would correspond to a given * VulkanImage. @@ -187,6 +202,7 @@ std::vector calculate_padded_sizes( utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim) { utils::uvec3 extents({1, 1, 1}); @@ -205,6 +221,28 @@ utils::uvec3 calculate_image_extents( extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); } + // For "regular" tensor dtypes, 4 elements along the packed dim are packed + // into one texel (4-component vectorized type). However, for packed int8 + // memory layouts, an additional level of packing is employed where 4 int8 + // elements are packed into one int32, and then 4 int32 are packed into each + // ivec4 texel. + if (utils::is_packed_int8_layout(memory_layout)) { + // Each int in the ivec4 contains 4 channels. The overall ivec4 contains + // data for a 1Hx4Wx4C block of the input tensor. + if (memory_layout == utils::kPackedInt8_4W4C) { + VK_CHECK_COND(packed_dim == 2); + extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u); + } + // Each int in the ivec4 contains 4 elements along the width dim. The + // overall ivec4 contains data for a 4Hx4W block of the input tensor. + else if (memory_layout == utils::kPackedInt8_4H4W) { + VK_CHECK_COND(packed_dim == 0); + extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u); + } else { + VK_THROW("Unhandled packed int8 memory layout!"); + } + } + // axis_map[3] indicates the WHCN index of the dimension used for batch // concatenation. Thus a double lookup is required to determine the image axis // used for batch concatenation. @@ -215,6 +253,7 @@ utils::uvec3 calculate_image_extents( VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0); extents[axis_map.at(packed_dim)] /= 4; + return extents; } @@ -247,35 +286,72 @@ utils::uvec3 calculate_logical_limits( */ utils::uvec3 calculate_logical_limits( const std::vector& sizes, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim) { return calculate_logical_limits( calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim), + calculate_padded_sizes(sizes, packed_dim), + memory_layout, + axis_map, + packed_dim), axis_map); } int64_t calculate_gpu_buffer_numel( + const std::vector& sizes, + const utils::GPUMemoryLayout memory_layout, + const vkapi::ScalarType dtype) { + size_t numel; + + // Mirrors the logic in calculate_image_extents for packed int8 memory layouts + if (dtype == vkapi::kInt8x4) { + VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout)); + std::vector blocks_in_dim = + flip_and_unsqueeze(sizes, kTensorSizes, 0); + // Each ivec4 contains data for a 1Hx4Wx4C block of the input + if (memory_layout == utils::kPackedInt8_4W4C) { + blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); + blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]); + } + // Each ivec4 contains data for a 4Hx4W block of the input + else if (memory_layout == utils::kPackedInt8_4H4W) { + blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); + blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]); + } else { + VK_THROW("Unhandled packed int8 memory layout!"); + } + // Each block is represented as an ivec4, and the base dtype of the buffer + // is int. Therefore, need to multiply the number of blocks by 4 to obtain + // the number of int elements in the data buffer. + numel = utils::multiply_integers(blocks_in_dim) * 4; + } + // Case for "regular" dtypes/memory layouts + else { + numel = utils::multiply_integers(sizes); + + // For 8-bit types, align to the next multiple of 4. For devices that do not + // support 8-bit storage buffers, the tensor data will be interpreted as an + // array of int32 instead. + if (vkapi::element_size(dtype) == 1) { + numel = utils::align_up_4(numel); + } + } + return numel; +} + +int64_t calculate_staging_or_gpu_buffer_numel( Context* const context, const std::vector& sizes, const utils::uvec3 image_extents, const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, const vkapi::ScalarType dtype) { // For texture backed tensors, simply multiply the total number of texels by 4 if (storage_type != utils::kBuffer) { return image_extents[0] * image_extents[1] * image_extents[2] * 4; } - const bool is_int8 = dtype == vkapi::kChar; - const bool int8_supported = - context->adapter_ptr()->has_full_int8_buffers_support(); - const size_t numel = utils::multiply_integers(sizes); - // For int8 tensors, if the device does not support int8 buffers, then int32 - // is used instead to represent the buffer data. Therefore the number of - // elements in the buffer is aligned to the next multiple of 4. - if (is_int8 && int8_supported) { - return utils::align_up_4(numel); - } - return numel; + return calculate_gpu_buffer_numel(sizes, memory_layout, dtype); } template ::value>> @@ -332,10 +408,12 @@ vkapi::VulkanImage allocate_image( Context* const context_ptr, utils::uvec3& image_extents, const utils::StorageType storage_type, - const VkFormat image_format, + const vkapi::ScalarType dtype, const bool allocate_memory) { vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); + const VkFormat image_format = vkcompute::vkapi::to_vkformat(dtype); + vkapi::ImageSampler::Properties sampler_props{ VK_FILTER_NEAREST, VK_SAMPLER_MIPMAP_MODE_NEAREST, @@ -420,6 +498,7 @@ vkapi::VulkanBuffer allocate_buffer( vTensorStorage::vTensorStorage( Context* const context, const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim, const std::vector& sizes, @@ -429,20 +508,22 @@ vTensorStorage::vTensorStorage( storage_type_{storage_type}, image_extents_(calculate_image_extents( calculate_padded_sizes(sizes, packed_dim), + memory_layout, axis_map, packed_dim)), - buffer_length_{calculate_gpu_buffer_numel( + buffer_length_{calculate_staging_or_gpu_buffer_numel( context_, sizes, image_extents_, storage_type, + memory_layout, dtype)}, buffer_offset_{0}, image_(allocate_image( context_, image_extents_, storage_type_, - to_vkformat(dtype), + dtype, allocate_memory)), buffer_(allocate_buffer( context_, @@ -553,7 +634,7 @@ vTensor::vTensor( const utils::GPUMemoryLayout memory_layout, const bool allocate_memory, const utils::AxisMapLayout axis_map_layout) - : dtype_(dtype), + : dtype_(get_effective_scalar_type(dtype, memory_layout)), // Calculate tensor metadata sizes_(sizes.begin(), sizes.end()), packed_dim_(utils::to_packed_dim(memory_layout)), @@ -576,6 +657,7 @@ vTensor::vTensor( storage_(std::make_shared( context, storage_type, + memory_layout, axis_map_, packed_dim_, sizes, @@ -785,6 +867,16 @@ vkapi::VulkanBuffer& vTensor::buffer( } utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { + if (dtype_ == vkapi::kInt8x4) { + switch (packed_dim_) { + case WHCN::kChannelsDim: + return utils::kPackedInt8_4W4C; + case WHCN::kWidthDim: + return utils::kPackedInt8_4H4W; + default: + VK_THROW("Invalid packed dim for Tensor with kInt8x4 type"); + } + } switch (packed_dim_) { case WHCN::kWidthDim: return utils::kWidthPacked; @@ -914,8 +1006,8 @@ void vTensor::update_metadata() { flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_); uniform_data_->strides_v = flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_); - uniform_data_->logical_limits.limits = - calculate_logical_limits(sizes_, axis_map_, packed_dim_); + uniform_data_->logical_limits.limits = calculate_logical_limits( + sizes_, estimate_memory_layout(), axis_map_, packed_dim_); if (sizes_uniform_offset_ != kUniformOffsetUnset) { uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); @@ -942,11 +1034,15 @@ void vTensor::update_metadata() { } void vTensor::check_sizes(const std::vector& sizes) const { + utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout(); if (storage_type() != utils::kBuffer) { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. utils::uvec3 virtual_extents = calculate_image_extents( - calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_); + calculate_padded_sizes(sizes_, packed_dim_), + est_memory_layout, + axis_map_, + packed_dim_); bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0]; valid_resize = @@ -958,9 +1054,10 @@ void vTensor::check_sizes(const std::vector& sizes) const { valid_resize, "tensor sizes requires a larger texture than the current one."); } else { - // For buffer storage check that the current buffer is large enough for the - // new sizes of the tensor. - int64_t numel = utils::multiply_integers(sizes); + // For buffer storage check that the current buffer is large enough for + // the new sizes of the tensor. + int64_t numel = + calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_); bool valid_resize = numel + storage_->buffer_offset_ <= storage_->buffer_length_; VK_CHECK_COND( diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 66c1fd1e4da..d9fc7784cbc 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -99,6 +99,7 @@ class vTensorStorage final { vTensorStorage( Context* context, const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, const int32_t packed_dim, const std::vector& sizes, diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl new file mode 100644 index 00000000000..8b69642d2e9 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl @@ -0,0 +1,78 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define NAME ${VARIANT_NAME} + +#define VEC4_T ${texel_load_type(DTYPE, "buffer")} +#define T ${texel_load_component_type(DTYPE, "buffer")} + +$if IO_STORAGE == "buffer": + #define PACKED_INT8_OUTPUT_BUFFER + #define PACKED_INT8_INPUT_BUFFER + +#define op(X, Y) ${OPERATOR} + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#extension GL_EXT_debug_printf : enable +#define DEBUG_MODE +#include "indexing.glslh" +#include "common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_out", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_in_a", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_in_b", "int", IO_STORAGE, is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "out_sizes")} + +layout(push_constant) uniform restrict Block { + float input_a_scale; + int input_a_zp; + float input_b_scale; + int input_b_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const int tid = int(gl_GlobalInvocationID.x); + + const int W4 = div_up_4(out_sizes.x); + const int H = out_sizes.y; + const int C4 = div_up_4(out_sizes.z); + const int N = out_sizes.w; + + if (tid >= W4 * H * C4 * N) { + return; + } + + const ivec4 in_block_1 = t_packed_int8_in_a[tid]; + const ivec4 in_block_2 = t_packed_int8_in_b[tid]; + + ivec4 out_block = ivec4(pack_into_int32(ivec4(output_zp))); + + for (int row = 0; row < 4; row++) { + vec4 in_texel_1 = unpack_and_dequantize( + in_block_1[row], input_a_scale, input_a_zp); + vec4 in_texel_2 = unpack_and_dequantize( + in_block_2[row], input_b_scale, input_b_zp); + + vec4 out_texel = op(in_texel_1, in_texel_2); + out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp); + } + + t_packed_int8_out[tid] = out_block; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml new file mode 100644 index 00000000000..e19ed8839eb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +binary_q8ta_q8ta_q8to: + parameter_names_with_default_values: + OPERATOR: X + Y + NDIM: 3 + DTYPE: float + PACKING: C_packed + IO_STORAGE: buffer + generate_variant_forall: + IO_STORAGE: + - VALUE: buffer + shader_variants: + - NAME: add_q8ta_q8ta_q8to + OPERATOR: X + Y diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh index 732b7006c2c..eb0ee02c2b4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh @@ -33,6 +33,59 @@ struct TensorIndex4D { ivec4 data; }; +int sign_extend_8bit(const int val) { + if ((val & 0x80) != 0) { + return val | (~0xFF); + } + return val; +} + +int extract_8bit_from_packed_int_le(const int packed, const int i) { + // account for little endian + int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF); + return byte; +} + +ivec4 unpack_int8x4(const int packed) { + return ivec4( + extract_8bit_from_packed_int_le(packed, 0), + extract_8bit_from_packed_int_le(packed, 1), + extract_8bit_from_packed_int_le(packed, 2), + extract_8bit_from_packed_int_le(packed, 3)); +} + +int pack_4xqint_into_int32( + const int val0, + const int val1, + const int val2, + const int val3) { + int packed = (val0 & 0xFF) | ((val1 & 0xFF) << 8) | ((val2 & 0xFF) << 16) | + ((val3 & 0xFF) << 24); + + return packed; +} + +int pack_into_int32(const ivec4 quant_vals) { + int packed = ((quant_vals[0] & 0xFF) << 0) | ((quant_vals[1] & 0xFF) << 8) | + ((quant_vals[2] & 0xFF) << 16) | ((quant_vals[3] & 0xFF) << 24); + + return packed; +} + +vec4 unpack_and_dequantize( + const int packed_int8_vals, + const float scale, + const int zp) { + ivec4 unpacked = unpack_int8x4(packed_int8_vals); + return vec4(unpacked - zp) * scale; +} + +int quantize_and_pack(const vec4 vals, const float inv_scale, const int zp) { + ivec4 quantized = ivec4(round(vals * inv_scale) + zp); + quantized = clamp(quantized, -128, 127); + return pack_into_int32(quantized); +} + #ifdef DEBUG_MODE #extension GL_EXT_debug_printf : require diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl index 0f5dbc41273..88746c5594e 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl @@ -60,7 +60,7 @@ void main() { int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y; start.y = ipos.y + num_steps * dilation.y; } - const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy)); + const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy); // Compute the start of the kernel based on how far we are skipping ahead when // reading the input. Note that these are "canonical" indices. ivec2 kstart = (start - ipos) / dilation; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh index 41825cba867..6f460d1398c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh @@ -27,6 +27,60 @@ struct Conv2DParams { int K4; }; +struct Conv2dTensorIndex { + ivec3 data; + int texel_i; +}; + +struct Conv2dBlockIndex { + ivec3 data; +}; + +Conv2dTensorIndex block_idx_to_tensor_idx(const Conv2dBlockIndex block_idx) { + Conv2dTensorIndex tensor_idx; + tensor_idx.data.x = mul_4(block_idx.data.x); + tensor_idx.data.y = block_idx.data.y; + tensor_idx.data.z = block_idx.data.z; + tensor_idx.texel_i = 0; + return tensor_idx; +} + +struct Conv2dBlockExtents { + ivec3 data; + int data_xz; +}; + +Conv2dBlockExtents make_block_extents(const ivec4 tensor_sizes) { + Conv2dBlockExtents block_sizes; + block_sizes.data.x = div_up_4(tensor_sizes.x); + block_sizes.data.y = tensor_sizes.y; + block_sizes.data.z = div_up_4(tensor_sizes.z); + + block_sizes.data_xz = block_sizes.data.x * block_sizes.data.z; + + return block_sizes; +} + +Conv2dBlockIndex linear_idx_to_block_idx( + const int idx, const Conv2dBlockExtents block_extents) { + Conv2dBlockIndex block_idx; + block_idx.data.z = idx % block_extents.data.z; + + const int row = idx / block_extents.data.z; + block_idx.data.x = row % block_extents.data.x; + block_idx.data.y = row / block_extents.data.x; + + return block_idx; +} + +bool block_idx_out_of_bounds( + const Conv2dBlockIndex block_idx, + const Conv2dBlockExtents block_extents) { + return block_idx.data.x >= block_extents.data.x || + block_idx.data.y >= block_extents.data.y || + block_idx.data.z >= block_extents.data.z; +} + #ifdef DEBUG_MODE void printConv2DParams(const Conv2DParams params) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl index 02fbef29b75..9089f87d658 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl @@ -54,7 +54,7 @@ void main() { // Compute the start and end of the input indices to load. Padding is assumed // to be constant 0 padding, so reads from the padding region are skipped. const ivec2 start = ipos; - const ivec2 end = ipos + overlay_region.xy; + const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy); VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); int kx = 0; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index 19250419baf..7448b042cad 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -97,6 +97,10 @@ void main() { for (int y = start.y, i = 0; i < TILE_SIZE + BATCH_SIZE_Y - 1; y += dilation.y, i++) { for (int x = start.x, j = 0; j < TILE_SIZE + BATCH_SIZE_X - 1; x += dilation.x, j++) { in_texels[j] = texelFetch(t_in, ivec3(x, y, pos.z), 0); + // Set to zero if reading out of bounds + if (any(greaterThanEqual(ivec2(x, y), in_sizes.xy))) { + in_texels[j] = VEC4_T(0); + } } // from 2nd iteration onwards accumulate dot product in 2nd sum diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh new file mode 100644 index 00000000000..f1d90aa83cb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh @@ -0,0 +1,214 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef CONV2D_DW_Q8_UTILS_GLSLH +#define CONV2D_DW_Q8_UTILS_GLSLH + +#extension GL_EXT_control_flow_attributes : require + +struct InputWindow1D { + vec4[MAX_WINDOW_WIDTH] data; + int len; +}; + +InputWindow1D initial_input_window() { + InputWindow1D input_window; + for (int i = 0; i < MAX_WINDOW_WIDTH; ++i) { + input_window.data[i] = vec4(0); + } + input_window.len = 0; + return input_window; +} + +vec4 dequantize(const int packed_texel, const float scale, const int zp) { + return vec4(unpack_int8x4(packed_texel) - zp) * scale; +} + +vec4 dequantize(const int packed_texel, const vec4 scales) { + return vec4(unpack_int8x4(packed_texel)) * scales; +} + +bool in_bounds( + const int block_w, + const int block_h, + const int block_c4, + const Conv2dBlockExtents block_extents) { + ivec3 idx = ivec3(block_w, block_h, block_c4); + if (any(lessThan(idx, ivec3(0)))) { + return false; + } + if (any(greaterThanEqual(idx, block_extents.data))) { + return false; + } + + return true; +} + +InputWindow1D load_input_window( + const int w_start, + const int w_end, + const int h, + const int c4, + const Conv2dBlockExtents block_extents, + const float input_scale, + const int input_zp, + const ivec4 input_zps) { + InputWindow1D input_window = initial_input_window(); + + const int block_w_start = div_4(w_start); + const int block_w_end = div_4(w_end); + + int window_i = 0; + for (int block_w = block_w_start; block_w <= block_w_end; ++block_w) { + ivec4 input_block = input_zps; + + if (in_bounds(block_w, h, c4, block_extents)) { +#ifdef PACKED_INT8_INPUT_BUFFER + const int buffer_idx = + h * block_extents.data_xz + block_w * block_extents.data.z + c4; + input_block = t_packed_int8_input[buffer_idx]; +#else + input_block = texelFetch(t_packed_int8_input, ivec3(block_w, h, c4), 0); +#endif + } + + const int loaded_w_start = mul_4(block_w); + for (int row = 0; row < 4; ++row) { + if (loaded_w_start + row >= w_start && loaded_w_start + row <= w_end) { + input_window.data[window_i++] = + dequantize(input_block[row], input_scale, input_zp); + } + } + } + input_window.len = window_i; + return input_window; +} + +struct WeightRow { + vec4[MAX_KERNEL_WIDTH] data; + int len; +}; + +WeightRow initial_weight_row() { + WeightRow weight_row; + for (int i = 0; i < MAX_KERNEL_WIDTH; ++i) { + weight_row.data[i] = vec4(0); + } + weight_row.len = 0; + return weight_row; +} + +WeightRow load_weight_row( + const int oc4, + const int ky, + const int OC4, + const int Kw, + const int Kw4, + const vec4 weight_scales) { + WeightRow weight_row = initial_weight_row(); + + int k4 = ky * Kw4; + int row_idx = 0; + for (int w = 0; w < Kw; w += 4) { +#ifdef WEIGHT_BUFFER + const ivec4 weight_block = t_packed_int8_weight[k4 * OC4 + oc4]; +#else + const ivec4 weight_block = texelFetch( + t_packed_int8_weight, ivec2(oc4, k4), 0); +#endif + + for (int row = 0; row < 4; ++row) { + if (w + row < Kw) { + weight_row.data[row_idx++] = dequantize(weight_block[row], weight_scales); + } + } + k4++; + } + weight_row.len = row_idx; + return weight_row; +} + +struct FPOutBlock { + vec4[4] data; +}; + +void perform_conv1d( + inout FPOutBlock out_block, + const InputWindow1D input_window, + const WeightRow weight_row) { + for (int out_w = 0; out_w < 4; ++out_w) { + [[unroll]] for (int kx = 0; kx < weight_row.len; ++kx) { + const int in_w = out_w * conv2d_params.stride.x; + out_block.data[out_w] = fma( + input_window.data[in_w + kx], + weight_row.data[kx], + out_block.data[out_w]); + } + } +} + +ivec4 quantize( + const vec4 texel, const float inv_scale, const int zp) { + vec4 quantized = round(texel * inv_scale) + zp; + return clamp(ivec4(quantized), -128, 127); +} + +ivec4 quantize_and_pack( + FPOutBlock out_block, const float inv_scale, const int zp) { + ivec4 packed_block; + for (int row = 0; row < 4; ++row) { + ivec4 quantized_texel = quantize(out_block.data[row], inv_scale, zp); + packed_block[row] = pack_into_int32(quantized_texel); + } + return packed_block; +} + +#ifdef DEBUG_MODE + +void printInputWindow1D(const InputWindow1D input_window) { + debugPrintfEXT("InputWindow1D contents (len = %d): \\n", input_window.len); + for (int i = 0; i < min(input_window.len, MAX_WINDOW_WIDTH); ++i) { + debugPrintfEXT( + " [%d]: (%.3f, %.3f, %.3f, %.3f) \\n", + i, + input_window.data[i].x, + input_window.data[i].y, + input_window.data[i].z, + input_window.data[i].w); + } +} + +void printWeightRow(const WeightRow weight_row) { + debugPrintfEXT("WeightRow contents (len = %d): \\n", weight_row.len); + for (int i = 0; i < min(weight_row.len, MAX_KERNEL_WIDTH); ++i) { + debugPrintfEXT( + " [%d]: (%.3f, %.3f, %.3f, %.3f) \\n", + i, + weight_row.data[i].x, + weight_row.data[i].y, + weight_row.data[i].z, + weight_row.data[i].w); + } +} + +void printFPOutBlock(const FPOutBlock out_block) { + debugPrintfEXT("FPOutBlock contents: \\n"); + for (int i = 0; i < 4; ++i) { + debugPrintfEXT( + " [%d]: (%.3f, %.3f, %.3f, %.3f) \\n", + i, + out_block.data[i].x, + out_block.data[i].y, + out_block.data[i].z, + out_block.data[i].w); + } + } + +#endif // DEBUG_MODE + +#endif // CONV2D_DW_Q8_UTILS_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl new file mode 100644 index 00000000000..8994ced3acb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl @@ -0,0 +1,121 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, "buffer")} +#define T ${texel_load_component_type(DTYPE, "buffer")} + +$if IO_STORAGE == "buffer": + #define PACKED_INT8_OUTPUT_BUFFER + #define PACKED_INT8_INPUT_BUFFER +$if WEIGHT_STORAGE == "buffer": + #define WEIGHT_BUFFER + +#define MAX_WINDOW_WIDTH 12 +#define MAX_KERNEL_WIDTH 5 + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "output_sizes")} +${layout_declare_ubo(B, "ivec4", "input_sizes")} +${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} + +layout(push_constant) uniform restrict Block { + float input_scale; + int input_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "apply_bias", "1")} + +#include "conv2d_dw_q8_utils.glslh" + +void main() { + const int tid = int(gl_GlobalInvocationID.x); + Conv2dBlockExtents out_block_extents = make_block_extents(output_sizes); + + Conv2dBlockIndex out_block_idx = linear_idx_to_block_idx( + tid, out_block_extents); + + if (block_idx_out_of_bounds(out_block_idx, out_block_extents)) { + return; + } + + const int out_w = mul_4(out_block_idx.data.x); + const int w_start = + (out_w * conv2d_params.stride.x) - conv2d_params.padding.x; + const int w_end = ((out_w + 3) * conv2d_params.stride.x) - + conv2d_params.padding.x + + (conv2d_params.kernel_size.x - 1) * conv2d_params.dilation.x; + + Conv2dBlockExtents in_block_extents = make_block_extents(input_sizes); + + const ivec4 input_zps = ivec4(pack_into_int32(ivec4(input_zp))); + const vec4 weight_scales = vec4(t_weight_scales[out_block_idx.data.z]); + + const int Kw4 = div_up_4(conv2d_params.kernel_size.x); + + FPOutBlock out_block; + for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) { + const int out_h = out_block_idx.data.y; + const int h = out_h * conv2d_params.stride.y - conv2d_params.padding.y + + ky * conv2d_params.dilation.y; + + InputWindow1D input_window = load_input_window( + w_start, + w_end, + h, + out_block_idx.data.z, + in_block_extents, + input_scale, + input_zp, + input_zps); + + WeightRow weight_row = load_weight_row( + out_block_idx.data.z, + ky, + out_block_extents.data.z, + conv2d_params.kernel_size.x, + Kw4, + weight_scales); + + perform_conv1d(out_block, input_window, weight_row); + } + + if (apply_bias > 0) { + const vec4 bias = vec4(t_bias[out_block_idx.data.z]); + for (int row = 0; row < 4; row++) { + out_block.data[row] += bias; + } + } + + const ivec4 packed_out_block = quantize_and_pack( + out_block, output_inv_scale, output_zp); + +#ifdef PACKED_INT8_OUTPUT_BUFFER + t_packed_int8_output[tid] = packed_out_block; +#else + imageStore(t_packed_int8_output, out_block_idx.data, packed_out_block); +#endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml new file mode 100644 index 00000000000..77f801668a4 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_dw_q8ta_q8csw_q8to: + parameter_names_with_default_values: + DTYPE: float + IO_STORAGE: texture3d + WEIGHT_STORAGE: texture2d + generate_variant_forall: + combination: + parameter_names: [IO_STORAGE, WEIGHT_STORAGE] + combos: + - parameter_values: [buffer, texture2d] + DTYPE: + - VALUE: float + shader_variants: + - NAME: conv2d_dw_q8ta_q8csw_q8to diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh new file mode 100644 index 00000000000..be8a76421a5 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh @@ -0,0 +1,34 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef CONV2D_FP_INPUT_TILE_LOAD +#define CONV2D_FP_INPUT_TILE_LOAD + +#extension GL_EXT_control_flow_attributes : require + +#include "linear_fp_input_tile.glslh" + +VEC4_T load_fp_input_texel(const Conv2dTensorIndex tidx) { + return texelFetch(t_fp_input, tidx.data, 0); +} + +void load_fp_input_tile( + out FPInputTile tile, + const Conv2dBlockIndex block_idx) { +#if TILE_M == 4 && TILE_K4 == 1 + Conv2dTensorIndex load_tidx = block_idx_to_tensor_idx(block_idx); + [[unroll]] for (int w = 0; w < TILE_M; w++) { + tile.data[w][0] = load_fp_input_texel(load_tidx); + load_tidx.data.x++; + } +#else + not_implemented; +#endif +} + +#endif // CONV2D_FP_INPUT_TILE_LOAD diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_load.glslh new file mode 100644 index 00000000000..44c226f6891 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_load.glslh @@ -0,0 +1,30 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef CONV2D_INT8_INPUT_BLOCK_LOAD +#define CONV2D_INT8_INPUT_BLOCK_LOAD + +#extension GL_EXT_control_flow_attributes : require + +#include "conv2d_common.glslh" +#include "conv2d_int8_activation_block.glslh" + +void store_packed_int8_input_block( + const Conv2dBlockIndex block_idx, + const Conv2dBlockExtents block_extents, + const Int8ActivationBlock packed_int8_block) { +#ifdef OUTPUT_BUFFER + const int buffer_idx = block_idx.data.y * block_extents.data_xz + + block_idx.data.x * block_extents.data.z + block_idx.data.z; + t_packed_int8_input[buffer_idx] = packed_int8_block.data; +#else + imageStore(t_packed_int8_input, block_idx.data, packed_int8_block.data); +#endif +} + +#endif // CONV2D_INT8_INPUT_BLOCK_LOAD diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_tile_load.glslh new file mode 100644 index 00000000000..44aa09912ec --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_tile_load.glslh @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef CONV2D_INT8_INPUT_TILE_LOAD +#define CONV2D_INT8_INPUT_TILE_LOAD + +#extension GL_EXT_control_flow_attributes : require + +#include "linear_int8_input_tile.glslh" + +struct Int8InputTileIndex { +#ifdef PACKED_INT8_INPUT_BUFFER + int data; +#else + ivec3 data; +#endif +}; + +Int8InputTileIndex make_initial_int8_input_tile_index( + const Conv2dBlockIndex block_idx, + const Conv2dBlockExtents block_extents) { + Int8InputTileIndex idx; +#ifdef PACKED_INT8_INPUT_BUFFER + idx.data = block_idx.data.y * block_extents.data_xz + + block_idx.data.x * block_extents.data.z; +#else + idx.data = ivec3(block_idx.data.x, block_idx.data.y, 0); +#endif + return idx; +} + +Int8InputTileIndex make_initial_int8_input_tile_index( + const Conv2dBlockIndex block_idx, + const Conv2dBlockExtents block_extents, + const int group_k4_offset) { + Int8InputTileIndex idx; +#ifdef PACKED_INT8_INPUT_BUFFER + idx.data = block_idx.data.y * block_extents.data_xz + + block_idx.data.x * block_extents.data.z + group_k4_offset; +#else + idx.data = ivec3(block_idx.data.x, block_idx.data.y, group_k4_offset); +#endif + return idx; +} + +void load_packed_int8_input_tile( + out Int8InputTile int8_tile, + const Int8InputTileIndex idx) { +#ifdef PACKED_INT8_INPUT_BUFFER + int8_tile.data[0][0] = t_packed_int8_input[idx.data]; +#else + int8_tile.data[0][0] = texelFetch(t_packed_int8_input, idx.data, 0); +#endif + + // Guard against unsupported tile sizes +#if TILE_M4 != 1 || TILE_K4 != 1 + not_implemented; +#endif +} + +void increment_k4(inout Int8InputTileIndex idx) { +#ifdef PACKED_INT8_INPUT_BUFFER + idx.data += 1; +#else + idx.data.z += 1; +#endif +} + +#endif // CONV2D_INT8_INPUT_TILE_LOAD diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh new file mode 100644 index 00000000000..0a490360f98 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef CONV2D_INT8_OUTPUT_TILE_STORE +#define CONV2D_INT8_OUTPUT_TILE_STORE + +#extension GL_EXT_control_flow_attributes : require + +#include "conv2d_common.glslh" +#include "linear_int8_output_tile.glslh" + +void store_packed_int8_output_tile( + const Int8OutTile int8_tile, + const Conv2dBlockIndex block_idx, + const Conv2dBlockExtents block_extents) { +#ifdef PACKED_INT8_OUTPUT_BUFFER + [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) { + int buffer_idx = block_idx.data.y * block_extents.data_xz + + (block_idx.data.x + m4) * block_extents.data.z + block_idx.data.z; + [[unroll]] for (int n4 = 0; n4 < TILE_N4; n4++) { + if (block_idx.data.x + m4 < block_extents.data.x && + block_idx.data.z + n4 < block_extents.data.z) { + t_packed_int8_output[buffer_idx++] = int8_tile.data[m4][n4]; + } + } + } +#else + [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; n4++) { + if (block_idx.data.x + m4 < block_extents.data.x && + block_idx.data.z + n4 < block_extents.data.z) { + imageStore( + t_packed_int8_output, block_idx.data, int8_tile.data[m4][n4]); + } + } + } +#endif +} + +#endif // CONV2D_INT8_OUTPUT_TILE_STORE diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.glsl new file mode 100644 index 00000000000..16c12b3ee5a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.glsl @@ -0,0 +1,144 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, "buffer")} +#define T ${texel_load_component_type(DTYPE, "buffer")} + +$if IO_STORAGE == "buffer": + #define PACKED_INT8_OUTPUT_BUFFER + #define PACKED_INT8_INPUT_BUFFER +$if WEIGHT_STORAGE == "buffer": + #define WEIGHT_BUFFER + +// corresponds to input/output width dim +#define TILE_M4 1 +// corresponds to input channels dim +#define TILE_K4 1 +// corresponds to output channels dim +#define TILE_N4 2 + +#define TILE_M 4 +#define TILE_K 4 +#define TILE_N 8 + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "output_sizes")} +${layout_declare_ubo(B, "ivec4", "input_sizes")} +${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} + +layout(push_constant) uniform restrict Block { + float input_scale; + int input_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "apply_bias", "1")} + +#include "conv2d_int8_input_tile_load.glslh" +#include "linear_int8_weight_tile_load.glslh" +#include "linear_fp_output_tile_int8_int8_compute.glslh" +#include "linear_int_weight_sums_load.glslh" +#include "linear_fp_weight_scales_load.glslh" +#include "linear_fp_bias_load.glslh" +#include "linear_int8_output_tile_compute.glslh" +#include "conv2d_int8_output_tile_store.glslh" + +void main() { + Conv2dBlockIndex output_block_idx; + output_block_idx.data.z = int(gl_GlobalInvocationID.x) * TILE_N4; + output_block_idx.data.x = int(gl_GlobalInvocationID.y) * TILE_M4; + output_block_idx.data.y = int(gl_GlobalInvocationID.z); + + Conv2dBlockExtents output_block_extents = make_block_extents(output_sizes); + if (block_idx_out_of_bounds(output_block_idx, output_block_extents)) { + return; + } + + Conv2dBlockExtents input_block_extents = make_block_extents(input_sizes); + + Int32Accum out_accum; + initialize(out_accum); + + Int8InputTile int8_input_tile; + Int8WeightTile int8_weight_tile; + + Int8InputTileIndex input_idx = make_initial_int8_input_tile_index( + output_block_idx, input_block_extents); + + for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) { + load_packed_int8_input_tile(int8_input_tile, input_idx); + + load_int8_weight_tile( + int8_weight_tile, + output_block_idx.data.z, + k4, + output_block_extents.data.z); + + int_accumulate_with_int8_weight( + out_accum, int8_input_tile, int8_weight_tile); + + increment_k4(input_idx); + } + + FPPerOutChannelParams weight_scales_tile; + load_weight_scales_tile(weight_scales_tile, output_block_idx.data.z); + + IntPerOutChannelParams weight_sums_tile; + load_weight_sums_tile(weight_sums_tile, output_block_idx.data.z); + + Int8OutTile int8_out_tile; + initialize(int8_out_tile); + + if (apply_bias > 0) { + FPPerOutChannelParams bias_tile; + load_bias_tile(bias_tile, output_block_idx.data.z); + + compute_int8_out_tile_with_int32_accum( + int8_out_tile, + out_accum, + input_scale, + input_zp, + output_inv_scale, + output_zp, + weight_sums_tile, + weight_scales_tile, + bias_tile); + } + else { + compute_int8_out_tile_with_int32_accum( + int8_out_tile, + out_accum, + input_scale, + input_zp, + output_inv_scale, + output_zp, + weight_sums_tile, + weight_scales_tile); + } + + store_packed_int8_output_tile( + int8_out_tile, output_block_idx, output_block_extents); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.yaml new file mode 100644 index 00000000000..23803dc6da1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_pw_q8ta_q8csw_q8to_tiled: + parameter_names_with_default_values: + DTYPE: float + IO_STORAGE: texture3d + WEIGHT_STORAGE: texture2d + generate_variant_forall: + combination: + parameter_names: [IO_STORAGE, WEIGHT_STORAGE] + combos: + - parameter_values: [buffer, texture2d] + DTYPE: + - VALUE: float + shader_variants: + - NAME: conv2d_pw_q8ta_q8csw_q8to_tiled diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl index 9f84afeb1a1..ef50a1aca9f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl @@ -12,10 +12,12 @@ #define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} +$if DTYPE == "half": + #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require + #define VEC4_T f16vec4 +$else: + #define VEC4_T ${texel_type(DTYPE)} -#define TILE_SIZE_X uint16_t(${TILE_SIZE_X}) -#define TILE_SIZE_Y uint16_t(${TILE_SIZE_Y}) #define op(X, A, B) ${OPERATOR} @@ -50,119 +52,90 @@ ${layout_declare_spec_const(C, "int", "ngroups", "1")} * size is only 1x1, making it easier to re-use loaded texels from t_kernel. */ void main() { - const int out_limits_scaled[2] = - {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X, - (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y}; - const uint16_t div_by_x = uint16_t(gl_GlobalInvocationID.x / out_limits_scaled[0]); - const uint16_t out_pos_xy[2] = {uint16_t(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x}; - const int out_pos_z = int(gl_GlobalInvocationID.y); + int inputAndOutputWidth = out_limits.x; + int inputAndOutputHeight = out_limits.y; + int outputChannel = out_limits.z*4; - // If the top left position is out of bounds, then this invocation will have - // no work to do. - if (out_pos_xy[1] >= out_limits_scaled[1] || out_pos_z >= out_limits.z) { + // Divided by 4 because the input channels are packed + int inputChannel = in_group_size/4; + + int threadHW = int(gl_GlobalInvocationID.x); + int threadOutChannel = int(gl_GlobalInvocationID.y); + + int xIdx = threadHW % inputAndOutputWidth; + int yIdx = threadHW / inputAndOutputWidth; + + if (threadHW >= inputAndOutputWidth * inputAndOutputHeight && threadOutChannel >= outputChannel) { return; } - // Output position for TILE_SIZE = 2 - // +--------+--------+ - // | pos[0] | pos[1] | - // +--------+--------+ - // | pos[2] | pos[3] | - // +--------+--------+ - uint16_t pos[TILE_SIZE_X * TILE_SIZE_Y * 2]; - for (uint16_t y = uint16_t(0), i = uint16_t(0); y < TILE_SIZE_Y; ++y) { - for (uint16_t x = uint16_t(0); x < TILE_SIZE_X; ++x) { - pos[i * 2] = out_pos_xy[0] * TILE_SIZE_X + x; - pos[i * 2 + 1] = out_pos_xy[1] * TILE_SIZE_Y + y; - i++; - } - } + VEC4_T outputTexel = VEC4_T(texelFetch(t_bias, ivec2(threadOutChannel, 0), 0)); - // Final output array where each element is a tensor value. - // Tuple of consecutive 4 elements represents a single output texel. - float sum[TILE_SIZE_X * TILE_SIZE_Y * 4]; + VEC4_T inputVec; + VEC4_T weight1OutputChannelPacked; + VEC4_T weight2OutputChannelPacked; + VEC4_T weight3OutputChannelPacked; + VEC4_T weight4OutputChannelPacked; - // Initialize the output array with the bias value - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i++) { - sum[i] = 0; - } + // By unrolling the loop in sets of 4, this significantly reduces the number of branching instructions + // and enables the compiler to rearrange instructions for more efficient memory retrieval and compute + for (int inputC = 0; inputC < inputChannel; inputC += 1) { - int z4 = 0; - // Since the kernel is 1x1, we only have to loop over the depth dimension. - for (int z = 0; z < in_group_size; z += 4, ++z4) { - // During prepacking, the weight tensor has been permuted so that the - // channel (IC) dim is along the x-axis, and the batch (OC) dim is along - // the z-axis. - float kernel_values[4 * 4]; // 4 channels, 4 elements per channel - - // Load kernel values from texels to array - [[unroll]] for (int i = 0; i < 4; ++i) { - const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos_z), 0); - kernel_values[i * 4 + 0] = k_tex.x; - kernel_values[i * 4 + 1] = k_tex.y; - kernel_values[i * 4 + 2] = k_tex.z; - kernel_values[i * 4 + 3] = k_tex.w; - } - - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const vec4 in_tex = texelFetch(t_in, ivec3(pos[i * 2], pos[i * 2 + 1], z4), 0); - // Load the input texel into an array - float tex_values[4]; - tex_values[0] = in_tex.x; - tex_values[1] = in_tex.y; - tex_values[2] = in_tex.z; - tex_values[3] = in_tex.w; - - // For 2x2 tile size algorithm works as follows. - // To explain the calculations below, the contents of one in_tex and the - // group of 4 texels loaded from t_kernel are shown: - // - // in_tex t_kernel - // -x-> ---x---> - // +---+ +----+----+----+----+ - // ^ | w | ^ | D0 | D1 | D2 | D3 | - // | +---+ | +----+----+----+----+ - // | | z | | | C0 | C1 | C2 | C3 | - // z +---+ z +----+----+----+----+ - // | | y | | | B0 | B2 | B2 | B3 | - // | +---+ | +----+----+----+----+ - // | x | | A0 | A1 | A2 | A3 | - // +---+ +----+----+----+----+ - // - // In the t_kernel graphic, cells sharing the same letter are from - // the same batch/output channel index, and the number denotes a unique - // channel index. To calculate the output texel, the following - // calculation is performed: - // - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | D0 | | y | | D1 | | z | | D2 | | w | | D3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | C0 | | y | | C1 | | z | | C2 | | w | | C3 | - // +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+ - // | x | | B0 | | y | | B1 | | z | | B2 | | w | | B3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // | x | | A0 | | y | | A1 | | z | | A2 | | w | | A3 | - // +---+ +----+ +---+ +----+ +---+ +----+ +---+ +----+ - // - // which is what is expressed in the following calculations. This is done - // for each output position. - for (int j = 0; j < 4; ++j) { - sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j]; - sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j]; - } - } - } + inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0)); + + weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0)); + weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0)); + weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0)); + weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0)); + + outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0])); + outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1])); + outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2])); + outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3])); + + inputC += 1; + + inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0)); - const vec4 bias = texelFetch(t_bias, ivec2(out_pos_z, 0), 0); + weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0)); + weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0)); + weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0)); + weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0)); - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos_z); - if (all(lessThan(pos_l.xy, out_limits.xy))) { - const vec4 out_sum = vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]); - imageStore(t_out, pos_l, op(out_sum + bias, out_min, out_max)); - } + outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0])); + outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1])); + outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2])); + outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3])); + + inputC += 1; + + inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0)); + + weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0)); + weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0)); + weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0)); + weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0)); + + outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0])); + outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1])); + outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2])); + outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3])); + + inputC += 1; + + inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0)); + + weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0)); + weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0)); + weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0)); + weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0)); + + outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0])); + outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1])); + outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2])); + outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3])); } + + imageStore(t_out, ivec3(xIdx, yIdx, threadOutChannel), op(vec4(outputTexel), out_min, out_max)); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml index ebfee11c405..bab3c715540 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml @@ -9,8 +9,6 @@ conv2d_pw_s1p0: OPERATOR: X NDIM: 3 DTYPE: float - TILE_SIZE_X: 1 - TILE_SIZE_Y: 4 generate_variant_forall: DTYPE: - VALUE: half diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8_utils.glslh new file mode 100644 index 00000000000..279f4f17f13 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8_utils.glslh @@ -0,0 +1,151 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef CONV2D_Q8_UTILS_GLSLH +#define CONV2D_Q8_UTILS_GLSLH + +#extension GL_EXT_control_flow_attributes : require +#extension GL_EXT_integer_dot_product : require + +#include "linear_int_accumulator.glslh" + +struct Int8InputWindow1D { + int[MAX_WINDOW_WIDTH] data; + int len; +}; + +Int8InputWindow1D initial_input_window() { + Int8InputWindow1D input_window; + for (int i = 0; i < MAX_WINDOW_WIDTH; ++i) { + input_window.data[i] = 0; + } + input_window.len = 0; + return input_window; +} + +bool in_bounds( + const int block_w, + const int block_h, + const int block_c4, + const Conv2dBlockExtents block_extents) { + ivec3 idx = ivec3(block_w, block_h, block_c4); + if (any(lessThan(idx, ivec3(0)))) { + return false; + } + if (any(greaterThanEqual(idx, block_extents.data))) { + return false; + } + + return true; +} + +Int8InputWindow1D load_input_window( + const int w_start, + const int w_end, + const int h, + const int c4, + const Conv2dBlockExtents block_extents, + const ivec4 input_zps) { + Int8InputWindow1D input_window = initial_input_window(); + + const int block_w_start = div_4(w_start); + const int block_w_end = div_4(w_end); + + int window_i = 0; + for (int block_w = block_w_start; block_w <= block_w_end; ++block_w) { + ivec4 input_block = input_zps; + + if (in_bounds(block_w, h, c4, block_extents)) { +#ifdef PACKED_INT8_INPUT_BUFFER + const int buffer_idx = + h * block_extents.data_xz + block_w * block_extents.data.z + c4; + input_block = t_packed_int8_input[buffer_idx]; +#else + input_block = texelFetch(t_packed_int8_input, ivec3(block_w, h, c4), 0); +#endif + } + + const int loaded_w_start = mul_4(block_w); + for (int row = 0; row < 4; ++row) { + if (loaded_w_start + row >= w_start && loaded_w_start + row <= w_end) { + input_window.data[window_i++] = input_block[row]; + } + } + } + input_window.len = window_i; + return input_window; +} + +ivec4 load_weight_block( + const int ic4, + const int kx, + const int ky, + const int oc4, + const int IC4, + const int Kw, + const int Kh, + const int OC4) { +#ifdef PACKED_INT8_WEIGHTS_BUFFER + const int block_x = oc4 * Kw + kx; + const int block_y = ky * IC4 + ic4; + return t_packed_int8_weight[block_y * (Kw * OC4) + block_x]; +#else + return texelFetch( + t_packed_int8_weight, ivec2(oc4 * Kw + kx, ky * IC4 + ic4), 0); +#endif +} + +void perform_conv1d( + inout Int32Accum accum, + const Int8InputWindow1D input_window, + const ivec4 weight_block, + const int kx) { + [[unroll]] for (int out_w = 0; out_w < 4; ++out_w) { + const int window_i = out_w * conv2d_params.stride.x + kx; + [[unroll]] for (int out_c = 0; out_c < 4; ++out_c) { + accum.data[out_w][0][out_c] = dotPacked4x8AccSatEXT( + input_window.data[window_i], + weight_block[out_c], + accum.data[out_w][0][out_c]); + } + } +} + +#ifdef DEBUG_MODE + +void printInt8InputWindow1D(const Int8InputWindow1D input_window) { + debugPrintfEXT("Int8InputWindow1D contents (len = %d): \\n", input_window.len); + for (int i = 0; i < min(input_window.len, MAX_WINDOW_WIDTH); ++i) { + ivec4 unpacked = unpack_int8x4(input_window.data[i]); + debugPrintfEXT( + " [%d]: (%d, %d, %d, %d) \\n", + i, + unpacked.x, + unpacked.y, + unpacked.z, + unpacked.w); + } +} + +void printWeightBlock(const ivec4 weight_block) { + debugPrintfEXT("WeightBlock contents: \\n"); + for (int i = 0; i < 4; ++i) { + ivec4 unpacked = unpack_int8x4(weight_block[i]); + debugPrintfEXT( + " [%d]: (%d, %d, %d, %d) \\n", + i, + unpacked.x, + unpacked.y, + unpacked.z, + unpacked.w); + } +} + +#endif // DEBUG_MODE + +#endif // CONV2D_Q8_UTILS_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.glsl new file mode 100644 index 00000000000..5839b13aeaa --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.glsl @@ -0,0 +1,173 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, "buffer")} +#define T ${texel_load_component_type(DTYPE, "buffer")} + +$if IO_STORAGE == "buffer": + #define PACKED_INT8_OUTPUT_BUFFER + #define PACKED_INT8_INPUT_BUFFER +$if WEIGHT_STORAGE == "buffer": + #define WEIGHT_BUFFER + +#define MAX_WINDOW_WIDTH 16 + +// corresponds to input/output width dim +#define TILE_M4 1 +// corresponds to input channels dim +#define TILE_K4 1 +// corresponds to output channels dim +#define TILE_N4 1 + +#define TILE_M 4 +#define TILE_K 4 +#define TILE_N 4 + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "output_sizes")} +${layout_declare_ubo(B, "ivec4", "input_sizes")} +${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} + +layout(push_constant) uniform restrict Block { + float input_scale; + int input_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "apply_bias", "1")} + +#include "im2col_packed_int8_utils.glslh" +#include "conv2d_int8_input_tile_load.glslh" +#include "linear_int8_weight_tile_load.glslh" +#include "linear_fp_output_tile_int8_int8_compute.glslh" +#include "linear_int_weight_sums_load.glslh" +#include "linear_fp_weight_scales_load.glslh" +#include "linear_fp_bias_load.glslh" +#include "linear_int8_output_tile_compute.glslh" +#include "conv2d_int8_output_tile_store.glslh" + +#include "conv2d_q8_utils.glslh" + +void main() { + Conv2dBlockIndex out_block_idx; + out_block_idx.data.z = int(gl_GlobalInvocationID.x) * TILE_N4; + out_block_idx.data.x = int(gl_GlobalInvocationID.y) * TILE_M4; + out_block_idx.data.y = int(gl_GlobalInvocationID.z); + + Conv2dBlockExtents out_block_extents = make_block_extents(output_sizes); + if (block_idx_out_of_bounds(out_block_idx, out_block_extents)) { + return; + } + + const int out_w = mul_4(out_block_idx.data.x); + const int w_start = + (out_w * conv2d_params.stride.x) - conv2d_params.padding.x; + const int w_end = ((out_w + 3) * conv2d_params.stride.x) - + conv2d_params.padding.x + + (conv2d_params.kernel_size.x - 1) * conv2d_params.dilation.x; + + Conv2dBlockExtents in_block_extents = make_block_extents(input_sizes); + + const ivec4 input_zps = ivec4(pack_into_int32(ivec4(input_zp))); + const vec4 weight_scales = vec4(t_weight_scales[out_block_idx.data.z]); + + Int32Accum out_accum; + initialize(out_accum); + + const int IC4_per_group = div_up_4(conv2d_params.in_channels_per_group); + + const int n = mul_4(out_block_idx.data.z); + const int group_idx = n / conv2d_params.out_channels_per_group; + const int group_ic4_offset = group_idx * IC4_per_group; + + for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) { + const int h = out_block_idx.data.y * conv2d_params.stride.y - + conv2d_params.padding.y + ky * conv2d_params.dilation.y; + + for (int ic4 = 0; ic4 < IC4_per_group; ic4++) { + Int8InputWindow1D int8_input_window = load_input_window( + w_start, + w_end, + h, + group_ic4_offset + ic4, + in_block_extents, + input_zps); + + for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) { + const ivec4 weight_block = load_weight_block( + ic4, + kx, + ky, + out_block_idx.data.z, + IC4_per_group, + conv2d_params.kernel_size.x, + conv2d_params.kernel_size.y, + out_block_extents.data.z); + + perform_conv1d(out_accum, int8_input_window, weight_block, kx); + } + } + } + + FPPerOutChannelParams weight_scales_tile; + load_weight_scales_tile(weight_scales_tile, out_block_idx.data.z); + + IntPerOutChannelParams weight_sums_tile; + load_weight_sums_tile(weight_sums_tile, out_block_idx.data.z); + + Int8OutTile int8_out_tile; + initialize(int8_out_tile); + + if (apply_bias > 0) { + FPPerOutChannelParams bias_tile; + load_bias_tile(bias_tile, out_block_idx.data.z); + + compute_int8_out_tile_with_int32_accum( + int8_out_tile, + out_accum, + input_scale, + input_zp, + output_inv_scale, + output_zp, + weight_sums_tile, + weight_scales_tile, + bias_tile); + } + else { + compute_int8_out_tile_with_int32_accum( + int8_out_tile, + out_accum, + input_scale, + input_zp, + output_inv_scale, + output_zp, + weight_sums_tile, + weight_scales_tile); + } + + store_packed_int8_output_tile( + int8_out_tile, out_block_idx, out_block_extents); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml new file mode 100644 index 00000000000..5da9cc14584 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_q8ta_q8csw_q8to: + parameter_names_with_default_values: + DTYPE: float + IO_STORAGE: texture3d + WEIGHT_STORAGE: texture2d + generate_variant_forall: + combination: + parameter_names: [IO_STORAGE, WEIGHT_STORAGE] + combos: + - parameter_values: [buffer, texture2d] + DTYPE: + - VALUE: float + shader_variants: + - NAME: conv2d_q8ta_q8csw_q8to diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.glsl new file mode 100644 index 00000000000..b44e37766fc --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.glsl @@ -0,0 +1,149 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, "buffer")} +#define T ${texel_load_component_type(DTYPE, "buffer")} + +$if IO_STORAGE == "buffer": + #define PACKED_INT8_OUTPUT_BUFFER + #define PACKED_INT8_INPUT_BUFFER +$if WEIGHT_STORAGE == "buffer": + #define WEIGHT_BUFFER + +// corresponds to input/output width dim +#define TILE_M4 1 +// corresponds to input channels dim +#define TILE_K4 1 +// corresponds to output channels dim +#define TILE_N4 2 + +#define TILE_M 4 +#define TILE_K 4 +#define TILE_N 8 + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "output_sizes")} +${layout_declare_ubo(B, "ivec4", "im2col_sizes")} +${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} + +layout(push_constant) uniform restrict Block { + float input_scale; + int input_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "apply_bias", "1")} + +#include "conv2d_int8_input_tile_load.glslh" +#include "linear_int8_weight_tile_load.glslh" +#include "linear_fp_output_tile_int8_int8_compute.glslh" +#include "linear_int_weight_sums_load.glslh" +#include "linear_fp_weight_scales_load.glslh" +#include "linear_fp_bias_load.glslh" +#include "linear_int8_output_tile_compute.glslh" +#include "conv2d_int8_output_tile_store.glslh" + +void main() { + Conv2dBlockIndex output_block_idx; + output_block_idx.data.z = int(gl_GlobalInvocationID.x) * TILE_N4; + output_block_idx.data.x = int(gl_GlobalInvocationID.y) * TILE_M4; + output_block_idx.data.y = int(gl_GlobalInvocationID.z); + + Conv2dBlockExtents output_block_extents = make_block_extents(output_sizes); + if (block_idx_out_of_bounds(output_block_idx, output_block_extents)) { + return; + } + + const int n = mul_4(output_block_idx.data.z); + + const int group_idx = n / conv2d_params.out_channels_per_group; + const int group_k4_offset = group_idx * conv2d_params.K4_per_group; + + Conv2dBlockExtents input_block_extents = make_block_extents(im2col_sizes); + + Int32Accum out_accum; + initialize(out_accum); + + Int8InputTile int8_input_tile; + Int8WeightTile int8_weight_tile; + + Int8InputTileIndex input_idx = make_initial_int8_input_tile_index( + output_block_idx, input_block_extents, group_k4_offset); + + for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) { + load_packed_int8_input_tile(int8_input_tile, input_idx); + + load_int8_weight_tile( + int8_weight_tile, + output_block_idx.data.z, + k4, + output_block_extents.data.z); + + int_accumulate_with_int8_weight( + out_accum, int8_input_tile, int8_weight_tile); + + increment_k4(input_idx); + } + + FPPerOutChannelParams weight_scales_tile; + load_weight_scales_tile(weight_scales_tile, output_block_idx.data.z); + + IntPerOutChannelParams weight_sums_tile; + load_weight_sums_tile(weight_sums_tile, output_block_idx.data.z); + + Int8OutTile int8_out_tile; + initialize(int8_out_tile); + + if (apply_bias > 0) { + FPPerOutChannelParams bias_tile; + load_bias_tile(bias_tile, output_block_idx.data.z); + + compute_int8_out_tile_with_int32_accum( + int8_out_tile, + out_accum, + input_scale, + input_zp, + output_inv_scale, + output_zp, + weight_sums_tile, + weight_scales_tile, + bias_tile); + } + else { + compute_int8_out_tile_with_int32_accum( + int8_out_tile, + out_accum, + input_scale, + input_zp, + output_inv_scale, + output_zp, + weight_sums_tile, + weight_scales_tile); + } + + store_packed_int8_output_tile( + int8_out_tile, output_block_idx, output_block_extents); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml new file mode 100644 index 00000000000..fa92481f5ef --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_q8ta_q8csw_q8to_linear_tiled: + parameter_names_with_default_values: + DTYPE: float + IO_STORAGE: texture3d + WEIGHT_STORAGE: texture2d + generate_variant_forall: + combination: + parameter_names: [IO_STORAGE, WEIGHT_STORAGE] + combos: + - parameter_values: [buffer, texture2d] + DTYPE: + - VALUE: float + shader_variants: + - NAME: conv2d_q8ta_q8csw_q8to_linear_tiled diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml index eff78a7938d..1a5b0cb235e 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/full.yaml @@ -14,5 +14,6 @@ full: DTYPE: - VALUE: half - VALUE: float + - VALUE: int32 shader_variants: - NAME: full diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.glsl b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.glsl new file mode 100644 index 00000000000..3ecaa597ecc --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.glsl @@ -0,0 +1,73 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +$if STORAGE == "buffer": + #define PACKED_INT8_OUTPUT_BUFFER + #define PACKED_INT8_INPUT_BUFFER + +#define TILE_M4 1 +#define TILE_N4 1 +#define TILE_K4 1 + +#define TILE_M 4 +#define TILE_N 4 +#define TILE_K 4 + +layout(std430) buffer; + +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", STORAGE, is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "im2col_sizes")} +// Sizes of the output image +${layout_declare_ubo(B, "ivec4", "output_sizes")} +// Sizes of the input image +${layout_declare_ubo(B, "ivec4", "input_sizes")} + +${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} + +layout(push_constant) uniform restrict Block { + float inv_scale; + int zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "conv2d_int8_output_tile_store.glslh" +#include "im2col_packed_int8_utils.glslh" + +void main() { + const int out_buf_idx = int(gl_GlobalInvocationID.x); + Conv2dBlockExtents im2col_block_extents = make_block_extents(im2col_sizes); + + Conv2dBlockIndex im2col_block_idx = linear_idx_to_block_idx( + out_buf_idx, im2col_block_extents); + + if (block_idx_out_of_bounds(im2col_block_idx, im2col_block_extents)) { + return; + } + + Im2ColBlockLoadIndices load_ixs = im2col_block_idx_to_load_ixs( + im2col_block_idx); + + Conv2dBlockExtents input_block_extents = make_block_extents(input_sizes); + + const ivec4 input_zps = ivec4(pack_into_int32(ivec4(zp))); + Int8OutTile int8_im2col_tile; + int8_im2col_tile.data[0][0] = load_im2col_block( + load_ixs, input_block_extents, zp, input_zps); + + store_packed_int8_output_tile( + int8_im2col_tile, im2col_block_idx, im2col_block_extents); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml new file mode 100644 index 00000000000..1c14f1fdc5a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +im2col_packed_int8: + parameter_names_with_default_values: + STORAGE: buffer + generate_variant_forall: + STORAGE: + - VALUE: buffer + shader_variants: + - NAME: im2col_packed_int8 diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8_utils.glslh new file mode 100644 index 00000000000..2b1870c493d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8_utils.glslh @@ -0,0 +1,287 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef IM2COL_PACKED_INT8_GLSLH +#define IM2COL_PACKED_INT8_GLSLH + +#include "common.glslh" + +struct Conv2dBlockElementIndex { + int x4; + int y; + int z4; + + int row; + int col; +}; + +struct Im2ColBlockLoadIndices { + bool block_aligned; + bool cols_aligned; + bool rows_contiguous; + + int im2col_w_start; + int im2col_h; + int k_in_group_start; + int group_idx; + + Conv2dBlockElementIndex block_idx_start; +}; + +Conv2dBlockElementIndex tidx_to_block_elem_idx(const TensorIndex4D tidx) { + Conv2dBlockElementIndex block_idx; + block_idx.x4 = div_4(tidx.data.x); + block_idx.row = mod_4(tidx.data.x); + + block_idx.y = tidx.data.y; + + block_idx.z4 = div_4(tidx.data.z); + block_idx.col = mod_4(tidx.data.z); + + return block_idx; +} + +TensorIndex4D get_input_tensor_tidx( + const int w, + const int h, + const int k_in_group, + const int group_idx) { + TensorIndex4D tidx; + tidx.data.w = 0; + + const int c_in_group = k_in_group % conv2d_params.in_channels_per_group; + const int row = k_in_group / conv2d_params.in_channels_per_group; + const int kernel_x = row % conv2d_params.kernel_size.x; + const int kernel_y = row / conv2d_params.kernel_size.x; + + tidx.data.z = group_idx * conv2d_params.in_channels_per_group + c_in_group; + + tidx.data.x = (w * conv2d_params.stride.x) - conv2d_params.padding.x + + (kernel_x * conv2d_params.dilation.x); + tidx.data.y = (h * conv2d_params.stride.y) - conv2d_params.padding.y + + (kernel_y * conv2d_params.dilation.y); + + return tidx; +} + +Im2ColBlockLoadIndices im2col_block_idx_to_load_ixs( + Conv2dBlockIndex im2col_block_idx) { + const int im2col_w = mul_4(im2col_block_idx.data.x); + const int im2col_h = im2col_block_idx.data.y; + const int im2col_k = mul_4(im2col_block_idx.data.z); + + const int group_idx = im2col_k / conv2d_params.K_per_group; + const int k_in_group = im2col_k % conv2d_params.K_per_group; + + TensorIndex4D input_tidx = + get_input_tensor_tidx(im2col_w, im2col_h, k_in_group, group_idx); + + bool cols_aligned = (mod_4(input_tidx.data.z) == 0) && + (input_tidx.data.z + 3 < conv2d_params.in_channels_per_group); + + bool rows_aligned = mod_4(input_tidx.data.x) == 0; + bool rows_contiguous = conv2d_params.stride.x == 1; + + Im2ColBlockLoadIndices load_ixs; + load_ixs.block_aligned = cols_aligned && rows_aligned && rows_contiguous; + load_ixs.cols_aligned = cols_aligned; + load_ixs.rows_contiguous = rows_contiguous; + + load_ixs.im2col_w_start = im2col_w; + load_ixs.im2col_h = im2col_h; + load_ixs.k_in_group_start = k_in_group; + load_ixs.group_idx = group_idx; + + load_ixs.block_idx_start = tidx_to_block_elem_idx(input_tidx); + + return load_ixs; +} + +bool is_block_elem_idx_in_bounds( + const Conv2dBlockElementIndex idx, + const Conv2dBlockExtents block_extents) { + const ivec3 block_idx = ivec3(idx.x4, idx.y, idx.z4); + if (any(lessThan(block_idx, ivec3(0))) || + any(greaterThanEqual(block_idx, block_extents.data))) { + return false; + } + return true; +} + +int load_packed_int8_input_element( + const Conv2dBlockElementIndex idx, + const Conv2dBlockExtents block_extents, + const int input_zp) { + // bounds checking + if (!is_block_elem_idx_in_bounds(idx, block_extents)) { + return input_zp; + } +#ifdef PACKED_INT8_INPUT_BUFFER + const int buf_idx = + idx.y * block_extents.data_xz + idx.x4 * block_extents.data.z + idx.z4; + const ivec4 tile = t_packed_int8_input[buf_idx]; +#else + const ivec4 tile = + texelFetch(t_packed_int8_input, ivec3(idx.x4, idx.y, idx.z4), 0); +#endif + return extract_8bit_from_packed_int_le(tile[idx.row], idx.col); +} + +Conv2dBlockElementIndex get_packed_int8_input_element_idx( + const int im2col_w, + const int im2col_h, + const int k_in_group, + const int group_idx) { + TensorIndex4D input_tidx = + get_input_tensor_tidx(im2col_w, im2col_h, k_in_group, group_idx); + + return tidx_to_block_elem_idx(input_tidx); +} + +ivec4 load_im2col_block_aligned( + const Im2ColBlockLoadIndices load_ixs, + const Conv2dBlockExtents block_extents) { +#ifdef PACKED_INT8_INPUT_BUFFER + const int buf_idx = load_ixs.block_idx_start.y * block_extents.data_xz + + load_ixs.block_idx_start.x4 * block_extents.data.z + + load_ixs.block_idx_start.z4; + return t_packed_int8_input[buf_idx]; +#else + return texelFetch( + t_packed_int8_input, + ivec3( + load_ixs.block_idx_start.x4, + load_ixs.block_idx_start.y, + load_ixs.block_idx_start.z4), + 0); +#endif +} + +ivec4 load_im2col_block_c_aligned_w_contiguous( + const Im2ColBlockLoadIndices load_ixs, + const Conv2dBlockExtents block_extents, + const ivec4 input_zps) { + ivec4 im2col_block; + Conv2dBlockElementIndex block_elem_idx = load_ixs.block_idx_start; + +#ifdef PACKED_INT8_INPUT_BUFFER + int buf_idx = load_ixs.block_idx_start.y * block_extents.data_xz + + load_ixs.block_idx_start.x4 * block_extents.data.z + + load_ixs.block_idx_start.z4; +#endif + + ivec4 in_block = input_zps; + if (is_block_elem_idx_in_bounds(block_elem_idx, block_extents)) { +#ifdef PACKED_INT8_INPUT_BUFFER + in_block = t_packed_int8_input[buf_idx]; +#else + in_block = texelFetch( + t_packed_int8_input, + ivec3(block_elem_idx.x4, block_elem_idx.y, block_elem_idx.z4), + 0); +#endif + } + + int current_row = 0; + int r_limit = min(4 - block_elem_idx.row, 4); + for (int r = 0; r < r_limit; r++) { + im2col_block[current_row++] = in_block[r + block_elem_idx.row]; + } + + in_block = input_zps; + block_elem_idx.x4++; +#ifdef PACKED_INT8_INPUT_BUFFER + buf_idx += block_extents.data.z; +#endif + + if (is_block_elem_idx_in_bounds(block_elem_idx, block_extents)) { +#ifdef PACKED_INT8_INPUT_BUFFER + in_block = t_packed_int8_input[buf_idx]; +#else + in_block = texelFetch( + t_packed_int8_input, + ivec3(block_elem_idx.x4, block_elem_idx.y, block_elem_idx.z4), + 0); +#endif + } + + for (int r = 0; current_row < 4; ++r) { + im2col_block[current_row++] = in_block[r]; + } + + return im2col_block; +} + +ivec4 load_im2col_block_no_alignment( + const Im2ColBlockLoadIndices load_ixs, + const Conv2dBlockExtents block_extents, + const int input_zp) { + ivec4 im2col_block; + + for (int r = 0; r < 4; r++) { + const int im2col_w = load_ixs.im2col_w_start + r; + ivec4 row_values; + for (int c = 0; c < 4; c++) { + const int k_in_group = load_ixs.k_in_group_start + c; + + if (k_in_group >= conv2d_params.logical_K_per_group) { + row_values[c] = input_zp; + continue; + } + + Conv2dBlockElementIndex block_idx = get_packed_int8_input_element_idx( + im2col_w, load_ixs.im2col_h, k_in_group, load_ixs.group_idx); + + row_values[c] = + load_packed_int8_input_element(block_idx, block_extents, input_zp); + } + + im2col_block[r] = pack_into_int32(row_values); + } + return im2col_block; +} + +ivec4 load_im2col_block( + const Im2ColBlockLoadIndices load_ixs, + const Conv2dBlockExtents block_extents, + const int input_zp, + const ivec4 input_zps) { + if (load_ixs.cols_aligned && load_ixs.rows_contiguous) { + return load_im2col_block_c_aligned_w_contiguous( + load_ixs, block_extents, input_zps); + } + return load_im2col_block_no_alignment(load_ixs, block_extents, input_zp); +} + +#ifdef DEBUG_MODE + +void printLoadIndices(const Im2ColBlockLoadIndices load_ixs) { + debugPrintfEXT("LoadIndices: \\n"); + + if (load_ixs.block_aligned) { + debugPrintfEXT(" block_aligned \\n"); + } + if (load_ixs.cols_aligned) { + debugPrintfEXT(" cols_aligned \\n"); + } + if (load_ixs.rows_contiguous) { + debugPrintfEXT(" rows_contiguous \\n"); + } + + debugPrintfEXT( + " block_idx_start: %d %d %d || %d %d \\n", + load_ixs.block_idx_start.x4, + load_ixs.block_idx_start.y, + load_ixs.block_idx_start.z4, + load_ixs.block_idx_start.row, + load_ixs.block_idx_start.col); +} + +#endif + +#endif // IM2COL_PACKED_INT8_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh index da326b26e93..c95abdcb230 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh @@ -16,19 +16,6 @@ #include "common.glslh" -int sign_extend_8bit(const int val) { - if ((val & 0x80) != 0) { - return val | (~0xFF); - } - return val; -} - -int extract_8bit_from_packed_int_le(const int packed, const int i) { - // account for little endian - int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF); - return byte; -} - // Extract a 4-bit value from a packed int (little endian) // It is assumed that the 4-bit value is in the range [0, 15] int extract_4bit_from_packed_int_le(const int packed, const int col) { diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh index ca25e406ac1..850dc7943c0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh @@ -75,7 +75,7 @@ void accumulate_out_tile_with_int_accum( input_zp_vec * weight_sums.data[n4] + accum.data[m][n4]; out_tile.data[m][n4] = fma(VEC4_T(accum_adjusted), - VEC4_T(input_q_scale * weight_scales.data[0]), + VEC4_T(input_q_scale * weight_scales.data[n4]), out_tile.data[m][n4]); } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh index a6dbd7e78a2..8f19418cd19 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh @@ -43,13 +43,6 @@ ivec4 quantize( return clamp(ivec4(quantized), -128, 127); } -int pack_into_int32(const ivec4 quant_vals) { - int packed = ((quant_vals[0] & 0xFF) << 0) | ((quant_vals[1] & 0xFF) << 8) | - ((quant_vals[2] & 0xFF) << 16) | ((quant_vals[3] & 0xFF) << 24); - - return packed; -} - void quantize_and_pack( out Int8InputBlock packed, const FPInputTile in_block, diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh new file mode 100644 index 00000000000..14aa6558bfc --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh @@ -0,0 +1,67 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Macro Settings: + * - TILE_M + * - TILE_N4 + */ + +#ifndef LINEAR_INT8_OUTPUT_TILE_GLSLH +#define LINEAR_INT8_OUTPUT_TILE_GLSLH + +#extension GL_EXT_control_flow_attributes : require + +struct Int8OutTile { + ivec4 data[TILE_M4][TILE_N4]; +}; + +void initialize(out Int8OutTile tile) { + [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + tile.data[m4][n4] = ivec4(0); + } + } +} + +#ifdef DEBUG_MODE + +#include "linear_common.glslh" + +void printInt8OutTile(const Int8OutTile tile) { + debugPrintfEXT( + "Int8InputTile [TILE_M4=%d][TILE_N4=%d]:\\n", TILE_M4, TILE_N4); + + [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + debugPrintfEXT(" tile[%d][%d] (ivec4): ", m4, n4); + + // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit + // values + [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) { + int packed_int = tile.data[m4][n4][vec_idx]; + debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int); + + // Extract 4 8-bit values from this packed integer + [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) { + int val = extract_8bit_from_packed_int_le(packed_int, byte_idx); + if (byte_idx < 3) { + debugPrintfEXT("%d, ", val); + } else { + debugPrintfEXT("%d] ", val); + } + } + } + debugPrintfEXT("\\n"); + } + } +} + +#endif // DEBUG_MODE + +#endif // LINEAR_INT8_OUTPUT_TILE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile_compute.glslh new file mode 100644 index 00000000000..1251ca60b87 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile_compute.glslh @@ -0,0 +1,93 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Defines functions to compute a FPOutTile using int8 input and weight tiles. + * + * Settings: + * - TILE_M: The number of rows in the output tile. + * - TILE_N4: The number of (groups of 4) columns in the output tile. + */ + +#ifndef LINEAR_INT8_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH +#define LINEAR_INT8_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH + +#extension GL_EXT_control_flow_attributes : require +#extension GL_EXT_integer_dot_product : require + +#include "linear_fp_per_out_channel_params.glslh" +#include "linear_int8_output_tile.glslh" +#include "linear_int_accumulator.glslh" +#include "linear_int_per_out_channel_params.glslh" + +void compute_int8_out_tile_with_int32_accum( + out Int8OutTile out_tile, + const Int32Accum accum, + const float input_q_scale, + const int input_q_zp, + const float output_q_inv_scale, + const int output_q_zp, + const IntPerOutChannelParams weight_sums, + const FPPerOutChannelParams weight_scales) { + ivec4 input_zp_vec = ivec4(-input_q_zp); + ivec4 output_zp_vec = ivec4(-output_q_zp); + [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { + [[unroll]] for (int m4i = 0; m4i < 4; ++m4i) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + const int m = mul_4(m4) + m4i; + // Compute floating point output values + ivec4 accum_adjusted = + input_zp_vec * weight_sums.data[n4] + accum.data[m][n4]; + vec4 float_out_texel = + vec4(accum_adjusted) * vec4(weight_scales.data[n4] * input_q_scale); + // Requantize to int8 + float_out_texel = + round(float_out_texel * output_q_inv_scale) + output_q_zp; + ivec4 quantized_out_texel = clamp(ivec4(float_out_texel), -128, 127); + + out_tile.data[m4][n4][m4i] = pack_into_int32(quantized_out_texel); + } + } + } +} + +void compute_int8_out_tile_with_int32_accum( + out Int8OutTile out_tile, + const Int32Accum accum, + const float input_q_scale, + const int input_q_zp, + const float output_q_inv_scale, + const int output_q_zp, + const IntPerOutChannelParams weight_sums, + const FPPerOutChannelParams weight_scales, + const FPPerOutChannelParams bias) { + ivec4 input_zp_vec = ivec4(-input_q_zp); + ivec4 output_zp_vec = ivec4(-output_q_zp); + [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) { + [[unroll]] for (int m4i = 0; m4i < 4; ++m4i) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + const int m = mul_4(m4) + m4i; + // Compute floating point output values + ivec4 accum_adjusted = + input_zp_vec * weight_sums.data[n4] + accum.data[m][n4]; + vec4 float_out_texel = + fma(vec4(accum_adjusted), + vec4(weight_scales.data[n4]) * input_q_scale, + vec4(bias.data[n4])); + // Requantize to int8 + float_out_texel = + round(float_out_texel * output_q_inv_scale) + output_q_zp; + ivec4 quantized_out_texel = clamp(ivec4(float_out_texel), -128, 127); + + out_tile.data[m4][n4][m4i] = pack_into_int32(quantized_out_texel); + } + } + } +} + +#endif // LINEAR_INT8_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl index 0ad91643219..878821d4189 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl @@ -76,9 +76,6 @@ void main() { const int N4 = div_up_4(output_sizes.x); // number of texels in each row const int N8 = div_up_8(output_sizes.x); // number of texels in each row - bool should_print = (n8 == 0) && (m4 == 0); - should_print = false; - // VEC4_T out_texels[4][2]; FPOutTile out_tile; initialize(out_tile); diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml index aa1de3077fc..989729f2d7f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml @@ -11,7 +11,7 @@ linear_q8ta_q8csw_tiled: PACKED_INT8_INPUT_STORAGE: buffer WEIGHT_STORAGE: texture2d TILE_M4: 1 - TILE_N4: 1 + TILE_N4: 2 TILE_K4: 1 generate_variant_forall: DTYPE: diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.glsl new file mode 100644 index 00000000000..da4162b6e58 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.glsl @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type(STORAGE)} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_packed_int8_weight", "int", STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_int8_weight", "int", "buffer")} + +layout(push_constant) uniform restrict Block { + ivec4 qmat2_sizes; + ivec3 orig_sizes; // [K_h, aligned_K_w, OC] +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "common.glslh" + +void main() { + // The size of the source weight tensor is [K_h, aligned_K_w, OC] for depthwise conv. + // Each shader invocation processes a 4x4 block of weights for a group of output channels. + const int oc4 = int(gl_GlobalInvocationID.x); + const int k4 = int(gl_GlobalInvocationID.y); + const int k = mul_4(k4); + + const int H = orig_sizes.x; + const int orig_W = orig_sizes.y; + const int W4 = div_up_4(orig_W); + const int OC = orig_sizes.z; + + const int h = k4 / W4; + const int w4 = k4 % W4; + const int w = mul_4(w4); + + // Determine the total number of blocks and check bounds + const int OC4 = div_up_4(OC); + const int K4 = H * W4; + + if (oc4 >= OC4 || k4 >= K4) { + return; + } + + ivec4 packed_block; + + int buf_idx = (h * orig_W + w) * OC4 + oc4; + int r_limit = min(4, orig_W - w); + [[unroll]] for (int r = 0; r < r_limit; r++) { + packed_block[r] = t_int8_weight[buf_idx]; + buf_idx += OC4; + } + [[unroll]] for (int r = r_limit; r < 4; r++) { + packed_block[r] = 0; + } + +#ifdef USING_BUFFER + t_packed_int8_weight[k4 * OC4 + oc4] = packed_block; +#else + imageStore(t_packed_int8_weight, ivec2(oc4, k4), packed_block); +#endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.yaml new file mode 100644 index 00000000000..9cfa3108ff0 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.yaml @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +pack_q8_conv2d_dw_weights: + parameter_names_with_default_values: + STORAGE: buffer + generate_variant_forall: + STORAGE: + - VALUE: buffer + - VALUE: texture2d + shader_variants: + - NAME: pack_q8_conv2d_dw_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.glsl new file mode 100644 index 00000000000..e9982a8273d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.glsl @@ -0,0 +1,82 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type(STORAGE)} + +#extension GL_EXT_control_flow_attributes : require + +${define_required_extensions("int8")} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_packed_int8_weight", "int", STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_int8_weight", "int8", "buffer")} + +layout(push_constant) uniform restrict Block { + ivec4 qmat2_sizes; + ivec4 orig_sizes; // [OC, K_h, K_w, IC] +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "common.glslh" + +void main() { + const int block_x = int(gl_GlobalInvocationID.x); + const int block_y = int(gl_GlobalInvocationID.y); + + const int kx = block_x % orig_sizes.z; + const int oc4 = block_x / orig_sizes.z; + + const int OC4 = div_up_4(orig_sizes.x); + const int IC4 = div_up_4(orig_sizes.w); + + const int nblocks_x = orig_sizes.z * OC4; + const int nblocks_y = IC4 * orig_sizes.y; + + const int ic4 = block_y % IC4; + const int ky = block_y / IC4; + + if (block_x >= nblocks_x || block_y >= nblocks_y) { + return; + } + + const int oc = mul_4(oc4); + const int ic = mul_4(ic4); + + const int oc_stride = align_up_4(orig_sizes.y * orig_sizes.z * orig_sizes.w); + const int oc_offset = oc * oc_stride; + const int ky_offset = ky * (orig_sizes.z * orig_sizes.w); + const int kx_offset = kx * orig_sizes.w; + int buf_idx = oc_offset + ky_offset + kx_offset + ic; + + ivec4 packed_block = ivec4(0); + for (int row = 0; row < 4; row++) { + if (oc + row < orig_sizes.x) { + ivec4 weight_vals = ivec4(0); + for (int col = 0; col < 4; col++) { + if (ic + col < orig_sizes.w) { + weight_vals[col] = int(t_int8_weight[buf_idx + col]); + } + } + packed_block[row] = pack_into_int32(weight_vals); + } + buf_idx += oc_stride; + } + +#ifdef USING_BUFFER + const int out_buf_idx = block_y * (nblocks_x) + block_x; + t_packed_int8_weight[out_buf_idx] = packed_block; +#else + imageStore(t_packed_int8_weight, ivec2(block_x, block_y), packed_block); +#endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.yaml new file mode 100644 index 00000000000..9331de6e758 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.yaml @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +pack_q8_conv2d_weights: + parameter_names_with_default_values: + STORAGE: buffer + generate_variant_forall: + STORAGE: + - VALUE: buffer + - VALUE: texture2d + shader_variants: + - NAME: pack_q8_conv2d_weights diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl new file mode 100644 index 00000000000..d485523709b --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl @@ -0,0 +1,77 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)} +#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)} + +// corresponds to the input width dim +#define TILE_M4 1 +// corresponds to the input channels dim +#define TILE_K4 1 + +#define TILE_M 4 + +$if OUTPUT_STORAGE == "buffer": + #define OUTPUT_BUFFER +$if INPUT_STORAGE == "buffer": + #define INPUT_BUFFER + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "input_sizes")} + +layout(push_constant) uniform restrict Block { + float inv_scale; + int zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "conv2d_fp_input_tile_load.glslh" +#include "linear_int8_input_block.glslh" + +void store_packed_int8_block( + const Conv2dBlockIndex block_idx, + const Conv2dBlockExtents block_extents, + const Int8InputBlock packed_int8_block) { +#ifdef OUTPUT_BUFFER + const int buffer_idx = block_idx.data.y * block_extents.data_xz + + block_idx.data.x * block_extents.data.z + block_idx.data.z; + t_packed_int8_input[buffer_idx] = packed_int8_block.data; +#else + imageStore(t_packed_int8_input, block_idx.data, packed_int8_block.data); +#endif +} + +void main() { + Conv2dBlockIndex block_idx; + block_idx.data = ivec3(gl_GlobalInvocationID); + + Conv2dBlockExtents block_extents = make_block_extents(input_sizes); + if (block_idx_out_of_bounds(block_idx, block_extents)) { + return; + } + + FPInputTile fp_tile; + load_fp_input_tile(fp_tile, block_idx); + + Int8InputBlock int8_block; + quantize_and_pack(int8_block, fp_tile, inv_scale, zp); + + store_packed_int8_block(block_idx, block_extents, int8_block); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml new file mode 100644 index 00000000000..712d3156e2e --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +quantize_and_pack_q8ta_conv2d_input: + parameter_names_with_default_values: + DTYPE: float + OUTPUT_STORAGE: texture3d + INPUT_STORAGE: texture3d + generate_variant_forall: + combination: + parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE] + combos: + - parameter_values: [texture3d, texture3d] + - parameter_values: [buffer, texture3d] + DTYPE: + - VALUE: float + shader_variants: + - NAME: quantize_and_pack_q8ta_conv2d_input diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh index 03132db1348..1880397181d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh @@ -44,7 +44,6 @@ void load_k_cache_tile_no_checks( const int context_len, const int C, const int KV_H) { - bool should_print = d4_start == 0 && c_start == 0 && kv_h == 0; [[unroll]] for (int c = 0; c < TILE_N; ++c) { const int c4 = div_4(c); const int c4i = mod_4(c); diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl index d35492bc367..86a2229c416 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl @@ -42,7 +42,8 @@ layout(constant_id = 5) const int group_dim = 1; // work group will write into its assigned element in the shared array. #define MAX_NTHREADS 16 -shared vec4 shared_vecs[MAX_NTHREADS]; +shared vec4 shared_max[MAX_NTHREADS]; +shared vec4 shared_sum[MAX_NTHREADS]; #include "indexing_utils.h" @@ -102,13 +103,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { max_elements = max(max_elements, load_texel(tin, scan_pos)); } - shared_vecs[smi] = max_elements; + shared_max[smi] = max_elements; barrier(); // Iterate over the partial maximums to obtain the overall maximum group_i = tid.y * NWORKERS; - max_elements = shared_vecs[group_i++]; + max_elements = shared_max[group_i++]; for (int i = 1; i < NWORKERS; ++i, group_i++) { - max_elements = max(max_elements, shared_vecs[group_i]); + max_elements = max(max_elements, shared_max[group_i]); } scan_pos[reduce_dim] = tid.x; @@ -118,13 +119,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { denominators += exp(load_texel(tin, scan_pos) - max_elements); } - shared_vecs[smi] = denominators; + shared_sum[smi] = denominators; barrier(); // Iterate over the partial sums to obtain the overall sum group_i = tid.y * NWORKERS; - denominators = shared_vecs[group_i++]; + denominators = shared_sum[group_i++]; for (int i = 1; i < NWORKERS; ++i, group_i++) { - denominators += shared_vecs[group_i]; + denominators += shared_sum[group_i]; } // Determine if there are any padding elements in the final texel of the @@ -184,13 +185,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { max_elements.x = max(intex[i], max_elements.x); } } - shared_vecs[smi] = max_elements; + shared_max[smi] = max_elements; barrier(); // Iterate over the partial maximums to obtain the overall maximum group_i = tid.y * NWORKERS; - max_elements = shared_vecs[group_i++]; + max_elements = shared_max[group_i++]; for (int i = 1; i < NWORKERS; ++i, group_i++) { - max_elements = max(max_elements, shared_vecs[group_i]); + max_elements = max(max_elements, shared_max[group_i]); } // Each element of the texel is itself a partial maximum; iterate over the // texel to find the actual maximum @@ -214,13 +215,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) { denominators.x += exp(intex[i] - max_element); } } - shared_vecs[smi] = denominators; + shared_sum[smi] = denominators; barrier(); // Iterate over the partial sums to obtain the overall sum group_i = tid.y * NWORKERS; - denominators = shared_vecs[group_i++]; + denominators = shared_sum[group_i++]; for (int i = 1; i < NWORKERS; ++i, group_i++) { - denominators += shared_vecs[group_i]; + denominators += shared_sum[group_i]; } // Reduce over the accumulated texel to find the overall sum float denominator = 0; diff --git a/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl new file mode 100644 index 00000000000..798366b523a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl @@ -0,0 +1,117 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)} +#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)} + +// corresponds to the output width dim +#define TILE_M4 1 +// corresponds to the output channels dim +#define TILE_K4 1 + +#define TILE_M 4 + +$if OUTPUT_STORAGE == "buffer": + #define OUTPUT_BUFFER +$if INPUT_STORAGE == "buffer": + #define INPUT_BUFFER + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_packed_int8_output", "int", INPUT_STORAGE, is_scalar_array=False)} + +${layout_declare_ubo(B, "ivec4", "output_sizes")} + +layout(push_constant) uniform restrict Block { + float scale; + int zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "linear_fp_input_tile.glslh" +#include "linear_int8_input_tile.glslh" + +void load_packed_int8_tile( + out Int8InputTile int8_tile, + const Conv2dBlockIndex block_idx, + const Conv2dBlockExtents block_extents) { +#ifdef INPUT_BUFFER + const int buffer_idx = block_idx.data.y * block_extents.data_xz + + block_idx.data.x * block_extents.data.z + block_idx.data.z; + int8_tile.data[0][0] = t_packed_int8_output[buffer_idx]; +#else + int8_tile.data[0][0] = texelFetch(t_packed_int8_output, block_idx.data, 0); +#endif +} + +VEC4_T +dequantize_8bit(const ivec4 val, const float q_scale, const int q_zero_point) { + return VEC4_T(val - q_zero_point) * q_scale; +} + +void unpack_and_dequantize( + out FPInputTile fp_tile, + const Int8InputTile int8_tile, + const float q_scale, + const int q_zero_point) { + [[unroll]] for (int w = 0; w < 4; ++w) { + int packed = int8_tile.data[0][0][w]; + fp_tile.data[w][0] = dequantize_8bit( + ivec4( + extract_8bit_from_packed_int_le(packed, 0), + extract_8bit_from_packed_int_le(packed, 1), + extract_8bit_from_packed_int_le(packed, 2), + extract_8bit_from_packed_int_le(packed, 3)), + q_scale, + q_zero_point); + } +} + +void store_fp_output_texel( + const Conv2dTensorIndex tidx, + const VEC4_T out_texel) { + imageStore(t_fp_output, tidx.data, out_texel); +} + +void store_fp_tile( + const FPInputTile block, + const Conv2dBlockIndex block_idx) { + Conv2dTensorIndex store_tidx = block_idx_to_tensor_idx(block_idx); + [[unroll]] for (int w = 0; w < 4; w++) { + store_fp_output_texel(store_tidx, block.data[w][0]); + store_tidx.data.x++; + } +} + +void main() { + Conv2dBlockIndex block_idx; + block_idx.data = ivec3(gl_GlobalInvocationID); + + Conv2dBlockExtents block_extents = make_block_extents(output_sizes); + if (block_idx_out_of_bounds(block_idx, block_extents)) { + return; + } + + Int8InputTile int8_tile; + load_packed_int8_tile(int8_tile, block_idx, block_extents); + + FPInputTile fp_tile; + unpack_and_dequantize( + fp_tile, int8_tile, scale, zp); + + store_fp_tile(fp_tile, block_idx); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml new file mode 100644 index 00000000000..24b253da343 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +unpack_and_dequantize_q8ta_conv2d_output: + parameter_names_with_default_values: + DTYPE: float + OUTPUT_STORAGE: texture3d + INPUT_STORAGE: texture3d + generate_variant_forall: + combination: + parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE] + combos: + - parameter_values: [texture3d, texture3d] + - parameter_values: [texture3d, buffer] + DTYPE: + - VALUE: float + shader_variants: + - NAME: unpack_and_dequantize_q8ta_conv2d_output diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp index 757afd06849..a6dd8f07f53 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp @@ -19,6 +19,18 @@ namespace vkcompute { +void resize_batch_norm_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef self = args.at(1).refs.at(0); + + // For batch norm, output dimensions are the same as input dimensions + std::vector new_out_sizes = graph->sizes_of(self); + graph->virtual_resize(out, new_out_sizes); +} + ValueRef check_and_prepack_arg( ComputeGraph& graph, ValueRef arg_ref, @@ -101,7 +113,7 @@ void add_native_batch_norm_node( // Resize Args {}, // Resizing Logic - nullptr)); + resize_batch_norm_node)); } void native_batch_norm(ComputeGraph& graph, const std::vector& args) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp index 6c701224f7f..71690ffc604 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp @@ -56,4 +56,27 @@ utils::uvec3 pick_hw_square_wg_size( return {16u, 4u, 1u}; } +utils::uvec3 pick_wc_square_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)graph; + (void)shader; + (void)args; + (void)resize_args; + // Some inactive invocations are okay; set 6 as the threshold to use the + // a square wg size. + if (global_workgroup_size[0u] >= 6 && global_workgroup_size[2u] >= 6) { + return {8u, 1u, 8u}; + } + // If channels dim is sufficiently small, then bias towards width dim to + // reduce the number of inactive invocations. + if (global_workgroup_size[2u] < 2u) { + return {64u, 1u, 1u}; + } + return {16u, 1u, 4u}; +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h index 1831ab2a845..b412f737c13 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Common.h +++ b/backends/vulkan/runtime/graph/ops/impl/Common.h @@ -54,4 +54,11 @@ utils::uvec3 pick_hw_square_wg_size( const std::vector& args, const std::vector& resize_args); +utils::uvec3 pick_wc_square_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args); + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index b83164f27d2..479bb44ae6f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -365,6 +365,10 @@ utils::uvec3 conv2d_global_wg_size( if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) { wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; + + if (shader.kernel_name.find("s1p0") != std::string::npos) { + wg_size[0] *= 4; + } } return wg_size; diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index 9ac4c963bc3..329620e80e6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -109,11 +109,15 @@ void add_permute_node( { IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); const int32_t permute_ndim = - utils::safe_downcast(permute_dims_ptr->size()); + utils::safe_downcast(permute_dims_ptr->size()); for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0; nchw_i--, whcn_i++) { - const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i); + int32_t permute_dim_nchw = + utils::safe_downcast(permute_dims_ptr->at(nchw_i)); + if (permute_dim_nchw < 0) { + permute_dim_nchw += permute_ndim; + } const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw; whcn_permute_dims[whcn_i] = permute_dim_whcn; diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp index 250fcdd5490..879f59667d6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp @@ -137,7 +137,7 @@ void max_pool2d(ComputeGraph& graph, const std::vector& args) { struct DivisorParams final { int32_t divisor_override; - bool count_include_pad; + int32_t count_include_pad; }; DivisorParams create_divisor_params( @@ -148,7 +148,7 @@ DivisorParams create_divisor_params( graph.val_is_int(divisor_override) ? static_cast(graph.get_int(divisor_override)) : 0, - graph.get_bool(count_include_pad)}; + int32_t(graph.get_bool(count_include_pad))}; } void add_avg_pool2d_node( diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp new file mode 100644 index 00000000000..4b359f12700 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +namespace vkcompute { + +// +// Shader dispatch utilities +// + +utils::uvec3 pick_q8ta_q8ta_q8to_binary_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef packed_int8_output = args.at(0).refs.at(0); + + const uint32_t W = graph->size_at(-1, packed_int8_output); + const uint32_t H = graph->size_at(-2, packed_int8_output); + const uint32_t C = graph->size_at(-3, packed_int8_output); + + const uint32_t W4 = utils::div_up_4(W); + const uint32_t C4 = utils::div_up_4(C); + + return {W4 * H * C4, 1, 1}; +} + +// +// Dispatch nodes +// + +void add_q8ta_q8ta_q8to_binary_node( + ComputeGraph& graph, + const ValueRef packed_int8_input_a, + const ValueRef packed_int8_input_b, + const ValueRef input_a_scale, + const ValueRef input_a_zp, + const ValueRef input_b_scale, + const ValueRef input_b_zp, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef alpha, + const ValueRef packed_int8_output, + const std::string& op_name) { + float input_a_scale_val = graph.extract_scalar(input_a_scale); + int32_t input_a_zp_val = graph.extract_scalar(input_a_zp); + float input_b_scale_val = graph.extract_scalar(input_b_scale); + int32_t input_b_zp_val = graph.extract_scalar(input_b_zp); + + float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); + int32_t output_zp_val = graph.extract_scalar(output_zp); + + float alpha_val = 1.0f; + // String is checked since some ops pass in an unused string argument in + // place of alpha + if (is_valid(alpha) && !graph.val_is_string(alpha)) { + alpha_val = graph.extract_scalar(alpha); + } + + std::string kernel_name = op_name + "_q8ta_q8ta_q8to"; + add_storage_type_suffix( + kernel_name, graph.storage_type_of(packed_int8_output)); + + vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(packed_int8_output)}; + + std::vector push_constants = { + PushConstantDataInfo(&input_a_scale_val, sizeof(input_a_scale_val)), + PushConstantDataInfo(&input_a_zp_val, sizeof(input_a_zp_val)), + PushConstantDataInfo(&input_b_scale_val, sizeof(input_b_scale_val)), + PushConstantDataInfo(&input_b_zp_val, sizeof(input_b_zp_val)), + PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), + PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), + PushConstantDataInfo(&alpha_val, sizeof(alpha_val)), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_q8ta_q8ta_q8to_binary_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{packed_int8_output, vkapi::kWrite}, + {{packed_int8_input_a, packed_int8_input_b}, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {}, + // Resize args + {}, + // Resizing Logic + nullptr)); +} + +// +// High level operator impl +// + +void add_q8ta_q8ta_q8to( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef packed_int8_input_a = args.at(idx++); + const ValueRef packed_int8_input_b = args.at(idx++); + const ValueRef input_a_scale = args.at(idx++); + const ValueRef input_a_zp = args.at(idx++); + const ValueRef input_b_scale = args.at(idx++); + const ValueRef input_b_zp = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef alpha = args.at(idx++); + const ValueRef packed_int8_output = args.at(idx++); + + add_q8ta_q8ta_q8to_binary_node( + graph, + packed_int8_input_a, + packed_int8_input_b, + input_a_scale, + input_a_zp, + input_b_scale, + input_b_zp, + output_scale, + output_zp, + alpha, + packed_int8_output, + "add"); +} + +// +// Test operators +// + +void add_q8ta_q8ta_q8to_test( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input_a = args.at(idx++); + const ValueRef fp_input_b = args.at(idx++); + const ValueRef input_a_scale = args.at(idx++); + const ValueRef input_a_zp = args.at(idx++); + const ValueRef input_b_scale = args.at(idx++); + const ValueRef input_b_zp = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef alpha = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + TmpTensor packed_int8_input_a( + &graph, + graph.sizes_of(fp_input_a), + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + TmpTensor packed_int8_input_b( + &graph, + graph.sizes_of(fp_input_b), + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + TmpTensor packed_int8_output( + &graph, + graph.sizes_of(fp_output), + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + add_quantize_and_pack_q8ta_conv2d_input_node( + graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a); + + add_quantize_and_pack_q8ta_conv2d_input_node( + graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b); + + std::vector add_args = { + packed_int8_input_a, + packed_int8_input_b, + input_a_scale, + input_a_zp, + input_b_scale, + input_b_zp, + output_scale, + output_zp, + alpha, + packed_int8_output}; + + add_q8ta_q8ta_q8to(graph, add_args); + + add_unpack_and_dequantize_q8ta_conv2d_output_node( + graph, packed_int8_output, output_scale, output_zp, fp_output); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.default, add_q8ta_q8ta_q8to); + VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.test, add_q8ta_q8ta_q8to_test); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp index 51f8138485e..775e4534cfb 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -19,6 +20,86 @@ namespace vkcompute { // Utility functions // +bool is_pointwise(ComputeGraph* graph, const ValueRef& kernel_size) { + const auto kernel_size_list = graph->get_int_list(kernel_size); + return kernel_size_list->at(0) == 1 && kernel_size_list->at(1) == 1; +} + +bool is_s1p1d1( + ComputeGraph* graph, + const ValueRef& stride, + const ValueRef& padding, + const ValueRef& dilation) { + const auto stride_list = graph->get_int_list(stride); + const auto padding_list = graph->get_int_list(padding); + const auto dilation_list = graph->get_int_list(dilation); + if (stride_list->at(0) != 1 && stride_list->at(1) != 1) { + return false; + } + if (padding_list->at(0) != 1 && padding_list->at(1) != 1) { + return false; + } + if (dilation_list->at(0) != 1 && dilation_list->at(1) != 1) { + return false; + } + return true; +} + +bool is_s1p0d1_pointwise( + ComputeGraph* graph, + const ValueRef& kernel_size, + const ValueRef& stride, + const ValueRef& padding, + const ValueRef& dilation) { + if (is_pointwise(graph, kernel_size)) { + const auto stride_list = graph->get_int_list(stride); + const auto padding_list = graph->get_int_list(padding); + const auto dilation_list = graph->get_int_list(dilation); + if (stride_list->at(0) != 1 && stride_list->at(1) != 1) { + return false; + } + if (padding_list->at(0) != 0 && padding_list->at(1) != 0) { + return false; + } + if (dilation_list->at(0) != 1 && dilation_list->at(1) != 1) { + return false; + } + return true; + } + return false; +} + +bool should_use_im2col( + ComputeGraph* graph, + const ValueRef kernel_size, + const ValueRef groups) { + const auto kernel_size_list = graph->get_int_list(kernel_size); + + // Always use im2col for pointwise convolutions + if (kernel_size_list->at(0) * kernel_size_list->at(1) == 1) { + return true; + } + + // For large kernel sizes, the im2col matrix will be too big. Not only will + // this result in a larger footprint for the im2col matrix, but the cost of + // performing the im2col procedure will also become prohibitive. In these + // cases it is faster to just compute convolution directly without going + // through im2col. Empirically, im2col works well for 3x3 convolution and + // not for 5x5 convolution, so set the limit at 10. + if (kernel_size_list->at(0) * kernel_size_list->at(1) > 10) { + return false; + } + + // Only use im2col for non-grouped convolutions; manual experimentation shows + // that im2col becomes very slow when dealing with grouped convolutions. The + // reason for this is likely that memory access in the im2col shader becomes + // too non-linear due to needed to keep convolution groups contiguous in + // in memory. This means that the channels of the input tensor (which are + // originally contiguous in memory) will be split up during the im2col + // procedure. + return graph->get_int(groups) == 1; +} + struct Conv2DParams { utils::ivec2 kernel_size; utils::ivec2 stride; @@ -135,6 +216,43 @@ std::vector calculate_input_im2col_sizes( return {M, K}; } +std::vector calculate_packed_int8_input_im2col_sizes( + ComputeGraph* graph, + const ValueRef& input, + const ValueRef& output, + const ValueRef& kernel_size, + const ValueRef& groups) { + std::vector in_sizes = graph->sizes_of(input); + const int64_t in_channels = utils::val_at(-3, in_sizes); + + std::vector out_sizes = graph->sizes_of(output); + const int64_t out_height = utils::val_at(-2, out_sizes); + const int64_t out_width = utils::val_at(-1, out_sizes); + + // Represents the number of channel groups + const int64_t groups_val = graph->extract_scalar(groups); + // No need to div_up because in_channels % groups_val = 0 + const int64_t in_channels_per_group = in_channels / groups_val; + + const auto kernel_size_list = graph->get_int_list(kernel_size); + + // Align to the next multiple of 4 to ensure that data loads align nicely with + // texel boundaries. We want to ensure that the first data element of each + // group is at the start of its texel. + const int64_t flattened_kernel_len = utils::align_up_4( + in_channels_per_group * kernel_size_list->at(0) * + kernel_size_list->at(1)); + + // K -> flattened convolution window (repeated for each group) + const int64_t K = flattened_kernel_len * groups_val; + // M -> number of elements in 2D output plane. This is aligned to the next + // multiple of 4 since the im2col shader operates on 4x4 blocks. + const int64_t W = utils::align_up_4(out_width); + const int64_t H = out_height; + + return {K, H, W}; +} + std::vector calculate_output_im2col_sizes( ComputeGraph* graph, const ValueRef& output) { @@ -156,6 +274,40 @@ std::vector calculate_output_im2col_sizes( // Shader dispatch utilities // +utils::uvec3 pick_quantize_and_pack_conv2d_input_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef fp_input = args.at(1).refs.at(0); + + const uint32_t W = graph->size_at(-1, fp_input); + const uint32_t H = graph->size_at(-2, fp_input); + const uint32_t C = graph->size_at(-3, fp_input); + + const uint32_t W4 = utils::div_up_4(W); + const uint32_t C4 = utils::div_up_4(C); + + return {W4, H, C4}; +} + +utils::uvec3 pick_unpack_and_dequantize_conv2d_output_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef fp_output = args.at(0).refs.at(0); + + const uint32_t W = graph->size_at(-1, fp_output); + const uint32_t H = graph->size_at(-2, fp_output); + const uint32_t C = graph->size_at(-3, fp_output); + + const uint32_t W4 = utils::div_up_4(W); + const uint32_t C4 = utils::div_up_4(C); + + return {W4, H, C4}; +} + utils::uvec3 im2col_global_wg_size( ComputeGraph* graph, const vkapi::ShaderInfo& shader, @@ -178,6 +330,33 @@ utils::uvec3 im2col_global_wg_size( return {K4, M4, 1}; } +utils::uvec3 im2col_packed_int8_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef input_im2col = args.at(0).refs.at(0); + + std::vector im2col_sizes = graph->sizes_of(input_im2col); + const uint32_t K = utils::safe_downcast(im2col_sizes[0]); + const uint32_t H = utils::safe_downcast(im2col_sizes[1]); + const uint32_t W = utils::safe_downcast(im2col_sizes[2]); + + const uint32_t K4 = utils::div_up(K, 4u); + const uint32_t W4 = utils::div_up(W, 4u); + + return {K4 * W4 * H, 1, 1}; +} + +utils::uvec3 im2col_packed_int8_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + return {64, 1, 1}; +} + utils::uvec3 col2im_global_wg_size( ComputeGraph* graph, const vkapi::ShaderInfo& shader, @@ -197,6 +376,229 @@ utils::uvec3 col2im_global_wg_size( return {N4, M4, 1}; } +utils::uvec3 pick_static_quantized_conv2d_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef packed_int8_output = args.at(0).refs.at(0); + + const uint32_t W = graph->size_at(-1, packed_int8_output); + const uint32_t H = graph->size_at(-2, packed_int8_output); + const uint32_t C = graph->size_at(-3, packed_int8_output); + + uint32_t C_per_tile = 4; + uint32_t W_per_tile = 4; + + if (shader.kernel_name.find("linear") != std::string::npos) { + C_per_tile = 8; + } + + const uint32_t num_W_tiles = utils::div_up(W, W_per_tile); + const uint32_t num_C_tiles = utils::div_up(C, C_per_tile); + + return {num_C_tiles, num_W_tiles, H}; +} + +utils::uvec3 pick_static_quantized_conv2d_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + return pick_hw_square_wg_size( + graph, shader, global_workgroup_size, args, resize_args); +} + +utils::uvec3 int8_conv2d_dw_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef packed_int8_output = args.at(0).refs.at(0); + + const uint32_t W = graph->size_at(-1, packed_int8_output); + const uint32_t H = graph->size_at(-2, packed_int8_output); + const uint32_t C = graph->size_at(-3, packed_int8_output); + + const uint32_t W4 = utils::div_up_4(W); + const uint32_t C4 = utils::div_up_4(C); + + return {C4 * W4 * H, 1, 1}; +} + +// +// Prepack nodes +// + +ValueRef prepack_quantized_conv2d_weight( + ComputeGraph& graph, + const QuantizationConfig& weight_quant_config, + const ValueRef weight_data, + const ValueRef input, + const ValueRef output, + const ValueRef groups, + const ValueRef kernel_size) { + VK_CHECK_COND(weight_quant_config.nbits == 8); + VK_CHECK_COND(weight_quant_config.is_symmetric); + + const int32_t groups_val = graph.get_int(groups); + + const int64_t OC = graph.size_at(-3, output); + const int64_t IC = graph.size_at(-3, input) / groups_val; + + int64_t K_h; + int64_t K_w; + + { + const auto kernel_size_list = graph.get_int_list(kernel_size); + K_h = kernel_size_list->at(0); + K_w = kernel_size_list->at(1); + } + + const int64_t num_blocks_OC = utils::div_up_4(OC); + const int64_t num_blocks_IC = utils::div_up_4(IC); + + const int64_t num_blocks_y = num_blocks_IC * K_h; + const int64_t num_blocks_x = K_w * num_blocks_OC; + + // The packed tensor arranges blocks as [OC_blocks * K_total, IC_blocks] + const int64_t output_height = num_blocks_y; + const int64_t output_width = num_blocks_x * 4; + + // Store the original sizes of the weight data to pass to the shader + utils::ivec4 orig_sizes = { + utils::safe_downcast(OC), + utils::safe_downcast(K_h), + utils::safe_downcast(K_w), + utils::safe_downcast(IC)}; + + std::vector packed_weight_sizes{output_height, output_width}; + + utils::StorageType storage_type = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + if (output_width > max_extent * 4 || output_height > max_extent) { + storage_type = utils::kBuffer; + } + + ValueRef packed_weight = graph.add_tensor( + packed_weight_sizes, + vkcompute::vkapi::kInt, + storage_type, + utils::kWidthPacked); + + utils::uvec3 global_wg_size = { + utils::safe_downcast(num_blocks_x), + utils::safe_downcast(num_blocks_y), + 1u}; + + std::string kernel_name = "pack_q8_conv2d_weights"; + add_storage_type_suffix(kernel_name, storage_type); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Inputs and Outputs + weight_data, + packed_weight, + // UBOs + {}, + // Specialization Constants + {}, + // Push Constants + {graph.sizes_pc_of(packed_weight), + PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec4))})); + + return packed_weight; +} + +ValueRef prepack_quantized_conv2d_dw_weight( + ComputeGraph& graph, + const QuantizationConfig& weight_quant_config, + const ValueRef weight_data, + const ValueRef kernel_size) { + VK_CHECK_COND(weight_quant_config.nbits == 8); + VK_CHECK_COND(weight_quant_config.is_symmetric); + + std::vector weight_orig_sizes = graph.sizes_of(weight_data); + const int64_t ndim = graph.dim_of(weight_data); + + // For depthwise convolution, expect weight layout [K_h, aligned_K_w, OC] + VK_CHECK_COND(ndim == 3); + int64_t K_h = weight_orig_sizes.at(0); + int64_t K_w = weight_orig_sizes.at(1); + int64_t aligned_K_w = utils::align_up_4(K_w); + int64_t OC = weight_orig_sizes.at(2); + + // The packing format packs the weight tensor into blocks of 4 output channels + // (OC) and 4 kernel elements (K_h * aligned_K_w) + int64_t OC_per_block = 4; + int64_t K_per_block = 4; + + // To figure out the size of the output tensor, determine the number of blocks + // along each dimension. + const int64_t total_K_elements = K_h * aligned_K_w; + const int64_t num_blocks_K = utils::div_up(total_K_elements, K_per_block); + const int64_t num_blocks_OC = utils::div_up(OC, OC_per_block); + + // The blocks are arranged in a transposed manner, such that the transposed + // weight block is indexed like packed_weights[k4][oc4] - this is to allow for + // optimal memory coalescing when computing the depthwise convolution. + int64_t output_height = num_blocks_K; + // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit + // values) and each block is represented as a ivec4. Therefore the width dim + // of the packed tensor is multiplied by 4. + int64_t output_width = num_blocks_OC * 4; + + // Store the original sizes of the weight data to pass to the shader + utils::ivec3 orig_sizes = { + utils::safe_downcast(K_h), + utils::safe_downcast(K_w), + utils::safe_downcast(OC)}; + + std::vector packed_weight_sizes{output_height, output_width}; + + utils::StorageType storage_type = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + if (output_width > max_extent * 4 || output_height > max_extent) { + storage_type = utils::kBuffer; + } + + ValueRef packed_weight = graph.add_tensor( + packed_weight_sizes, + vkcompute::vkapi::kInt, + storage_type, + utils::kWidthPacked); + + utils::uvec3 global_wg_size = { + utils::safe_downcast(num_blocks_OC), + utils::safe_downcast(num_blocks_K), + 1u}; + + std::string kernel_name = "pack_q8_conv2d_dw_weights"; + add_storage_type_suffix(kernel_name, storage_type); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Inputs and Outputs + weight_data, + packed_weight, + // UBOs + {}, + // Specialization Constants + {}, + // Push Constants + {graph.sizes_pc_of(packed_weight), + PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec3))})); + + return packed_weight; +} + // // Dispatch nodes // @@ -251,6 +653,145 @@ void add_input_im2col_node( nullptr)); } +void add_input_im2col_packed_int8_node( + ComputeGraph& graph, + const ValueRef input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef output, + const ValueRef input_im2col) { + Conv2DParams conv_params = create_conv2d_params( + graph, input, output, kernel_size, stride, padding, dilation, groups); + + float inv_scale = 1.0f / graph.extract_scalar(input_scale); + int32_t zp = graph.extract_scalar(input_zp); + + std::string kernel_name = "im2col_packed_int8"; + add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col)); + + vkapi::ParamsBindList param_buffers = { + graph.sizes_ubo(input_im2col), + graph.sizes_ubo(output), + graph.sizes_ubo(input), + graph.create_params_buffer(conv_params)}; + + std::vector push_constants = { + PushConstantDataInfo(&inv_scale, sizeof(inv_scale)), + PushConstantDataInfo(&zp, sizeof(zp)), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + im2col_packed_int8_global_wg_size, + im2col_packed_int8_local_wg_size, + // Inputs and Outputs + {{input_im2col, vkapi::kWrite}, {input, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {}, + // Resize args + {}, + // Resizing Logic + nullptr)); +} + +void add_quantize_and_pack_q8ta_conv2d_input_node( + ComputeGraph& graph, + const ValueRef fp_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_int8_input) { + float inv_scale = 1.0f / graph.extract_scalar(input_scale); + int32_t zp = graph.extract_scalar(input_zp); + + // Get shader for quantized conv2d linear tiled + std::string kernel_name = "quantize_and_pack_q8ta_conv2d_input"; + add_storage_type_suffix( + kernel_name, graph.storage_type_of(packed_int8_input)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(fp_input)); + add_dtype_suffix(kernel_name, graph.dtype_of(fp_input)); + + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(fp_input)}; + + std::vector push_constants = { + PushConstantDataInfo(&inv_scale, sizeof(inv_scale)), + PushConstantDataInfo(&zp, sizeof(zp)), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_quantize_and_pack_conv2d_input_global_wg_size, + pick_wc_square_wg_size, + // Inputs and Outputs + {{packed_int8_input, vkapi::kWrite}, {fp_input, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {}, + // Resize args + {}, + // Resizing Logic + nullptr)); +} + +void add_unpack_and_dequantize_q8ta_conv2d_output_node( + ComputeGraph& graph, + const ValueRef packed_int8_output, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef fp_output) { + float scale = graph.extract_scalar(output_scale); + int32_t zp = graph.extract_scalar(output_zp); + + // Get shader for quantized conv2d linear tiled + std::string kernel_name = "unpack_and_dequantize_q8ta_conv2d_output"; + add_storage_type_suffix(kernel_name, graph.storage_type_of(fp_output)); + add_storage_type_suffix( + kernel_name, graph.storage_type_of(packed_int8_output)); + add_dtype_suffix(kernel_name, graph.dtype_of(fp_output)); + + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(fp_output)}; + + std::vector push_constants = { + PushConstantDataInfo(&scale, sizeof(scale)), + PushConstantDataInfo(&zp, sizeof(zp)), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_unpack_and_dequantize_conv2d_output_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{fp_output, vkapi::kWrite}, {packed_int8_output, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {}, + // Resize args + {}, + // Resizing Logic + nullptr)); +} + void add_quantize_and_pack_im2col_node( ComputeGraph& graph, const ValueRef input_image, @@ -307,19 +848,178 @@ void add_quantize_and_pack_im2col_node( // Push Constants push_constants, // Specialization Constants - {}, + {}, + // Resize args + {output_image, kernel_size, groups}, + // Resizing Logic + nullptr)); +} + +void add_conv2d_q8csw_linear_node( + ComputeGraph& graph, + const ValueRef input_im2col, + const ValueRef input_image, + const ValueRef packed_weight, + const ValueRef packed_weight_scales, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef output_image) { + Conv2DParams conv_params = create_conv2d_params( + graph, + input_image, + output_image, + kernel_size, + stride, + padding, + dilation, + groups); + + // One limitation of the current implementation is that for grouped convs, + // the number of output_image channels per group must be a multiple of 4. One + // loaded 4x4 weight tile must all belong to the same group. + if (conv_params.groups > 1) { + VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0); + } + + std::string kernel_name = "conv2d_q8csw_linear_tiled"; + add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); + add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + vkapi::ParamsBindList param_buffers = { + graph.sizes_ubo(output_image), + graph.sizes_ubo(input_image), + graph.create_params_buffer(conv_params)}; + + uint32_t apply_bias = 1; + if (graph.val_is_none(bias_data)) { + apply_bias = 0; + } + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + col2im_global_wg_size, + quantized_linear_local_wg_size, + // Inputs and Outputs + {{output_image, vkapi::kWrite}, + {{input_im2col, packed_weight, packed_weight_scales, packed_bias}, + vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + {}, + // Specialization Constants + {apply_bias}, + // Resize args + {}, + // Resizing Logic + nullptr)); +} + +void add_conv2d_q8ta_q8csw_linear_node( + ComputeGraph& graph, + const ValueRef input_int_im2col, + const ValueRef input_image, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef weight_data, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef output_image) { + Conv2DParams conv_params = create_conv2d_params( + graph, + input_image, + output_image, + kernel_size, + stride, + padding, + dilation, + groups); + + // One limitation of the current implementation is that for grouped convs, + // the number of output channels per group must be a multiple of 4. One loaded + // 4x4 weight tile must all belong to the same group. + if (conv_params.groups > 1) { + VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0); + } + + float scale = graph.extract_scalar(input_scale); + int32_t zp = graph.extract_scalar(input_zp); + + std::string kernel_name = "conv2d_q8ta_q8csw_linear_tiled"; + add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); + add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + vkapi::ParamsBindList param_buffers = { + graph.sizes_ubo(output_image), + graph.sizes_ubo(input_image), + graph.create_params_buffer(conv_params)}; + + std::vector push_constants = { + PushConstantDataInfo(&scale, sizeof(scale)), + PushConstantDataInfo(&zp, sizeof(zp)), + }; + + uint32_t apply_bias = 1; + if (graph.val_is_none(bias_data)) { + apply_bias = 0; + } + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + col2im_global_wg_size, + quantized_linear_local_wg_size, + // Inputs and Outputs + {{output_image, vkapi::kWrite}, + {{input_int_im2col, + packed_weight, + packed_weight_sums, + packed_weight_scales, + packed_bias}, + vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {apply_bias}, // Resize args - {output_image, kernel_size, groups}, + {weight_data}, // Resizing Logic nullptr)); } -void add_conv2d_q8csw_linear_node( +void add_conv2d_q8ta_q8csw_q8to_node( ComputeGraph& graph, - const ValueRef input_im2col, - const ValueRef input_image, + const ValueRef packed_int8_input, + const ValueRef packed_int8_input_im2col, + const ValueRef input_scale, + const ValueRef input_zp, const ValueRef packed_weight, + const ValueRef packed_weight_sums, const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, const ValueRef bias_data, const ValueRef packed_bias, const ValueRef kernel_size, @@ -327,36 +1027,45 @@ void add_conv2d_q8csw_linear_node( const ValueRef padding, const ValueRef dilation, const ValueRef groups, - const ValueRef output_image) { + const ValueRef packed_int8_output) { Conv2DParams conv_params = create_conv2d_params( graph, - input_image, - output_image, + packed_int8_input, + packed_int8_output, kernel_size, stride, padding, dilation, groups); - // One limitation of the current implementation is that for grouped convs, - // the number of output_image channels per group must be a multiple of 4. One - // loaded 4x4 weight tile must all belong to the same group. - if (conv_params.groups > 1) { - VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0); - } + const bool use_im2col = should_use_im2col(&graph, kernel_size, groups); - std::string kernel_name = "conv2d_q8csw_linear_tiled"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col)); + float input_scale_val = graph.extract_scalar(input_scale); + int32_t input_zp_val = graph.extract_scalar(input_zp); + + float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); + int32_t output_zp_val = graph.extract_scalar(output_zp); + + std::string kernel_name = use_im2col ? "conv2d_q8ta_q8csw_q8to_linear_tiled" + : "conv2d_q8ta_q8csw_q8to"; + add_storage_type_suffix( + kernel_name, graph.storage_type_of(packed_int8_output)); add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); - add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); + add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(output_image), - graph.sizes_ubo(input_image), + graph.sizes_ubo(packed_int8_output), + graph.sizes_ubo(packed_int8_input_im2col), graph.create_params_buffer(conv_params)}; + std::vector push_constants = { + PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), + PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), + PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), + PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), + }; + uint32_t apply_bias = 1; if (graph.val_is_none(bias_data)) { apply_bias = 0; @@ -365,16 +1074,20 @@ void add_conv2d_q8csw_linear_node( graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), - col2im_global_wg_size, - quantized_linear_local_wg_size, + pick_static_quantized_conv2d_global_wg_size, + pick_static_quantized_conv2d_local_wg_size, // Inputs and Outputs - {{output_image, vkapi::kWrite}, - {{input_im2col, packed_weight, packed_weight_scales, packed_bias}, + {{packed_int8_output, vkapi::kWrite}, + {{packed_int8_input_im2col, + packed_weight, + packed_weight_sums, + packed_weight_scales, + packed_bias}, vkapi::kRead}}, // Shader params buffers param_buffers, // Push Constants - {}, + push_constants, // Specialization Constants {apply_bias}, // Resize args @@ -383,16 +1096,16 @@ void add_conv2d_q8csw_linear_node( nullptr)); } -void add_conv2d_q8ta_q8csw_linear_node( +void add_conv2d_dw_q8ta_q8csw_q8to_node( ComputeGraph& graph, - const ValueRef input_int_im2col, - const ValueRef input_image, + const ValueRef packed_int8_input, const ValueRef input_scale, const ValueRef input_zp, - const ValueRef weight_data, const ValueRef packed_weight, const ValueRef packed_weight_sums, const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, const ValueRef bias_data, const ValueRef packed_bias, const ValueRef kernel_size, @@ -400,42 +1113,45 @@ void add_conv2d_q8ta_q8csw_linear_node( const ValueRef padding, const ValueRef dilation, const ValueRef groups, - const ValueRef output_image) { + const ValueRef packed_int8_output) { Conv2DParams conv_params = create_conv2d_params( graph, - input_image, - output_image, + packed_int8_input, + packed_int8_output, kernel_size, stride, padding, dilation, groups); - // One limitation of the current implementation is that for grouped convs, - // the number of output channels per group must be a multiple of 4. One loaded - // 4x4 weight tile must all belong to the same group. - if (conv_params.groups > 1) { - VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0); - } + // Verify this is actually a depthwise convolution + const int64_t groups_val = graph.extract_scalar(groups); + const int64_t in_channels = graph.size_at(-3, packed_int8_input); + VK_CHECK_COND(groups_val == in_channels); - float scale = graph.extract_scalar(input_scale); - int32_t zp = graph.extract_scalar(input_zp); + float input_scale_val = graph.extract_scalar(input_scale); + int32_t input_zp_val = graph.extract_scalar(input_zp); - std::string kernel_name = "conv2d_q8ta_q8csw_linear_tiled"; - add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image)); - add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col)); + float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); + int32_t output_zp_val = graph.extract_scalar(output_zp); + + std::string kernel_name = "conv2d_dw_q8ta_q8csw_q8to"; + add_storage_type_suffix( + kernel_name, graph.storage_type_of(packed_int8_output)); add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); - add_dtype_suffix(kernel_name, graph.dtype_of(output_image)); + add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); vkapi::ParamsBindList param_buffers = { - graph.sizes_ubo(output_image), - graph.sizes_ubo(input_image), + graph.sizes_ubo(packed_int8_output), + graph.sizes_ubo(packed_int8_input), graph.create_params_buffer(conv_params)}; std::vector push_constants = { - PushConstantDataInfo(&scale, sizeof(scale)), - PushConstantDataInfo(&zp, sizeof(zp)), + PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), + PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), + PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), + PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), }; uint32_t apply_bias = 1; @@ -446,11 +1162,11 @@ void add_conv2d_q8ta_q8csw_linear_node( graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), - col2im_global_wg_size, - quantized_linear_local_wg_size, + int8_conv2d_dw_global_wg_size, + default_pick_local_wg_size, // Inputs and Outputs - {{output_image, vkapi::kWrite}, - {{input_int_im2col, + {{packed_int8_output, vkapi::kWrite}, + {{packed_int8_input, packed_weight, packed_weight_sums, packed_weight_scales, @@ -463,7 +1179,7 @@ void add_conv2d_q8ta_q8csw_linear_node( // Specialization Constants {apply_bias}, // Resize args - {weight_data}, + {}, // Resizing Logic nullptr)); } @@ -564,16 +1280,12 @@ void quantized_conv2d_impl( ValueRef packed_weight_sums = prepack_standard( graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); - // Allocate quantized + packed im2col matrix for input - const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0)); - const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1)); - TmpTensor input_int_im2col( &graph, - {num_blocks_M, num_blocks_K * 4}, - vkapi::kInt, + input_im2col_sizes, + vkapi::kInt8x4, utils::kBuffer, - utils::kWidthPacked); + utils::kPackedInt8_4H4W); add_quantize_and_pack_im2col_node( graph, @@ -687,9 +1399,343 @@ void conv2d_q8csw(ComputeGraph& graph, const std::vector& args) { output_image); } +// Implementation for statically quantized conv2d, which expects input, weight, +// and output tensors to all have packed int8 dtype/memory layout. +void static_quantized_conv2d_impl( + ComputeGraph& graph, + const QuantizationConfig& input_quant_config, + const QuantizationConfig& weight_quant_config, + const QuantizationConfig& output_quant_config, + const ValueRef packed_int8_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef weight_data, + const ValueRef weight_sums_data, + const ValueRef weight_scales_data, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output) { + // Currently, only certain quantization configs are supported + VK_CHECK_COND(input_quant_config.granularity == kPerTensor); + VK_CHECK_COND(input_quant_config.nbits == 8); + + VK_CHECK_COND(weight_quant_config.granularity == kPerChannel); + VK_CHECK_COND(weight_quant_config.nbits == 8); + VK_CHECK_COND(weight_quant_config.is_symmetric); + + VK_CHECK_COND(output_quant_config.granularity == kPerTensor); + VK_CHECK_COND(output_quant_config.nbits == 8); + + // Check for depthwise conv + const int64_t groups_val = graph.extract_scalar(groups); + const int64_t in_channels = graph.size_at(-3, packed_int8_input); + + // Depthwise convs have a specialized implementation, since the regular conv + // implementations requires that the number of input and output channels per + // groups is a multiple of 4. This is so that all values that are part of the + // same 4Wx4C block have the same group index. + const bool is_depthwise = (groups_val == in_channels); + + const bool use_im2col = should_use_im2col(&graph, kernel_size, groups); + // For pointwise convolution with stride = 1, padding = 0, dilation = 1, the + // input tensor is already equivalent to its im2col representation. In this + // case we can skip the im2col procedure and pass in the input image to the + // convolution_as_matmul implementation directly. + const bool is_optimizable_pw = + is_s1p0d1_pointwise(&graph, kernel_size, stride, padding, dilation); + + ValueRef packed_weight; + if (is_depthwise) { + packed_weight = prepack_quantized_conv2d_dw_weight( + graph, weight_quant_config, weight_data, kernel_size); + } else if (use_im2col) { + packed_weight = prepack_quantized_linear_weight( + graph, weight_quant_config, weight_data); + } else { + packed_weight = prepack_quantized_conv2d_weight( + graph, + weight_quant_config, + weight_data, + packed_int8_input, + packed_int8_output, + groups, + kernel_size); + } + + ValueRef packed_weight_sums = prepack_standard( + graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); + + ValueRef packed_weight_scales = prepack_standard( + graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked); + + // See quantized_conv2d_impl for why this is needed + TmpTensor dummy_bias( + &graph, + {}, + graph.dtype_of(weight_scales_data), + utils::kBuffer, + utils::kWidthPacked); + + ValueRef packed_bias = dummy_bias.vref; + if (graph.val_is_not_none(bias_data)) { + packed_bias = + prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked); + } + + // Depthwise conv path + if (is_depthwise) { + add_conv2d_dw_q8ta_q8csw_q8to_node( + graph, + packed_int8_input, + input_scale, + input_zp, + packed_weight, + packed_weight_sums, + packed_weight_scales, + output_scale, + output_zp, + bias_data, + packed_bias, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output); + return; + } + + std::vector input_im2col_sizes = + calculate_packed_int8_input_im2col_sizes( + &graph, packed_int8_input, packed_int8_output, kernel_size, groups); + + ValueRef packed_int8_input_im2col = packed_int8_input; + if (use_im2col && !is_optimizable_pw) { + TmpTensor packed_int8_input_im2col_tensor( + &graph, + input_im2col_sizes, + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + packed_int8_input_im2col = packed_int8_input_im2col_tensor.vref; + + add_input_im2col_packed_int8_node( + graph, + packed_int8_input, + input_scale, + input_zp, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output, + packed_int8_input_im2col); + } + + add_conv2d_q8ta_q8csw_q8to_node( + graph, + packed_int8_input, + packed_int8_input_im2col, + input_scale, + input_zp, + packed_weight, + packed_weight_sums, + packed_weight_scales, + output_scale, + output_zp, + bias_data, + packed_bias, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output); +} + +void conv2d_q8ta_q8csw_q8to( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef packed_int8_input = args.at(idx++); + const ValueRef input_scale = args.at(idx++); + const ValueRef input_zp = args.at(idx++); + const ValueRef weight_data = args.at(idx++); + const ValueRef weight_sums_data = args.at(idx++); + const ValueRef weight_scales_data = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef bias_data = args.at(idx++); + const ValueRef kernel_size = args.at(idx++); + const ValueRef stride = args.at(idx++); + const ValueRef padding = args.at(idx++); + const ValueRef dilation = args.at(idx++); + const ValueRef groups = args.at(idx++); + const ValueRef packed_int8_output = args.at(idx++); + + QuantizationConfig input_quant_config(8, kPerTensor, {}); + QuantizationConfig weight_quant_config(8, kPerChannel, {}); + QuantizationConfig output_quant_config(8, kPerTensor, {}); + + static_quantized_conv2d_impl( + graph, + input_quant_config, + weight_quant_config, + output_quant_config, + packed_int8_input, + input_scale, + input_zp, + weight_data, + weight_sums_data, + weight_scales_data, + output_scale, + output_zp, + bias_data, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output); +} + +// +// Quantize and dequantize operators +// + +void quantize_q8ta_for_conv2d( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input = args.at(idx++); + const ValueRef scale = args.at(idx++); + const ValueRef zero_point = args.at(idx++); + const ValueRef packed_int8_input = args.at(idx++); + + add_quantize_and_pack_q8ta_conv2d_input_node( + graph, fp_input, scale, zero_point, packed_int8_input); +} + +void dequantize_q8to_from_conv2d( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef packed_int8_output = args.at(idx++); + const ValueRef scale = args.at(idx++); + const ValueRef zero_point = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + add_unpack_and_dequantize_q8ta_conv2d_output_node( + graph, packed_int8_output, scale, zero_point, fp_output); +} + +void qdq8ta_conv2d_input( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input = args.at(idx++); + const ValueRef scale = args.at(idx++); + const ValueRef zero_point = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + TmpTensor packed_int8_input( + &graph, + graph.sizes_of(fp_input), + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + add_quantize_and_pack_q8ta_conv2d_input_node( + graph, fp_input, scale, zero_point, packed_int8_input); + + add_unpack_and_dequantize_q8ta_conv2d_output_node( + graph, packed_int8_input, scale, zero_point, fp_output); +} + +// +// Test operators +// + +void conv2d_q8ta_q8csw_q8to_test( + ComputeGraph& graph, + const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input = args.at(idx++); + const ValueRef input_scale = args.at(idx++); + const ValueRef input_zp = args.at(idx++); + const ValueRef weight_data = args.at(idx++); + const ValueRef weight_sums_data = args.at(idx++); + const ValueRef weight_scales_data = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef bias_data = args.at(idx++); + const ValueRef kernel_size = args.at(idx++); + const ValueRef stride = args.at(idx++); + const ValueRef padding = args.at(idx++); + const ValueRef dilation = args.at(idx++); + const ValueRef groups = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + TmpTensor packed_int8_input( + &graph, + graph.sizes_of(fp_input), + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + TmpTensor packed_int8_output( + &graph, + graph.sizes_of(fp_output), + vkapi::kInt8x4, + utils::kBuffer, + utils::kPackedInt8_4W4C); + + add_quantize_and_pack_q8ta_conv2d_input_node( + graph, fp_input, input_scale, input_zp, packed_int8_input); + + std::vector conv2d_args = { + packed_int8_input, + input_scale, + input_zp, + weight_data, + weight_sums_data, + weight_scales_data, + output_scale, + output_zp, + bias_data, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output}; + + conv2d_q8ta_q8csw_q8to(graph, conv2d_args); + + add_unpack_and_dequantize_q8ta_conv2d_output_node( + graph, packed_int8_output, output_scale, output_zp, fp_output); +} + REGISTER_OPERATORS { VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw.default, conv2d_q8ta_q8csw); VK_REGISTER_OP(et_vk.conv2d_q8csw.default, conv2d_q8csw); + VK_REGISTER_OP(etvk.qdq8ta_conv2d_input.default, qdq8ta_conv2d_input); + VK_REGISTER_OP(etvk.conv2d_q8ta_q8csw_q8to.test, conv2d_q8ta_q8csw_q8to_test); + VK_REGISTER_OP( + et_vk.quantize_q8ta_for_conv2d.default, quantize_q8ta_for_conv2d); + VK_REGISTER_OP( + et_vk.dequantize_q8to_from_conv2d.default, dequantize_q8to_from_conv2d); + VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw_q8to.default, conv2d_q8ta_q8csw_q8to); + VK_REGISTER_OP( + et_vk.conv2d_q8ta_q8csw_q8to_dw.default, conv2d_q8ta_q8csw_q8to); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h new file mode 100644 index 00000000000..33474cee47b --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +// +// Quantize and dequantize functions for conv2d that can be reused by other +// operations +// + +/** + * Add a dispatch node to quantize a floating-point input tensor to a packed + * int8 tensor for use in quantized operations. + */ +void add_quantize_and_pack_q8ta_conv2d_input_node( + ComputeGraph& graph, + const ValueRef fp_input, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_int8_input); + +/** + * Add a dispatch node to unpack and dequantize a packed int8 output tensor back + * to a floating-point tensor. + */ +void add_unpack_and_dequantize_q8ta_conv2d_output_node( + ComputeGraph& graph, + const ValueRef packed_int8_output, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef fp_output); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp index 7fbfcee5cb1..97566038501 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp @@ -77,6 +77,10 @@ utils::uvec3 quantized_linear_global_wg_size( M_per_tile = 1; } + if (shader.kernel_name.find("q8ta_q8csw_tiled") != std::string::npos) { + N_per_tile = 8; + } + const uint32_t num_N_tiles = utils::div_up(N, N_per_tile); const uint32_t num_M_tiles = utils::div_up(M, M_per_tile); @@ -802,20 +806,12 @@ void quantized_linear_impl( graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); // Allocate temporary tensor to store quantized and packed input - - int64_t num_blocks_M, num_blocks_K; - std::tie(num_blocks_M, num_blocks_K) = - get_quantized_input_num_blocks(graph, fp_input); - - const int64_t int_input_height = num_blocks_M; - const int64_t int_input_width = num_blocks_K * 4; - TmpTensor packed_int_input( &graph, - {int_input_height, int_input_width}, - vkapi::kInt, + graph.sizes_of(fp_input), + vkapi::kInt8x4, utils::kBuffer, - utils::kWidthPacked); + utils::kPackedInt8_4H4W); // Non dynamically quantized input case if (!input_quant_config.is_dynamic) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp index 13801b45cc7..e2b73b2f3f2 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp @@ -32,8 +32,13 @@ void add_squeeze_copy_dims_node( // 2. Squeeze outter most dim // For these cases, just pass input to output via clone. for (int i = 0; i < dims.size(); ++i) { - if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) { - squeeze_dims.push_back(dims.at(i)); + // adjust negative dims + int64_t dim_val = dims.at(i); + if (dim_val < 0) { + dim_val += in_dim; + } + if (dims.at(i) != 0 && in_sizes.at(dim_val) == 1) { + squeeze_dims.push_back(dim_val); } } if (squeeze_dims.size() == 0) { diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp new file mode 100644 index 00000000000..cfe3d9e159a --- /dev/null +++ b/backends/vulkan/runtime/utils/StorageUtils.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { +namespace utils { + +bool is_packed_int8_layout(const GPUMemoryLayout layout) { + switch (layout) { + case kPackedInt8_4W4C: + case kPackedInt8_4H4W: + return true; + default: + return false; + } +} + +} // namespace utils +} // namespace vkcompute diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h index 20addf88c53..76edec897c7 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.h +++ b/backends/vulkan/runtime/utils/StorageUtils.h @@ -84,9 +84,24 @@ enum class GPUMemoryLayout : uint8_t { * 2. For texture backed tensors, the packed dim will be the specified dim. * The axis map will be `{0, 1, 2, 2}`. */ + TENSOR_WIDTH_PACKED = 0u, TENSOR_HEIGHT_PACKED = 1u, TENSOR_CHANNELS_PACKED = 2u, + + /* + * The following memory layouts are used for quantized int8 tensors. For the + * above "standard" memory layouts, 4 elements along the packed dim are stored + * in each texel (4-component vectorized type). However, for packed int8 + * memory layouts, an additional level of packing is used where 4 int8 values + * are packed into each int32, and each int32 is packed into each ivec4. + * Conceptually, this allows an additional packed dimension to be used. + * When loading a ivec4 from the GPU storage buffer / texture, data for a + * 16 element block is loaded, rather than 4 elements along one dimension. + */ + + TENSOR_PACKED_INT8_4W4C = 3u, + TENSOR_PACKED_INT8_4H4W = 4u, }; static constexpr GPUMemoryLayout kWidthPacked = @@ -98,6 +113,12 @@ static constexpr GPUMemoryLayout kHeightPacked = static constexpr GPUMemoryLayout kChannelsPacked = GPUMemoryLayout::TENSOR_CHANNELS_PACKED; +static constexpr GPUMemoryLayout kPackedInt8_4W4C = + GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C; + +static constexpr GPUMemoryLayout kPackedInt8_4H4W = + GPUMemoryLayout::TENSOR_PACKED_INT8_4H4W; + template T to_packed_dim(const GPUMemoryLayout layout) { switch (layout) { @@ -107,11 +128,17 @@ T to_packed_dim(const GPUMemoryLayout layout) { return 1; case kChannelsPacked: return 2; + case kPackedInt8_4W4C: + return 2; + case kPackedInt8_4H4W: + return 0; }; // Should be unreachable return 0; } +bool is_packed_int8_layout(const GPUMemoryLayout layout); + inline std::ostream& operator<<( std::ostream& os, const StorageType storage_type) { @@ -142,6 +169,12 @@ inline std::ostream& operator<<( case kChannelsPacked: os << "TENSOR_CHANNELS_PACKED"; break; + case kPackedInt8_4W4C: + os << "TENSOR_PACKED_INT8_4W4C"; + break; + case kPackedInt8_4H4W: + os << "TENSOR_PACKED_INT8_4H4W"; + break; } return os; } diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp index d3efa81e52a..5bcf047aaf1 100644 --- a/backends/vulkan/runtime/vk_api/Exception.cpp +++ b/backends/vulkan/runtime/vk_api/Exception.cpp @@ -10,6 +10,13 @@ #include +#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif // _GNU_SOURCE +#include +#endif // ETVK_BOOST_STACKTRACE_AVAILABLE + namespace vkcompute { namespace vkapi { @@ -65,6 +72,11 @@ Error::Error(SourceLocation source_location, std::string msg) std::ostringstream oss; oss << "Exception raised from " << source_location_ << ": "; oss << msg_; +#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE + oss << "\n"; + oss << "Stack trace:\n"; + oss << boost::stacktrace::stacktrace(); +#endif // ETVK_BOOST_STACKTRACE_AVAILABLE what_ = oss.str(); } @@ -74,6 +86,11 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg) oss << "Exception raised from " << source_location_ << ": "; oss << "(" << cond << ") is false! "; oss << msg_; +#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE + oss << "\n"; + oss << "Stack trace:\n"; + oss << boost::stacktrace::stacktrace(); +#endif // ETVK_BOOST_STACKTRACE_AVAILABLE what_ = oss.str(); } diff --git a/backends/vulkan/runtime/vk_api/Types.h b/backends/vulkan/runtime/vk_api/Types.h index b3309aa6c69..f4415b5c08f 100644 --- a/backends/vulkan/runtime/vk_api/Types.h +++ b/backends/vulkan/runtime/vk_api/Types.h @@ -43,7 +43,8 @@ _(double, VK_FORMAT_R64G64B64A64_SFLOAT, Double) \ _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8) \ _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8) \ - _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32) + _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32) \ + _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int8x4) namespace vkcompute { namespace vkapi { diff --git a/backends/vulkan/serialization/schema.fbs b/backends/vulkan/serialization/schema.fbs index 4bc12208ce7..9d738bc386f 100644 --- a/backends/vulkan/serialization/schema.fbs +++ b/backends/vulkan/serialization/schema.fbs @@ -40,6 +40,8 @@ enum VkMemoryLayout : ubyte { TENSOR_WIDTH_PACKED = 0, TENSOR_HEIGHT_PACKED = 1, TENSOR_CHANNELS_PACKED = 2, + PACKED_INT8_4W4C = 3, + PACKED_INT8_4H4W = 4, DEFAULT_LAYOUT = 255, } diff --git a/backends/vulkan/serialization/vulkan_graph_schema.py b/backends/vulkan/serialization/vulkan_graph_schema.py index cf5326f40cf..236183ce42f 100644 --- a/backends/vulkan/serialization/vulkan_graph_schema.py +++ b/backends/vulkan/serialization/vulkan_graph_schema.py @@ -48,6 +48,8 @@ class VkMemoryLayout(IntEnum): TENSOR_WIDTH_PACKED = 0 TENSOR_HEIGHT_PACKED = 1 TENSOR_CHANNELS_PACKED = 2 + PACKED_INT8_4W4C = 3 + PACKED_INT8_4H4W = 4 DEFAULT_LAYOUT = 255 def __str__(self) -> str: diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl index a9ba62b6f9f..c48ce0a452b 100644 --- a/backends/vulkan/targets.bzl +++ b/backends/vulkan/targets.bzl @@ -19,6 +19,8 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode): default_flags = [] android_flags = [] + debug_mode = read_config("etvk", "debug", "0") == "1" + if not no_volk: for flags in [default_flags, android_flags]: flags.append("-DUSE_VULKAN_WRAPPER") @@ -32,6 +34,10 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode): if link_moltenvk: mac_flags = [] + if debug_mode: + mac_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE") + default_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE") + VK_API_PREPROCESSOR_FLAGS += select({ "DEFAULT": default_flags, "ovr_config//os:android": android_flags, @@ -59,7 +65,6 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode): if etvk_default_cache_path != "": VK_API_PREPROCESSOR_FLAGS += ["-DETVK_DEFAULT_CACHE_PATH={}".format(etvk_default_cache_path)] - debug_mode = read_config("etvk", "debug", "0") == "1" if debug_mode: VK_API_PREPROCESSOR_FLAGS += ["-DVULKAN_DEBUG"] @@ -136,6 +141,8 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = Fal ) def define_common_targets(is_fbcode = False): + debug_mode = read_config("etvk", "debug", "0") == "1" + runtime.python_library( name = "gen_vulkan_spv_lib", srcs = [ @@ -200,6 +207,10 @@ def define_common_targets(is_fbcode = False): "//third-party/khronos:moltenVK_static" ] + if debug_mode: + mac_deps.append("fbsource//third-party/boost:boost") + default_deps.append("fbsource//third-party/boost:boost") + VK_API_DEPS += select({ "DEFAULT": default_deps, "ovr_config//os:android": android_deps, diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS index 53fad86f90c..ee296a4f68f 100644 --- a/backends/vulkan/test/TARGETS +++ b/backends/vulkan/test/TARGETS @@ -34,7 +34,6 @@ python_unittest( deps = [ "//caffe2:torch", "//executorch/backends/vulkan/_passes:vulkan_passes", - "//executorch/backends/vulkan/quantizer:vulkan_quantizer", "//executorch/backends/vulkan:vulkan_preprocess", "//pytorch/ao:torchao", # @manual ] diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index 97b632338db..fc1d33391d4 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -95,4 +95,8 @@ if(TARGET vulkan_backend) add_operator_prototype(q8csw_conv2d) add_operator_prototype(q4gsw_linear) add_operator_prototype(choose_qparams_per_row) + add_operator_prototype(qdq8ta_conv2d_activations) + add_operator_prototype(q8ta_q8csw_q8to_conv2d) + add_operator_prototype(q8ta_q8csw_q8to_conv2d_dw) + add_operator_prototype(q8ta_q8ta_q8to_add) endif() diff --git a/backends/vulkan/test/custom_ops/conv2d_utils.cpp b/backends/vulkan/test/custom_ops/conv2d_utils.cpp new file mode 100644 index 00000000000..74c26cef5a1 --- /dev/null +++ b/backends/vulkan/test/custom_ops/conv2d_utils.cpp @@ -0,0 +1,10 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "conv2d_utils.h" + +// Implementation file for conv2d utilities. +// Currently all functionality is implemented inline in the header. diff --git a/backends/vulkan/test/custom_ops/conv2d_utils.h b/backends/vulkan/test/custom_ops/conv2d_utils.h new file mode 100644 index 00000000000..cad52219062 --- /dev/null +++ b/backends/vulkan/test/custom_ops/conv2d_utils.h @@ -0,0 +1,88 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace executorch { +namespace vulkan { +namespace prototyping { + +// Component structs for better readability +struct KernelSize { + int32_t h; + int32_t w; + + KernelSize(int32_t height, int32_t width) : h(height), w(width) {} +}; + +struct Stride { + int32_t h; + int32_t w; + + Stride(int32_t height, int32_t width) : h(height), w(width) {} +}; + +struct Padding { + int32_t h; + int32_t w; + + Padding(int32_t height, int32_t width) : h(height), w(width) {} +}; + +struct Dilation { + int32_t h; + int32_t w; + + Dilation(int32_t height = 1, int32_t width = 1) : h(height), w(width) {} +}; + +struct OutInChannels { + int32_t out; + int32_t in; + + OutInChannels(int32_t out_channels, int32_t in_channels) + : out(out_channels), in(in_channels) {} +}; + +struct InputSize2D { + int32_t h; + int32_t w; + + InputSize2D(int32_t height, int32_t width) : h(height), w(width) {} +}; + +// Conv2d configuration struct +struct Conv2dConfig { + OutInChannels channels; + InputSize2D input_size; + KernelSize kernel; + Stride stride; + Padding padding; + Dilation dilation; + int32_t groups; // Number of groups for grouped convolution + std::string test_case_name = "placeholder"; + std::string op_name = "conv2d"; + + // Calculate output dimensions + int64_t get_output_height() const { + return (input_size.h + 2 * padding.h - dilation.h * (kernel.h - 1) - 1) / + stride.h + + 1; + } + + int64_t get_output_width() const { + return (input_size.w + 2 * padding.w - dilation.w * (kernel.w - 1) - 1) / + stride.w + + 1; + } +}; + +} // namespace prototyping +} // namespace vulkan +} // namespace executorch diff --git a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp index d566e5b2646..219bccb04c3 100644 --- a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp +++ b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp @@ -8,6 +8,7 @@ #include #include #include +#include "conv2d_utils.h" #include "utils.h" #include @@ -18,76 +19,6 @@ using namespace vkcompute; static constexpr int64_t kRefDimSizeLimit = 100; -// Component structs for better readability -struct KernelSize { - int32_t h; - int32_t w; - - KernelSize(int32_t height, int32_t width) : h(height), w(width) {} -}; - -struct Stride { - int32_t h; - int32_t w; - - Stride(int32_t height, int32_t width) : h(height), w(width) {} -}; - -struct Padding { - int32_t h; - int32_t w; - - Padding(int32_t height, int32_t width) : h(height), w(width) {} -}; - -struct Dilation { - int32_t h; - int32_t w; - - Dilation(int32_t height = 1, int32_t width = 1) : h(height), w(width) {} -}; - -struct OutInChannels { - int32_t out; - int32_t in; - - OutInChannels(int32_t out_channels, int32_t in_channels) - : out(out_channels), in(in_channels) {} -}; - -struct InputSize2D { - int32_t h; - int32_t w; - - InputSize2D(int32_t height, int32_t width) : h(height), w(width) {} -}; - -// Conv2d configuration struct -struct Conv2dConfig { - OutInChannels channels; - InputSize2D input_size; - KernelSize kernel; - Stride stride; - Padding padding; - Dilation dilation; - int32_t groups; // Number of groups for grouped convolution - std::string test_case_name = "placeholder"; - std::string op_name = "conv2d_q8ta_q8csw"; - - // Calculate output dimensions - int64_t get_output_height() const { - return (input_size.h + 2 * padding.h - dilation.h * (kernel.h - 1) - 1) / - stride.h + - 1; - } - - int64_t get_output_width() const { - return (input_size.w + 2 * padding.w - dilation.w * (kernel.w - 1) - 1) / - stride.w + - 1; - } -}; - // Utility function to create a test case from a Conv2dConfig TestCase create_test_case_from_config( const Conv2dConfig& config, @@ -366,13 +297,20 @@ std::vector generate_quantized_conv2d_test_cases() { Stride(1, 1), Padding(1, 1), Dilation(1, 1), - 8}, + 1}, {OutInChannels(128, 64), InputSize2D(128, 128), KernelSize(3, 3), Stride(1, 1), Padding(1, 1), Dilation(1, 1), + 1}, + {OutInChannels(128, 1024), + InputSize2D(128, 128), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), 1}}; // Test with different storage types and data types @@ -394,6 +332,7 @@ std::vector generate_quantized_conv2d_test_cases() { std::to_string(config.kernel.h) + "/" + std::to_string(config.kernel.w); + config.op_name = "conv2d_q8ta_q8csw"; config.test_case_name = prefix + suffix; // The default operator tested is activation + weight quantized conv2d; // however, only test this if the int8 dot product extension is supported @@ -763,7 +702,7 @@ int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) { int main(int argc, char* argv[]) { set_debugging(false); set_print_output(false); - set_print_latencies(false); + set_print_latencies(true); set_use_gpu_timestamps(true); print_performance_header(); diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp new file mode 100644 index 00000000000..8762fe4c0d1 --- /dev/null +++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp @@ -0,0 +1,628 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include "conv2d_utils.h" +#include "utils.h" + +#include + +using namespace executorch::vulkan::prototyping; + +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 100; + +// Utility function to create a test case from a Conv2dConfig +TestCase create_test_case_from_config( + const Conv2dConfig& config, + utils::StorageType storage_type, + vkapi::ScalarType input_dtype) { + TestCase test_case; + + // Create a descriptive name for the test case + std::string storage_str = + (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; + std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; + + std::string test_name = + config.test_case_name + "_" + storage_str + "_" + dtype_str; + test_case.set_name(test_name); + + // Set the operator name for the test case + std::string operator_name = "etvk." + config.op_name + ".test"; + test_case.set_operator_name(operator_name); + + // Calculate output dimensions + int64_t H_out = config.get_output_height(); + int64_t W_out = config.get_output_width(); + + // Input tensor (float/half) - [1, C_in, H_in, W_in] (batch size always 1) + std::vector input_size = { + 1, config.channels.in, config.input_size.h, config.input_size.w}; + + ValueSpec input_tensor( + input_size, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::RANDOM); + + if (debugging()) { + print_valuespec_data(input_tensor, "input_tensor"); + } + + float input_scale_val = 0.008123; + ValueSpec input_scale(input_scale_val); + + int32_t input_zero_point_val = 2; + ValueSpec input_zero_point(input_zero_point_val); + + // Quantized weight tensor (int8) - [C_out, C_in_per_group * K_h * K_w] + // Memory layout: height, width, then channels - in_c is innermost (stride 1) + // in the second dimension + const int64_t in_channels_per_group = config.channels.in / config.groups; + const int64_t in_features = utils::align_up_4( + in_channels_per_group * config.kernel.h * config.kernel.w); + std::vector weight_size = {config.channels.out, in_features}; + ValueSpec quantized_weight( + weight_size, + vkapi::kChar, // int8 for quantized weights + storage_type, + utils::kWidthPacked, + DataGenType::RANDINT8); + quantized_weight.set_constant(true); + + if (debugging()) { + print_valuespec_data(quantized_weight, "weight_tensor"); + } + + const int64_t aligned_out_channels = utils::align_up_4(config.channels.out); + + // Weight quantization scales (float/half, per-channel) + ValueSpec weight_scales( + {aligned_out_channels}, // Per output channel + input_dtype, + storage_type, + utils::kWidthPacked, + DataGenType::RANDOM_SCALES); + weight_scales.set_constant(true); + + ValueSpec weight_sums( + {aligned_out_channels}, // Per output channel + vkapi::kInt, + storage_type, + utils::kWidthPacked, + DataGenType::ZEROS); + weight_sums.set_constant(true); + + // Compute weight_sums data based on quantized weights + compute_weight_sums( + weight_sums, quantized_weight, config.channels.out, in_features); + + // Bias (optional, float/half) - [C_out] + ValueSpec bias( + {aligned_out_channels}, // Per output channel + input_dtype, + storage_type, + utils::kWidthPacked, + DataGenType::ZEROS); + bias.set_constant(true); + + // Output quantization parameters + // float output_scale_val = 0.01432; + float output_scale_val = 0.05314; + ValueSpec output_scale(output_scale_val); + + int32_t output_zero_point_val = -1; + ValueSpec output_zero_point(output_zero_point_val); + + // Stride and padding parameters + ValueSpec stride({config.stride.h, config.stride.w}); + ValueSpec padding({config.padding.h, config.padding.w}); + + // Dilation and groups parameters + ValueSpec dilation({config.dilation.h, config.dilation.w}); + ValueSpec groups(config.groups); + + // Kernel size parameters + ValueSpec kernel_size({config.kernel.h, config.kernel.w}); + + // Output tensor (float/half) - [1, C_out, H_out, W_out] (batch size always 1) + ValueSpec output( + {1, config.channels.out, H_out, W_out}, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::ZEROS); + + // Add all specs to test case for q8ta_q8csw_q8to operation + test_case.add_input_spec(input_tensor); + test_case.add_input_spec(input_scale); + test_case.add_input_spec(input_zero_point); + test_case.add_input_spec(quantized_weight); + test_case.add_input_spec(weight_sums); + test_case.add_input_spec(weight_scales); + test_case.add_input_spec(output_scale); + test_case.add_input_spec(output_zero_point); + test_case.add_input_spec(bias); + test_case.add_input_spec(kernel_size); + test_case.add_input_spec(stride); + test_case.add_input_spec(padding); + test_case.add_input_spec(dilation); + test_case.add_input_spec(groups); + + test_case.add_output_spec(output); + + test_case.set_abs_tolerance(output_scale_val + 1e-4f); + + return test_case; +} + +// Generate easy test cases for quantized conv2d operation (for debugging) +std::vector generate_quantized_conv2d_easy_cases() { + std::vector test_cases; + + // Single simple configuration for debugging + Conv2dConfig config = { + OutInChannels(16, 8), // channels (out, in) + InputSize2D(21, 17), // input_size (h, w) + KernelSize(3, 3), // kernel + Stride(1, 1), // stride + Padding(1, 1), // padding + Dilation(1, 1), // dilation + 2, // groups + }; + config.op_name = "conv2d_q8ta_q8csw_q8to"; + + // Test with both storage types and data types for completeness + std::vector storage_types = {utils::kTexture3D}; + std::vector float_types = {vkapi::kFloat}; + + // Generate test cases for each combination + for (const auto& storage_type : storage_types) { + for (const auto& input_dtype : float_types) { + test_cases.push_back( + create_test_case_from_config(config, storage_type, input_dtype)); + } + } + + return test_cases; +} + +// Generate test cases for quantized conv2d operation +std::vector generate_quantized_conv2d_test_cases() { + std::vector test_cases; + + std::vector configs = { + // Pointwise convolutions: kernel size 1x1 + {OutInChannels(32, 3), + InputSize2D(64, 64), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(64, 32), + InputSize2D(32, 32), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(96, 64), + InputSize2D(16, 16), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(13, 7), + InputSize2D(57, 33), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + // General 2D convolutions + {OutInChannels(32, 3), + InputSize2D(64, 64), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 1}, + {OutInChannels(32, 3), + InputSize2D(64, 64), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + 1}, + {OutInChannels(64, 32), + InputSize2D(8, 8), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 1}, + {OutInChannels(64, 32), + InputSize2D(64, 64), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 1}, + {OutInChannels(64, 32), + InputSize2D(64, 64), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + 1}, + {OutInChannels(16, 32), + InputSize2D(77, 77), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 1}, + // Grouped convolutions + {OutInChannels(64, 32), + InputSize2D(64, 64), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 2}, + {OutInChannels(96, 96), + InputSize2D(81, 81), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + 3}, + {OutInChannels(96, 96), + InputSize2D(64, 64), + KernelSize(5, 5), + Stride(2, 2), + Padding(2, 2), + Dilation(1, 1), + 4}, + // Performance cases (pointwise) + {OutInChannels(128, 128), + InputSize2D(128, 128), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(128, 128), + InputSize2D(128, 128), + KernelSize(1, 1), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + // Performance cases (general 2d convs) + {OutInChannels(32, 3), + InputSize2D(256, 256), + KernelSize(3, 3), + Stride(1, 1), + Padding(0, 0), + Dilation(1, 1), + 1}, + {OutInChannels(64, 32), + InputSize2D(128, 128), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 1}, + {OutInChannels(64, 64), + InputSize2D(128, 128), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 1}, + {OutInChannels(128, 128), + InputSize2D(128, 128), + KernelSize(5, 5), + Stride(2, 2), + Padding(2, 2), + Dilation(1, 1), + 4}}; + + // Test with different storage types and data types + std::vector storage_types = {utils::kTexture3D}; + + // Generate test cases for each combination + for (auto& config : configs) { + for (const auto& storage_type : storage_types) { + // Generate test case name programmatically + bool is_performance = config.channels.out > kRefDimSizeLimit || + config.channels.in > kRefDimSizeLimit || + config.input_size.h > kRefDimSizeLimit || + config.input_size.w > kRefDimSizeLimit; + std::string prefix = is_performance ? "performance_" : "correctness_"; + std::string suffix = std::to_string(config.channels.out) + "/" + + std::to_string(config.channels.in) + "_" + + std::to_string(config.input_size.h) + "/" + + std::to_string(config.input_size.w) + "_" + + std::to_string(config.kernel.h) + "/" + + std::to_string(config.kernel.w); + + config.op_name = "conv2d_q8ta_q8csw_q8to"; + config.test_case_name = prefix + suffix; + + // Only test q8ta_q8csw_q8to if the int8 dot product extension is + // supported + if (vkcompute::api::context() + ->adapter_ptr() + ->supports_int8_dot_product()) { + test_cases.push_back( + create_test_case_from_config(config, storage_type, vkapi::kFloat)); + } + } + } + + return test_cases; +} + +// Reference implementation for activation, weight, and output quantized conv2d +void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) { + // Extract input specifications + int32_t idx = 0; + const ValueSpec& input_spec = test_case.inputs()[idx++]; + const ValueSpec& input_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_zeros_spec = test_case.inputs()[idx++]; + const ValueSpec& weight_spec = test_case.inputs()[idx++]; + const ValueSpec& weight_sums_spec = test_case.inputs()[idx++]; + (void)weight_sums_spec; + const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; + const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& output_zeros_spec = test_case.inputs()[idx++]; + const ValueSpec& bias_spec = test_case.inputs()[idx++]; + const ValueSpec& kernel_size_spec = test_case.inputs()[idx++]; + const ValueSpec& stride_spec = test_case.inputs()[idx++]; + const ValueSpec& padding_spec = test_case.inputs()[idx++]; + const ValueSpec& dilation_spec = test_case.inputs()[idx++]; + const ValueSpec& groups_spec = test_case.inputs()[idx++]; + + // Extract output specification (mutable reference) + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions + auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in] + auto weight_sizes = + weight_spec.get_tensor_sizes(); // [C_out, C_in_per_group * K_h * K_w] + auto output_sizes = + output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out] + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t H_in = input_sizes[2]; + int64_t W_in = input_sizes[3]; + int64_t C_out = output_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + // Get kernel dimensions from kernel_size ValueSpec + auto kernel_size_data = kernel_size_spec.get_int32_data(); + int64_t K_h = kernel_size_data[0]; + int64_t K_w = kernel_size_data[1]; + + // Get stride, padding, dilation, and groups + auto stride_data = stride_spec.get_int32_data(); + auto padding_data = padding_spec.get_int32_data(); + auto dilation_data = dilation_spec.get_int32_data(); + int64_t stride_h = stride_data[0]; + int64_t stride_w = stride_data[1]; + int64_t pad_h = padding_data[0]; + int64_t pad_w = padding_data[1]; + int64_t dilation_h = dilation_data[0]; + int64_t dilation_w = dilation_data[1]; + int64_t groups = groups_spec.get_int_value(); + + // Skip for large tensors since computation time will be extremely slow + if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit || + H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit || + C_out > kRefDimSizeLimit) { + throw std::invalid_argument( + "One or more dimensions exceed the allowed limit for reference implementation."); + } + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Get raw data pointers + auto& input_data = input_spec.get_float_data(); + const float input_scale = input_scale_spec.get_float_value(); + const int32_t input_zero_point = input_zeros_spec.get_int_value(); + + auto& weight_data = weight_spec.get_int8_data(); + auto& weight_scales_data = weight_scales_spec.get_float_data(); + auto& bias_data = bias_spec.get_float_data(); + + const float output_scale = output_scale_spec.get_float_value(); + const int32_t output_zero_point = output_zeros_spec.get_int_value(); + + // Calculate channels per group for grouped convolution + int64_t C_in_per_group = C_in / groups; + int64_t C_out_per_group = C_out / groups; + + // Calculate number of output elements + int64_t num_output_elements = N * C_out * H_out * W_out; + + auto& ref_data = output_spec.get_ref_float_data(); + ref_data.resize(num_output_elements); + + const int in_features = utils::align_up_4(C_in_per_group * K_h * K_w); + + // Perform activation, weight, and output quantized conv2d operation + for (int64_t n = 0; n < N; ++n) { + for (int64_t out_c = 0; out_c < C_out; ++out_c) { + for (int64_t out_h = 0; out_h < H_out; ++out_h) { + for (int64_t out_w = 0; out_w < W_out; ++out_w) { + int32_t int_sum = 0; + int32_t weight_sum = 0; // Track weight sum on the fly + + // Determine which group this output channel belongs to + int64_t group_idx = out_c / C_out_per_group; + int64_t in_c_start = group_idx * C_in_per_group; + int64_t in_c_end = (group_idx + 1) * C_in_per_group; + + // Convolution operation with integer accumulation + for (int64_t in_c = in_c_start; in_c < in_c_end; ++in_c) { + for (int64_t kh = 0; kh < K_h; ++kh) { + for (int64_t kw = 0; kw < K_w; ++kw) { + // Calculate input position with dilation + int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h; + int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w; + + // Check bounds (zero padding) + if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) { + // Get input value and quantize to int8 + int64_t input_idx = n * (C_in * H_in * W_in) + + in_c * (H_in * W_in) + in_h * W_in + in_w; + + float quant_input_f = + std::round(input_data[input_idx] / input_scale) + + input_zero_point; + quant_input_f = + std::min(std::max(quant_input_f, -128.0f), 127.0f); + int8_t quantized_input = static_cast(quant_input_f); + + // Get quantized weight (already int8) + // Weight layout: [C_out, C_in_per_group * K_h * K_w] + int64_t weight_idx = out_c * in_features + + (kh * (K_w * C_in_per_group) + kw * C_in_per_group + + (in_c % C_in_per_group)); + int8_t quantized_weight = weight_data[weight_idx]; + + // Integer multiplication and accumulation + int_sum += static_cast(quantized_input) * + static_cast(quantized_weight); + + // Track weight sum for this output channel on the fly + weight_sum += static_cast(quantized_weight); + } else { + // For zero padding, we still need to account for the weight + // in weight_sum when input is effectively 0 (but quantized 0 + // is input_zero_point) + int64_t weight_idx = out_c * in_features + + (kh * (K_w * C_in_per_group) + kw * C_in_per_group + + (in_c % C_in_per_group)); + int8_t quantized_weight = weight_data[weight_idx]; + + // Add contribution from zero-padded input (quantized zero = + // input_zero_point) + int_sum += static_cast(input_zero_point) * + static_cast(quantized_weight); + + // Track weight sum for this output channel on the fly + weight_sum += static_cast(quantized_weight); + } + } + } + } + + // Convert accumulated integer result to float and apply scales + // Final result = (int_sum - zero_point_correction) * input_scale * + // weight_scale + bias zero_point_correction = input_zero_point * + // sum_of_weights_for_this_output_channel + int32_t zero_point_correction = input_zero_point * weight_sum; + int32_t accum_adjusted = int_sum - zero_point_correction; + float float_result = + accum_adjusted * input_scale * weight_scales_data[out_c]; + + // Add bias and store result + float_result += bias_data[out_c]; + + // Quantize the output to int8 + float quant_output_f = + std::round(float_result / output_scale) + output_zero_point; + quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); + int8_t quantized_output = static_cast(quant_output_f); + + // Dequantize back to float + float dequant_output = + (static_cast(quantized_output) - output_zero_point) * + output_scale; + + int64_t output_idx = n * (C_out * H_out * W_out) + + out_c * (H_out * W_out) + out_h * W_out + out_w; + ref_data[output_idx] = dequant_output; + } + } + } + } +} + +void reference_impl(TestCase& test_case) { + conv2d_q8ta_q8csw_q8to_reference_impl(test_case); +} + +// Custom FLOP calculator for quantized conv2d operation +int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) { + int kernel_idx = 9; // kernel_size is at index 9 for q8ta_q8csw_q8to + + // Get input and weight dimensions + const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); + const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes(); + + const auto& kernel_sizes = test_case.inputs()[kernel_idx].get_int32_data(); + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t C_out = output_sizes[1]; + int64_t K_h = kernel_sizes[0]; + int64_t K_w = kernel_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + // Calculate FLOPs for quantized conv2d operation + // Each output element requires: + // - C_in * K_h * K_w multiply-accumulate operations + // - Additional operations for quantization/dequantization + int64_t output_elements = N * C_out * H_out * W_out; + int64_t ops_per_output = C_in * K_h * K_w; + + int64_t flop = output_elements * (ops_per_output); + + return flop; +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout + << "Quantized Conv2d Operation with Output Quantization Prototyping Framework" + << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = reference_impl; + + // Execute test cases using the new framework with custom FLOP calculator + auto results = execute_test_cases( + generate_quantized_conv2d_test_cases, + quantized_conv2d_flop_calculator, + "QuantizedConv2dQ8ToQ8To", + 0, + 10, + ref_fn); + + return 0; +} diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp new file mode 100644 index 00000000000..c259b45de06 --- /dev/null +++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp @@ -0,0 +1,592 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include "conv2d_utils.h" +#include "utils.h" + +#include + +using namespace executorch::vulkan::prototyping; + +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 100; + +// Utility function to create a test case from a Conv2dConfig for depthwise +// convolution +TestCase create_test_case_from_config( + const Conv2dConfig& config, + utils::StorageType storage_type, + vkapi::ScalarType input_dtype) { + TestCase test_case; + + // Create a descriptive name for the test case + std::string storage_str = + (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; + std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; + + std::string test_name = + config.test_case_name + "_" + storage_str + "_" + dtype_str; + test_case.set_name(test_name); + + // Set the operator name for the test case + std::string operator_name = "etvk." + config.op_name + ".test"; + test_case.set_operator_name(operator_name); + + // Calculate output dimensions + int64_t H_out = config.get_output_height(); + int64_t W_out = config.get_output_width(); + + // Input tensor (float/half) - [1, C_in, H_in, W_in] (batch size always 1) + std::vector input_size = { + 1, config.channels.in, config.input_size.h, config.input_size.w}; + + ValueSpec input_tensor( + input_size, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::RANDOM); + + if (debugging()) { + print_valuespec_data(input_tensor, "input_tensor", false, 64); + } + + float input_scale_val = 0.008123; + ValueSpec input_scale(input_scale_val); + + int32_t input_zero_point_val = 2; + ValueSpec input_zero_point(input_zero_point_val); + + // Quantized weight tensor (int8) for depthwise convolution + // Memory layout: [K_h, K_w, OC] + // For depthwise conv: groups = channels.out, in_channels_per_group = 1 + std::vector weight_size = { + config.kernel.h, config.kernel.w, config.channels.out}; + ValueSpec quantized_weight( + weight_size, + vkapi::kChar, // int8 for quantized weights + storage_type, + utils::kWidthPacked, + DataGenType::RANDINT8); + quantized_weight.set_constant(true); + + if (debugging()) { + print_valuespec_data(quantized_weight, "weight_tensor", false, 64); + } + + // Weight quantization scales (float/half, per-channel) + ValueSpec weight_scales( + {config.channels.out}, // Per output channel + input_dtype, + storage_type, + utils::kWidthPacked, + DataGenType::RANDOM_SCALES); + weight_scales.set_constant(true); + + ValueSpec weight_sums( + {config.channels.out}, // Per output channel + vkapi::kInt, + storage_type, + utils::kWidthPacked, + DataGenType::ZEROS); + weight_sums.set_constant(true); + + // Compute weight_sums data based on quantized weights for depthwise layout + // For depthwise conv: each output channel has K_h * K_w weights + // Custom computation for depthwise layout [K_h, K_w, OC] + auto& weight_sums_data = weight_sums.get_int32_data(); + auto& quantized_weight_data = quantized_weight.get_int8_data(); + + weight_sums_data.resize(config.channels.out); + + for (int64_t out_c = 0; out_c < config.channels.out; ++out_c) { + int32_t sum = 0; + for (int64_t kh = 0; kh < config.kernel.h; ++kh) { + for (int64_t kw = 0; kw < config.kernel.w; ++kw) { + // Weight indexing for depthwise layout [K_h, K_w, OC] + int64_t weight_idx = kh * (config.kernel.w * config.channels.out) + + kw * config.channels.out + out_c; + sum += static_cast(quantized_weight_data[weight_idx]); + } + } + weight_sums_data[out_c] = sum; + } + + // Bias (optional, float/half) - [C_out] + ValueSpec bias( + {config.channels.out}, // Per output channel + input_dtype, + storage_type, + utils::kWidthPacked, + DataGenType::RANDOM); + bias.set_constant(true); + + // Output quantization parameters + float output_scale_val = 0.05314; + ValueSpec output_scale(output_scale_val); + + int32_t output_zero_point_val = -1; + ValueSpec output_zero_point(output_zero_point_val); + + // Stride and padding parameters + ValueSpec stride({config.stride.h, config.stride.w}); + ValueSpec padding({config.padding.h, config.padding.w}); + + // Dilation and groups parameters + ValueSpec dilation({config.dilation.h, config.dilation.w}); + ValueSpec groups(config.groups); + + // Kernel size parameters + ValueSpec kernel_size({config.kernel.h, config.kernel.w}); + + // Output tensor (float/half) - [1, C_out, H_out, W_out] (batch size always 1) + ValueSpec output( + {1, config.channels.out, H_out, W_out}, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::ZEROS); + + // Add all specs to test case for q8ta_q8csw_q8to operation + test_case.add_input_spec(input_tensor); + test_case.add_input_spec(input_scale); + test_case.add_input_spec(input_zero_point); + test_case.add_input_spec(quantized_weight); + test_case.add_input_spec(weight_sums); + test_case.add_input_spec(weight_scales); + test_case.add_input_spec(output_scale); + test_case.add_input_spec(output_zero_point); + test_case.add_input_spec(bias); + test_case.add_input_spec(kernel_size); + test_case.add_input_spec(stride); + test_case.add_input_spec(padding); + test_case.add_input_spec(dilation); + test_case.add_input_spec(groups); + + test_case.add_output_spec(output); + + test_case.set_abs_tolerance(output_scale_val + 1e-4f); + + return test_case; +} + +// Generate easy test cases for quantized depthwise conv2d operation (for +// debugging) +std::vector generate_quantized_conv2d_dw_easy_cases() { + std::vector test_cases; + + // Single simple configuration for debugging - depthwise convolution + Conv2dConfig config = { + OutInChannels(8, 8), // channels (out, in) - equal for depthwise + InputSize2D(8, 8), // input_size (h, w) + KernelSize(3, 3), // kernel + Stride(2, 2), // stride + Padding(1, 1), // padding + Dilation(1, 1), // dilation + 8, // groups = channels.out for depthwise + }; + config.op_name = "conv2d_q8ta_q8csw_q8to"; + + // Test with both storage types and data types for completeness + std::vector storage_types = {utils::kTexture3D}; + std::vector float_types = {vkapi::kFloat}; + + // Generate test cases for each combination + for (const auto& storage_type : storage_types) { + for (const auto& input_dtype : float_types) { + test_cases.push_back( + create_test_case_from_config(config, storage_type, input_dtype)); + } + } + + return test_cases; +} + +// Generate test cases for quantized depthwise conv2d operation +std::vector generate_quantized_conv2d_dw_test_cases() { + std::vector test_cases; + + std::vector configs = { + // Depthwise convolutions: groups = channels.out, channels.in = + // channels.out + {OutInChannels(32, 32), + InputSize2D(64, 64), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 32}, + {OutInChannels(64, 64), + InputSize2D(32, 32), + KernelSize(3, 3), + Stride(2, 2), + Padding(2, 2), + Dilation(1, 1), + 64}, + {OutInChannels(64, 64), + InputSize2D(32, 32), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + 64}, + {OutInChannels(80, 80), + InputSize2D(16, 16), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 80}, + {OutInChannels(16, 16), + InputSize2D(57, 33), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 16}, + // Different kernel sizes for depthwise + {OutInChannels(32, 32), + InputSize2D(64, 64), + KernelSize(5, 5), + Stride(1, 1), + Padding(2, 2), + Dilation(1, 1), + 32}, + {OutInChannels(96, 96), + InputSize2D(64, 64), + KernelSize(5, 5), + Stride(2, 2), + Padding(2, 2), + Dilation(1, 1), + 96}, + // Performance cases + {OutInChannels(128, 128), + InputSize2D(128, 128), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 128}, + {OutInChannels(64, 64), + InputSize2D(256, 256), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 64}, + {OutInChannels(288, 288), + InputSize2D(16, 16), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 288}, + {OutInChannels(32, 32), + InputSize2D(128, 128), + KernelSize(3, 3), + Stride(1, 1), + Padding(2, 2), + Dilation(1, 1), + 32}}; + + // Test with different storage types and data types + std::vector storage_types = {utils::kTexture3D}; + + // Generate test cases for each combination + for (auto& config : configs) { + for (const auto& storage_type : storage_types) { + // Generate test case name programmatically + bool is_performance = config.channels.out > kRefDimSizeLimit || + config.channels.in > kRefDimSizeLimit || + config.input_size.h > kRefDimSizeLimit || + config.input_size.w > kRefDimSizeLimit; + std::string prefix = + is_performance ? "performance_dw_" : "correctness_dw_"; + std::string suffix = std::to_string(config.channels.out) + "/" + + std::to_string(config.channels.in) + "_" + + std::to_string(config.input_size.h) + "/" + + std::to_string(config.input_size.w) + "_" + + std::to_string(config.kernel.h) + "/" + + std::to_string(config.kernel.w); + + config.op_name = "conv2d_q8ta_q8csw_q8to"; + config.test_case_name = prefix + suffix; + + // Only test q8ta_q8csw_q8to if the int8 dot product extension is + // supported + if (vkcompute::api::context() + ->adapter_ptr() + ->supports_int8_dot_product()) { + test_cases.push_back( + create_test_case_from_config(config, storage_type, vkapi::kFloat)); + } + } + } + + return test_cases; +} + +// Reference implementation for activation, weight, and output quantized +// depthwise conv2d +void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) { + // Extract input specifications + int32_t idx = 0; + const ValueSpec& input_spec = test_case.inputs()[idx++]; + const ValueSpec& input_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_zeros_spec = test_case.inputs()[idx++]; + const ValueSpec& weight_spec = test_case.inputs()[idx++]; + const ValueSpec& weight_sums_spec = test_case.inputs()[idx++]; + (void)weight_sums_spec; + const ValueSpec& weight_scales_spec = test_case.inputs()[idx++]; + const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& output_zeros_spec = test_case.inputs()[idx++]; + const ValueSpec& bias_spec = test_case.inputs()[idx++]; + const ValueSpec& kernel_size_spec = test_case.inputs()[idx++]; + const ValueSpec& stride_spec = test_case.inputs()[idx++]; + const ValueSpec& padding_spec = test_case.inputs()[idx++]; + const ValueSpec& dilation_spec = test_case.inputs()[idx++]; + const ValueSpec& groups_spec = test_case.inputs()[idx++]; + + // Extract output specification (mutable reference) + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions + auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in] + auto weight_sizes = + weight_spec.get_tensor_sizes(); // [K_h, align_up_4(K_w), OC] + auto output_sizes = + output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out] + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t H_in = input_sizes[2]; + int64_t W_in = input_sizes[3]; + int64_t C_out = output_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + // Get kernel dimensions from kernel_size ValueSpec + auto kernel_size_data = kernel_size_spec.get_int32_data(); + int64_t K_h = kernel_size_data[0]; + int64_t K_w = kernel_size_data[1]; + + // Get stride, padding, dilation, and groups + auto stride_data = stride_spec.get_int32_data(); + auto padding_data = padding_spec.get_int32_data(); + auto dilation_data = dilation_spec.get_int32_data(); + int64_t stride_h = stride_data[0]; + int64_t stride_w = stride_data[1]; + int64_t pad_h = padding_data[0]; + int64_t pad_w = padding_data[1]; + int64_t dilation_h = dilation_data[0]; + int64_t dilation_w = dilation_data[1]; + int64_t groups = groups_spec.get_int_value(); + + // Skip for large tensors since computation time will be extremely slow + if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit || + H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit || + C_out > kRefDimSizeLimit) { + throw std::invalid_argument( + "One or more dimensions exceed the allowed limit for reference implementation."); + } + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Verify this is a depthwise convolution + if (groups != C_out || C_in != C_out) { + throw std::invalid_argument( + "This is not a depthwise convolution configuration"); + } + + // Get raw data pointers + auto& input_data = input_spec.get_float_data(); + const float input_scale = input_scale_spec.get_float_value(); + const int32_t input_zero_point = input_zeros_spec.get_int_value(); + + auto& weight_data = weight_spec.get_int8_data(); + auto& weight_scales_data = weight_scales_spec.get_float_data(); + auto& bias_data = bias_spec.get_float_data(); + + const float output_scale = output_scale_spec.get_float_value(); + const int32_t output_zero_point = output_zeros_spec.get_int_value(); + + // Calculate number of output elements + int64_t num_output_elements = N * C_out * H_out * W_out; + + auto& ref_data = output_spec.get_ref_float_data(); + ref_data.resize(num_output_elements); + + // Perform activation, weight, and output quantized depthwise conv2d operation + for (int64_t n = 0; n < N; ++n) { + for (int64_t out_c = 0; out_c < C_out; ++out_c) { + for (int64_t out_h = 0; out_h < H_out; ++out_h) { + for (int64_t out_w = 0; out_w < W_out; ++out_w) { + int32_t int_sum = 0; + int32_t weight_sum = 0; // Track weight sum on the fly + + // For depthwise convolution, each output channel corresponds to one + // input channel + int64_t in_c = out_c; + + // Convolution operation with integer accumulation + for (int64_t kh = 0; kh < K_h; ++kh) { + for (int64_t kw = 0; kw < K_w; ++kw) { + // Calculate input position with dilation + int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h; + int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w; + + // Check bounds (zero padding) + if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) { + // Get input value and quantize to int8 + int64_t input_idx = n * (C_in * H_in * W_in) + + in_c * (H_in * W_in) + in_h * W_in + in_w; + + float quant_input_f = + std::round(input_data[input_idx] / input_scale) + + input_zero_point; + quant_input_f = + std::min(std::max(quant_input_f, -128.0f), 127.0f); + int8_t quantized_input = static_cast(quant_input_f); + + // Get quantized weight using depthwise layout [K_h, K_w, OC] + int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c; + int8_t quantized_weight = weight_data[weight_idx]; + + if (false && in_w == 0 && in_h == 0 && out_c == 0) { + std::cout << "input: " << input_data[input_idx] << std::endl; + std::cout << "quantized_input: " << (int)quantized_input + << std::endl; + std::cout << "quantized_weight: " << (int)quantized_weight + << std::endl; + } + // Integer multiplication and accumulation + int_sum += static_cast(quantized_input) * + static_cast(quantized_weight); + + // Track weight sum for this output channel on the fly + weight_sum += static_cast(quantized_weight); + } else { + // For zero padding, we still need to account for the weight + // in weight_sum when input is effectively 0 (but quantized 0 + // is input_zero_point) + int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c; + int8_t quantized_weight = weight_data[weight_idx]; + + // Add contribution from zero-padded input (quantized zero = + // input_zero_point) + int_sum += static_cast(input_zero_point) * + static_cast(quantized_weight); + + // Track weight sum for this output channel on the fly + weight_sum += static_cast(quantized_weight); + } + } + } + + // Convert accumulated integer result to float and apply scales + // Final result = (int_sum - zero_point_correction) * input_scale * + // weight_scale + bias zero_point_correction = input_zero_point * + // sum_of_weights_for_this_output_channel + int32_t zero_point_correction = input_zero_point * weight_sum; + int32_t accum_adjusted = int_sum - zero_point_correction; + float float_result = + accum_adjusted * input_scale * weight_scales_data[out_c]; + + // Add bias and store result + float_result += bias_data[out_c]; + + // Quantize the output to int8 + float quant_output_f = + std::round(float_result / output_scale) + output_zero_point; + quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); + int8_t quantized_output = static_cast(quant_output_f); + + if (false && out_c < 4 && out_h < 1 && out_w < 4) { + std::cout << "int_sum[" << out_c << ", " << out_h << ", " << out_w + << "] = " << int_sum << ", " << float_result << ", " + << output_scale << ", " << quant_output_f << std::endl; + } + + // Dequantize back to float + float dequant_output = + (static_cast(quantized_output) - output_zero_point) * + output_scale; + + int64_t output_idx = n * (C_out * H_out * W_out) + + out_c * (H_out * W_out) + out_h * W_out + out_w; + ref_data[output_idx] = dequant_output; + } + } + } + } +} + +void reference_impl(TestCase& test_case) { + conv2d_q8ta_q8csw_q8to_dw_reference_impl(test_case); +} + +// Custom FLOP calculator for quantized depthwise conv2d operation +int64_t quantized_conv2d_dw_flop_calculator(const TestCase& test_case) { + int kernel_idx = 9; // kernel_size is at index 9 for q8ta_q8csw_q8to + + // Get input and weight dimensions + const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes(); + const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes(); + + const auto& kernel_sizes = test_case.inputs()[kernel_idx].get_int32_data(); + + int64_t N = input_sizes[0]; + int64_t C_out = output_sizes[1]; + int64_t K_h = kernel_sizes[0]; + int64_t K_w = kernel_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + // Calculate FLOPs for quantized depthwise conv2d operation + // Each output element requires: + // - K_h * K_w multiply-accumulate operations (only one input channel per + // output channel) + // - Additional operations for quantization/dequantization + int64_t output_elements = N * C_out * H_out * W_out; + int64_t ops_per_output = K_h * K_w; + + int64_t flop = output_elements * ops_per_output; + + return flop; +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout + << "Quantized Depthwise Conv2d Operation with Output Quantization Prototyping Framework" + << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = reference_impl; + + // Execute test cases using the new framework with custom FLOP calculator + auto results = execute_test_cases( + generate_quantized_conv2d_dw_test_cases, + quantized_conv2d_dw_flop_calculator, + "QuantizedDepthwiseInt8Conv2d", + 0, + 1, + ref_fn); + + return 0; +} diff --git a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp new file mode 100644 index 00000000000..5799bc194c9 --- /dev/null +++ b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp @@ -0,0 +1,265 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include "utils.h" + +using namespace executorch::vulkan::prototyping; + +// Utility function to create a test case for quantized add operation +TestCase create_quantized_add_test_case( + const std::vector& sizes, + utils::StorageType storage_type, + vkapi::ScalarType input_dtype) { + TestCase test_case; + + // Create a descriptive name for the test case + std::string size_str = ""; + for (size_t i = 0; i < sizes.size(); ++i) { + size_str += std::to_string(sizes[i]); + if (i < sizes.size() - 1) + size_str += "x"; + } + + std::string storage_str = + (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; + std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; + + std::string test_name = + "QuantizedAdd_" + size_str + "_" + storage_str + "_" + dtype_str; + test_case.set_name(test_name); + + // Set the operator name for the test case + test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test"); + + // Input tensor A (float/half) + ValueSpec input_a( + sizes, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::RANDOM); + + // Input tensor B (float/half) + ValueSpec input_b( + sizes, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::RANDOM); + + // Quantization parameters for input A + float input_a_scale_val = 0.007843; // 2/255 approximately + ValueSpec input_a_scale(input_a_scale_val); + + int32_t input_a_zero_point_val = 3; + ValueSpec input_a_zero_point(input_a_zero_point_val); + + // Quantization parameters for input B + float input_b_scale_val = 0.009412; // 2.4/255 approximately + ValueSpec input_b_scale(input_b_scale_val); + + int32_t input_b_zero_point_val = -2; + ValueSpec input_b_zero_point(input_b_zero_point_val); + + // Output quantization parameters + float output_scale_val = 0.015686; // 4/255 approximately + ValueSpec output_scale(output_scale_val); + + int32_t output_zero_point_val = 1; + ValueSpec output_zero_point(output_zero_point_val); + + // Alpha parameter + float alpha_val = 1.0f; + ValueSpec alpha(alpha_val); + + // Output tensor (float/half) + ValueSpec output( + sizes, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::ZEROS); + + // Add all specs to test case for q8ta_q8ta_q8to add operation + test_case.add_input_spec(input_a); + test_case.add_input_spec(input_b); + test_case.add_input_spec(input_a_scale); + test_case.add_input_spec(input_a_zero_point); + test_case.add_input_spec(input_b_scale); + test_case.add_input_spec(input_b_zero_point); + test_case.add_input_spec(output_scale); + test_case.add_input_spec(output_zero_point); + test_case.add_input_spec(alpha); + + test_case.add_output_spec(output); + + test_case.set_abs_tolerance(output_scale_val + 1e-4f); + + return test_case; +} + +// Generate test cases for quantized add operation +std::vector generate_quantized_add_test_cases() { + std::vector test_cases; + + // Define different input size configurations + std::vector> size_configs = { + {3, 32, 32}, // Small square + {8, 64, 64}, // Medium square + {16, 16, 16}, // 3D cube + {8, 32, 16}, // 3D rectangular + {7, 7, 13}, // Irregular sizes + }; + + // Storage types to test + std::vector storage_types = {utils::kTexture3D}; + + // Data types to test + std::vector data_types = {vkapi::kFloat}; + + // Generate test cases for each combination + for (const auto& sizes : size_configs) { + for (const auto& storage_type : storage_types) { + for (const auto& data_type : data_types) { + test_cases.push_back( + create_quantized_add_test_case(sizes, storage_type, data_type)); + } + } + } + + return test_cases; +} + +// Reference implementation for quantized add operation +void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) { + // Extract input specifications + int32_t idx = 0; + const ValueSpec& input_a_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_spec = test_case.inputs()[idx++]; + const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& alpha_spec = test_case.inputs()[idx++]; + + // Extract output specification (mutable reference) + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions + auto input_sizes = input_a_spec.get_tensor_sizes(); + int64_t num_elements = input_a_spec.numel(); + + if (input_a_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Get raw data pointers + auto& input_a_data = input_a_spec.get_float_data(); + auto& input_b_data = input_b_spec.get_float_data(); + + const float input_a_scale = input_a_scale_spec.get_float_value(); + const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value(); + const float input_b_scale = input_b_scale_spec.get_float_value(); + const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value(); + const float output_scale = output_scale_spec.get_float_value(); + const int32_t output_zero_point = output_zero_point_spec.get_int_value(); + const float alpha = alpha_spec.get_float_value(); + + auto& ref_data = output_spec.get_ref_float_data(); + ref_data.resize(num_elements); + + // Perform quantized add operation + for (int64_t i = 0; i < num_elements; ++i) { + // Quantize input A to int8 + float quant_a_f = + std::round(input_a_data[i] / input_a_scale) + input_a_zero_point; + quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f); + int8_t quantized_a = static_cast(quant_a_f); + + // Quantize input B to int8 + float quant_b_f = + std::round(input_b_data[i] / input_b_scale) + input_b_zero_point; + quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f); + int8_t quantized_b = static_cast(quant_b_f); + + // Dequantize both inputs to a common scale for addition + float dequant_a = + (static_cast(quantized_a) - input_a_zero_point) * input_a_scale; + float dequant_b = + (static_cast(quantized_b) - input_b_zero_point) * input_b_scale; + + // Perform addition in float space with alpha + float float_result = dequant_a + alpha * dequant_b; + + // Quantize the result to int8 + float quant_output_f = + std::round(float_result / output_scale) + output_zero_point; + quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); + int8_t quantized_output = static_cast(quant_output_f); + + // Dequantize back to float for comparison + float dequant_output = + (static_cast(quantized_output) - output_zero_point) * + output_scale; + + ref_data[i] = dequant_output; + } +} + +void reference_impl(TestCase& test_case) { + add_q8ta_q8ta_q8to_reference_impl(test_case); +} + +// Custom FLOP calculator for quantized add operation +int64_t quantized_add_flop_calculator(const TestCase& test_case) { + // Calculate total elements from the first input tensor + int64_t total_elements = 1; + if (!test_case.empty() && test_case.num_inputs() > 0 && + test_case.inputs()[0].is_tensor()) { + const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); + for (int64_t size : sizes) { + total_elements *= size; + } + } + + // Quantized add operation includes: + // - 2 quantizations (float to int8) + // - 2 dequantizations (int8 to float) + // - 1 addition + // For simplicity, we count this as 1 FLOP per element (the addition) + return total_elements; +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Quantized Add Operation (q8ta_q8ta_q8to) Prototyping Framework" + << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = reference_impl; + + // Execute test cases using the new framework with custom FLOP calculator + auto results = execute_test_cases( + generate_quantized_add_test_cases, + quantized_add_flop_calculator, + "QuantizedAddQ8taQ8taQ8to", + 0, + 1, + ref_fn); + + return 0; +} diff --git a/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp b/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp new file mode 100644 index 00000000000..5275e6c9335 --- /dev/null +++ b/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp @@ -0,0 +1,251 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include +#include "utils.h" + +#include + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 512; + +// QDQ8TA Conv2D configuration struct for 4D tensor quantize-dequantize testing +struct QDQ8TAConv2DConfig { + int64_t batch_size; // N dimension + int64_t in_channels; // C dimension + int64_t height; // H dimension + int64_t width; // W dimension + std::string test_case_name = "placeholder"; + std::string op_name = "qdq8ta_conv2d_input"; +}; + +// Utility function to create a test case from a QDQ8TAConv2DConfig +TestCase create_test_case_from_config( + const QDQ8TAConv2DConfig& config, + utils::StorageType storage_type, + vkapi::ScalarType input_dtype) { + TestCase test_case; + + // Create a descriptive name for the test case + std::string storage_str = + (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; + std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; + + std::string test_name = + config.test_case_name + "_" + storage_str + "_" + dtype_str; + test_case.set_name(test_name); + + // Set the operator name for the test case + std::string operator_name = "etvk." + config.op_name + ".default"; + test_case.set_operator_name(operator_name); + + // Input tensor (float) - [N, C, H, W] + std::vector input_size = { + config.batch_size, config.in_channels, config.height, config.width}; + ValueSpec input_tensor( + input_size, + input_dtype, + storage_type, + utils::kChannelsPacked, // Use channels packed for conv2d tensors + DataGenType::RANDOM); + + float scale_val = 0.007112; + ValueSpec scale(scale_val); + + // Generate random zero point within quantization range + int32_t zero_point_val = -2; + ValueSpec zero_point(zero_point_val); + + // Output tensor (float) - same shape as input [N, C, H, W] + ValueSpec output_tensor( + input_size, + input_dtype, + storage_type, + utils::kChannelsPacked, + DataGenType::ZEROS); + + // Add all specs to test case + test_case.add_input_spec(input_tensor); + test_case.add_input_spec(scale); + test_case.add_input_spec(zero_point); + test_case.add_output_spec(output_tensor); + + test_case.set_abs_tolerance(scale_val + 1e-4); + + return test_case; +} + +// Generate easy test cases for qdq8ta_conv2d operation (for debugging) +std::vector generate_qdq8ta_conv2d_easy_cases() { + std::vector test_cases; + + // Single simple configuration for debugging + QDQ8TAConv2DConfig config = { + 1, // batch_size + 3, // in_channels + 4, // height + 4, // width + "simple", // test_case_name + }; + + // Test with both storage types + std::vector storage_types = {utils::kTexture3D}; + std::vector float_types = {vkapi::kFloat}; + + // Generate test cases for each combination + for (const auto& storage_type : storage_types) { + for (const auto& input_dtype : float_types) { + test_cases.push_back( + create_test_case_from_config(config, storage_type, input_dtype)); + } + } + + return test_cases; +} + +// Generate test cases for qdq8ta_conv2d operation +std::vector generate_qdq8ta_conv2d_test_cases() { + std::vector test_cases; + + std::vector configs = { + // Small test cases for correctness + {1, 3, 16, 16}, + {1, 8, 32, 32}, + {1, 16, 24, 24}, + {1, 32, 12, 12}, + {1, 1, 64, 64}, + {1, 3, 64, 64}, + {1, 4, 16, 16}, + + // Different tensor sizes + {1, 8, 20, 20}, + {1, 16, 14, 14}, + {1, 8, 28, 28}, + + // Odd tensor sizes + {1, 3, 15, 15}, + {1, 13, 31, 31}, + {1, 17, 23, 23}, + + // Performance test cases (larger tensors) + {1, 64, 128, 128}, + {1, 32, 64, 64}, + {1, 128, 56, 56}, + }; + + // Test with different storage types + std::vector storage_types = {utils::kTexture3D}; + + for (auto config : configs) { + std::string prefix = + (config.batch_size < kRefDimSizeLimit && + config.in_channels < kRefDimSizeLimit && + config.height < kRefDimSizeLimit && config.width < kRefDimSizeLimit) + ? "correctness_" + : "performance_"; + std::string generated_test_case_name = prefix + + std::to_string(config.batch_size) + "_" + + std::to_string(config.in_channels) + "_" + + std::to_string(config.height) + "_" + std::to_string(config.width); + + config.test_case_name = generated_test_case_name; + + for (const auto& storage_type : storage_types) { + test_cases.push_back( + create_test_case_from_config(config, storage_type, vkapi::kFloat)); + } + } + + return test_cases; +} + +// Reference implementation for qdq8ta_conv2d operation +void qdq8ta_conv2d_reference_impl(TestCase& test_case) { + int32_t idx = 0; + const ValueSpec& input_spec = test_case.inputs()[idx++]; + const ValueSpec& scale_spec = test_case.inputs()[idx++]; + const ValueSpec& zero_point_spec = test_case.inputs()[idx++]; + + // Extract output specification + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions + auto input_sizes = input_spec.get_tensor_sizes(); // [N, C, H, W] + int64_t N = input_sizes[0]; + int64_t C = input_sizes[1]; + int64_t H = input_sizes[2]; + int64_t W = input_sizes[3]; + + // Skip for large tensors since computation time will be extremely slow + if (N > kRefDimSizeLimit || C > kRefDimSizeLimit || H > kRefDimSizeLimit || + W > kRefDimSizeLimit) { + throw std::invalid_argument( + "One or more dimensions (N, C, H, W) exceed the allowed limit for reference implementation."); + } + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Get raw data pointers + auto& input_data = input_spec.get_float_data(); + + // Extract the randomized scale and zero point values (following + // q8csw_conv2d.cpp pattern) + float scale = scale_spec.get_float_value(); + int32_t zero_point = zero_point_spec.get_int_value(); + int32_t quant_min = -128; + int32_t quant_max = 127; + + // Prepare output data + auto& ref_data = output_spec.get_ref_float_data(); + int64_t num_elements = N * C * H * W; + ref_data.resize(num_elements); + + // Perform quantize-dequantize operation on each element + for (int64_t i = 0; i < num_elements; ++i) { + float input_val = input_data[i]; + + // Quantize: quantized = round(input / scale + zero_point) + float quantized_float = std::round(input_val / scale) + zero_point; + + // Clamp to quantization range + quantized_float = std::max(quantized_float, static_cast(quant_min)); + quantized_float = std::min(quantized_float, static_cast(quant_max)); + + int32_t quantized_int = static_cast(quantized_float); + + // Dequantize: output = (quantized - zero_point) * scale + float dequantized = (quantized_int - zero_point) * scale; + + ref_data[i] = dequantized; + } +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "QDQ8TA Conv2D Operation Prototyping Framework" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = qdq8ta_conv2d_reference_impl; + + auto results = execute_test_cases( + generate_qdq8ta_conv2d_test_cases, "QDQ8TAConv2D", 0, 1, ref_fn); + + return 0; +} diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 3162857c2d3..4ef1cdd7fed 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -60,9 +60,11 @@ def define_common_targets(is_fbcode = False): ], headers = [ "utils.h", + "conv2d_utils.h", ], exported_headers = [ "utils.h", + "conv2d_utils.h", ], platforms = get_platforms(), deps = [ @@ -97,3 +99,7 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("q8csw_conv2d") define_custom_op_test_binary("choose_qparams_per_row") define_custom_op_test_binary("q4gsw_linear") + define_custom_op_test_binary("qdq8ta_conv2d_activations") + define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d") + define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d_dw") + define_custom_op_test_binary("q8ta_q8ta_q8to_add") diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp index 2aa827a4d5a..4de6c32ac25 100644 --- a/backends/vulkan/test/custom_ops/utils.cpp +++ b/backends/vulkan/test/custom_ops/utils.cpp @@ -661,7 +661,12 @@ float collect_gpu_timing_us(ComputeGraph& graph) { float total_duration_us = 0.0f; for (const auto& shader_result : results) { if (shader_result.kernel_name.find("nchw_to") == std::string::npos && - shader_result.kernel_name.find("to_nchw") == std::string::npos) { + shader_result.kernel_name.find("to_nchw") == std::string::npos && + shader_result.kernel_name.find( + "quantize_and_pack_q8ta_conv2d_input") == std::string::npos && + shader_result.kernel_name.find( + "unpack_and_dequantize_q8ta_conv2d_output") == + std::string::npos) { // Calculate duration from start and end times, convert from ns to μs uint64_t duration_ns = shader_result.end_time_ns - shader_result.start_time_ns; @@ -1715,6 +1720,41 @@ void compute_weight_sums( } } +// Compute weight sums for 4D quantized conv2d operations +// Weight layout: [C_out, K_h, K_w, align_up_4(C_in_per_group)] +void compute_weight_sums_4d( + ValueSpec& weight_sums, + const ValueSpec& quantized_weight, + int64_t out_channels, + int64_t kernel_h, + int64_t kernel_w, + int64_t aligned_in_channels) { + auto& weight_sums_data = weight_sums.get_int32_data(); + auto& quantized_weight_data = quantized_weight.get_int8_data(); + + weight_sums_data.resize(out_channels); + + // For each output channel, compute the sum of quantized weights + for (int64_t out_c = 0; out_c < out_channels; ++out_c) { + int32_t sum = 0; + + for (int64_t kh = 0; kh < kernel_h; ++kh) { + for (int64_t kw = 0; kw < kernel_w; ++kw) { + for (int64_t in_c = 0; in_c < aligned_in_channels; ++in_c) { + // Weight indexing: [out_c, kh, kw, in_c] + int64_t weight_idx = + out_c * (kernel_h * kernel_w * aligned_in_channels) + + kh * (kernel_w * aligned_in_channels) + kw * aligned_in_channels + + in_c; + sum += static_cast(quantized_weight_data[weight_idx]); + } + } + } + + weight_sums_data[out_c] = sum; + } +} + // Helper function to unpack 4-bit values from uint8 (same as in // q4gsw_linear.cpp) std::pair unpack_4bit_utils(uint8_t packed) { diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h index f1736f1d144..b80f28639e8 100644 --- a/backends/vulkan/test/custom_ops/utils.h +++ b/backends/vulkan/test/custom_ops/utils.h @@ -653,6 +653,16 @@ void compute_weight_sums( int64_t out_features, int64_t elements_per_output_feature); +// Compute weight sums for 4D quantized conv2d operations +// Weight layout: [C_out, K_h, K_w, align_up_4(C_in_per_group)] +void compute_weight_sums_4d( + ValueSpec& weight_sums, + const ValueSpec& quantized_weight, + int64_t out_channels, + int64_t kernel_h, + int64_t kernel_w, + int64_t aligned_in_channels); + // Compute weight sums for 4-bit group symmetric quantized weights void compute_weight_sums_4bit_grouped( ValueSpec& weight_sums, diff --git a/backends/vulkan/test/scripts/test_model.sh b/backends/vulkan/test/scripts/test_model.sh index 5f06d2c039b..40ec88bae70 100755 --- a/backends/vulkan/test/scripts/test_model.sh +++ b/backends/vulkan/test/scripts/test_model.sh @@ -111,6 +111,7 @@ build_core_libraries_and_devtools() { -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ diff --git a/backends/vulkan/test/scripts/test_op.sh b/backends/vulkan/test/scripts/test_op.sh index 1ec07b7f75f..797089e54dc 100755 --- a/backends/vulkan/test/scripts/test_op.sh +++ b/backends/vulkan/test/scripts/test_op.sh @@ -138,6 +138,7 @@ build_core_libraries() { -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index f8194f0b32c..f92cea64767 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -2482,6 +2482,7 @@ def forward(self, x): rtol=1e-1, ) + @unittest.skip("Cannot run on swiftshader due to no integer dot product support") def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence(self): """ Test a sequence of convolution layers quantized with PT2E quantization. @@ -2572,6 +2573,7 @@ def forward(self, x): rtol=1e-1, ) + @unittest.skip("Cannot run on swiftshader due to no integer dot product support") def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence_all_reduced(self): """ Test a sequence of convolution layers quantized with PT2E quantization. diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py index 4a30ab6c2de..438126a179f 100644 --- a/backends/vulkan/test/test_vulkan_passes.py +++ b/backends/vulkan/test/test_vulkan_passes.py @@ -3,15 +3,8 @@ import torch -from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform -from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass -from executorch.backends.vulkan.quantizer.vulkan_quantizer import ( - get_symmetric_quantization_config, - VulkanQuantizer, -) - from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge from executorch.exir.backend.canonical_partitioners.config_partitioner import ( @@ -94,66 +87,6 @@ def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) -> class TestVulkanPasses(unittest.TestCase): - def test_fuse_int8pack_mm(self): - K = 256 - N = 256 - model = SingleLinearModule(K, N) - sample_inputs = model.get_sample_inputs() - - quantizer = VulkanQuantizer() - quantizer.set_global( - get_symmetric_quantization_config(is_dynamic=False, weight_bits=8) - ) - - edge_manager = quantize_and_lower_module( - model, - sample_inputs, - quantizer, - ) - - ep = edge_manager._edge_programs["forward"] - edge_manager.transform( - [ - AddmmToLinearTransform(), - FuseQuantizedOpsTransform(ep), - ] - ) - - gm = ep.graph_module - - self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1) - self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0) - - def test_fuse_linear_qcs4w(self): - K = 256 - N = 256 - model = SingleLinearModule(K, N) - sample_inputs = model.get_sample_inputs() - - quantizer = VulkanQuantizer() - quantizer.set_global( - get_symmetric_quantization_config(is_dynamic=False, weight_bits=4) - ) - - edge_manager = quantize_and_lower_module( - model, - sample_inputs, - quantizer, - ) - - ep = edge_manager._edge_programs["forward"] - edge_manager.transform( - [ - AddmmToLinearTransform(), - FuseQuantizedOpsTransform(ep), - ] - ) - - gm = ep.graph_module - - self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1) - self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0) - def test_fuse_rotary_emb(self): """Test conversion of rotary embedding pattern to et_vk.apply_rotary_emb custom op.""" @@ -238,7 +171,8 @@ def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor): # Apply the rotary embedding pass ep = edge_manager._edge_programs["forward"] - rotary_pass = FusePatternsPass(ep) + rotary_pass = FusePatternsPass() + rotary_pass._exported_program = ep result = rotary_pass.call(ep.graph_module) # Verify that the pass was successful diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py index 41c1d92bd00..a887c53473a 100644 --- a/backends/vulkan/test/utils.py +++ b/backends/vulkan/test/utils.py @@ -90,7 +90,9 @@ def export_model_to_vulkan( qmode=QuantizationMode.NONE, ): compile_options = {} - exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode) + exported_graph = get_exported_graph( + model, sample_inputs, dynamic_shapes=dynamic_shapes, qmode=qmode + ) program = export( exported_graph, sample_inputs, @@ -303,13 +305,13 @@ def run_and_check_output( Returns: bool: True if outputs match within tolerance, False otherwise """ - # Load the ExecutorTorch program + # Load the ExecuTorch program executorch_module = _load_for_executorch_from_buffer(executorch_program.buffer) # Flatten inputs for execution inputs_flattened, _ = tree_flatten(sample_inputs) - # Run the ExecutorTorch program + # Run the ExecuTorch program model_output = executorch_module.run_method("forward", tuple(inputs_flattened)) # Generate reference outputs using the reference model diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index a193d02da88..189562178a7 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -187,6 +187,8 @@ std::vector get_reference_strides( default: return {}; } + default: + VK_THROW("Unsupported memory layout: ", layout); } return {}; } diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py index 96f200eecbc..09c57f649ae 100644 --- a/backends/vulkan/utils.py +++ b/backends/vulkan/utils.py @@ -128,7 +128,7 @@ def is_param_node(program: ExportedProgram, node: torch.fx.Node) -> bool: is_get_attr_node(node) or is_param(program, node) or is_buffer(program, node) - or is_constant(program, node) + or is_lifted_tensor_constant(program, node) ) @@ -206,6 +206,8 @@ def is_tensor_arg_node(node: Any) -> bool: if isinstance(node, torch.fx.Node): return is_tensor_node(node) elif isinstance(node, (list, tuple)): + if len(node) == 0: + return False return all(is_tensor_node(n) for n in node) return False @@ -348,6 +350,8 @@ def find_quant_user(node: torch.fx.Node) -> Optional[torch.fx.Node]: VkMemoryLayout.TENSOR_WIDTH_PACKED, VkMemoryLayout.TENSOR_HEIGHT_PACKED, VkMemoryLayout.TENSOR_CHANNELS_PACKED, + VkMemoryLayout.PACKED_INT8_4W4C, + VkMemoryLayout.PACKED_INT8_4H4W, } MemoryLayoutSet = Set[VkMemoryLayout] @@ -400,6 +404,12 @@ def required_image_extents(sizes: torch.Size, layout: VkMemoryLayout) -> ImageEx height = (height + 3) // 4 elif layout == VkMemoryLayout.TENSOR_CHANNELS_PACKED: channels = (channels + 3) // 4 + elif layout == VkMemoryLayout.PACKED_INT8_4W4C: + width = (width + 3) // 4 + channels = (channels + 3) // 4 + elif layout == VkMemoryLayout.PACKED_INT8_4H4W: + height = (height + 3) // 4 + width = (width + 3) // 4 else: raise RuntimeError(f"Unsupported memory layout {layout}") @@ -692,6 +702,8 @@ def make_filtered_tensor_repset( ## Convenience TensorRepSet definitions +PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set()) + CONTIGUOUS_ANY = TensorRepSet( {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_WIDTH_PACKED} ) @@ -1218,6 +1230,16 @@ def is_in_8bit_range(tensor: torch.Tensor) -> bool: ## +def nchw_dim_to_whcn_dim(nchw_dim: int, ndim: int) -> int: + # Handle negative indices for nchw_dim + if nchw_dim < 0: + nchw_dim += ndim + + assert nchw_dim >= 0 and nchw_dim < ndim + whcn_dim = (ndim - 1) - nchw_dim + return whcn_dim + + def get_tensor_val_str(tensor_val: FakeTensor) -> str: return f"{tensor_val.dtype}: {tensor_val.shape}" @@ -1269,6 +1291,7 @@ def update_program_state_dict( updated_tensor: torch.Tensor, ) -> None: target_name = None + kind = None # Iterate over all the tensors in the graph signature, and find # the one corresponding to the parameter/buffer name for input_ in program.graph_signature.input_specs: @@ -1277,6 +1300,7 @@ def update_program_state_dict( and isinstance(input_.arg, TensorArgument) and input_.arg.name == buffer_name ): + kind = input_.kind target_name = input_.target break @@ -1286,6 +1310,9 @@ def update_program_state_dict( ), f"could not find {buffer_name} in source program signature" assert target_name in program.state_dict, f"could not find {target_name}" + if kind == InputKind.PARAMETER: + updated_tensor = torch.nn.Parameter(updated_tensor, requires_grad=False) + # Finally, overwrite the current tensor with updated tensor program.state_dict[target_name] = updated_tensor diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index 95da66494e0..876f7fa8900 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -8,7 +8,7 @@ from functools import partial -from typing import Any, Dict, final, List +from typing import Any, Callable, Dict, final, List import executorch.backends.vulkan.utils as utils @@ -24,6 +24,7 @@ insert_prepack_nodes, RemoveLocalScalarDenseOpsTransform, RemoveRedundantOpsTransform, + ReplaceQDQPass, SqueezeUnsqueezeInputs, TagMemoryMetaPass, ) @@ -55,7 +56,9 @@ from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass -from executorch.exir.program._program import _copy_module +from executorch.exir.program._program import _transform + +from torch._export.verifier import Verifier from torch.export._remove_auto_functionalized_pass import ( unsafe_remove_auto_functionalized_pass, @@ -64,28 +67,34 @@ DEFAULT_DEBUG_HANDLE = 65535 +class _any_op(Verifier): + # Set training dialect to skip functional check in base verifier + dialect = "TRAINING" + + def allowed_op_types(self): + return (Callable,) + + # pyre-ignore def apply_passes(program: ExportedProgram, passes) -> ExportedProgram: for p in passes: - if issubclass(type(p), ExportPass) or issubclass(type(p), PassBase): - new_gm = program.graph_module - # This is a workaround to allow the memory planning pass to work without - # having to first apply ToOutVarPass(). See the `greedy()` function in - # `exir.memory_planning`; if this attribute isn't set, assertions in - # `collect_spec_from_nodes()` will fail. - if isinstance(p, MemoryPlanningPass): - new_gm.encounter_to_out_var_failure = True - - new_gm_res = p(new_gm) - assert new_gm_res is not None - new_gm = new_gm_res.graph_module - + if isinstance(p, MemoryPlanningPass) and hasattr(p, "run"): + p.run(program.graph_module) + + elif issubclass(type(p), ExportPass) or issubclass(type(p), PassBase): + # Some passes require the ep to be provided. However, since the ep may be + # updated with each pass applied, the ep must be set right before calling + # the pass. _exported_program is the attribute used by XNNPACK and Vulkan + # passes to store the exported program. + if hasattr(p, "_exported_program"): + p._exported_program = program + + program = _transform(program, p, override_verifiers=[_any_op]) # See the application of this function in exir/program/_program.py for more # details on why this step is necessary. if isinstance(p, SpecPropPass): - p.update_placeholder_tensor_specs(program, new_gm) + p.update_placeholder_tensor_specs(program, program.graph_module) - _copy_module(program.graph_module, new_gm) else: program = p(program) @@ -158,16 +167,17 @@ def preprocess( # noqa: C901 program = apply_passes( program, [ - FusePatternsPass(program), - RemoveRedundantOpsTransform(), + FuseBatchNormPass(program), + FusePatternsPass(), + FuseClampPass(), AddmmToLinearTransform(), - FuseQuantizedOpsTransform(program), - FoldQDQPass(program), + RemoveRedundantOpsTransform(), + FuseQuantizedOpsTransform(), + ReplaceQDQPass(), + FoldQDQPass(), SqueezeUnsqueezeInputs(), FuseViewCopyTransform(), ViewCopyToSqueezeUnsqueezePass(), - FuseBatchNormPass(program), - FuseClampPass(), ], ) @@ -213,6 +223,11 @@ def preprocess( # noqa: C901 mem_planning_suite = MemoryPlanningAlgorithmSuite( algo_list=[greedy_memory_planning] ) + # This is a workaround to allow the memory planning pass to work without having + # to first apply ToOutVarPass(). See the `greedy()` function in + # `exir.memory_planning`; if this attribute isn't set, assertions in + # `collect_spec_from_nodes()` will fail. + program.graph_module.encounter_to_out_var_failure = True program = apply_passes( program, [ diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py index 85e9889ca36..c1bc3a54f7c 100644 --- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py +++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py @@ -110,7 +110,9 @@ def is_nhwc_node(node: torch.fx.Node) -> bool: if len(quantize_node.all_input_nodes) > 0: actual_node = quantize_node.args[0] if actual_node.op == "placeholder": - return not actual_node.meta["val"][0].is_contiguous() + return ChannelsLastTaggedReshapePass._is_nhwc_tensor( + actual_node.meta["val"][0] + ) else: return actual_node.meta.get( ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False @@ -125,7 +127,9 @@ def is_nchw_node(node: torch.fx.Node) -> bool: if len(quantize_node.all_input_nodes) > 0: actual_node = quantize_node.args[0] if actual_node.op == "placeholder": - return actual_node.meta["val"][0].is_contiguous() + return not ChannelsLastTaggedReshapePass._is_nhwc_tensor( + actual_node.meta["val"][0] + ) else: return not actual_node.meta.get( ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False @@ -133,6 +137,26 @@ def is_nchw_node(node: torch.fx.Node) -> bool: return not ChannelsLastTaggedReshapePass.is_nhwc_node(node) + @staticmethod + def _is_nhwc_tensor(tensor: torch.Tensor) -> bool: + nhwc = tensor.is_contiguous(memory_format=torch.channels_last) + nchw = tensor.is_contiguous() + # if both are true false + # if both nchw and nhwc are true + # then we want to see this is nchw hence return false + # if either of nchw or nhwc is false, then just rely on hwc + # if both are false, mayb channels_last_3d, then return nhwc + # however this should not happen here + # return (not (nchw and nhwc)) and nhwc + # Readable version + if nchw and nhwc: + return False + else: + return nhwc + + def _is_nhwc(self, tensor: torch.Tensor) -> bool: + return ChannelsLastTaggedReshapePass._is_nhwc_tensor(tensor) + def requires_nhwc_input(self, node: torch.fx.Node) -> bool: return node.target in self.memory_sensitive_ops_nhwc @@ -315,11 +339,8 @@ def input_dim_order( self, input_node: torch.fx.Node, input_order: InputDimOrder ) -> bool: if input_node.op == "placeholder": - return ( - input_node.meta["val"].is_contiguous() - if input_order == InputDimOrder.NCHW - else not input_node.meta["val"].is_contiguous() - ) + is_nhwc = self._is_nhwc(input_node.meta["val"]) + return not is_nhwc if input_order == InputDimOrder.NCHW else is_nhwc else: return ( ChannelsLastTaggedReshapePass.is_nchw_node(input_node) @@ -348,7 +369,7 @@ def input_to_nhwc( self.mark_as_nhwc_node(input_node) if input_node.op == "placeholder": - if not input_node.meta["val"][0].is_contiguous(): + if self._is_nhwc(input_node.meta["val"][0]): return elif ChannelsLastTaggedReshapePass.is_nhwc_node(input_node): return @@ -420,7 +441,7 @@ def input_to_nchw( self.mark_as_nchw_node(input_node) if input_node.op == "placeholder": - if input_node.meta["val"].is_contiguous(): + if not self._is_nhwc(input_node.meta["val"]): return elif ChannelsLastTaggedReshapePass.is_nchw_node(input_node): return @@ -462,17 +483,17 @@ def call(self, graph_module: torch.fx.GraphModule): # noqa: C901 and isinstance(node.meta["val"], torch.Tensor) and len(node.meta["val"].shape) == 4 ): - if node.meta["val"].is_contiguous(): - self.mark_as_nchw_node(node) - else: + if self._is_nhwc(node.meta["val"]): self.mark_as_nhwc_node(node) + else: + self.mark_as_nchw_node(node) continue # Need special case for output node because it can have multiple output dim orders as we can output a tuple multiple nodes if node.op == "output": out_tuple = node.args[0] for out_node in out_tuple: - if out_node.meta["val"].is_contiguous(): + if not self._is_nhwc(out_node.meta["val"]): self.input_to_nchw(graph_module, out_node, node) else: self.input_to_nhwc(graph_module, out_node, node) diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py index d17b7abd6a1..93424b1c84d 100644 --- a/backends/xnnpack/operators/__init__.py +++ b/backends/xnnpack/operators/__init__.py @@ -41,6 +41,7 @@ op_relu, op_rsqrt, op_sigmoid, + op_sin, op_skip_ops, op_slice_copy, op_softmax, diff --git a/backends/xnnpack/operators/op_sin.py b/backends/xnnpack/operators/op_sin.py new file mode 100644 index 00000000000..56fe9396103 --- /dev/null +++ b/backends/xnnpack/operators/op_sin.py @@ -0,0 +1,52 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import torch +from executorch.backends.xnnpack.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import ( + XNNGraph, + XNNSin, + XNode, +) +from executorch.backends.xnnpack.utils.utils import get_input_node + + +@register_node_visitor +class SinVisitor(NodeVisitor): + target = "aten.sin.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids) + + # input + input_id = vals_to_ids[get_input_node(node, 0)] + + # output + output_id = vals_to_ids[node] + + ser_node = XNode( + xnode_union=XNNSin( + input_id=input_id, + output_id=output_id, + flags=0, + ), + debug_handle=debug_handle, + ) + xnn_graph.xnodes.append(ser_node) diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py index e393f1c9ac8..86baba3e3f7 100644 --- a/backends/xnnpack/partition/config/__init__.py +++ b/backends/xnnpack/partition/config/__init__.py @@ -45,6 +45,7 @@ ReciprocalSquareRootConfig, ReLUConfig, SigmoidConfig, + SinConfig, SliceCopyConfig, SoftmaxConfig, SquareRootConfig, @@ -105,6 +106,7 @@ TanhConfig, ToDimOrderCopyConfig, SigmoidConfig, + SinConfig, SliceCopyConfig, SoftmaxConfig, SquareRootConfig, diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py index 559d1522275..06024c632c9 100644 --- a/backends/xnnpack/partition/config/generic_node_configs.py +++ b/backends/xnnpack/partition/config/generic_node_configs.py @@ -636,3 +636,10 @@ class BMMConfig(GenericNodePartitionerConfig): def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] + + +class SinConfig(GenericNodePartitionerConfig): + target_name = "sin.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 78eaaf6d039..b71ab08ea45 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -174,13 +174,12 @@ payload (deprecated) or via offsets to the constant_data_ptr. If no constant data associated with the tensor value, then returns nullptr. */ const uint8_t* getConstantDataPtr( - const fb_xnnpack::XNNTensorValue* tensor_value, + uint32_t buffer_idx, GraphPtr flatbuffer_graph, const uint8_t* constant_data_ptr, const NamedDataMap* named_data_map, std::vector& freeable_buffers, XNNWeightsCache* weights_cache) { - auto buffer_idx = tensor_value->constant_buffer_idx(); if (buffer_idx) { if (!constant_data_ptr) { // TODO(T172265611): Remove constant_buffer in flatbuffer path after BC @@ -230,6 +229,22 @@ const uint8_t* getConstantDataPtr( return nullptr; } +const uint8_t* getConstantDataPtr( + const fb_xnnpack::XNNTensorValue* tensor_value, + GraphPtr flatbuffer_graph, + const uint8_t* constant_data_ptr, + const NamedDataMap* named_data_map, + std::vector& freeable_buffers, + XNNWeightsCache* weights_cache) { + return getConstantDataPtr( + tensor_value->constant_buffer_idx(), + flatbuffer_graph, + constant_data_ptr, + named_data_map, + freeable_buffers, + weights_cache); +} + /** Define serialized tensor value into the subgraph. While also keeping track of the remapped ids from @@ -434,22 +449,15 @@ Error defineTensor( const float* scale = qparams->scale()->data(); if (qparams->scale_buffer_idx() != 0) { - // if scales are stored in named data, then retrieve it - ConstantDataOffsetPtr scale_buffer_offset = - flatbuffer_graph->constant_data()->Get( - qparams->scale_buffer_idx()); - const std::string& data_name = - scale_buffer_offset->named_key()->str(); - Result scale_buffer = - named_data_map->get_data(data_name.c_str()); + scale = reinterpret_cast(getConstantDataPtr( + qparams->scale_buffer_idx(), + flatbuffer_graph, + constant_data_ptr, + named_data_map, + freeable_buffers, + weights_cache)); ET_CHECK_OR_RETURN_ERROR( - scale_buffer.ok(), - Internal, - "Failed to get constant data for key %s from named_data_map. Error code: %u", - data_name.c_str(), - static_cast(scale_buffer.error())); - scale = reinterpret_cast(scale_buffer.get().data()); - freeable_buffers.push_back(std::move(scale_buffer.get())); + scale != nullptr, Internal, "Failed to load scale data."); } status = xnn_define_channelwise_quantized_tensor_value_v2( /*subgraph=*/subgraph_ptr, @@ -483,22 +491,15 @@ Error defineTensor( // Block scales are preferably serialized as bf16 but can also be // serialized as fp32 for backwards compatability. if (qparams->scale_buffer_idx() != 0) { - ConstantDataOffsetPtr scale_buffer_offset = - flatbuffer_graph->constant_data()->Get( - qparams->scale_buffer_idx()); - const std::string& data_name = - scale_buffer_offset->named_key()->str(); - Result scale_buffer = - named_data_map->get_data(data_name.c_str()); + scale_data = reinterpret_cast(getConstantDataPtr( + qparams->scale_buffer_idx(), + flatbuffer_graph, + constant_data_ptr, + named_data_map, + freeable_buffers, + weights_cache)); ET_CHECK_OR_RETURN_ERROR( - scale_buffer.ok(), - Internal, - "Failed to get constant data for key %s from named_data_map. Error code: %u", - data_name.c_str(), - static_cast(scale_buffer.error())); - scale_data = - reinterpret_cast(scale_buffer.get().data()); - freeable_buffers.push_back(std::move(scale_buffer.get())); + scale_data != nullptr, Internal, "Failed to load scale data."); scale_numel = qparams->num_scales(); } else { // Read fp32 scales, convert to bf16. @@ -1689,6 +1690,7 @@ _DEFINE_UNARY_NODE_NO_PARAMS(Log, xnn_unary_log) _DEFINE_UNARY_NODE_NO_PARAMS(Negate, xnn_unary_negate) _DEFINE_UNARY_NODE_NO_PARAMS(Square, xnn_unary_square) _DEFINE_UNARY_NODE_NO_PARAMS(Abs, xnn_unary_abs) +_DEFINE_UNARY_NODE_NO_PARAMS(Sin, xnn_unary_sine) // Unary Ops with min/max params _DEFINE_UNARY_NODE_WITH_MINMAX(Clamp, xnn_unary_clamp) @@ -1736,6 +1738,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) { _DEFINE(Floor) _DEFINE(PReLU) _DEFINE(Sigmoid) + _DEFINE(Sin) // Others _DEFINE(FullyConnected) @@ -1895,9 +1898,8 @@ ET_NODISCARD Error XNNCompiler::compileModel( xnn_weights_cache_t weights_cache_ptr = nullptr; #endif -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE - ET_CHECK_OR_RETURN_ERROR( - workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace"); + // NOLINTBEGIN(facebook-hte-NullableDereference) - weights cache is allowed to + // be null status = xnn_create_runtime_v4( subgraph.get(), weights_cache_ptr, @@ -1905,14 +1907,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( ::executorch::extension::threadpool::get_pthreadpool(), runtime_flags, &runtime_ptr); -#else - status = xnn_create_runtime_v3( - subgraph.get(), - weights_cache_ptr, - ::executorch::extension::threadpool::get_pthreadpool(), - runtime_flags, - &runtime_ptr); -#endif + // NOLINTEND(facebook-hte-NullableDereference) ET_CHECK_OR_RETURN_ERROR( xnn_status_success == status, diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h index f7084a5dd88..c7926744dd6 100644 --- a/backends/xnnpack/runtime/XNNExecutor.h +++ b/backends/xnnpack/runtime/XNNExecutor.h @@ -9,13 +9,13 @@ #pragma once #include +#include #include #include #include #include #include -#include #include #include @@ -35,9 +35,11 @@ class XNNExecutor { std::vector output_ids_; std::vector externals_; std::vector packed_data_names_; + std::shared_ptr workspace_; public: - XNNExecutor() = default; + XNNExecutor(std::shared_ptr workspace) + : workspace_(workspace) {} inline size_t getNumInputs() { return input_ids_.size(); @@ -51,6 +53,10 @@ class XNNExecutor { return packed_data_names_; } + inline std::shared_ptr get_workspace() { + return workspace_; + } + /** * Initialize the XNNExecutor with a given runtime and input/output ids. * The input/output ids are expected to be sorted in order of their diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index b05919ecf2b..70845b6cab1 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -7,7 +7,10 @@ */ #include +#include #include +#include +#include #include #include #include @@ -21,14 +24,18 @@ namespace executorch { namespace backends { +using executorch::backends::xnnpack::WorkspaceSharingMode; +using executorch::backends::xnnpack::XNNWorkspace; using executorch::backends::xnnpack::delegate::XNNWeightsCache; using executorch::ET_RUNTIME_NAMESPACE::Backend; using executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext; using executorch::ET_RUNTIME_NAMESPACE::BackendInitContext; +using executorch::ET_RUNTIME_NAMESPACE::BackendOptionContext; using executorch::ET_RUNTIME_NAMESPACE::CompileSpec; using executorch::ET_RUNTIME_NAMESPACE::DelegateHandle; using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap; using executorch::runtime::ArrayRef; +using executorch::runtime::BackendOption; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; @@ -51,23 +58,8 @@ class XnnpackBackend final return; } -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE - // Create a workspace for the XNNExecutor to use. This workspace will be - // shared across all delegate instances. - ET_LOG(Debug, "Creating XNN workspace"); - xnn_workspace_t workspace = nullptr; - status = xnn_create_workspace(&workspace); - if (status != xnn_status_success) { - ET_LOG( - Error, - "Failed to create XNN workspace, XNNPACK status: 0x%x", - (unsigned int)status); - workspace = nullptr; - return; - } - workspace_.reset(workspace); - ET_LOG(Debug, "Created XNN workspace: %p", workspace_.get()); -#endif // ENABLE_XNNPACK_SHARED_WORKSPACE + // Workspace manager is initialized with the appropriate default mode in its + // constructor } bool is_available() const override { @@ -85,11 +77,12 @@ class XnnpackBackend final } const NamedDataMap* named_data_map = context.get_named_data_map(); - // thread safe. This can heppen when multiple threads call init() on + // thread safe. This can happen when multiple threads call init() on // the same backend instance. -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE - const std::lock_guard lock(workspace_mutex_); -#endif + + auto program_id = + reinterpret_cast(context.get_runtime_allocator()); + auto workspace = ET_UNWRAP(get_or_create_workspace(program_id)); #ifdef ENABLE_XNNPACK_WEIGHTS_CACHE const std::lock_guard lock_weight_cache(weights_cache_mutex_); @@ -97,17 +90,19 @@ class XnnpackBackend final context.get_runtime_allocator(), named_data_map); #endif + auto [workspace_lock, workspace_ptr] = workspace->acquire(); + // Executor has been allocated but not constructed, ensure that runtime_ is // nullptr by constructing it in place here. NOTE: Since we use placement // new and since this type is not trivially destructible, we must call the // destructor manually in destroy(). - new (executor) xnnpack::delegate::XNNExecutor; + new (executor) xnnpack::delegate::XNNExecutor(workspace); Error err = xnnpack::delegate::XNNCompiler::compileModel( processed->data(), processed->size(), executor, weights_cache_.get(), - workspace_.get(), + workspace_ptr, named_data_map); // This backend does not need its processed data after compiling the model. processed->Free(); @@ -130,14 +125,12 @@ class XnnpackBackend final Span args) const override { auto executor = static_cast(handle); -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE - const std::lock_guard lock(workspace_mutex_); -#endif - #ifdef ENABLE_XNNPACK_WEIGHTS_CACHE const std::lock_guard lock_weights_cache(weights_cache_mutex_); #endif + auto [raii_lock, _] = executor->get_workspace()->acquire(); + // Prepare Inputs/Outputs and Propagate Input Shapes Error err = executor->prepare_args(args); if (err != Error::Ok) { @@ -158,13 +151,6 @@ class XnnpackBackend final void destroy(DelegateHandle* handle) const override { if (handle != nullptr) { - // This is needed to serialize access to xnn_delete_runtime which is not - // thread safe. This can heppen when multiple threads call destroy() on - // the same backend instance. -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE - const std::lock_guard lock(workspace_mutex_); -#endif - auto executor = static_cast(handle); #ifdef ENABLE_XNNPACK_PROFILING @@ -176,18 +162,87 @@ class XnnpackBackend final weights_cache_mutex_); weights_cache_->delete_packed_data(executor->get_packed_data_names()); #endif + + // This is needed to serialize access to xnn_delete_runtime which is not + // thread safe. This can heppen when multiple threads call destroy() on + // the same backend instance. Make sure to hold onto the workspace + // shared_ptr, as the pointer in the executor is freed, which includes + // the mutex referenced by raii_lock. + auto workspace = executor->get_workspace(); + auto [raii_lock, _] = workspace->acquire(); + // XNNExecutor is not trivially destructible. Since this was constructed // manually in init(), we must destroy it manually here. executor->~XNNExecutor(); } } + Error get_option_internal( + BackendOptionContext& context, + executorch::runtime::Span& + backend_options) const { + // Intentionally not locking here as it is not required. + + // Verify that the expected option key is present and modify the value + for (size_t i = 0; i < backend_options.size(); ++i) { + if (strcmp( + backend_options[i].key, + xnnpack::workspace_sharing_mode_option_key) == 0) { + // Set the value to what was stored by set_option + backend_options[i].value = + static_cast(workspace_manager_.get_sharing_mode()); + } + } + + return Error::Ok; + } + + Error get_option( + BackendOptionContext& context, + executorch::runtime::Span& + backend_options) override { + return get_option_internal(context, backend_options); + } + + Error set_option( + BackendOptionContext& context, + const executorch::runtime::Span& + backend_options) override { + if (backend_options.size() > 0) { + for (const auto& option : backend_options) { + if (strcmp(option.key, xnnpack::workspace_sharing_mode_option_key) == + 0) { + if (auto* val = std::get_if(&option.value)) { + if (*val < 0 || + *val > static_cast(WorkspaceSharingMode::Count)) { + ET_LOG( + Error, + "XNNPACK workspace sharing mode must be between 0 and %d, inclusive, but was %d.", + static_cast(WorkspaceSharingMode::Count), + *val); + return Error::InvalidArgument; + } + + ET_LOG( + Debug, "Setting XNNPACK workspace sharing mode to %d.", *val); + auto status = workspace_manager_.set_sharing_mode( + static_cast(*val)); + if (status != Error::Ok) { + return status; + } + } else { + ET_LOG(Error, "XNNPACK workspace sharing mode must be an integer."); + return Error::InvalidArgument; + } + } + } + } + return Error::Ok; + } + private: - // This is a global workspace for all delegate instances. - mutable std::mutex workspace_mutex_; - std::unique_ptr workspace_{ - nullptr, - &xnn_release_workspace}; + // Workspace manager for handling workspace sharing modes + mutable xnnpack::XNNWorkspaceManager workspace_manager_; // Weights cache is global to all delegate instances. mutable std::mutex weights_cache_mutex_; @@ -195,13 +250,21 @@ class XnnpackBackend final std::make_unique(); // Lock Hiearchy for Mutexes: - // workspace_mutex_ // weights_cache_mutex_ + // workspace_meta_mutex_ + // workspace_mutex_ (owned by executor) + + // Retrieve a workspace for the given method ID, depending on the sharing + // mode. + Result> get_or_create_workspace( + uintptr_t program_id) const { + return workspace_manager_.get_or_create_workspace(program_id); + } }; namespace { -auto cls = XnnpackBackend(); -Backend backend{"XnnpackBackend", &cls}; +auto backend_instance = XnnpackBackend(); +Backend backend{xnnpack::xnnpack_backend_key, &backend_instance}; static auto success_with_compiler = register_backend(backend); } // namespace diff --git a/backends/xnnpack/runtime/XNNPACKBackend.h b/backends/xnnpack/runtime/XNNPACKBackend.h new file mode 100644 index 00000000000..aca72f8652b --- /dev/null +++ b/backends/xnnpack/runtime/XNNPACKBackend.h @@ -0,0 +1,42 @@ +#pragma once + +namespace executorch::backends::xnnpack { +/// The key for the backend. This is used to register the backend, check +/// availability, and get/set options. +const char xnnpack_backend_key[] = "XnnpackBackend"; + +/// The key for the workspace sharing option. See the WorkspaceSharingMode enum +/// for a description of the associated functionality. +const char workspace_sharing_mode_option_key[] = "workspace_sharing_mode"; + +/// Workspace sharing mode. This is a backend option that can be set via the +/// set_option API to control memory sharing between CALL_DELEGATE instances. +/// This is useful for reducing memory consumption. +enum class WorkspaceSharingMode { + /// No workspace sharing. Each CALL_DELEGATE instance will have its own + /// workspace (memory arena). + Disabled = 0, + + /// All CALL_DELEGATE instances in a given program will share a workspace. + /// This reduces memory consumption + /// for methods with multiple delegate calls, at the cost of only allowing one + /// method to execute at a time. + PerModel = 1, + + /// All CALL_DELEGATE instances accross all loaded methods will share a + /// workspace. This reduces memory + /// consumption by overlapping activation memory between methods but enforces + /// synchronization between + /// methods. If multiple methods are run concurrently, it may block as only + /// one delegate call occur + /// at a time. Additionally, the workspace does not shrink when a method is + /// unloaded, so memory will + /// only be reclaimed when all XNNPACK-delegated methods are unloaded. + Global = 2, + + /// The number of workspace sharing modes. This is not a valid mode and is + /// only used for tracking the + // maximum enum value. + Count, +}; +} // namespace executorch::backends::xnnpack diff --git a/backends/xnnpack/runtime/XNNWorkspace.h b/backends/xnnpack/runtime/XNNWorkspace.h new file mode 100644 index 00000000000..36596b05089 --- /dev/null +++ b/backends/xnnpack/runtime/XNNWorkspace.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace executorch::backends::xnnpack { + +using WorkspacePtr = + std::unique_ptr; + +/// A lightweight wrapper around an underlying xnn_workspace_t instance, bundled +/// with appropriate synchronization. +class XNNWorkspace { + public: + XNNWorkspace(WorkspacePtr workspace) : workspace_(std::move(workspace)){}; + XNNWorkspace(const XNNWorkspace&) = delete; + XNNWorkspace& operator=(const XNNWorkspace&) = delete; + // Not moveable due to std::mutex. + XNNWorkspace(XNNWorkspace&&) = delete; + XNNWorkspace& operator=(XNNWorkspace&&) = delete; + + std::pair, xnn_workspace_t> acquire() { + auto lock = std::unique_lock(mutex_); + return {std::move(lock), workspace_.get()}; + } + + // Return the workspace pointer withot acquiring the lock. This should be used + // carefully, as it can lead to crashes or data corruption if the workspace is + // used concurrently.s + xnn_workspace_t unsafe_get_workspace() { + return workspace_.get(); + } + + static runtime::Result> create() { + // Because this class can't be moved, we need to construct it in-place. + xnn_workspace_t workspace = nullptr; + auto status = xnn_create_workspace(&workspace); + if (status != xnn_status_success) { + ET_LOG( + Error, + "Failed to create XNN workspace, XNNPACK status: 0x%x", + (unsigned int)status); + return runtime::Error::Internal; + } + + return std::make_shared( + WorkspacePtr(workspace, &xnn_release_workspace)); + } + + private: + std::mutex mutex_; + WorkspacePtr workspace_; +}; + +} // namespace executorch::backends::xnnpack diff --git a/backends/xnnpack/runtime/XNNWorkspaceManager.cpp b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp new file mode 100644 index 00000000000..d8c6dae4d6d --- /dev/null +++ b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include // For PRIuPTR + +namespace executorch::backends::xnnpack { + +using executorch::runtime::Error; +using executorch::runtime::Result; + +XNNWorkspaceManager::XNNWorkspaceManager() { +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE + sharing_mode_ = WorkspaceSharingMode::Global; +#else + sharing_mode_ = WorkspaceSharingMode::Disabled; +#endif // ENABLE_XNNPACK_SHARED_WORKSPACE +} + +runtime::Error XNNWorkspaceManager::set_sharing_mode( + WorkspaceSharingMode mode) { + // Validate that the mode is valid + if (static_cast(mode) < 0 || + static_cast(mode) >= static_cast(WorkspaceSharingMode::Count)) { + ET_LOG( + Error, + "XNNPACK workspace sharing mode must be between 0 and %d, inclusive, but was %d.", + static_cast(WorkspaceSharingMode::Count) - 1, + static_cast(mode)); + return runtime::Error::InvalidArgument; + } + + sharing_mode_ = mode; + return runtime::Error::Ok; +} + +WorkspaceSharingMode XNNWorkspaceManager::get_sharing_mode() const { + return sharing_mode_.load(); +} + +Result> +XNNWorkspaceManager::get_or_create_workspace(uintptr_t program_id) const { + auto mode = sharing_mode_.load(); + + // Get or create the workspace according to the current sharing mode. + if (mode == WorkspaceSharingMode::Disabled) { + ET_LOG(Debug, "Instantiating workspace."); + auto create_result = XNNWorkspace::create(); + if (!create_result.ok()) { + return create_result.error(); + } + + return create_result.get(); + } else if (mode == WorkspaceSharingMode::PerModel) { + return get_or_create_model_workspace(program_id); + } else if (mode == WorkspaceSharingMode::Global) { + return get_or_create_global_workspace(); + } else { + ET_LOG( + Error, "Invalid workspace sharing mode: %d.", static_cast(mode)); + return Error::Internal; + } +} + +Result> +XNNWorkspaceManager::get_or_create_global_workspace() const { + std::scoped_lock lock(workspace_meta_mutex_); + + // Check for an existing (live) global workspace. + std::shared_ptr workspace = {}; + if (auto live_workspace = global_workspace_.lock()) { + workspace = live_workspace; + } + + // Allocate a new workspace if needed. + if (!workspace) { + auto create_result = XNNWorkspace::create(); + if (!create_result.ok()) { + return create_result.error(); + } + workspace = create_result.get(); + ET_LOG( + Debug, + "Created global workspace %p.", + workspace->unsafe_get_workspace()); + global_workspace_ = workspace; + } + + return workspace; +} + +Result> +XNNWorkspaceManager::get_or_create_model_workspace(uintptr_t program_id) const { + std::scoped_lock lock(workspace_meta_mutex_); + + // Check for an existing (live) workspace for this program. + auto match = model_workspaces_.find(program_id); + std::shared_ptr workspace = {}; + if (match != model_workspaces_.end()) { + if (auto live_workspace = match->second.lock()) { + workspace = live_workspace; + } + } + + // Allocate a new workspace if needed. + if (!workspace) { + auto create_result = XNNWorkspace::create(); + if (!create_result.ok()) { + return create_result.error(); + } + workspace = create_result.get(); + ET_LOG( + Debug, + "Created workspace %p for program %" PRIuPTR ".", + workspace->unsafe_get_workspace(), + program_id); + model_workspaces_.insert( + {program_id, std::weak_ptr(workspace)}); + } + + return workspace; +} + +} // namespace executorch::backends::xnnpack diff --git a/backends/xnnpack/runtime/XNNWorkspaceManager.h b/backends/xnnpack/runtime/XNNWorkspaceManager.h new file mode 100644 index 00000000000..52db1184bbd --- /dev/null +++ b/backends/xnnpack/runtime/XNNWorkspaceManager.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace executorch::backends::xnnpack { + +/** + * XNNWorkspaceManager manages XNNPACK workspaces based on the configured + * workspace sharing mode. + * + * It supports three modes: + * - Disabled: Each delegate instance gets its own workspace + * - PerModel: All delegate instances in a model share a workspace + * - Global: All delegate instances across all models share a workspace + */ +class XNNWorkspaceManager { + public: + XNNWorkspaceManager(); + ~XNNWorkspaceManager() = default; + + /** + * Set the workspace sharing mode. + * + * @param mode The workspace sharing mode to set. + * @return Error::Ok if the mode was set successfully. + */ + runtime::Error set_sharing_mode(WorkspaceSharingMode mode); + + /** + * Get the current workspace sharing mode. + * + * @return The current workspace sharing mode. + */ + WorkspaceSharingMode get_sharing_mode() const; + + /** + * Retrieve a workspace for the given program ID, depending on the sharing + * mode. A workspace will be created if needed. + * + * @param program_id The ID of the program requesting a workspace. + * @return A Result containing a shared_ptr to the workspace, or an error. + */ + runtime::Result> get_or_create_workspace( + uintptr_t program_id) const; + + private: + // The active sharing mode. Changes to this affect only models loaded after + // the change. + std::atomic sharing_mode_; + + // A mutex guarding global_workspace_ and model_workspaces_. Note that this + // mutex only guards the top-level definitions, not the contents of the + // workspace. The contents of the workspace are guarded by the workspace's own + // mutex in the XNNWorkspace class. + mutable std::mutex workspace_meta_mutex_; + + // A global workspace for all delegate instances, if global sharing is + // enabled. Lazy initialized. Stored as a weak pointer to allow automatic + // cleanup when all references are released. + mutable std::weak_ptr global_workspace_; + + // A map from program id to workspace for delegate instances, if per model + // sharing is enabled. Workspaces are owned by the executor instances via + // shared_ptr. They are tracked here via weak pointers to allow automatic + // cleanup when the executors are destroyed while being retrievable when + // instantiating new executors. + mutable std::unordered_map> + model_workspaces_; + + // Retrieve the global workspace, lazy initializing it if needed. + runtime::Result> + get_or_create_global_workspace() const; + + // Get or create a workspace for the given program ID. + runtime::Result> get_or_create_model_workspace( + uintptr_t program_id) const; +}; + +} // namespace executorch::backends::xnnpack diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs index 950318f18dc..239f92d899e 100644 --- a/backends/xnnpack/serialization/runtime_schema.fbs +++ b/backends/xnnpack/serialization/runtime_schema.fbs @@ -156,6 +156,7 @@ union XNodeUnion { XNNGelu: _XNNNode1x1, XNNTanh: _XNNNode1x1, XNNExp: _XNNNode1x1, + XNNSin: _XNNNode1x1, } union XValueUnion { diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs index a4efc627cbb..92a61c5537b 100644 --- a/backends/xnnpack/serialization/schema.fbs +++ b/backends/xnnpack/serialization/schema.fbs @@ -152,6 +152,7 @@ union XNodeUnion { XNNGelu: _XNNNode1x1, XNNTanh: _XNNNode1x1, XNNExp: _XNNNode1x1, + XNNSin: _XNNNode1x1, } union XValueUnion { diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py index 99b64708f86..2b3f8e74202 100644 --- a/backends/xnnpack/serialization/xnnpack_graph_schema.py +++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py @@ -347,6 +347,11 @@ class XNNPReLU(XNNNode2x1): pass +@dataclass +class XNNSin(XNNNode1x1): + pass + + @dataclass class XNNScaledDotProductAttention: query_id: int @@ -402,6 +407,8 @@ class XNNScaledDotProductAttention: XNNLog, XNNGelu, XNNTanh, + XNNExp, + XNNSin, ] diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 0eab89a00f9..796fd887e33 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -59,6 +59,9 @@ def define_common_targets(): exported_deps = [ "//executorch/runtime/backend:interface" + aten_suffix, ], + exported_headers = [ + "runtime/XNNPACKBackend.h", + ], deps = [ third_party_dep("XNNPACK"), "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header", @@ -70,3 +73,13 @@ def define_common_targets(): # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) link_whole = True, ) + + runtime.cxx_library( + name = "xnnpack_interface", + visibility = [ + "@EXECUTORCH_CLIENTS", + ], + exported_headers = [ + "runtime/XNNPACKBackend.h", + ], + ) diff --git a/backends/xnnpack/test/ops/test_sin.py b/backends/xnnpack/test/ops/test_sin.py new file mode 100644 index 00000000000..6a1b323e14c --- /dev/null +++ b/backends/xnnpack/test/ops/test_sin.py @@ -0,0 +1,87 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.xnnpack.test.tester import Tester + + +class TestSin(unittest.TestCase): + def setUp(self): + torch._dynamo.reset() + + class Sin(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + z = torch.sin(x) + return z + + def _test_sin(self, inputs, legacy_mode: bool = False): + tester = ( + Tester(self.Sin(), inputs) + .export() + .check_count({"torch.ops.aten.sin.default": 1}) + ) + + if legacy_mode: + tester = tester.to_edge().partition() + else: + tester = tester.to_edge_transform_and_lower() + + ( + tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .check_not(["executorch_exir_dialects_edge__ops_aten_sin_default"]) + .to_executorch() + .serialize() + .run_method_and_compare_outputs() + ) + + def test_fp16_sin(self): + inputs = ( + torch.Tensor( + [ + [0.0, 0.1, 0.5, 0.785398], + [-0.5, -0.785398, 1.5708, -1.5708], + ], + ).to(torch.float16), + ) + self._test_sin(inputs, legacy_mode=False) + + def test_fp16_sin_legacy_mode(self): + inputs = ( + torch.Tensor( + [ + [0.0, 0.1, 0.5, 0.785398], + [-0.5, -0.785398, 1.5708, -1.5708], + ], + ).to(torch.float16), + ) + self._test_sin(inputs, legacy_mode=True) + + def test_fp32_sin(self): + inputs = ( + torch.Tensor( + [ + [0.0, 0.1, 0.5, 0.785398], + [-0.5, -0.785398, 1.5708, -1.5708], + ], + ), + ) + self._test_sin(inputs, legacy_mode=False) + + def test_fp32_sin_legacy_mode(self): + inputs = ( + torch.Tensor( + [ + [0.0, 0.1, 0.5, 0.785398], + [-0.5, -0.785398, 1.5708, -1.5708], + ], + ), + ) + self._test_sin(inputs, legacy_mode=True) diff --git a/backends/xnnpack/test/runtime/test_workspace_manager.cpp b/backends/xnnpack/test/runtime/test_workspace_manager.cpp new file mode 100644 index 00000000000..ddb7074a1ce --- /dev/null +++ b/backends/xnnpack/test/runtime/test_workspace_manager.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +#include + +using namespace ::testing; + +using executorch::backends::xnnpack::WorkspaceSharingMode; +using executorch::backends::xnnpack::XNNWorkspace; +using executorch::backends::xnnpack::XNNWorkspaceManager; +using executorch::runtime::Error; +using executorch::runtime::Result; + +class XNNWorkspaceManagerTest : public ::testing::Test { + protected: + void SetUp() override { + // Log calls will abort if PAL is not initialized. + executorch::runtime::runtime_init(); + + // Initialize a new workspace manager for each test. + workspace_manager_ = std::make_unique(); + } + + std::unique_ptr workspace_manager_; +}; + +TEST_F(XNNWorkspaceManagerTest, SetAndGetSharingMode) { + // Test setting and getting the sharing mode + EXPECT_EQ( + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled), + Error::Ok); + EXPECT_EQ( + workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Disabled); + + EXPECT_EQ( + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel), + Error::Ok); + EXPECT_EQ( + workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::PerModel); + + EXPECT_EQ( + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global), + Error::Ok); + EXPECT_EQ( + workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Global); +} + +TEST_F(XNNWorkspaceManagerTest, SetInvalidSharingMode) { + // First set a valid mode to ensure we're starting from a known state. + EXPECT_EQ( + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled), + Error::Ok); + EXPECT_EQ( + workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Disabled); + + // Try to set an invalid mode. + WorkspaceSharingMode invalid_mode = static_cast(70); + EXPECT_EQ( + workspace_manager_->set_sharing_mode(invalid_mode), + Error::InvalidArgument); + + // The mode should not have changed. + EXPECT_EQ( + workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Disabled); +} + +TEST_F(XNNWorkspaceManagerTest, DisabledMode) { + // Verify that each call retrieves a new workspace when sharing is disabled. + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled); + + uintptr_t program_id = 12345; + auto workspace1_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace1_result.ok()); + auto workspace1 = workspace1_result.get(); + + auto workspace2_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace2_result.ok()); + auto workspace2 = workspace2_result.get(); + + auto workspace3_result = + workspace_manager_->get_or_create_workspace(program_id + 1); + ASSERT_TRUE(workspace3_result.ok()); + auto workspace3 = workspace3_result.get(); + + EXPECT_NE(workspace1, workspace2); + EXPECT_NE(workspace1, workspace3); + EXPECT_NE(workspace2, workspace3); + EXPECT_NE( + workspace1->unsafe_get_workspace(), workspace2->unsafe_get_workspace()); + EXPECT_NE( + workspace1->unsafe_get_workspace(), workspace3->unsafe_get_workspace()); + EXPECT_NE( + workspace2->unsafe_get_workspace(), workspace3->unsafe_get_workspace()); +} + +TEST_F(XNNWorkspaceManagerTest, PerModelMode) { + // In PerModel mode, calls with the same program_id should return the same + // workspace. + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel); + + // Get two workspaces with the same program ID and one different. + uintptr_t program_id = 12345; + auto workspace1_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace1_result.ok()); + auto workspace1 = workspace1_result.get(); + + auto workspace2_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace2_result.ok()); + auto workspace2 = workspace2_result.get(); + + auto workspace3_result = + workspace_manager_->get_or_create_workspace(program_id + 1); + ASSERT_TRUE(workspace3_result.ok()); + auto workspace3 = workspace3_result.get(); + + // Workspace 1 and 2 should be the same, but different from workspace 3. + EXPECT_EQ(workspace1, workspace2); + EXPECT_EQ( + workspace1->unsafe_get_workspace(), workspace2->unsafe_get_workspace()); + + EXPECT_NE(workspace1, workspace3); + EXPECT_NE( + workspace1->unsafe_get_workspace(), workspace3->unsafe_get_workspace()); +} + +TEST_F(XNNWorkspaceManagerTest, GlobalMode) { + // In Global mode, all calls should return the same workspace. + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global); + + // Get workspaces with different program IDs + uintptr_t program_id1 = 12345; + auto workspace1_result = + workspace_manager_->get_or_create_workspace(program_id1); + ASSERT_TRUE(workspace1_result.ok()); + auto workspace1 = workspace1_result.get(); + + uintptr_t program_id2 = 67890; + auto workspace2_result = + workspace_manager_->get_or_create_workspace(program_id2); + ASSERT_TRUE(workspace2_result.ok()); + auto workspace2 = workspace2_result.get(); + + EXPECT_EQ(workspace1, workspace2); + EXPECT_EQ( + workspace1->unsafe_get_workspace(), workspace2->unsafe_get_workspace()); +} + +TEST_F(XNNWorkspaceManagerTest, PerModelModeCleanup) { + // Test that workspaces are properly cleaned up when shared_ptr is destroyed + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel); + + uintptr_t program_id = 12345; + xnn_workspace_t raw_workspace1 = nullptr; + + // Create a scope to control the lifetime of workspace1 + { + auto workspace1_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace1_result.ok()); + auto workspace1 = workspace1_result.get(); + + // Store the raw pointer for later comparison + raw_workspace1 = workspace1->unsafe_get_workspace(); + + // Let workspace1 go out of scope and be destroyed + } + + // Get a new workspace with the same program ID + auto workspace2_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace2_result.ok()); + auto workspace2 = workspace2_result.get(); + + // Since the previous workspace was destroyed, we should get a new one. + EXPECT_NE(workspace2->unsafe_get_workspace(), raw_workspace1); +} + +TEST_F(XNNWorkspaceManagerTest, GlobalModeCleanup) { + // Test that global workspaces are properly cleaned up when all users + // are destroyed. + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global); + + uintptr_t program_id = 12345; + xnn_workspace_t raw_workspace1 = nullptr; + + // Create a scope to control the lifetime of workspace1 + { + auto workspace1_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace1_result.ok()); + auto workspace1 = workspace1_result.get(); + + // Store the raw pointer for later comparison + raw_workspace1 = workspace1->unsafe_get_workspace(); + + // Let workspace1 go out of scope and be destroyed + } + + // Get a new workspace (program ID doesn't matter in Global mode) + auto workspace2_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace2_result.ok()); + auto workspace2 = workspace2_result.get(); + + // Since the previous workspace was destroyed, we should get a new one. + EXPECT_NE(workspace2->unsafe_get_workspace(), raw_workspace1); +} + +TEST_F(XNNWorkspaceManagerTest, SwitchingModes) { + // Test switching between different sharing modes + + // Start with Disabled mode + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled); + + // Get a workspace + uintptr_t program_id = 12345; + auto workspace1_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace1_result.ok()); + auto workspace1 = workspace1_result.get(); + + // Switch to PerModel mode + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel); + + // Get another workspace with the same program ID + auto workspace2_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace2_result.ok()); + auto workspace2 = workspace2_result.get(); + + // Should be a different workspace + EXPECT_NE(workspace1, workspace2); + + // Get another workspace with the same program ID in PerModel mode + auto workspace3_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace3_result.ok()); + auto workspace3 = workspace3_result.get(); + + // Should be the same workspace as workspace2 + EXPECT_EQ(workspace2, workspace3); + + // Switch to Global mode + workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global); + + // Get another workspace + auto workspace4_result = + workspace_manager_->get_or_create_workspace(program_id); + ASSERT_TRUE(workspace4_result.ok()); + auto workspace4 = workspace4_result.get(); + + // Should be a different workspace since we switched modes + EXPECT_NE(workspace3, workspace4); + + // Get a workspace with a different program ID in Global mode + uintptr_t different_program_id = 67890; + auto workspace5_result = + workspace_manager_->get_or_create_workspace(different_program_id); + ASSERT_TRUE(workspace5_result.ok()); + auto workspace5 = workspace5_result.get(); + + // Should be the same workspace as workspace4 + EXPECT_EQ(workspace4, workspace5); +} diff --git a/backends/xnnpack/test/runtime/test_workspace_sharing.cpp b/backends/xnnpack/test/runtime/test_workspace_sharing.cpp new file mode 100644 index 00000000000..66f0d012acd --- /dev/null +++ b/backends/xnnpack/test/runtime/test_workspace_sharing.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; + +using executorch::backends::xnnpack::workspace_sharing_mode_option_key; +using executorch::backends::xnnpack::WorkspaceSharingMode; +using executorch::backends::xnnpack::xnnpack_backend_key; +using executorch::extension::Module; +using executorch::extension::TensorPtr; +using executorch::runtime::BackendOption; +using executorch::runtime::BackendOptions; +using executorch::runtime::Error; + +TensorPtr create_input_tensor(float val); +void run_and_validate_two_models( + std::optional mode1 = std::nullopt, + std::optional mode2 = std::nullopt); +void set_and_check_workspace_sharing_mode(WorkspaceSharingMode mode); + +TEST(WorkspaceSharing, SetMode) { + // Try setting and reading back the mode a few times. + set_and_check_workspace_sharing_mode(WorkspaceSharingMode::Disabled); + set_and_check_workspace_sharing_mode(WorkspaceSharingMode::PerModel); + set_and_check_workspace_sharing_mode(WorkspaceSharingMode::Global); +} + +TEST(WorkspaceSharing, SetInvalidMode) { + // Make sure we can't set an invalid mode. + + // Set to an initial known value. + set_and_check_workspace_sharing_mode(WorkspaceSharingMode::PerModel); + + // Set to a bad value. + BackendOptions<1> backend_options; + backend_options.set_option(workspace_sharing_mode_option_key, 70); + + auto status = executorch::runtime::set_option( + xnnpack_backend_key, backend_options.view()); + ASSERT_EQ(status, Error::InvalidArgument); + + // Make sure the option is still set to a valid value. + BackendOption read_option; + strcpy(read_option.key, workspace_sharing_mode_option_key); + read_option.value = -1; + status = get_option(xnnpack_backend_key, read_option); + + ASSERT_TRUE( + std::get(read_option.value) == + static_cast(WorkspaceSharingMode::PerModel)); +} + +TEST(WorkspaceSharing, RunWithDisabledMode) { + // Load and run some PTEs with workspace sharing disabled. + run_and_validate_two_models(WorkspaceSharingMode::Disabled); +} + +TEST(WorkspaceSharing, RunWithPerModelMode) { + // Load and run some PTEs with per-model workspace sharing. + run_and_validate_two_models(WorkspaceSharingMode::PerModel); +} + +TEST(WorkspaceSharing, RunWithGlobalMode) { + // Load and run some PTEs with global workspace sharing. + run_and_validate_two_models(WorkspaceSharingMode::Global); +} + +TEST(WorkspaceSharing, RunWithModeSwitch) { + // Check each pair of modes, loading one model in one mode and the other in + // the other mode. + + std::array modes = { + WorkspaceSharingMode::Disabled, + WorkspaceSharingMode::PerModel, + WorkspaceSharingMode::Global}; + + for (auto i = 0; i < modes.size(); ++i) { + for (auto j = i + 1; j < modes.size(); ++j) { + run_and_validate_two_models(modes[i], modes[j]); + } + } +} + +TensorPtr create_input_tensor(float val) { + // Create an f32 tensor with shape [10, 10, 10], matching the input of the + // test models. + std::vector data(1000, val); + + // Note that the tensor pointer takes ownership of the data vector. + return executorch::extension::make_tensor_ptr({10, 10, 10}, std::move(data)); +} + +void run_and_validate_two_models( + std::optional mode1, + std::optional mode2) { + // Load and run two models, verifying that the output tensors are correct, + // optionally setting sharing mode. + + if (mode1) { + set_and_check_workspace_sharing_mode(*mode1); + } + + Module mod1(std::getenv("ET_XNNPACK_GENERATED_ADD_LARGE_PTE_PATH")); + + auto a = create_input_tensor(1.0); + auto b = create_input_tensor(2.0); + auto c = create_input_tensor(3.0); + + auto result = mod1.forward({a, b, c}); + EXPECT_TRUE(result.ok()); + + // Expected output is 2a + 2b + c. + auto output_val = 1.0 * 2 + 2.0 * 2 + 3.0; + auto& output_tensor = result.get()[0].toTensor(); + for (auto i = 0; i < output_tensor.numel(); ++i) { + ASSERT_EQ(output_tensor.const_data_ptr()[i], output_val); + } + + if (mode2) { + set_and_check_workspace_sharing_mode(*mode2); + } + + Module mod2(std::getenv("ET_XNNPACK_GENERATED_SUB_LARGE_PTE_PATH")); + + auto result2 = mod2.forward({a, b, c}); + EXPECT_TRUE(result2.ok()); + + // Expected output is zero (the subtract operations cancel out). + auto& output_tensor2 = result2.get()[0].toTensor(); + for (auto i = 0; i < output_tensor2.numel(); ++i) { + ASSERT_EQ(output_tensor2.const_data_ptr()[i], 0); + } + + // Run mod1 again to validate that it gives correct results in the second mode + auto result3 = mod1.forward({a, b, c}); + EXPECT_TRUE(result3.ok()); + + // Expected output is still 2a + 2b + c + auto& output_tensor3 = result3.get()[0].toTensor(); + for (auto i = 0; i < output_tensor3.numel(); ++i) { + ASSERT_EQ(output_tensor3.const_data_ptr()[i], output_val); + } +} + +void set_and_check_workspace_sharing_mode(WorkspaceSharingMode mode) { + executorch::runtime::runtime_init(); + + BackendOptions<1> backend_options; + backend_options.set_option( + workspace_sharing_mode_option_key, static_cast(mode)); + + auto status = executorch::runtime::set_option( + xnnpack_backend_key, backend_options.view()); + ASSERT_EQ(status, Error::Ok); + + // Read the option back to sanity check. + BackendOption read_option; + strcpy(read_option.key, workspace_sharing_mode_option_key); + read_option.value = -1; + status = get_option(xnnpack_backend_key, read_option); + + ASSERT_TRUE(std::get(read_option.value) == static_cast(mode)); +} diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp index b2a56f6283d..568c3c4ec35 100644 --- a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp +++ b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp @@ -18,7 +18,7 @@ using executorch::runtime::Span; using executorch::runtime::testing::TensorFactory; TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) { - XNNExecutor executor; + XNNExecutor executor({}); xnn_subgraph_t subgraph = nullptr; xnn_runtime_t rt = nullptr; et_pal_init(); diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl index f175e9655ea..04517c035fe 100644 --- a/backends/xnnpack/test/targets.bzl +++ b/backends/xnnpack/test/targets.bzl @@ -63,3 +63,26 @@ def define_common_targets(): "ET_MODULE_LINEAR_XNN_DATA_PATH": "$(location fbcode//executorch/test/models:exported_xnnpack_program_and_data[ModuleLinear.ptd])", }, ) + + runtime.cxx_test( + name = "test_workspace_sharing", + srcs = ["runtime/test_workspace_sharing.cpp"], + deps = [ + "//executorch/extension/module:module", + "//executorch/extension/tensor:tensor", + "//executorch/backends/xnnpack:xnnpack_backend", + ], + env = { + "ET_XNNPACK_GENERATED_ADD_LARGE_PTE_PATH": "$(location fbcode//executorch/test/models:exported_xnnp_delegated_programs[ModuleAddLarge.pte])", + "ET_XNNPACK_GENERATED_SUB_LARGE_PTE_PATH": "$(location fbcode//executorch/test/models:exported_xnnp_delegated_programs[ModuleSubLarge.pte])", + }, + ) + + runtime.cxx_test( + name = "test_workspace_manager", + srcs = ["runtime/test_workspace_manager.cpp"], + deps = [ + third_party_dep("XNNPACK"), + "//executorch/backends/xnnpack:xnnpack_backend", + ], + ) diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py index 05fb53a837d..cdceb8a90a1 100644 --- a/backends/xnnpack/xnnpack_preprocess.py +++ b/backends/xnnpack/xnnpack_preprocess.py @@ -71,6 +71,11 @@ def generate_node_to_external_map( if node.op == "output": for output_nodes in node.args: for output_node in output_nodes: + if output_node in node_to_external_map: + raise RuntimeError( + f"Output node '{output_node}' is already in the inputs. " + "This is likely due to pass through arguments, which are not supported in XNNPACK Delegate." + ) node_to_external_map[output_node] = ExternalMeta( external_id=len(node_to_external_map), io_type=XNN_VALUE_FLAG_EXTERNAL_OUTPUT, diff --git a/codegen/tools/CMakeLists.txt b/codegen/tools/CMakeLists.txt index 489a96aafb6..2d61a4d68c1 100644 --- a/codegen/tools/CMakeLists.txt +++ b/codegen/tools/CMakeLists.txt @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -24,10 +25,23 @@ target_include_directories( # Compile options target_compile_options( - selective_build PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions + selective_build + PUBLIC -Wno-deprecated-declarations + -fPIC + -frtti + -fexceptions + -Werror + -Wunused-variable + -Wno-unknown-argument ) +# We suppress -Wno-unknown-argument because our build system passes -fPIC for +# Unix builds, but we also build on Windows where it's ignored # Link against required libraries +if(TARGET bundled_program) + target_compile_definitions(selective_build PRIVATE -DET_BUNDLE_IO) + target_link_libraries(selective_build PRIVATE bundled_program) +endif() target_link_libraries(selective_build PRIVATE executorch_core program_schema) # Install the module diff --git a/codegen/tools/combine_prim_ops_headers.py b/codegen/tools/combine_prim_ops_headers.py new file mode 100644 index 00000000000..b579de2047d --- /dev/null +++ b/codegen/tools/combine_prim_ops_headers.py @@ -0,0 +1,164 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Script to combine multiple selected_prim_ops.h header files into a single header. +This is used by selected_prim_operators_genrule to merge prim ops headers from dependencies. +""" + +import argparse +import os +import sys +from pathlib import Path +from typing import List, Set + + +def read_header_file(file_path: Path) -> Set[str]: + """ + Read a selected_prim_ops.h file and extract the macros and comments. + + Args: + file_path: Path to the header file + + Returns: + macros_set where macros_set contains unique macro defines + """ + macros = set() + + try: + with open(file_path, "r") as f: + for line in f: + line = line.strip() + + # Extract #define statements for prim ops + if line.startswith("#define INCLUDE_") and not line.startswith( + "#define EXECUTORCH_ENABLE" + ): + macros.add(line) + except FileNotFoundError: + print(f"Warning: Header file not found: {file_path}", file=sys.stderr) + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + + return macros + + +def combine_prim_ops_headers(header_file_paths: List[str], output_path: str) -> None: + """ + Combine multiple selected_prim_ops.h files into a single header. + + Args: + header_files: List of paths to header files to combine + output_path: Path to output the combined header + """ + all_macros = set() + has_selective_build = False + + # Read all header files and collect unique macros + for header_file_path in header_file_paths: + header_file = Path(header_file_path) / "selected_prim_ops.h" + if os.path.exists(header_file): + macros = read_header_file(header_file) + all_macros.update(macros) + if len(all_macros) > 0: + has_selective_build = True + else: + print( + f"Warning: Header file does not exist: {header_file}", file=sys.stderr + ) + + # Generate combined header + header_content = [ + "// Combined header for selective prim ops build", + "// This file is auto-generated by combining multiple selected_prim_ops.h files", + "// Do not edit manually.", + "", + "#pragma once", + "", + ] + + if all_macros and has_selective_build: + header_content.extend( + [ + "// Enable selective build for prim ops", + "#define EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD", + "", + "// Combined prim ops macros from all dependencies", + ] + ) + + # Sort macros for deterministic output + sorted_macros = sorted(all_macros) + header_content.extend(sorted_macros) + else: + header_content.extend( + [ + "// No prim ops found in dependencies - all prim ops will be included", + "// Selective build is disabled", + ] + ) + + header_content.append("") + + # Write the combined header + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as f: + f.write("\n".join(header_content)) + + +def _get_header_file_paths_from_query_output(query_output_file: str) -> List[str]: + """ + Parse the output of a Buck query command to extract header file paths. + + Args: + query_output_file: Path to the file containing the query output + + Returns: + List of header file paths + """ + header_file_paths = [] + assert ( + query_output_file[0] == "@" + ), "query_output_file is not a valid file path, or it doesn't start with '@'." + query_output_file = query_output_file[1:] + + with open(query_output_file, "r") as f: + for line in f: + # Extract the header file path from the query output + header_file_paths += line.split() + return header_file_paths + + +def main(): + parser = argparse.ArgumentParser( + description="Combine multiple selected_prim_ops.h header files" + ) + parser.add_argument( + "--header_files", + required=True, + help="Comma-separated list of header file paths", + ) + parser.add_argument( + "--output_dir", required=True, help="Output directory for combined header" + ) + + args = parser.parse_args() + import os + + header_file_paths = _get_header_file_paths_from_query_output(args.header_files) + + if not header_file_paths: + print("Error: No header files provided", file=sys.stderr) + sys.exit(1) + + # Generate output path + output_path = os.path.join(args.output_dir, "selected_prim_ops.h") + + combine_prim_ops_headers(header_file_paths, output_path) + + +if __name__ == "__main__": + main() diff --git a/codegen/tools/gen_all_oplist.py b/codegen/tools/gen_all_oplist.py index 5cb93bb9153..f33c3dc935d 100644 --- a/codegen/tools/gen_all_oplist.py +++ b/codegen/tools/gen_all_oplist.py @@ -10,7 +10,7 @@ import sys from functools import reduce from pathlib import Path -from typing import Any, List +from typing import Any, Dict, List import yaml from torchgen.selective_build.selector import ( @@ -72,6 +72,19 @@ def _raise_if_check_prim_ops_fail(options): raise Exception(error) +def _selected_ops_model_dict_is_empty(model_dict: Dict[str, Any]) -> bool: + return ( + not model_dict.get("build_features", []) + and not model_dict.get("custom_classes", []) + and not model_dict.get("et_kernel_metadata", None) + and not model_dict.get("include_all_non_op_selectives", False) + and not model_dict.get("include_all_operators", False) + and not model_dict.get("kernel_metadata", {}) + and not model_dict.get("operators", {}) + ) + + +# flake8: noqa: C901 def main(argv: List[Any]) -> None: """This binary generates 3 files: @@ -171,6 +184,11 @@ def main(argv: List[Any]) -> None: ), f"{model_file_name} is not a valid file path. This is likely a BUCK issue." with open(model_file_name, "rb") as model_file: model_dict = yaml.safe_load(model_file) + # It is possible that we created an empty yaml file. + # This is because et_operator_library may only contain prim ops. + # In that case selected_operators.yaml will be empty. + if _selected_ops_model_dict_is_empty(model_dict): + continue resolved = resolve_model_file_path_to_buck_target(model_file_name) for op in model_dict["operators"]: model_dict["operators"][op]["debug_info"] = [resolved] diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py index cca5bf1b1d2..28506050a8e 100644 --- a/codegen/tools/gen_oplist.py +++ b/codegen/tools/gen_oplist.py @@ -9,6 +9,7 @@ import os import sys from enum import IntEnum +from pathlib import Path from typing import Any, Dict, List, Optional, Set import yaml @@ -158,7 +159,7 @@ def _get_et_kernel_metadata_from_ops_yaml(ops_yaml_path: str) -> Dict[str, List[ def _dump_yaml( op_list: List[str], - output_path: str, + output_path: Path, model_name: Optional[str] = None, et_kernel_metadata: Optional[Dict[str, List[str]]] = None, include_all_operators: bool = False, @@ -212,20 +213,23 @@ def create_kernel_key(maybe_kernel_key: str) -> str: def gen_oplist( - output_path: str, + output_path: Path, model_file_path: Optional[str] = None, ops_schema_yaml_path: Optional[str] = None, root_ops: Optional[str] = None, ops_dict: Optional[str] = None, include_all_operators: bool = False, ): - assert ( + if not ( model_file_path or ops_schema_yaml_path or root_ops or ops_dict or include_all_operators - ), "Need to provide either model_file_path or ops_schema_yaml_path or root_ops or ops_dict or include_all_operators." + ): + # dump empty yaml file + _dump_yaml([], output_path) + return assert output_path, "Need to provide output_path for dumped yaml file." op_set = set() @@ -326,9 +330,15 @@ def main(args: List[Any]) -> None: ) options = parser.parse_args(args) + # check if the output_path is a directory, then generate operators + # under selected_operators.yaml + if Path(options.output_path).is_dir(): + output_path = Path(options.output_path) / "selected_operators.yaml" + else: + output_path = Path(options.output_path) try: gen_oplist( - output_path=options.output_path, + output_path=output_path, model_file_path=options.model_file_path, ops_schema_yaml_path=options.ops_schema_yaml_path, root_ops=options.root_ops, diff --git a/codegen/tools/gen_selected_prim_ops.py b/codegen/tools/gen_selected_prim_ops.py new file mode 100644 index 00000000000..4535ffaa57a --- /dev/null +++ b/codegen/tools/gen_selected_prim_ops.py @@ -0,0 +1,96 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import argparse +import os +import sys +from typing import Any, List + +from torchgen.code_template import CodeTemplate # type: ignore[import-not-found] + + +selected_prim_ops_h_template_str = """#pragma once +/** + * Generated by executorch/codegen/tools/gen_selected_prim_ops.py + */ + +$defines +""" +selected_prim_ops_h_template = CodeTemplate(selected_prim_ops_h_template_str) + + +def normalize_op_name(op_name: str) -> str: + """ + Normalize an operator name to a macro-safe format. + Convert op names like "executorch_prim::et_view.default" to "EXECUTORCH_PRIM_ET_VIEW_DEFAULT" + or "aten::sym_size.int" to "ATEN_SYM_SIZE_INT" + """ + # Remove namespace separator and replace with underscore + normalized = op_name.replace("::", "_") + # Replace dots with underscores + normalized = normalized.replace(".", "_") + # Convert to uppercase + normalized = normalized.upper() + # Add INCLUDE_ prefix + normalized = f"INCLUDE_{normalized}" + return normalized + + +def write_selected_prim_ops(prim_op_names: List[str], output_dir: str) -> None: + """ + Generate selected_prim_ops.h from a list of prim op names. + + Args: + prim_op_names: List of prim op names like ["executorch_prim::et_view.default", "aten::sym_size.int"] + output_dir: Directory where to write selected_prim_ops.h + """ + # Generate #define statements for each op + defines = [] + for op_name in prim_op_names: + macro_name = normalize_op_name(op_name) + defines.append(f"#define {macro_name}") + + # Join all defines with newlines + defines_str = "\n".join(defines) + + # Generate header content + header_contents = selected_prim_ops_h_template.substitute(defines=defines_str) + + # Write to file + selected_prim_ops_path = os.path.join(output_dir, "selected_prim_ops.h") + with open(selected_prim_ops_path, "wb") as out_file: + out_file.write(header_contents.encode("utf-8")) + + +def main(argv: List[Any]) -> None: + parser = argparse.ArgumentParser(description="Generate selected prim ops header") + parser.add_argument( + "--prim-op-names", + "--prim_op_names", + help="Comma-separated list of prim op names to include", + required=True, + ) + parser.add_argument( + "--output-dir", + "--output_dir", + help="The directory to store the output header file (selected_prim_ops.h)", + required=True, + ) + + options = parser.parse_args(argv) + + # Parse comma-separated prim op names + prim_op_names = [ + name.strip() for name in options.prim_op_names.split(",") if name.strip() + ] + + write_selected_prim_ops(prim_op_names, options.output_dir) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/codegen/tools/selective_build.cpp b/codegen/tools/selective_build.cpp index d33ff12ec9f..a34789e129d 100644 --- a/codegen/tools/selective_build.cpp +++ b/codegen/tools/selective_build.cpp @@ -1,16 +1,21 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ +#include +#include #include #include -#include -#include +#ifdef ET_BUNDLE_IO +#include +#include +#endif namespace py = pybind11; @@ -186,8 +191,39 @@ get_kernel_tensor_metadatas_from_execution_plan( const executorch_flatbuffer::Program* _get_program_from_buffer( const py::bytes& buffer) { + // Access the Python bytes without copying and get raw pointer/size. + const std::string_view sv = buffer.cast(); +#ifdef ET_BUNDLE_IO + void* buf_ptr = const_cast(static_cast(sv.data())); + const size_t buf_len = sv.size(); + + // If this is a bundled program, extract the inner ExecuTorch program bytes. + if (executorch::bundled_program::is_bundled_program(buf_ptr, buf_len)) { + const void* program_data = nullptr; + size_t program_size = 0; + + const auto status = executorch::bundled_program::get_program_data( + buf_ptr, // serialized BundledProgram start + buf_len, // total size of the BundledProgram blob + &program_data, // [out] pointer to inner .pte bytes + &program_size // [out] size of inner .pte bytes + ); + + if (status != ::executorch::runtime::Error::Ok || program_data == nullptr || + program_size == 0) { + throw std::runtime_error( + "bundled_program::get_program_data() failed or returned empty data"); + } + + // program_data points directly at the flatbuffer-encoded Program region. + return executorch_flatbuffer::GetProgram( + reinterpret_cast(program_data)); + } +#endif + // Otherwise treat the buffer as a raw .pte (flatbuffer Program with optional + // extended header). return executorch_flatbuffer::GetProgram( - buffer.cast().data()); + reinterpret_cast(sv.data())); } py::list _get_program_operators(const executorch_flatbuffer::Program* program) { diff --git a/codegen/tools/targets.bzl b/codegen/tools/targets.bzl index acea3370e7d..c11982409f0 100644 --- a/codegen/tools/targets.bzl +++ b/codegen/tools/targets.bzl @@ -17,10 +17,8 @@ def define_common_targets(is_fbcode = False): ], deps = [ "//executorch/codegen:gen_lib", - ] + ([] if runtime.is_oss else select({ - "DEFAULT": [], - "ovr_config//os:linux": ["//executorch/codegen/tools:selective_build"], # TODO(larryliu0820) :selective_build doesn't build in OSS yet - })), + "//executorch/codegen/tools:selective_build", + ], ) runtime.python_binary( @@ -29,7 +27,7 @@ def define_common_targets(is_fbcode = False): deps = [ ":gen_oplist_lib", ], - preload_deps = [] if runtime.is_oss else ["//executorch/codegen/tools:selective_build"], # TODO(larryliu0820) :selective_build doesn't build in OSS yet + preload_deps = ["//executorch/codegen/tools:selective_build"], package_style = "inplace", visibility = [ "//executorch/...", @@ -103,6 +101,26 @@ def define_common_targets(is_fbcode = False): _is_external_target = True, ) + runtime.python_library( + name = "combine_prim_ops_headers_lib", + srcs = ["combine_prim_ops_headers.py"], + base_module = "executorch.codegen.tools", + visibility = ["//executorch/..."], + ) + + runtime.python_binary( + name = "combine_prim_ops_headers", + main_module = "executorch.codegen.tools.combine_prim_ops_headers", + package_style = "inplace", + visibility = [ + "PUBLIC", + ], + deps = [ + ":combine_prim_ops_headers_lib", + ], + _is_external_target = True, + ) + runtime.python_test( name = "test_gen_all_oplist", srcs = [ @@ -155,27 +173,48 @@ def define_common_targets(is_fbcode = False): _is_external_target = True, ) - if not runtime.is_oss: - runtime.cxx_python_extension( - name = "selective_build", - srcs = [ - "selective_build.cpp", - ], - base_module = "executorch.codegen.tools", - types = ["selective_build.pyi"], - preprocessor_flags = [ - "-DEXECUTORCH_PYTHON_MODULE_NAME=selective_build", - ], - deps = [ - "//executorch/runtime/core:core", - "//executorch/schema:program", - ], - external_deps = [ - "pybind11", - ], - use_static_deps = True, - visibility = ["//executorch/codegen/..."], - ) + runtime.python_library( + name = "gen_selected_prim_ops_lib", + srcs = ["gen_selected_prim_ops.py"], + base_module = "executorch.codegen.tools", + visibility = ["//executorch/..."], + external_deps = ["torchgen"], + ) + + runtime.python_binary( + name = "gen_selected_prim_ops", + main_module = "executorch.codegen.tools.gen_selected_prim_ops", + package_style = "inplace", + visibility = [ + "PUBLIC", + ], + deps = [ + ":gen_selected_prim_ops_lib", + ], + _is_external_target = True, + ) + + + runtime.cxx_python_extension( + name = "selective_build", + srcs = [ + "selective_build.cpp", + ], + base_module = "executorch.codegen.tools", + types = ["selective_build.pyi"], + preprocessor_flags = [ + "-DEXECUTORCH_PYTHON_MODULE_NAME=selective_build", + ], + deps = [ + "//executorch/runtime/core:core", + "//executorch/schema:program", + ], + external_deps = [ + "pybind11", + ], + use_static_deps = True, + visibility = ["//executorch/codegen/..."], + ) # TODO(larryliu0820): This is a hack to only run these two on fbcode. These targets depends on exir which is only available in fbcode. @@ -214,10 +253,12 @@ def define_common_targets(is_fbcode = False): ], ) + if runtime.is_oss or is_fbcode: + # Doesn't work on xplat. But works on fbcode and OSS. runtime.python_test( - name = "test_selective_build", + name = "test_tools_selective_build", srcs = [ - "test/test_selective_build.py", + "test/test_tools_selective_build.py", ], package_style = "inplace", visibility = [ diff --git a/codegen/tools/test/test_gen_oplist.py b/codegen/tools/test/test_gen_oplist.py index f5c6829d6a0..18689cd2505 100644 --- a/codegen/tools/test/test_gen_oplist.py +++ b/codegen/tools/test/test_gen_oplist.py @@ -8,6 +8,7 @@ import os import tempfile import unittest +from pathlib import Path from typing import Dict, List from unittest.mock import NonCallableMock, patch @@ -77,7 +78,7 @@ def test_gen_op_list_with_valid_root_ops( gen_oplist.main(args) mock_dump_yaml.assert_called_once_with( ["aten::add", "aten::mul"], - output_path, + Path(output_path), None, {"aten::add": ["default"], "aten::mul": ["default"]}, False, @@ -100,7 +101,7 @@ def test_gen_op_list_with_root_ops_and_dtypes( gen_oplist.main(args) mock_dump_yaml.assert_called_once_with( ["aten::add", "aten::mul"], - output_path, + Path(output_path), None, { "aten::add": [ @@ -129,7 +130,7 @@ def test_gen_op_list_with_both_op_list_and_ops_schema_yaml_merges( gen_oplist.main(args) mock_dump_yaml.assert_called_once_with( ["aten::add.out", "aten::mul.out", "aten::relu.out"], - output_path, + Path(output_path), test_path, { "aten::relu.out": ["default"], @@ -153,7 +154,7 @@ def test_gen_op_list_with_include_all_operators( gen_oplist.main(args) mock_dump_yaml.assert_called_once_with( ["aten::add", "aten::mul"], - output_path, + Path(output_path), None, {"aten::add": ["default"], "aten::mul": ["default"]}, True, @@ -164,7 +165,7 @@ def test_get_custom_build_selector_with_both_allowlist_and_yaml( ) -> None: op_list = ["aten::add", "aten::mul"] filename = os.path.join(self.temp_dir.name, "selected_operators.yaml") - gen_oplist._dump_yaml(op_list, filename, "model.pte") + gen_oplist._dump_yaml(op_list, Path(filename), "model.pte") self.assertTrue(os.path.isfile(filename)) with open(filename) as f: es = yaml.safe_load(f) diff --git a/codegen/tools/test/test_selective_build.py b/codegen/tools/test/test_tools_selective_build.py similarity index 100% rename from codegen/tools/test/test_selective_build.py rename to codegen/tools/test/test_tools_selective_build.py diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt index fa5412ac476..fb154ff88bc 100644 --- a/configurations/CMakeLists.txt +++ b/configurations/CMakeLists.txt @@ -63,6 +63,6 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) install( TARGETS optimized_native_cpu_ops_lib EXPORT ExecuTorchTargets - DESTINATION lib + DESTINATION ${CMAKE_INSTALL_LIBDIR} ) endif() diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp index d095844986f..fd35caca557 100644 --- a/devtools/etdump/tests/etdump_test.cpp +++ b/devtools/etdump/tests/etdump_test.cpp @@ -345,7 +345,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) { EValue* values_p[2] = {&evalue_1, &evalue_2}; BoxedEvalueList a_box(values_p, storage, 2); - EValue evalue(a_box); + EValue evalue(&a_box); evalue.tag = Tag::ListTensor; etdump_gen[i]->create_event_block("test_block"); diff --git a/devtools/scripts/profile_model.sh b/devtools/scripts/profile_model.sh index 8697c97cd02..a4d50f6c6fc 100755 --- a/devtools/scripts/profile_model.sh +++ b/devtools/scripts/profile_model.sh @@ -7,7 +7,7 @@ #!/bin/bash -# ExecutorTorch Model Profiling Script +# ExecuTorch Model Profiling Script # # This script automates the process of building executor_runner with profiling enabled, # running model inference with ETDump collection, and generating CSV profiling reports. diff --git a/docs/.gitignore b/docs/.gitignore index 980fbad8320..b9b2a3753e5 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -3,3 +3,4 @@ /sphinxbuild_py /sphinxbuild_cpp /src +source/sg_execution_times.rst diff --git a/docs/Makefile b/docs/Makefile index 219998d4b4d..627358d0387 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -10,6 +10,9 @@ BUILDDIR = _build # Put it first so that "make" without argument is like "make help". +html-noplot: + $(SPHINXBUILD) -D plot_gallery=0 -b html $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)/html" + help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md index e30decb9362..845267b32f6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -43,7 +43,7 @@ To build the documentation locally: git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch ``` -1. If you don't have it already, start either a Python virtual envitonment: +1. If you don't have it already, start either a Python virtual environment: ```bash python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip @@ -111,7 +111,7 @@ You can use the variables in both regular text and code blocks. ## Including READMEs to the Documentation Build You might want to include some of the `README.md` files from various directories -in this repositories in your documentation build. To do that, create an `.md` +in this repository in your documentation build. To do that, create an `.md` file and use the `{include}` directive to insert your `.md` files. Example: ```` @@ -177,7 +177,7 @@ file: ```` In the `index.md` file, I would add `tutorials/selective-build-tutorial` in -both the `toctree` and the `cusotmcarditem` sections. +both the `toctree` and the `customcarditem` sections. # Auto-generated API documentation diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css deleted file mode 100644 index 3ae9585701e..00000000000 --- a/docs/source/_static/css/custom.css +++ /dev/null @@ -1,194 +0,0 @@ -/** - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* sphinx-design styles for cards/tabs -*/ -:root { - --sd-color-info: #ee4c2c; - --sd-color-primary: #6c6c6d; - --sd-color-primary-highlight: #f3f4f7; - --sd-color-card-border-hover: #ee4c2c; - --sd-color-card-border: #f3f4f7; - --sd-color-card-background: #fff; - --sd-color-card-text: inherit; - --sd-color-card-header: transparent; - --sd-color-card-footer: transparent; - --sd-color-tabs-label-active: #ee4c2c; - --sd-color-tabs-label-hover: #ee4c2c; - --sd-color-tabs-label-inactive: #6c6c6d; - --sd-color-tabs-underline-active: #ee4c2c; - --sd-color-tabs-underline-hover: #fabdbd; - --sd-color-tabs-underline-inactive: transparent; - --sd-color-tabs-overline: rgb(222, 222, 222); - --sd-color-tabs-underline: rgb(222, 222, 222); -} - -.sd-text-info { - color: #ee4c2c; -} - -.sd-card-img-top { - background: #ee4c2c; - height: 5px !important; -} - -.sd-card { - position: relative; - background-color: #fff; - opacity: 1.0; - border-radius: 0px; - width: 30%; - border: none; - padding-bottom: 0px; -} - - -.sd-card-img:hover { - opacity: 1.0; - background-color: #f3f4f7; -} - - -.sd-card:after { - display: block; - opacity: 1; - content: ''; - border-bottom: solid 1px #ee4c2c; - background-color: #fff; - transform: scaleX(0); - transition: transform .250s ease-in-out; - transform-origin: 0% 50%; -} - -.sd-card:hover { - background-color: #fff; - opacity: 1; - border-top: 1px solid #f3f4f7; - border-left: 1px solid #f3f4f7; - border-right: 1px solid #f3f4f7; -} - -.sd-card:hover:after { - transform: scaleX(1); -} - -.card-prerequisites:hover { - transition: none; - border: none; -} - -.card-prerequisites:hover:after { - transition: none; - transform: none; -} - -.card-prerequisites:after { - display: block; - content: ''; - border-bottom: none; - background-color: #fff; - transform: none; - transition: none; - transform-origin: none; -} - - -details.sd-dropdown { - font-weight: 300; - width: auto; -} - -details.sd-dropdown:after { - border: none; - transition: none; -} - -details.sd-dropdown:hover { - border: none; - transition: none; -} - -details.sd-dropdown .sd-summary-content { - font-weight: 300; -} - -details.sd-dropdown .highlight .n { - font-weight: normal; -} - -.et-page-column1 { - float: left; - width: 70%; - font-size: 1rem; -} - -.et-page-column2 { - float: right; - padding-top: 40px; - padding-left: 60px; - padding-right: 60px; - padding-bottom: 60px; - width: 30%; -} - -.et-page-column-row:after { - content: ""; - display: table; - clear: both; -} - -/* For screens smaller than 768px (typical mobile devices) */ -@media screen and (max-width: 768px) { - .et-page-column1, .et-page-column2 { - float: none; /* Remove floats */ - width: 100%; /* Full width for both columns */ - padding: 0; - font-size: 1rem; - } - - .et-page-column2 img { - display: none; - } - .et-page-column-row:after { - content: ""; - display: table; - clear: both; - } -} - -article.pytorch-article .class .method dt { - border-top: none; -} - -article.pytorch-article .class .simple dt { - border-top: none; -} - -article.pytorch-article .function dt.sig { - border-top: none; -} - -/* styles needed for 3rd level left nav */ - -.pytorch-left-menu ul, .pytorch-right-menu ul { - margin-left: 1.2em; -} - -.pytorch-left-menu li.toctree-l2.current > a { - color: #e44c2c; -} - -/* The next two styles enable normal hihglighting in the third level nav -in right side bar.*/ -#pytorch-right-menu .side-scroll-highlight { - color: #6c6c6d; -} - -#pytorch-right-menu a.reference.internal.side-scroll-highlight-local { - color: #ee4c2c; -} diff --git a/docs/source/_static/css/progress-bar.css b/docs/source/_static/css/progress-bar.css deleted file mode 100644 index 9b3aeb9d301..00000000000 --- a/docs/source/_static/css/progress-bar.css +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -.progress-bar-wrapper { - margin-top: auto; - display: flex; - justify-content: space-between; - margin-bottom: 20px; - position: sticky; - top: 0; - background: white; - padding-top: 20px; - padding-bottom: 20px; - z-index: 2; -} - -.progress-bar-item { - position: relative; - display: flex; - flex-direction: column; - align-items: center; - flex: 1; - - @media (max-width: 768px) { - font-size: 12px; - } -} - -.progress-bar-item::before { - position: absolute; - content: ""; - border-bottom: 2px solid #ccc; - width: 100%; - top: 20px; - left: -50%; - z-index: 2; -} - -.progress-bar-item::after { - position: absolute; - content: ""; - border-bottom: 2px solid #ccc; - width: 100%; - top: 20px; - left: 50%; - z-index: 2; -} - -.progress-bar-item .step-number { - position: relative; - z-index: 5; - display: flex; - justify-content: center; - align-items: center; - width: 40px; - height: 40px; - border-radius: 50%; - border-color: #812CE5; - border-style: solid; - border-width: 1px; - color: #812CE5; - background: #fff; - margin-bottom: 6px; -} - -.progress-bar-item.active { - font-weight: bold; -} - -.progress-bar-item.completed .step-number { - background-color: #812CE5; - color: white; -} - -.progress-bar-item.completed::after { - position: absolute; - content: ""; - border-bottom: 2px solid #812CE5; - width: 100%; - top: 20px; - left: 50%; - z-index: 3; -} - -.progress-bar-item:first-child::before { - content: none; -} - -.progress-bar-item:last-child::after { - content: none; -} - -.progress-bar-item a:link { - color: #262626 !important; -} - -.step-caption:first-child { - margin-left: 10px; -} - -.step-caption { - text-align: center; -} - -.step-caption a:link { - color: #262626 !important; -} - -.step-caption a:hover { - color: #ee4c2c; - text-decoration: underline; -} diff --git a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg b/docs/source/_static/img/ExecuTorch-Logo-cropped.svg deleted file mode 100644 index 9e0ef52fbd8..00000000000 --- a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg +++ /dev/null @@ -1,57 +0,0 @@ - - - - - - - - - - - diff --git a/docs/source/_static/img/executorch-chip-logo-circle-16.png b/docs/source/_static/img/executorch-chip-logo-circle-16.png new file mode 100644 index 00000000000..a3966ae27db Binary files /dev/null and b/docs/source/_static/img/executorch-chip-logo-circle-16.png differ diff --git a/docs/source/_static/img/executorch-chip-logo-circle-32.png b/docs/source/_static/img/executorch-chip-logo-circle-32.png new file mode 100644 index 00000000000..83f1018a76c Binary files /dev/null and b/docs/source/_static/img/executorch-chip-logo-circle-32.png differ diff --git a/docs/source/_static/img/executorch-chip-logo.svg b/docs/source/_static/img/executorch-chip-logo.svg new file mode 100644 index 00000000000..11e5ed60956 --- /dev/null +++ b/docs/source/_static/img/executorch-chip-logo.svg @@ -0,0 +1,205 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/js/progress-bar.js b/docs/source/_static/js/progress-bar.js deleted file mode 100644 index 878251cfc60..00000000000 --- a/docs/source/_static/js/progress-bar.js +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -document.addEventListener("DOMContentLoaded", function() { - const steps = Array.from(document.querySelectorAll('.progress-bar-item')); - const h2s = Array.from(document.querySelectorAll('h2')); - - // Populate captions from h2s - h2s.forEach((h2, index) => { - const captionElem = document.getElementById(`caption-${index + 1}`); - if (captionElem) { - captionElem.innerText = h2.innerText; - } - }); - - // Throttle function to optimize performance - function throttle(func, delay) { - let lastCall = 0; - return function() { - const now = Date.now(); - if (now - lastCall < delay) return; - lastCall = now; - func.apply(this, arguments); - } - } - - document.addEventListener("scroll", throttle(function() { - let activeIndex = 0; - let closestDistance = Number.MAX_VALUE; - const totalHeight = document.documentElement.scrollHeight; - const viewportHeight = window.innerHeight; - const scrollBottom = window.scrollY + viewportHeight; - const isAtBottom = totalHeight === scrollBottom; - - h2s.forEach((h2, index) => { - const rect = h2.getBoundingClientRect(); - const distanceToTop = Math.abs(rect.top); - if (distanceToTop < closestDistance) { - closestDistance = distanceToTop; - activeIndex = index; - } - }); - - steps.forEach((step, index) => { - if (isAtBottom) { - step.classList.remove('active'); - step.classList.add('completed'); - } else { - if (index < activeIndex) { - step.classList.remove('active'); - step.classList.add('completed'); - } else if (index === activeIndex) { - step.classList.add('active'); - step.classList.remove('completed'); - } else { - step.classList.remove('active', 'completed'); - } - } - }); - }, 100)); -}); diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html deleted file mode 100644 index 55f91103b35..00000000000 --- a/docs/source/_templates/layout.html +++ /dev/null @@ -1,145 +0,0 @@ -{% extends "!layout.html" %} - -{% block extrahead %} -{% if 'getting-started-setup' in pagename%} - - -{% elif 'compiler-delegate-and-partitioner' in pagename%} - - -{% elif 'xtensa' in pagename%} - - -{% elif 'qualcomm-ai-engine-direct-backend' in pagename%} - - -{% elif 'coreml' in pagename%} - - -{% elif 'mps' in pagename%} - - -{% endif %} -{{ super() }} -{% endblock %} - - -{% block sidebartitle %} - - {% include "searchbox.html" %} -{% endblock %} - -{%- block content %} -{% if 'tutorials' in pagename %} - - - -{% endif %} -{{ super() }} - -{% endblock %} - - - -{% block menu %} - {% if 'singlehtml' not in builder %} - {% set global_toc = toctree(collapse=theme_collapse_navigation|tobool, - includehidden=theme_includehidden|tobool, - titles_only=theme_titles_only|tobool) %} - {% endif %} - {% if global_toc %} - {{ global_toc }} - {% else %} - -
{{ toc }}
- {% endif %} -{% endblock %} - - -{% block footer %} -{{ super() }} - - -{{ super() }} - - -{{ super() }} - -{% endblock %} diff --git a/docs/source/advanced-topics-section.md b/docs/source/advanced-topics-section.md new file mode 100644 index 00000000000..e7b7f5490c6 --- /dev/null +++ b/docs/source/advanced-topics-section.md @@ -0,0 +1,112 @@ +(advanced-topics-section)= + +# Advanced + +Deep dive into ExecuTorch's advanced features for optimization, customization, and integration. + +This section covers advanced concepts for developers who need to customize ExecuTorch for specific use cases, optimize performance, or integrate with custom hardware backends. + +## Quantization & Optimization + +Techniques for model compression and performance optimization. + +**→ {doc}`quantization-optimization` — Quantization strategies and performance optimization** + +Key topics: + +- Quantization strategies and techniques +- Performance profiling and optimization + +## Model Export + +Learn the core ExecuTorch workflow, exporting PyTorch models to the `.pte` format for edge deployment. + +**→ {doc}`using-executorch-export`** - Model Export & Lowering + +Key topics: + +- Export and Lowering Workflow +- Hardware Backend Selection & Optimization +- Dynamic Shapes & Advanced Model Features + + +## Kernel Library + +Deep dive into ExecuTorch's kernel implementation and customization. + +**→ {doc}`kernel-library-advanced` — Kernel library deep dive and customization** + +Key topics: + +- Kernel library architecture +- Custom kernel implementation +- Selective build and optimization + +## Backend & Delegates + +**→ {doc}`backend-delegate-advanced` — Backend delegate integration** + +Key topics: + +- Learn how to integrate Backend Delegate into ExecuTorch and more +- XNNPACK Delegate Internals +- Debugging Delegation + + +## Runtime & Integration + +Advanced runtime features and backend integration. + +**→ {doc}`runtime-integration-advanced` — Runtime customization and backend integration** + +Key topics: + +- Backend delegate implementation +- Platform abstraction layer +- Custom runtime integration + +## Compiler & IR + +Advanced compiler features and intermediate representation details. + +**→ {doc}`compiler-ir-advanced` — Compiler passes and IR specification** + +Key topics: + +- Custom compiler passes +- Memory planning strategies +- Backend dialect and EXIR +- Ops set definition + + +## File Formats + +ExecuTorch file format specifications and internals. + +**→ {doc}`file-formats-advanced` — PTE and PTD file format specifications** + +Key topics: + +- PTE file format internals +- PTD file format specification +- Custom file format handling + +## Next Steps + +After exploring advanced topics: + +- **{doc}`tools-sdk-section`** - Developer tools for debugging and profiling +- **{doc}`api-section`** - Complete API reference documentation + +```{toctree} +:hidden: +:maxdepth: 2 +:caption: Advanced Topics + +quantization-optimization +using-executorch-export +kernel-library-advanced +backend-delegate-advanced +runtime-integration-advanced +compiler-ir-advanced +file-formats-advanced diff --git a/docs/source/android-arm-vgf.md b/docs/source/android-arm-vgf.md new file mode 100644 index 00000000000..cc39b53e176 --- /dev/null +++ b/docs/source/android-arm-vgf.md @@ -0,0 +1 @@ +```{include} backends-arm-vgf.md diff --git a/docs/source/android-backends.md b/docs/source/android-backends.md new file mode 100644 index 00000000000..d506813990b --- /dev/null +++ b/docs/source/android-backends.md @@ -0,0 +1,28 @@ +(android-backends)= +# Backends + +Available hardware acceleration backends for Android deployment. + +## CPU Acceleration + +- {doc}`android-xnnpack` — XNNPACK CPU acceleration + +## GPU Acceleration + +- {doc}`android-vulkan` — Vulkan GPU acceleration + +## NPU/Accelerator Backends + +- {doc}`android-qualcomm` — Qualcomm AI Engine (NPU) +- {doc}`android-mediatek` — MediaTek NPU acceleration +- {doc}`android-arm-vgf` — ARM VGF Backend +- {doc}`android-samsung-exynos` — Samsung Exynos NPU + +```{toctree} +:hidden: +android-xnnpack +android-vulkan +android-qualcomm +android-mediatek +android-arm-vgf +android-samsung-exynos diff --git a/docs/source/android-examples.md b/docs/source/android-examples.md new file mode 100644 index 00000000000..65580870c57 --- /dev/null +++ b/docs/source/android-examples.md @@ -0,0 +1,9 @@ +# Examples & Demos + +- [Working with LLMs - Android Examples](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android) +- [Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app) +- {doc}`tutorial-arm-vgf` — Export a simple PyTorch model for the ExecuTorch VGF backend + +```{toctree} +:hidden: +tutorial-arm-vgf diff --git a/docs/source/android-mediatek.md b/docs/source/android-mediatek.md new file mode 100644 index 00000000000..7034fe439dd --- /dev/null +++ b/docs/source/android-mediatek.md @@ -0,0 +1 @@ +```{include} backends-mediatek.md diff --git a/docs/source/android-qualcomm.md b/docs/source/android-qualcomm.md new file mode 100644 index 00000000000..f484d771a8b --- /dev/null +++ b/docs/source/android-qualcomm.md @@ -0,0 +1 @@ +```{include} backends-qualcomm.md diff --git a/docs/source/android-samsung-exynos.md b/docs/source/android-samsung-exynos.md new file mode 100644 index 00000000000..4c5a470edca --- /dev/null +++ b/docs/source/android-samsung-exynos.md @@ -0,0 +1 @@ +```{include} backends-samsung-exynos.md diff --git a/docs/source/android-section.md b/docs/source/android-section.md new file mode 100644 index 00000000000..a5774352bc1 --- /dev/null +++ b/docs/source/android-section.md @@ -0,0 +1,23 @@ +(android-section)= + +# Android + +Deploy ExecuTorch on Android devices with hardware acceleration support. + +## Quick Start & Integration + +- {doc}`using-executorch-android` — Complete Android integration guide + +## Backends + +- {doc}`android-backends` — Available Android backends and acceleration options + +## Examples & Demos + +- {doc}`android-examples` — Explore Android Examples & Demos + +```{toctree} +:hidden: +using-executorch-android +android-backends +android-examples diff --git a/docs/source/android-vulkan.md b/docs/source/android-vulkan.md new file mode 100644 index 00000000000..6399ac4ec7c --- /dev/null +++ b/docs/source/android-vulkan.md @@ -0,0 +1 @@ +```{include} backends-vulkan.md diff --git a/docs/source/android-xnnpack.md b/docs/source/android-xnnpack.md new file mode 100644 index 00000000000..315dd747006 --- /dev/null +++ b/docs/source/android-xnnpack.md @@ -0,0 +1 @@ +```{include} backends-xnnpack.md diff --git a/docs/source/api-section.md b/docs/source/api-section.md new file mode 100644 index 00000000000..ab2573aefa9 --- /dev/null +++ b/docs/source/api-section.md @@ -0,0 +1,26 @@ +(api-section)= +# API + +In this section, find complete API documentation for ExecuTorch's export, runtime, and extension interfaces. Includes comprehensive references for Python, C++, and Java APIs across all supported platforms. + +- {doc}`export-to-executorch-api-reference` — Export to ExecuTorch API Reference +- {doc}`executorch-runtime-api-reference` — ExecuTorch Runtime API Reference +- {doc}`runtime-python-api-reference` — Runtime Python API Reference +- {doc}`api-life-cycle` — API Life Cycle +- [Android doc →](https://pytorch.org/executorch/main/javadoc/) — Android API Documentation +- {doc}`extension-module` — Extension Module +- {doc}`extension-tensor` — Extension Tensor +- {doc}`running-a-model-cpp-tutorial` — Detailed C++ Runtime APIs Tutorial + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: API Reference + +export-to-executorch-api-reference +executorch-runtime-api-reference +runtime-python-api-reference +api-life-cycle +extension-module +extension-tensor +running-a-model-cpp-tutorial diff --git a/docs/source/backend-delegate-advanced.md b/docs/source/backend-delegate-advanced.md new file mode 100644 index 00000000000..752bd1cdc02 --- /dev/null +++ b/docs/source/backend-delegate-advanced.md @@ -0,0 +1,33 @@ +(backend-delegate-advanced)= + +# Backend & Delegates + +## Integration + +- {doc}`backend-delegates-integration` — Learn how to integrate a backend delegate into ExecuTorch + +## XNNPACK Reference + +- {doc}`backend-delegates-xnnpack-reference` — Deep dive into XNNPACK delegate internals and implementation details + +## Dependency Management + +- {doc}`backend-delegates-dependencies` — Manage third-party dependencies for backend delegates + +## Overview + +- {doc}`compiler-delegate-and-partitioner` — Understanding backends, delegates, and the partitioner system + +## Debugging + +- {doc}`debug-backend-delegate` — Tools and techniques for debugging delegation issues + +```{toctree} +:hidden: +:maxdepth: 1 + +backend-delegates-integration +backend-delegates-xnnpack-reference +backend-delegates-dependencies +compiler-delegate-and-partitioner +debug-backend-delegate diff --git a/docs/source/backend-delegates-dependencies.md b/docs/source/backend-delegates-dependencies.md index f2068989bd2..06f23ca36bc 100644 --- a/docs/source/backend-delegates-dependencies.md +++ b/docs/source/backend-delegates-dependencies.md @@ -49,7 +49,7 @@ for these third-party dependencies. `executorch/third-party` then try to use that if possible. This helps with reducing the binary size when the delegate is enabled. * The rest of the ExecuTorch code, outside of the delegate, should not depend on - this. And it should should build and run correctly without this dependency + this. And it should build and run correctly without this dependency when the delegate is disabled at build time. More details in the section [below](#runtime-dependencies). diff --git a/docs/source/backend-delegates-integration.md b/docs/source/backend-delegates-integration.md index 0179ceff872..130da0d3225 100644 --- a/docs/source/backend-delegates-integration.md +++ b/docs/source/backend-delegates-integration.md @@ -23,12 +23,13 @@ the top level ExecuTorch package. For third-party dependencies, please refer to At a minimum, a delegate must provide CMake support for building its C++ sources. -For the CMake setup, the delegate dir should be included by the -top level `CMakeLists.txt` file using `add_subdirectory` CMake command, and -should be built conditionally with an ExecuTorch build flag like -`EXECUTORCH_BUILD_`, see `EXECUTORCH_BUILD_XNNPACK` for example. -For third-party dependencies, please refer to -[this](backend-delegates-dependencies.md). +For the CMake setup: + +- The delegate directory should be included by the top-level `CMakeLists.txt` file using the `add_subdirectory` command. +- It should be built conditionally using an ExecuTorch build flag like `EXECUTORCH_BUILD_`. +(See `EXECUTORCH_BUILD_XNNPACK` for an example.) + +For third-party dependencies, please refer to [this](backend-delegates-dependencies.md). on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)). [comment]: <> (TODO: Refactor quantizer to a more official quantization doc) diff --git a/docs/source/backend-development.md b/docs/source/backend-development.md new file mode 100644 index 00000000000..ec5ceb3b37a --- /dev/null +++ b/docs/source/backend-development.md @@ -0,0 +1,11 @@ +# Backend Development + +```{toctree} +:maxdepth: 1 + +backend-delegates-integration +backend-delegates-xnnpack-reference +backend-delegates-dependencies +compiler-delegate-and-partitioner +debug-backend-delegate +``` diff --git a/docs/source/backends-arm-ethos-u.md b/docs/source/backends-arm-ethos-u.md index 9b3d02b21c1..2dfddacd20f 100644 --- a/docs/source/backends-arm-ethos-u.md +++ b/docs/source/backends-arm-ethos-u.md @@ -1,7 +1,7 @@ # Arm® Ethos™-U NPU Backend The Arm® Ethos™-U backend targets Edge/IoT-type AI use-cases by enabling optimal execution of quantized models on -[Arm® Ethos™-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm® Ethos™-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and +[Arm® Ethos™-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm® Ethos™-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and [Arm® Ethos™-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85), leveraging [TOSA](https://www.mlplatform.org/tosa/) and the [ethos-u-vela](https://pypi.org/project/ethos-u-vela/) graph compiler. This document is a technical reference for using the Ethos-U backend, for a top level view with code examples please refer to the [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html). @@ -268,10 +268,18 @@ You can see how this coupling between the memory mode and runtime application i The arm_executor_runner supports [bundled-io](https://docs.pytorch.org/executorch/0.4/bundled-io.html) and [ETdump](https://docs.pytorch.org/executorch/stable/etdump.html) debugging tools. -To enable bundled-io, set `EXECUTORCH_BUILD_DEVTOOLS` when building Executorch and `DET_BUNDLE_IO` when building the executor_runner. Currently using bundled-io requires specifying your -non delegated Aten ops manually by setting `EXECUTORCH_SELECT_OPS_LIST`. To enable ETdump, set `EXECUTORCH_BUILD_ARM_ETDUMP` when building Executorch and `DEXECUTORCH_ENABLE_EVENT_TRACER` +To enable bundled-io, set `EXECUTORCH_BUILD_DEVTOOLS` when building Executorch and `DET_BUNDLE_IO` when building the executor_runner. To enable ETdump, set `EXECUTORCH_BUILD_ARM_ETDUMP` when building Executorch and `DEXECUTORCH_ENABLE_EVENT_TRACER` when building the executor_runner. +## Memory formats + +Tensors of rank 4 and higher have two differing [memory format](https://pytorch.org/blog/tensor-memory-format-matters/) standards used. +Pytorch defaults to contiguous/ channels first/ NCHW memory formats, compared to TOSA which only supports channels last/NHWC memory format. +To support this, the backend inserts a transpose in the beginning if the incoming memory format is contiguous, and correspondingly a +transpose in the end if the outgoing memory format is contiguous. Note that this means that you may avoid transposing the data unneccessarily if the runtime integration and +full network is converted to use channels last. A word of caution must be given here however - changing memory format has been noted to have side effects such as +unsupported ops being inserted into the graph, and it is currently not widely tested, so the feature must so far be viewed as experimental. + ## See Also -- [Arm Ethos-U Backend Tutorial](tutorial-arm.md) \ No newline at end of file +- [Arm Ethos-U Backend Tutorial](tutorial-arm-ethos-u.md) \ No newline at end of file diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md index fe6748617a0..3ab0d3d3435 100644 --- a/docs/source/backends-coreml.md +++ b/docs/source/backends-coreml.md @@ -61,7 +61,7 @@ The Core ML partitioner API allows for configuration of the model delegation to - `skip_ops_for_coreml_delegation`: Allows you to skip ops for delegation by Core ML. By default, all ops that Core ML supports will be delegated. See [here](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/test/test_coreml_partitioner.py#L42) for an example of skipping an op for delegation. - `compile_specs`: A list of `CompileSpec`s for the Core ML backend. These control low-level details of Core ML delegation, such as the compute unit (CPU, GPU, ANE), the iOS deployment target, and the compute precision (FP16, FP32). These are discussed more below. - `take_over_mutable_buffer`: A boolean that indicates whether PyTorch mutable buffers in stateful models should be converted to [Core ML `MLState`](https://developer.apple.com/documentation/coreml/mlstate). If set to `False`, mutable buffers in the PyTorch graph are converted to graph inputs and outputs to the Core ML lowered module under the hood. Generally, setting `take_over_mutable_buffer` to true will result in better performance, but using `MLState` requires iOS >= 18.0, macOS >= 15.0, and Xcode >= 16.0. -- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate. If set to False, constant data is passed to the Core ML delegate as inputs. By deafault, take_over_constant_data=True. +- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate. If set to False, constant data is passed to the Core ML delegate as inputs. By default, take_over_constant_data=True. - `lower_full_graph`: A boolean that indicates whether the entire graph must be lowered to Core ML. If set to True and Core ML does not support an op, an error is raised during lowering. If set to False and Core ML does not support an op, the op is executed on the CPU by ExecuTorch. Although setting `lower_full_graph`=False can allow a model to lower where it would otherwise fail, it can introduce performance overhead in the model when there are unsupported ops. You will see warnings about unsupported ops during lowering if there are any. By default, `lower_full_graph`=False. @@ -187,7 +187,7 @@ To quantize a PyTorch model for the Core ML backend, use the `CoreMLQuantizer`. Quantization with the Core ML backend requires exporting the model for iOS 17 or later. To perform 8-bit quantization with the PT2E flow, follow these steps: -1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use to to create an instance of a `CoreMLQuantizer`. +1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use it to create an instance of a `CoreMLQuantizer`. 2) Use `torch.export.export` to export a graph module that will be prepared for quantization. 3) Call `prepare_pt2e` to prepare the model for quantization. 4) Run the prepared model with representative samples to calibrate the quantizated tensor activation ranges. @@ -386,4 +386,4 @@ If you're using Python 3.13, try reducing your python version to Python 3.12. c ### At runtime 1. [ETCoreMLModelCompiler.mm:55] [Core ML] Failed to compile model, error = Error Domain=com.apple.mlassetio Code=1 "Failed to parse the model specification. Error: Unable to parse ML Program: at unknown location: Unknown opset 'CoreML7'." UserInfo={NSLocalizedDescription=Failed to par$ -This means the model requires the the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14. +This means the model requires the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14. diff --git a/docs/source/backends-mediatek.md b/docs/source/backends-mediatek.md index a562cea13bd..34cd56f971b 100644 --- a/docs/source/backends-mediatek.md +++ b/docs/source/backends-mediatek.md @@ -23,7 +23,7 @@ The MediaTek backend enables acceleration of PyTorch models on edge devices with ``` - NeuroPilot SDK Python wheels (download from [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress)): ```bash - pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl + pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ``` diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md index f02f495f685..f4f7762c769 100644 --- a/docs/source/backends-nxp.md +++ b/docs/source/backends-nxp.md @@ -1,5 +1,79 @@ # NXP eIQ Neutron Backend -See -[NXP eIQ Neutron Backend](https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md) -for current status about running ExecuTorch on NXP eIQ Neutron Backend. +This manual page is dedicated to introduction of using the ExecuTorch with NXP eIQ Neutron Backend. +NXP offers accelerated machine learning models inference on edge devices. +To learn more about NXP's machine learning acceleration platform, please refer to [the official NXP website](https://www.nxp.com/applications/technologies/ai-and-machine-learning:MACHINE-LEARNING). + +
+For up-to-date status about running ExecuTorch on Neutron Backend please visit the manual page. +
+ +## Features + +ExecuTorch v1.0 supports running machine learning models on selected NXP chips (for now only i.MXRT700). +Among currently supported machine learning models are: +- Convolution-based neutral networks +- Full support for MobileNetV2 and CifarNet + +## Prerequisites (Hardware and Software) + +In order to successfully build ExecuTorch project and convert models for NXP eIQ Neutron Backend you will need a computer running Linux. + +If you want to test the runtime, you'll also need: +- Hardware with NXP's [i.MXRT700](https://www.nxp.com/products/i.MX-RT700) chip or a testing board like MIMXRT700-AVK +- [MCUXpresso IDE](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-integrated-development-environment-ide:MCUXpresso-IDE) or [MCUXpresso Visual Studio Code extension](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-for-visual-studio-code:MCUXPRESSO-VSC) + +## Using NXP backend + +To test converting a neural network model for inference on NXP eIQ Neutron Backend, you can use our example script: + +```shell +# cd to the root of executorch repository +./examples/nxp/aot_neutron_compile.sh [model (cifar10 or mobilenetv2)] +``` + +For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py). + +### Partitioner API + +The partitioner is defined in `NeutronPartitioner` in `backends/nxp/neutron_partitioner.py`. It has the following +arguments: +* `compile_spec` - list of key-value pairs defining compilation. E.g. for specifying platform (i.MXRT700) and Neutron Converter flavor. +* `custom_delegation_options` - custom options for specifying node delegation. + +### Quantization + +The quantization for Neutron Backend is defined in `NeutronQuantizer` in `backends/nxp/quantizer/neutron_quantizer.py`. +The quantization follows PT2E workflow, INT8 quantization is supported. Operators are quantized statically, activations +follow affine and weights symmetric per-tensor quantization scheme. + +#### Supported operators + +List of Aten operators supported by Neutron quantizer: + +`abs`, `adaptive_avg_pool2d`, `addmm`, `add.Tensor`, `avg_pool2d`, `cat`, `conv1d`, `conv2d`, `dropout`, +`flatten.using_ints`, `hardtanh`, `hardtanh_`, `linear`, `max_pool2d`, `mean.dim`, `pad`, `permute`, `relu`, `relu_`, +`reshape`, `view`, `softmax.int`, `sigmoid`, `tanh`, `tanh_` + +#### Example +```python +import torch +from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e + +# Prepare your model in Aten dialect +aten_model = get_model_in_aten_dialect() +# Prepare calibration inputs, each tuple is one example, example tuple has items for each model input +calibration_inputs: list[tuple[torch.Tensor, ...]] = get_calibration_inputs() +quantizer = NeutronQuantizer() + +m = prepare_pt2e(aten_model, quantizer) +for data in calibration_inputs: + m(*data) +m = convert_pt2e(m) +``` + +## Runtime Integration + +To learn how to run the converted model on the NXP hardware, use one of our example projects on using ExecuTorch runtime from MCUXpresso IDE example projects list. +For more finegrained tutorial, visit [this manual page](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html). diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md index c83ace26853..4a3313964a8 100644 --- a/docs/source/backends-overview.md +++ b/docs/source/backends-overview.md @@ -1,21 +1,64 @@ -# Backend Overview +# Backends -ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each. +## Backend Overview -The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details. +ExecuTorch backends provide hardware acceleration for specific hardware targets, enabling models to run efficiently on devices ranging from mobile phones to embedded systems and DSPs. During the export and lowering process, ExecuTorch optimizes your model for the chosen backend, resulting in a `.pte` file specialized for that hardware. To support multiple platforms (e.g., Core ML on iOS, Arm CPU on Android), you typically generate a dedicated `.pte` file for each backend. -As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example. +The choice of backend is informed by the hardware your model will run on. Each backend has its own hardware requirements and level of model/operator support. See the documentation for each backend for details. -### Available Backends +As part of `.pte` file creation, ExecuTorch identifies model partitions supported by the backend. These are processed ahead of time for efficient execution. Operators not supported by the delegate are executed using the portable CPU fallback (e.g., XNNPACK), allowing for partial acceleration. You can also specify multiple partitioners in order of priority, so unsupported GPU ops can fall back to CPU, for example. -Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation for more information. +--- -- [XNNPACK (Mobile CPU)](backends-xnnpack.md) -- [Core ML (iOS)](backends-coreml.md) -- [Metal Performance Shaders (iOS GPU)](backends-mps.md) -- [Vulkan (Android GPU)](backends-vulkan.md) -- [Qualcomm NPU](backends-qualcomm.md) -- [MediaTek NPU](backends-mediatek.md) -- [ARM Ethos-U NPU](backends-arm-ethos-u.md) -- [ARM VGF](backends-arm-vgf.md) -- [Cadence DSP](backends-cadence.md) +## Why Backends Matter + +Backends are the bridge between your exported model and the hardware it runs on. Choosing the right backend ensures your model takes full advantage of device-specific acceleration, balancing performance, compatibility, and resource usage. + +--- + +## Choosing a Backend + +| Backend | Platform(s) | Hardware Type | Typical Use Case | +|------------------------------------------|---------------------|---------------|---------------------------------| +| [XNNPACK](backends-xnnpack) | All | CPU | General-purpose, fallback | +| [Core ML](backends-coreml) | iOS, macOS | NPU/GPU | Apple devices, high performance | +| [Metal Performance Shaders](backends-mps)| iOS, macOS | GPU | Apple GPU acceleration | +| [Vulkan ](backends-vulkan) | Android | GPU | Android GPU acceleration | +| [Qualcomm](backends-qualcomm) | Android | NPU | Qualcomm SoCs | +| [MediaTek](backends-mediatek) | Android | NPU | MediaTek SoCs | +| [ARM EthosU](backends-arm-ethos-u) | Embedded | NPU | ARM MCUs | +| [ARM VGF](backends-arm-vgf) | Android | NPU | ARM platforms | +| [OpenVINO](build-run-openvino) | Embedded | CPU/GPU/NPU | Intel SoCs | +| [NXP](backends-nxp) | Embedded | NPU | NXP SoCs | +| [Cadence](backends-cadence) | Embedded | DSP | DSP-optimized workloads | +| [Samsung Exynos](backends-samsung-exynos)| Android | NPU | Samsung SoCs | + +**Tip:** For best performance, export a `.pte` file for each backend you plan to support. + +--- + +## Best Practices + +- **Test on all target devices:** Operator support may vary by backend. +- **Use fallback wisely:** If a backend doesn't support an operator, ExecuTorch will run it on CPU. +- **Consult backend docs:** Each backend has unique setup and tuning options. + +--- + +```{toctree} +:maxdepth: 1 +:hidden: +:caption: Backend Overview + +backends-xnnpack +backends-coreml +backends-mps +backends-vulkan +backends-qualcomm +backends-mediatek +backends-arm-ethos-u +backends-arm-vgf +build-run-openvino +backends-nxp +backends-cadence +backends-samsung-exynos diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md index 45f932da491..74089885fcf 100644 --- a/docs/source/backends-qualcomm.md +++ b/docs/source/backends-qualcomm.md @@ -74,10 +74,9 @@ This example is verified with SM8550 and SM8450. - A compiler to compile AOT parts, e.g., the GCC compiler comes with Ubuntu LTS. - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 26c. - [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) - - Click the "Get Software" button to download a version of QNN SDK. - - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6. - - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon. - - [QNN 2.28.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip) + - Click the "Get Software" button to download the latest version of the QNN SDK. + - Although newer versions are available, we have verified and recommend using QNN 2.37.0 for stability. + - You can download it directly from the following link: [QNN 2.37.0](https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/2.37.0.250724/v2.37.0.250724.zip) The directory with installed Qualcomm AI Engine Direct SDK looks like: ``` @@ -136,86 +135,6 @@ cd $EXECUTORCH_ROOT ./backends/qualcomm/scripts/build.sh --release ``` -### AOT (Ahead-of-time) components: - -Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct binary. - -```bash -cd $EXECUTORCH_ROOT -mkdir build-x86 -cd build-x86 -# Note that the below command might change. -# Please refer to the above build.sh for latest workable commands. -cmake .. \ - -DCMAKE_INSTALL_PREFIX=$PWD \ - -DEXECUTORCH_BUILD_QNN=ON \ - -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ - -DEXECUTORCH_BUILD_DEVTOOLS=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ - -DPYTHON_EXECUTABLE=python3 - -# nproc is used to detect the number of available CPU. -# If it is not applicable, please feel free to use the number you want. -cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc) - -# install Python APIs to correct import path -# The filename might vary depending on your Python and host version. -cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python -cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python - -# Workaround for .fbs files in exir/_serialize -cp $EXECUTORCH_ROOT/schema/program.fbs $EXECUTORCH_ROOT/exir/_serialize/program.fbs -cp $EXECUTORCH_ROOT/schema/scalar_type.fbs $EXECUTORCH_ROOT/exir/_serialize/scalar_type.fbs -``` - -### Runtime: - -An example `qnn_executor_runner` executable would be used to run the compiled `pte` model. - -Commands to build `qnn_executor_runner` for Android: - -```bash -cd $EXECUTORCH_ROOT -mkdir build-android -cd build-android -# build executorch & qnn_executorch_backend -cmake .. \ - -DCMAKE_INSTALL_PREFIX=$PWD \ - -DEXECUTORCH_BUILD_QNN=ON \ - -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ - -DEXECUTORCH_BUILD_DEVTOOLS=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ - -DPYTHON_EXECUTABLE=python3 \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI='arm64-v8a' \ - -DANDROID_PLATFORM=android-30 - -# nproc is used to detect the number of available CPU. -# If it is not applicable, please feel free to use the number you want. -cmake --build $PWD --target install -j$(nproc) - -cmake ../examples/qualcomm \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI='arm64-v8a' \ - -DANDROID_PLATFORM=android-30 \ - -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \ - -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ - -DPYTHON_EXECUTABLE=python3 \ - -Bexamples/qualcomm - -cmake --build examples/qualcomm -j$(nproc) - -# qnn_executor_runner can be found under examples/qualcomm -# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/executor_runner/qnn_executor_runner -ls examples/qualcomm -``` - -**Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options. - ## Deploying and running on device @@ -315,9 +234,11 @@ adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV79Stub.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR} ``` ***Step 2***. We also need to indicate dynamic linkers on Android and Hexagon @@ -363,13 +284,13 @@ The model, inputs, and output location are passed to `qnn_executorch_runner` by ## Supported model list -Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models. +Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `$EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models. ## How to Support a Custom Model in HTP Backend ### Step-by-Step Implementation Guide -Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more compilated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference +Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more complicated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference #### Step 1: Prepare Your Model ```python import torch @@ -476,4 +397,4 @@ print(f"Model successfully exported to {model_name}") ## FAQ If you encounter any issues while reproducing the tutorial, please file a github -issue on ExecuTorch repo and tag use `#qcom_aisw` tag +[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag diff --git a/docs/source/backends-samsung-exynos.md b/docs/source/backends-samsung-exynos.md new file mode 100644 index 00000000000..0d77936bf7f --- /dev/null +++ b/docs/source/backends-samsung-exynos.md @@ -0,0 +1 @@ +# Samsung Exynos Backend (TBD) diff --git a/docs/source/backends-section.md b/docs/source/backends-section.md new file mode 100644 index 00000000000..29a235a9416 --- /dev/null +++ b/docs/source/backends-section.md @@ -0,0 +1 @@ +```{include} backends-overview.md diff --git a/docs/source/backends-vulkan.md b/docs/source/backends-vulkan.md index 3ae80950645..531deece4e2 100644 --- a/docs/source/backends-vulkan.md +++ b/docs/source/backends-vulkan.md @@ -150,7 +150,7 @@ when building with CMake. First, make sure that you have the Android NDK installed; any NDK version past NDK r19c should work. Note that the examples in this doc have been validated with -NDK r27b. The Android SDK should also be installed so that you have access to `adb`. +NDK r28c. The Android SDK should also be installed so that you have access to `adb`. The instructions in this page assumes that the following environment variables are set. diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md index d1a120e69fa..42e76741ec8 100644 --- a/docs/source/backends-xnnpack.md +++ b/docs/source/backends-xnnpack.md @@ -67,10 +67,11 @@ The XNNPACK delegate can also be used as a backend to execute symmetrically quan ### Supported Quantization Schemes The XNNPACK delegate supports the following quantization schemes: + - 8-bit symmetric weights with 8-bit asymmetric activations (via the PT2E quantization flow). - - Supports both static and dynamic activations. - - Supports per-channel and per-tensor schemes. - - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators. + - Supports both static and dynamic activations. + - Supports per-channel and per-tensor schemes. + - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators. Weight-only quantization is not currently supported on XNNPACK. @@ -81,7 +82,7 @@ To perform 8-bit quantization with the PT2E flow, perform the following steps pr 1) Create an instance of the `XnnpackQuantizer` class. Set quantization parameters. 2) Use `torch.export.export` to prepare for quantization. 3) Call `prepare_pt2e` to prepare the model for quantization. -4) For static quantization, run the prepared model with representative samples to calibrate the quantizated tensor activation ranges. +4) For static quantization, run the prepared model with representative samples to calibrate the quantized tensor activation ranges. 5) Call `convert_pt2e` to quantize the model. 6) Export and lower the model using the standard flow. diff --git a/docs/source/build-run-openvino.md b/docs/source/build-run-openvino.md index dc6f098850f..9b4c48fee5a 100644 --- a/docs/source/build-run-openvino.md +++ b/docs/source/build-run-openvino.md @@ -61,7 +61,7 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct Follow the steps below to setup your build environment: -1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](getting-started-setup.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment. +1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](using-executorch-building-from-source.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment. 2. **Setup OpenVINO Backend Environment** - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory @@ -92,7 +92,7 @@ The exported model will be saved as 'resnet50.pte' in the current directory. ### Build C++ OpenVINO Examples -After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `/cmake-out/backends/openvino/`. +After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `/cmake-out/`. The executable requires a model file (`.pte` file generated in the aot step) and the number of inference executions. @@ -101,7 +101,7 @@ The executable requires a model file (`.pte` file generated in the aot step) and Run inference with a given model for 10 executions: ``` -./openvino_executor_runner \ +./executor_runner \ --model_path=model.pte \ --num_executions=10 ``` diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md index 79897737268..c0b03938374 100644 --- a/docs/source/bundled-io.md +++ b/docs/source/bundled-io.md @@ -17,7 +17,7 @@ This stage mainly focuses on the creation of a `BundledProgram` and dumping it o ### Step 1: Create a Model and Emit its ExecuTorch Program. -ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial). +ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) . ### Step 2: Construct `List[MethodTestSuite]` to hold test info @@ -194,7 +194,7 @@ regenerate_bundled_program = deserialize_from_flatbuffer_to_bundled_program(seri ``` ## Runtime Stage -This stage mainly focuses on executing the model with the bundled inputs and and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it. +This stage mainly focuses on executing the model with the bundled inputs and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it. ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md index c633bb1fd12..b057f3afa2e 100644 --- a/docs/source/compiler-delegate-and-partitioner.md +++ b/docs/source/compiler-delegate-and-partitioner.md @@ -1,4 +1,4 @@ -# Backends and Delegates +# Understanding Backends and Delegates Audience: Vendors, Backend Delegate developers, who are interested in integrating their own compilers and hardware as part of ExecuTorch @@ -37,7 +37,7 @@ The diagram looks like following There are mainly two Ahead-of-Time entry point for backend to implement: `partition` and `preprocess`. `partitioner` is an algorithm implemented by the backend to tag the nodes to be lowered to the backend. `to_backend` API will apply the partition algorithm and lower each subgraph, which consists of connected tagged nodes, to the targeted backend. Every subgraph -will be sent to the `preprocess` part provided by the backend to compiled as a binary blob. +will be sent to the `preprocess` part provided by the backend to be compiled as a binary blob. During partition, the `exported_program` is not allowed to mutate the program, and it's supposed to apply tag to each node. The `PartitionResult` includes both tagged exported program and the partition tags dictionary for `to_backend` to look up the tag and @@ -194,8 +194,8 @@ qnnpack is one backend and xnnpack is another backend. We haven't open-sourced these two backends delegates yet, and this example won't run out of box. It can be used as a reference to see how it can be done. -This option is easy to try becuase usually all backends will implement their own -parititioner. However this option may get different results if we change the +This option is easy to try because usually all backends will implement their own +partitioner. However this option may get different results if we change the order of to_backend call. If we want to have a better control on the nodes, like which backend they should go, option 2 is better. diff --git a/docs/source/compiler-entry-points.md b/docs/source/compiler-entry-points.md new file mode 100644 index 00000000000..ac5623c6769 --- /dev/null +++ b/docs/source/compiler-entry-points.md @@ -0,0 +1,9 @@ +# Compiler Entry Points + +```{toctree} +:maxdepth: 1 + +compiler-backend-dialect +compiler-custom-compiler-passes +compiler-memory-planning +``` diff --git a/docs/source/compiler-ir-advanced.md b/docs/source/compiler-ir-advanced.md new file mode 100644 index 00000000000..b6d24026d5a --- /dev/null +++ b/docs/source/compiler-ir-advanced.md @@ -0,0 +1,31 @@ +(compiler-ir-advanced)= +# Compiler & IR + +Advanced compiler features and intermediate representation specifications. + +## Compiler Passes + +- {doc}`compiler-custom-compiler-passes` — Custom compiler passes and optimization + +## Memory Management + +- {doc}`compiler-memory-planning` — Advanced memory planning strategies + +## Intermediate Representation + +- {doc}`ir-exir` — EXIR (Export Intermediate Representation) specification +- {doc}`ir-ops-set-definition` — Ops set definition and operator standardization + +## Backend dialect + +- {doc}`compiler-backend-dialect` — Backend dialect and compiler integration + +```{toctree} +:hidden: +:maxdepth: 1 + +compiler-custom-compiler-passes +compiler-memory-planning +ir-exir +ir-ops-set-definition +compiler-backend-dialect diff --git a/docs/source/conf.py b/docs/source/conf.py index 65845c03868..31abdef2820 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,7 +24,7 @@ import sys from typing import Any -import pytorch_sphinx_theme +import pytorch_sphinx_theme2 # type: ignore[import-not-found] # To let us import ./custom_directives.py sys.path.insert(0, os.path.abspath(".")) @@ -63,13 +63,10 @@ "sphinx_design", "sphinx_gallery.gen_gallery", "sphinx_reredirects", + "sphinx_sitemap", + "sphinxcontrib.mermaid", ] -if not FBCODE: - extensions += [ - "executorch_custom_versions", - ] - this_file_dir = os.path.abspath(os.path.dirname(__file__)) doxygen_xml_dir = os.path.join( os.path.dirname(this_file_dir), # {repo_root}/docs/ @@ -77,7 +74,7 @@ "xml", # {repo_root}/docs/cpp/build/xml ) -html_favicon = "_static/img/ExecuTorch-Logo-cropped.svg" +html_favicon = "_static/img/executorch-chip-logo.svg" # Get ET_VERSION_DOCS during the build. et_version_docs = os.environ.get("ET_VERSION_DOCS", None) @@ -99,14 +96,23 @@ print(f"Version: {version}") html_title = " ".join((project, version, "documentation")) +html_baseurl = "https://docs.pytorch.org/executorch/" # needed for sphinx-sitemap +sitemap_locales = [None] +sitemap_excludes = [ + "search.html", + "genindex.html", +] +sitemap_url_scheme = "{link}" + breathe_projects = {"ExecuTorch": "../build/xml/"} breathe_default_project = "ExecuTorch" -templates_path = ["_templates"] autodoc_typehints = "description" myst_enable_extensions = [ "colon_fence", + "deflist", + "html_image", ] myst_heading_anchors = 4 @@ -162,23 +168,78 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "pytorch_sphinx_theme" -html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] +html_theme = "pytorch_sphinx_theme2" +html_theme_path = [pytorch_sphinx_theme2.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # + +switcher_version = version + html_theme_options = { + "logo": { + "image_light": "_static/img/et-logo.png", + "image_dark": "_static/img/et-logo.png", + }, + "navigation_with_keys": False, + "canonical_url": "https://docs.pytorch.org/executorch/stable/", + "switcher": { + "json_url": "https://docs.pytorch.org/executorch/executorch-versions.json", # for testing only, will need to replace to the correct json file on the executorch website when it's added in the repo. + "version_match": switcher_version, + }, + "show_toc_level": 2, + "analytics_id": "GTM-T8XT4PS", + "icon_links": [ + { + "name": "X", + "url": "https://x.com/PyTorch", + "icon": "fa-brands fa-x-twitter", + }, + { + "name": "GitHub", + "url": "https://github.com/pytorch/executorch", + "icon": "fa-brands fa-github", + }, + { + "name": "Discourse", + "url": "https://discuss.pytorch.org/", + "icon": "fa-brands fa-discourse", + }, + { + "name": "PyPi", + "url": "https://pypi.org/project/executorch", + "icon": "fa-brands fa-python", + }, + ], + "show_version_warning_banner": True, + "use_edit_page_button": True, + "header_links_before_dropdown": 8, + "navbar_align": "left", + "navbar_start": ["navbar-logo", "version-switcher"], + "navbar_center": ["navbar-nav"], + "navbar_end": ["search-field-custom", "theme-switcher", "navbar-icon-links"], + "navbar_persistent": [], +} + +theme_variables = pytorch_sphinx_theme2.get_theme_variables() +templates_path = [ + "_templates", + os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"), +] + +html_context = { + "theme_variables": theme_variables, + "display_github": True, + "github_url": "https://github.com", + "github_user": "pytorch", + "github_repo": "executorch", + "feedback_url": "https://github.com/pytorch/executorch", + "github_version": "main", + "doc_path": "docs/source", "pytorch_project": "executorch", "display_version": True, - "logo_only": True, - "collapse_navigation": True, # changed to True to enable 3rd level nav. - "sticky_navigation": False, - "navigation_depth": 4, - "includehidden": True, - "titles_only": False, - "analytics_id": "GTM-T8XT4PS", } # Add any paths that contain custom static files (such as style sheets) here, @@ -186,14 +247,15 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] -html_css_files = ["css/custom.css", "progress-bar.css"] -html_js_files = ["js/progress-bar.js"] +# Add custom 404 page for GitHub Pages +html_additional_pages = {"404": "404.html"} + # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/", None), "numpy": ("https://numpy.org/doc/stable/", None), - "torch": ("https://pytorch.org/docs/stable/", None), + "torch": ("https://docs.pytorch.org/docs/stable/", None), } # Redirects for moved pages diff --git a/docs/source/debug-backend-delegate.md b/docs/source/debug-backend-delegate.md index 86dddd75868..efb4653a994 100644 --- a/docs/source/debug-backend-delegate.md +++ b/docs/source/debug-backend-delegate.md @@ -6,60 +6,607 @@ We provide a list of util functions to give users insights on what happened to t The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call: ```python +import torch +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.exir import to_edge_transform_and_lower +from torch.export import Dim, export +from torchvision.models.mobilenetv2 import MobileNet_V2_Weights +import torchvision.models as models + +# Dependency needed for debugging delegates from executorch.devtools.backend_debug import get_delegation_info from tabulate import tabulate -# ... After call to to_backend(), but before to_executorch() -graph_module = edge_manager.exported_program().graph_module + +model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval() +sample_inputs = (torch.randn(1, 3, 224, 224), ) + +et_program = to_edge_transform_and_lower( + torch.export.export(model, sample_inputs), + partitioner=[XnnpackPartitioner()] +) +graph_module = et_program.exported_program().graph_module delegation_info = get_delegation_info(graph_module) +# print the summary like the number of delegated nodes, non-delegated nodes, etc print(delegation_info.get_summary()) df = delegation_info.get_operator_delegation_dataframe() +# print the table including op_type, occurrences_in_delegated_graphs, occurrences_in_non_delegated_graphs print(tabulate(df, headers="keys", tablefmt="fancy_grid")) ``` Example printout: ``` -Total delegated subgraphs: 86 -Number of delegated nodes: 473 -Number of non-delegated nodes: 430 +Total delegated subgraphs: 2 +Number of delegated nodes: 203 +Number of non-delegated nodes: 4 ``` +| | op_type | occurrences_in_delegated_graphs | occurrences_in_non_delegated_graphs | +|----|---------------------------------------------------|---------------------------------|-------------------------------------| +| 0 | aten__native_batch_norm_legit_no_training_default | 52 | 0 | +| 1 | aten_add_tensor | 10 | 0 | +| 2 | aten_convolution_default | 52 | 0 | +| 3 | aten_hardtanh_default | 35 | 0 | +| 4 | aten_linear_default | 1 | 0 | +| 5 | aten_mean_dim | 1 | 0 | +| 6 | aten_view_copy_default | 0 | 1 | +| 7 | dim_order_ops__clone_dim_order_default | 0 | 1 | +| 8 | getitem | 52 | 2 | +| 9 | **Total** | **203** | **4** | -| | op_type | occurrences_in_delegated_graphs | occurrences_in_non_delegated_graphs | -|----|---------------------------------|------- |-----| -| 0 | aten__softmax_default | 12 | 0 | -| 1 | aten_add_tensor | 37 | 0 | -| 2 | aten_addmm_default | 48 | 0 | -| 3 | aten_arange_start_step | 0 | 25 | -| | ... | | | -| 23 | aten_view_copy_default | 170 | 48 | -| | ... | | | -| 26 | Total | 473 | 430 | -From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs. Users can use information like this to debug. +From the table, the operator `aten_view_copy_default` appears 0 times in delegate graphs and 1 times in non-delegated graphs. Users can use information like this to debug. `get_item node` is a special case, it means getting the output from the delegate subgraph. ## Visualize delegated graph -To see a more detailed view, use the `format_delegated_graph()` method to get a str of printout of the whole graph or use `print_delegated_graph()` to print directly: +To see a more detailed view, use the `format_delegated_graph()` method to get a string representation of the entire graph or use `print_delegated_graph()` to print directly: ```python from executorch.exir.backend.utils import format_delegated_graph -graph_module = edge_manager.exported_program().graph_module +graph_module = et_program.exported_program().graph_module print(format_delegated_graph(graph_module)) # or call print_delegated_graph(graph_module) ``` -It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` but hide the the subgraph consumes by the backend, while this function exposes the contents inside the subgraph. +It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` and hide the subgraph consumed by the backend, while this function exposes the contents inside the subgraph. -In the example printout below, observe that `embedding` and `add` operators are delegated to `XNNPACK` while the `sub` operator is not. +In the example printout below, observe that there are two subgraphs, `aten_view_copy_default` is not delegated, while most of the others ops are delegated. +
``` -%aten_unsqueeze_copy_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_23, -2), kwargs = {}) - %aten_unsqueeze_copy_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_24, -1), kwargs = {}) +graph(): + %b_features_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_0_1_num_batches_tracked] + %b_features_1_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_1_conv_0_1_num_batches_tracked] + %b_features_1_conv_2_num_batches_tracked : [num_users=0] = placeholder[target=b_features_1_conv_2_num_batches_tracked] + %b_features_2_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_2_conv_0_1_num_batches_tracked] + %b_features_2_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_2_conv_1_1_num_batches_tracked] + %b_features_2_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_2_conv_3_num_batches_tracked] + %b_features_3_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_3_conv_0_1_num_batches_tracked] + %b_features_3_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_3_conv_1_1_num_batches_tracked] + %b_features_3_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_3_conv_3_num_batches_tracked] + %b_features_4_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_4_conv_0_1_num_batches_tracked] + %b_features_4_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_4_conv_1_1_num_batches_tracked] + %b_features_4_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_4_conv_3_num_batches_tracked] + %b_features_5_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_5_conv_0_1_num_batches_tracked] + %b_features_5_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_5_conv_1_1_num_batches_tracked] + %b_features_5_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_5_conv_3_num_batches_tracked] + %b_features_6_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_6_conv_0_1_num_batches_tracked] + %b_features_6_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_6_conv_1_1_num_batches_tracked] + %b_features_6_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_6_conv_3_num_batches_tracked] + %b_features_7_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_7_conv_0_1_num_batches_tracked] + %b_features_7_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_7_conv_1_1_num_batches_tracked] + %b_features_7_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_7_conv_3_num_batches_tracked] + %b_features_8_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_8_conv_0_1_num_batches_tracked] + %b_features_8_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_8_conv_1_1_num_batches_tracked] + %b_features_8_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_8_conv_3_num_batches_tracked] + %b_features_9_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_9_conv_0_1_num_batches_tracked] + %b_features_9_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_9_conv_1_1_num_batches_tracked] + %b_features_9_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_9_conv_3_num_batches_tracked] + %b_features_10_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_10_conv_0_1_num_batches_tracked] + %b_features_10_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_10_conv_1_1_num_batches_tracked] + %b_features_10_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_10_conv_3_num_batches_tracked] + %b_features_11_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_11_conv_0_1_num_batches_tracked] + %b_features_11_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_11_conv_1_1_num_batches_tracked] + %b_features_11_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_11_conv_3_num_batches_tracked] + %b_features_12_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_12_conv_0_1_num_batches_tracked] + %b_features_12_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_12_conv_1_1_num_batches_tracked] + %b_features_12_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_12_conv_3_num_batches_tracked] + %b_features_13_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_13_conv_0_1_num_batches_tracked] + %b_features_13_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_13_conv_1_1_num_batches_tracked] + %b_features_13_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_13_conv_3_num_batches_tracked] + %b_features_14_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_14_conv_0_1_num_batches_tracked] + %b_features_14_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_14_conv_1_1_num_batches_tracked] + %b_features_14_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_14_conv_3_num_batches_tracked] + %b_features_15_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_15_conv_0_1_num_batches_tracked] + %b_features_15_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_15_conv_1_1_num_batches_tracked] + %b_features_15_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_15_conv_3_num_batches_tracked] + %b_features_16_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_16_conv_0_1_num_batches_tracked] + %b_features_16_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_16_conv_1_1_num_batches_tracked] + %b_features_16_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_16_conv_3_num_batches_tracked] + %b_features_17_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_17_conv_0_1_num_batches_tracked] + %b_features_17_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_17_conv_1_1_num_batches_tracked] + %b_features_17_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_17_conv_3_num_batches_tracked] + %b_features_18_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_18_1_num_batches_tracked] + %x : [num_users=1] = placeholder[target=x] %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0] backend_id: XnnpackBackend lowered graph(): - %aten_embedding_default : [num_users=1] = placeholder[target=aten_embedding_default] - %aten_embedding_default_1 : [num_users=1] = placeholder[target=aten_embedding_default_1] - %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_embedding_default, %aten_embedding_default_1), kwargs = {}) - return (aten_add_tensor,) - %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %aten_embedding_default, %aten_embedding_default_1), kwargs = {}) - %aten_sub_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.sub.Tensor](args = (%aten_unsqueeze_copy_default, %aten_unsqueeze_copy_default_1), kwargs = {}) + %p_features_0_0_weight : [num_users=1] = placeholder[target=p_features_0_0_weight] + %p_features_0_1_weight : [num_users=1] = placeholder[target=p_features_0_1_weight] + %p_features_0_1_bias : [num_users=1] = placeholder[target=p_features_0_1_bias] + %p_features_1_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_1_conv_0_0_weight] + %p_features_1_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_1_conv_0_1_weight] + %p_features_1_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_1_conv_0_1_bias] + %p_features_1_conv_1_weight : [num_users=1] = placeholder[target=p_features_1_conv_1_weight] + %p_features_1_conv_2_weight : [num_users=1] = placeholder[target=p_features_1_conv_2_weight] + %p_features_1_conv_2_bias : [num_users=1] = placeholder[target=p_features_1_conv_2_bias] + %p_features_2_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_2_conv_0_0_weight] + %p_features_2_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_2_conv_0_1_weight] + %p_features_2_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_2_conv_0_1_bias] + %p_features_2_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_2_conv_1_0_weight] + %p_features_2_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_2_conv_1_1_weight] + %p_features_2_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_2_conv_1_1_bias] + %p_features_2_conv_2_weight : [num_users=1] = placeholder[target=p_features_2_conv_2_weight] + %p_features_2_conv_3_weight : [num_users=1] = placeholder[target=p_features_2_conv_3_weight] + %p_features_2_conv_3_bias : [num_users=1] = placeholder[target=p_features_2_conv_3_bias] + %p_features_3_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_3_conv_0_0_weight] + %p_features_3_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_3_conv_0_1_weight] + %p_features_3_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_3_conv_0_1_bias] + %p_features_3_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_3_conv_1_0_weight] + %p_features_3_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_3_conv_1_1_weight] + %p_features_3_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_3_conv_1_1_bias] + %p_features_3_conv_2_weight : [num_users=1] = placeholder[target=p_features_3_conv_2_weight] + %p_features_3_conv_3_weight : [num_users=1] = placeholder[target=p_features_3_conv_3_weight] + %p_features_3_conv_3_bias : [num_users=1] = placeholder[target=p_features_3_conv_3_bias] + %p_features_4_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_4_conv_0_0_weight] + %p_features_4_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_4_conv_0_1_weight] + %p_features_4_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_4_conv_0_1_bias] + %p_features_4_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_4_conv_1_0_weight] + %p_features_4_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_4_conv_1_1_weight] + %p_features_4_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_4_conv_1_1_bias] + %p_features_4_conv_2_weight : [num_users=1] = placeholder[target=p_features_4_conv_2_weight] + %p_features_4_conv_3_weight : [num_users=1] = placeholder[target=p_features_4_conv_3_weight] + %p_features_4_conv_3_bias : [num_users=1] = placeholder[target=p_features_4_conv_3_bias] + %p_features_5_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_5_conv_0_0_weight] + %p_features_5_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_5_conv_0_1_weight] + %p_features_5_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_5_conv_0_1_bias] + %p_features_5_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_5_conv_1_0_weight] + %p_features_5_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_5_conv_1_1_weight] + %p_features_5_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_5_conv_1_1_bias] + %p_features_5_conv_2_weight : [num_users=1] = placeholder[target=p_features_5_conv_2_weight] + %p_features_5_conv_3_weight : [num_users=1] = placeholder[target=p_features_5_conv_3_weight] + %p_features_5_conv_3_bias : [num_users=1] = placeholder[target=p_features_5_conv_3_bias] + %p_features_6_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_6_conv_0_0_weight] + %p_features_6_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_6_conv_0_1_weight] + %p_features_6_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_6_conv_0_1_bias] + %p_features_6_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_6_conv_1_0_weight] + %p_features_6_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_6_conv_1_1_weight] + %p_features_6_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_6_conv_1_1_bias] + %p_features_6_conv_2_weight : [num_users=1] = placeholder[target=p_features_6_conv_2_weight] + %p_features_6_conv_3_weight : [num_users=1] = placeholder[target=p_features_6_conv_3_weight] + %p_features_6_conv_3_bias : [num_users=1] = placeholder[target=p_features_6_conv_3_bias] + %p_features_7_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_7_conv_0_0_weight] + %p_features_7_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_7_conv_0_1_weight] + %p_features_7_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_7_conv_0_1_bias] + %p_features_7_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_7_conv_1_0_weight] + %p_features_7_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_7_conv_1_1_weight] + %p_features_7_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_7_conv_1_1_bias] + %p_features_7_conv_2_weight : [num_users=1] = placeholder[target=p_features_7_conv_2_weight] + %p_features_7_conv_3_weight : [num_users=1] = placeholder[target=p_features_7_conv_3_weight] + %p_features_7_conv_3_bias : [num_users=1] = placeholder[target=p_features_7_conv_3_bias] + %p_features_8_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_8_conv_0_0_weight] + %p_features_8_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_8_conv_0_1_weight] + %p_features_8_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_8_conv_0_1_bias] + %p_features_8_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_8_conv_1_0_weight] + %p_features_8_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_8_conv_1_1_weight] + %p_features_8_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_8_conv_1_1_bias] + %p_features_8_conv_2_weight : [num_users=1] = placeholder[target=p_features_8_conv_2_weight] + %p_features_8_conv_3_weight : [num_users=1] = placeholder[target=p_features_8_conv_3_weight] + %p_features_8_conv_3_bias : [num_users=1] = placeholder[target=p_features_8_conv_3_bias] + %p_features_9_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_9_conv_0_0_weight] + %p_features_9_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_9_conv_0_1_weight] + %p_features_9_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_9_conv_0_1_bias] + %p_features_9_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_9_conv_1_0_weight] + %p_features_9_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_9_conv_1_1_weight] + %p_features_9_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_9_conv_1_1_bias] + %p_features_9_conv_2_weight : [num_users=1] = placeholder[target=p_features_9_conv_2_weight] + %p_features_9_conv_3_weight : [num_users=1] = placeholder[target=p_features_9_conv_3_weight] + %p_features_9_conv_3_bias : [num_users=1] = placeholder[target=p_features_9_conv_3_bias] + %p_features_10_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_10_conv_0_0_weight] + %p_features_10_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_10_conv_0_1_weight] + %p_features_10_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_10_conv_0_1_bias] + %p_features_10_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_10_conv_1_0_weight] + %p_features_10_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_10_conv_1_1_weight] + %p_features_10_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_10_conv_1_1_bias] + %p_features_10_conv_2_weight : [num_users=1] = placeholder[target=p_features_10_conv_2_weight] + %p_features_10_conv_3_weight : [num_users=1] = placeholder[target=p_features_10_conv_3_weight] + %p_features_10_conv_3_bias : [num_users=1] = placeholder[target=p_features_10_conv_3_bias] + %p_features_11_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_11_conv_0_0_weight] + %p_features_11_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_11_conv_0_1_weight] + %p_features_11_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_11_conv_0_1_bias] + %p_features_11_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_11_conv_1_0_weight] + %p_features_11_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_11_conv_1_1_weight] + %p_features_11_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_11_conv_1_1_bias] + %p_features_11_conv_2_weight : [num_users=1] = placeholder[target=p_features_11_conv_2_weight] + %p_features_11_conv_3_weight : [num_users=1] = placeholder[target=p_features_11_conv_3_weight] + %p_features_11_conv_3_bias : [num_users=1] = placeholder[target=p_features_11_conv_3_bias] + %p_features_12_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_12_conv_0_0_weight] + %p_features_12_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_12_conv_0_1_weight] + %p_features_12_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_12_conv_0_1_bias] + %p_features_12_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_12_conv_1_0_weight] + %p_features_12_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_12_conv_1_1_weight] + %p_features_12_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_12_conv_1_1_bias] + %p_features_12_conv_2_weight : [num_users=1] = placeholder[target=p_features_12_conv_2_weight] + %p_features_12_conv_3_weight : [num_users=1] = placeholder[target=p_features_12_conv_3_weight] + %p_features_12_conv_3_bias : [num_users=1] = placeholder[target=p_features_12_conv_3_bias] + %p_features_13_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_13_conv_0_0_weight] + %p_features_13_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_13_conv_0_1_weight] + %p_features_13_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_13_conv_0_1_bias] + %p_features_13_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_13_conv_1_0_weight] + %p_features_13_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_13_conv_1_1_weight] + %p_features_13_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_13_conv_1_1_bias] + %p_features_13_conv_2_weight : [num_users=1] = placeholder[target=p_features_13_conv_2_weight] + %p_features_13_conv_3_weight : [num_users=1] = placeholder[target=p_features_13_conv_3_weight] + %p_features_13_conv_3_bias : [num_users=1] = placeholder[target=p_features_13_conv_3_bias] + %p_features_14_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_14_conv_0_0_weight] + %p_features_14_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_14_conv_0_1_weight] + %p_features_14_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_14_conv_0_1_bias] + %p_features_14_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_14_conv_1_0_weight] + %p_features_14_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_14_conv_1_1_weight] + %p_features_14_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_14_conv_1_1_bias] + %p_features_14_conv_2_weight : [num_users=1] = placeholder[target=p_features_14_conv_2_weight] + %p_features_14_conv_3_weight : [num_users=1] = placeholder[target=p_features_14_conv_3_weight] + %p_features_14_conv_3_bias : [num_users=1] = placeholder[target=p_features_14_conv_3_bias] + %p_features_15_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_15_conv_0_0_weight] + %p_features_15_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_15_conv_0_1_weight] + %p_features_15_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_15_conv_0_1_bias] + %p_features_15_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_15_conv_1_0_weight] + %p_features_15_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_15_conv_1_1_weight] + %p_features_15_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_15_conv_1_1_bias] + %p_features_15_conv_2_weight : [num_users=1] = placeholder[target=p_features_15_conv_2_weight] + %p_features_15_conv_3_weight : [num_users=1] = placeholder[target=p_features_15_conv_3_weight] + %p_features_15_conv_3_bias : [num_users=1] = placeholder[target=p_features_15_conv_3_bias] + %p_features_16_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_16_conv_0_0_weight] + %p_features_16_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_16_conv_0_1_weight] + %p_features_16_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_16_conv_0_1_bias] + %p_features_16_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_16_conv_1_0_weight] + %p_features_16_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_16_conv_1_1_weight] + %p_features_16_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_16_conv_1_1_bias] + %p_features_16_conv_2_weight : [num_users=1] = placeholder[target=p_features_16_conv_2_weight] + %p_features_16_conv_3_weight : [num_users=1] = placeholder[target=p_features_16_conv_3_weight] + %p_features_16_conv_3_bias : [num_users=1] = placeholder[target=p_features_16_conv_3_bias] + %p_features_17_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_17_conv_0_0_weight] + %p_features_17_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_17_conv_0_1_weight] + %p_features_17_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_17_conv_0_1_bias] + %p_features_17_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_17_conv_1_0_weight] + %p_features_17_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_17_conv_1_1_weight] + %p_features_17_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_17_conv_1_1_bias] + %p_features_17_conv_2_weight : [num_users=1] = placeholder[target=p_features_17_conv_2_weight] + %p_features_17_conv_3_weight : [num_users=1] = placeholder[target=p_features_17_conv_3_weight] + %p_features_17_conv_3_bias : [num_users=1] = placeholder[target=p_features_17_conv_3_bias] + %p_features_18_0_weight : [num_users=1] = placeholder[target=p_features_18_0_weight] + %p_features_18_1_weight : [num_users=1] = placeholder[target=p_features_18_1_weight] + %p_features_18_1_bias : [num_users=1] = placeholder[target=p_features_18_1_bias] + %b_features_0_1_running_mean : [num_users=1] = placeholder[target=b_features_0_1_running_mean] + %b_features_0_1_running_var : [num_users=1] = placeholder[target=b_features_0_1_running_var] + %b_features_1_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_1_conv_0_1_running_mean] + %b_features_1_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_1_conv_0_1_running_var] + %b_features_1_conv_2_running_mean : [num_users=1] = placeholder[target=b_features_1_conv_2_running_mean] + %b_features_1_conv_2_running_var : [num_users=1] = placeholder[target=b_features_1_conv_2_running_var] + %b_features_2_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_2_conv_0_1_running_mean] + %b_features_2_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_2_conv_0_1_running_var] + %b_features_2_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_2_conv_1_1_running_mean] + %b_features_2_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_2_conv_1_1_running_var] + %b_features_2_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_2_conv_3_running_mean] + %b_features_2_conv_3_running_var : [num_users=1] = placeholder[target=b_features_2_conv_3_running_var] + %b_features_3_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_3_conv_0_1_running_mean] + %b_features_3_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_3_conv_0_1_running_var] + %b_features_3_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_3_conv_1_1_running_mean] + %b_features_3_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_3_conv_1_1_running_var] + %b_features_3_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_3_conv_3_running_mean] + %b_features_3_conv_3_running_var : [num_users=1] = placeholder[target=b_features_3_conv_3_running_var] + %b_features_4_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_4_conv_0_1_running_mean] + %b_features_4_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_4_conv_0_1_running_var] + %b_features_4_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_4_conv_1_1_running_mean] + %b_features_4_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_4_conv_1_1_running_var] + %b_features_4_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_4_conv_3_running_mean] + %b_features_4_conv_3_running_var : [num_users=1] = placeholder[target=b_features_4_conv_3_running_var] + %b_features_5_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_5_conv_0_1_running_mean] + %b_features_5_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_5_conv_0_1_running_var] + %b_features_5_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_5_conv_1_1_running_mean] + %b_features_5_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_5_conv_1_1_running_var] + %b_features_5_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_5_conv_3_running_mean] + %b_features_5_conv_3_running_var : [num_users=1] = placeholder[target=b_features_5_conv_3_running_var] + %b_features_6_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_6_conv_0_1_running_mean] + %b_features_6_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_6_conv_0_1_running_var] + %b_features_6_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_6_conv_1_1_running_mean] + %b_features_6_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_6_conv_1_1_running_var] + %b_features_6_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_6_conv_3_running_mean] + %b_features_6_conv_3_running_var : [num_users=1] = placeholder[target=b_features_6_conv_3_running_var] + %b_features_7_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_7_conv_0_1_running_mean] + %b_features_7_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_7_conv_0_1_running_var] + %b_features_7_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_7_conv_1_1_running_mean] + %b_features_7_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_7_conv_1_1_running_var] + %b_features_7_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_7_conv_3_running_mean] + %b_features_7_conv_3_running_var : [num_users=1] = placeholder[target=b_features_7_conv_3_running_var] + %b_features_8_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_8_conv_0_1_running_mean] + %b_features_8_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_8_conv_0_1_running_var] + %b_features_8_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_8_conv_1_1_running_mean] + %b_features_8_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_8_conv_1_1_running_var] + %b_features_8_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_8_conv_3_running_mean] + %b_features_8_conv_3_running_var : [num_users=1] = placeholder[target=b_features_8_conv_3_running_var] + %b_features_9_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_9_conv_0_1_running_mean] + %b_features_9_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_9_conv_0_1_running_var] + %b_features_9_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_9_conv_1_1_running_mean] + %b_features_9_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_9_conv_1_1_running_var] + %b_features_9_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_9_conv_3_running_mean] + %b_features_9_conv_3_running_var : [num_users=1] = placeholder[target=b_features_9_conv_3_running_var] + %b_features_10_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_10_conv_0_1_running_mean] + %b_features_10_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_10_conv_0_1_running_var] + %b_features_10_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_10_conv_1_1_running_mean] + %b_features_10_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_10_conv_1_1_running_var] + %b_features_10_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_10_conv_3_running_mean] + %b_features_10_conv_3_running_var : [num_users=1] = placeholder[target=b_features_10_conv_3_running_var] + %b_features_11_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_11_conv_0_1_running_mean] + %b_features_11_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_11_conv_0_1_running_var] + %b_features_11_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_11_conv_1_1_running_mean] + %b_features_11_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_11_conv_1_1_running_var] + %b_features_11_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_11_conv_3_running_mean] + %b_features_11_conv_3_running_var : [num_users=1] = placeholder[target=b_features_11_conv_3_running_var] + %b_features_12_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_12_conv_0_1_running_mean] + %b_features_12_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_12_conv_0_1_running_var] + %b_features_12_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_12_conv_1_1_running_mean] + %b_features_12_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_12_conv_1_1_running_var] + %b_features_12_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_12_conv_3_running_mean] + %b_features_12_conv_3_running_var : [num_users=1] = placeholder[target=b_features_12_conv_3_running_var] + %b_features_13_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_13_conv_0_1_running_mean] + %b_features_13_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_13_conv_0_1_running_var] + %b_features_13_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_13_conv_1_1_running_mean] + %b_features_13_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_13_conv_1_1_running_var] + %b_features_13_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_13_conv_3_running_mean] + %b_features_13_conv_3_running_var : [num_users=1] = placeholder[target=b_features_13_conv_3_running_var] + %b_features_14_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_14_conv_0_1_running_mean] + %b_features_14_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_14_conv_0_1_running_var] + %b_features_14_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_14_conv_1_1_running_mean] + %b_features_14_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_14_conv_1_1_running_var] + %b_features_14_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_14_conv_3_running_mean] + %b_features_14_conv_3_running_var : [num_users=1] = placeholder[target=b_features_14_conv_3_running_var] + %b_features_15_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_15_conv_0_1_running_mean] + %b_features_15_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_15_conv_0_1_running_var] + %b_features_15_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_15_conv_1_1_running_mean] + %b_features_15_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_15_conv_1_1_running_var] + %b_features_15_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_15_conv_3_running_mean] + %b_features_15_conv_3_running_var : [num_users=1] = placeholder[target=b_features_15_conv_3_running_var] + %b_features_16_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_16_conv_0_1_running_mean] + %b_features_16_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_16_conv_0_1_running_var] + %b_features_16_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_16_conv_1_1_running_mean] + %b_features_16_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_16_conv_1_1_running_var] + %b_features_16_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_16_conv_3_running_mean] + %b_features_16_conv_3_running_var : [num_users=1] = placeholder[target=b_features_16_conv_3_running_var] + %b_features_17_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_17_conv_0_1_running_mean] + %b_features_17_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_17_conv_0_1_running_var] + %b_features_17_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_17_conv_1_1_running_mean] + %b_features_17_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_17_conv_1_1_running_var] + %b_features_17_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_17_conv_3_running_mean] + %b_features_17_conv_3_running_var : [num_users=1] = placeholder[target=b_features_17_conv_3_running_var] + %b_features_18_1_running_mean : [num_users=1] = placeholder[target=b_features_18_1_running_mean] + %b_features_18_1_running_var : [num_users=1] = placeholder[target=b_features_18_1_running_var] + %x : [num_users=1] = placeholder[target=x] + %aten_convolution_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%x, %p_features_0_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default, %p_features_0_1_weight, %p_features_0_1_bias, %b_features_0_1_running_mean, %b_features_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {}) + %aten_hardtanh_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default, %p_features_1_conv_0_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 32), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_1, %p_features_1_conv_0_1_weight, %p_features_1_conv_0_1_bias, %b_features_1_conv_0_1_running_mean, %b_features_1_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_1, 0), kwargs = {}) + %aten_hardtanh_default_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_1, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_1, %p_features_1_conv_1_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_2, %p_features_1_conv_2_weight, %p_features_1_conv_2_bias, %b_features_1_conv_2_running_mean, %b_features_1_conv_2_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_2 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_2, 0), kwargs = {}) + %aten_convolution_default_3 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_2, %p_features_2_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_3 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_3, %p_features_2_conv_0_1_weight, %p_features_2_conv_0_1_bias, %b_features_2_conv_0_1_running_mean, %b_features_2_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_3 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_3, 0), kwargs = {}) + %aten_hardtanh_default_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_3, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_4 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_2, %p_features_2_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 96), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_4 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_4, %p_features_2_conv_1_1_weight, %p_features_2_conv_1_1_bias, %b_features_2_conv_1_1_running_mean, %b_features_2_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_4 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_4, 0), kwargs = {}) + %aten_hardtanh_default_3 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_4, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_3, %p_features_2_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_5, %p_features_2_conv_3_weight, %p_features_2_conv_3_bias, %b_features_2_conv_3_running_mean, %b_features_2_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_5 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_5, 0), kwargs = {}) + %aten_convolution_default_6 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_5, %p_features_3_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_6 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_6, %p_features_3_conv_0_1_weight, %p_features_3_conv_0_1_bias, %b_features_3_conv_0_1_running_mean, %b_features_3_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_6 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_6, 0), kwargs = {}) + %aten_hardtanh_default_4 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_6, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_4, %p_features_3_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 144), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_7, %p_features_3_conv_1_1_weight, %p_features_3_conv_1_1_bias, %b_features_3_conv_1_1_running_mean, %b_features_3_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_7 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_7, 0), kwargs = {}) + %aten_hardtanh_default_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_7, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_8 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_5, %p_features_3_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_8 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_8, %p_features_3_conv_3_weight, %p_features_3_conv_3_bias, %b_features_3_conv_3_running_mean, %b_features_3_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_8 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_8, 0), kwargs = {}) + %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_5, %getitem_8), kwargs = {}) + %aten_convolution_default_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor, %p_features_4_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_9, %p_features_4_conv_0_1_weight, %p_features_4_conv_0_1_bias, %b_features_4_conv_0_1_running_mean, %b_features_4_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_9 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_9, 0), kwargs = {}) + %aten_hardtanh_default_6 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_9, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_10 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_6, %p_features_4_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 144), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_10 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_10, %p_features_4_conv_1_1_weight, %p_features_4_conv_1_1_bias, %b_features_4_conv_1_1_running_mean, %b_features_4_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_10 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_10, 0), kwargs = {}) + %aten_hardtanh_default_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_10, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_11 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_7, %p_features_4_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_11 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_11, %p_features_4_conv_3_weight, %p_features_4_conv_3_bias, %b_features_4_conv_3_running_mean, %b_features_4_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_11 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_11, 0), kwargs = {}) + %aten_convolution_default_12 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_11, %p_features_5_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_12 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_12, %p_features_5_conv_0_1_weight, %p_features_5_conv_0_1_bias, %b_features_5_conv_0_1_running_mean, %b_features_5_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_12 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_12, 0), kwargs = {}) + %aten_hardtanh_default_8 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_12, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_13 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_8, %p_features_5_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 192), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_13 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_13, %p_features_5_conv_1_1_weight, %p_features_5_conv_1_1_bias, %b_features_5_conv_1_1_running_mean, %b_features_5_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_13 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_13, 0), kwargs = {}) + %aten_hardtanh_default_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_13, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_14 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_9, %p_features_5_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_14 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_14, %p_features_5_conv_3_weight, %p_features_5_conv_3_bias, %b_features_5_conv_3_running_mean, %b_features_5_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_14 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_14, 0), kwargs = {}) + %aten_add_tensor_1 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_11, %getitem_14), kwargs = {}) + %aten_convolution_default_15 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_1, %p_features_6_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_15 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_15, %p_features_6_conv_0_1_weight, %p_features_6_conv_0_1_bias, %b_features_6_conv_0_1_running_mean, %b_features_6_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_15 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_15, 0), kwargs = {}) + %aten_hardtanh_default_10 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_15, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_16 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_10, %p_features_6_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 192), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_16 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_16, %p_features_6_conv_1_1_weight, %p_features_6_conv_1_1_bias, %b_features_6_conv_1_1_running_mean, %b_features_6_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_16 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_16, 0), kwargs = {}) + %aten_hardtanh_default_11 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_16, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_17 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_11, %p_features_6_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_17 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_17, %p_features_6_conv_3_weight, %p_features_6_conv_3_bias, %b_features_6_conv_3_running_mean, %b_features_6_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_17 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_17, 0), kwargs = {}) + %aten_add_tensor_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_1, %getitem_17), kwargs = {}) + %aten_convolution_default_18 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_2, %p_features_7_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_18 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_18, %p_features_7_conv_0_1_weight, %p_features_7_conv_0_1_bias, %b_features_7_conv_0_1_running_mean, %b_features_7_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_18 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_18, 0), kwargs = {}) + %aten_hardtanh_default_12 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_18, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_19 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_12, %p_features_7_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 192), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_19 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_19, %p_features_7_conv_1_1_weight, %p_features_7_conv_1_1_bias, %b_features_7_conv_1_1_running_mean, %b_features_7_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_19 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_19, 0), kwargs = {}) + %aten_hardtanh_default_13 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_19, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_20 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_13, %p_features_7_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_20 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_20, %p_features_7_conv_3_weight, %p_features_7_conv_3_bias, %b_features_7_conv_3_running_mean, %b_features_7_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_20 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_20, 0), kwargs = {}) + %aten_convolution_default_21 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_20, %p_features_8_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_21 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_21, %p_features_8_conv_0_1_weight, %p_features_8_conv_0_1_bias, %b_features_8_conv_0_1_running_mean, %b_features_8_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_21 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_21, 0), kwargs = {}) + %aten_hardtanh_default_14 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_21, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_14, %p_features_8_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_22, %p_features_8_conv_1_1_weight, %p_features_8_conv_1_1_bias, %b_features_8_conv_1_1_running_mean, %b_features_8_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_22 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_22, 0), kwargs = {}) + %aten_hardtanh_default_15 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_22, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_15, %p_features_8_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_23, %p_features_8_conv_3_weight, %p_features_8_conv_3_bias, %b_features_8_conv_3_running_mean, %b_features_8_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_23 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_23, 0), kwargs = {}) + %aten_add_tensor_3 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_20, %getitem_23), kwargs = {}) + %aten_convolution_default_24 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_3, %p_features_9_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_24 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_24, %p_features_9_conv_0_1_weight, %p_features_9_conv_0_1_bias, %b_features_9_conv_0_1_running_mean, %b_features_9_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_24 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_24, 0), kwargs = {}) + %aten_hardtanh_default_16 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_24, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_25 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_16, %p_features_9_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_25 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_25, %p_features_9_conv_1_1_weight, %p_features_9_conv_1_1_bias, %b_features_9_conv_1_1_running_mean, %b_features_9_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_25 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_25, 0), kwargs = {}) + %aten_hardtanh_default_17 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_25, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_26 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_17, %p_features_9_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_26 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_26, %p_features_9_conv_3_weight, %p_features_9_conv_3_bias, %b_features_9_conv_3_running_mean, %b_features_9_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_26 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_26, 0), kwargs = {}) + %aten_add_tensor_4 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_3, %getitem_26), kwargs = {}) + %aten_convolution_default_27 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_4, %p_features_10_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_27 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_27, %p_features_10_conv_0_1_weight, %p_features_10_conv_0_1_bias, %b_features_10_conv_0_1_running_mean, %b_features_10_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_27 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_27, 0), kwargs = {}) + %aten_hardtanh_default_18 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_27, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_28 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_18, %p_features_10_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_28 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_28, %p_features_10_conv_1_1_weight, %p_features_10_conv_1_1_bias, %b_features_10_conv_1_1_running_mean, %b_features_10_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_28 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_28, 0), kwargs = {}) + %aten_hardtanh_default_19 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_28, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_29 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_19, %p_features_10_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_29 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_29, %p_features_10_conv_3_weight, %p_features_10_conv_3_bias, %b_features_10_conv_3_running_mean, %b_features_10_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_29 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_29, 0), kwargs = {}) + %aten_add_tensor_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_4, %getitem_29), kwargs = {}) + %aten_convolution_default_30 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_5, %p_features_11_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_30 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_30, %p_features_11_conv_0_1_weight, %p_features_11_conv_0_1_bias, %b_features_11_conv_0_1_running_mean, %b_features_11_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_30 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_30, 0), kwargs = {}) + %aten_hardtanh_default_20 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_30, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_31 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_20, %p_features_11_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_31 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_31, %p_features_11_conv_1_1_weight, %p_features_11_conv_1_1_bias, %b_features_11_conv_1_1_running_mean, %b_features_11_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_31 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_31, 0), kwargs = {}) + %aten_hardtanh_default_21 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_31, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_32 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_21, %p_features_11_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_32 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_32, %p_features_11_conv_3_weight, %p_features_11_conv_3_bias, %b_features_11_conv_3_running_mean, %b_features_11_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_32 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_32, 0), kwargs = {}) + %aten_convolution_default_33 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_32, %p_features_12_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_33 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_33, %p_features_12_conv_0_1_weight, %p_features_12_conv_0_1_bias, %b_features_12_conv_0_1_running_mean, %b_features_12_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_33 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_33, 0), kwargs = {}) + %aten_hardtanh_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_33, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_34 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_22, %p_features_12_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 576), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_34 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_34, %p_features_12_conv_1_1_weight, %p_features_12_conv_1_1_bias, %b_features_12_conv_1_1_running_mean, %b_features_12_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_34 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_34, 0), kwargs = {}) + %aten_hardtanh_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_34, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_35 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_23, %p_features_12_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_35 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_35, %p_features_12_conv_3_weight, %p_features_12_conv_3_bias, %b_features_12_conv_3_running_mean, %b_features_12_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_35 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_35, 0), kwargs = {}) + %aten_add_tensor_6 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_32, %getitem_35), kwargs = {}) + %aten_convolution_default_36 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_6, %p_features_13_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_36 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_36, %p_features_13_conv_0_1_weight, %p_features_13_conv_0_1_bias, %b_features_13_conv_0_1_running_mean, %b_features_13_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_36 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_36, 0), kwargs = {}) + %aten_hardtanh_default_24 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_36, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_37 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_24, %p_features_13_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 576), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_37 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_37, %p_features_13_conv_1_1_weight, %p_features_13_conv_1_1_bias, %b_features_13_conv_1_1_running_mean, %b_features_13_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_37 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_37, 0), kwargs = {}) + %aten_hardtanh_default_25 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_37, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_38 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_25, %p_features_13_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_38 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_38, %p_features_13_conv_3_weight, %p_features_13_conv_3_bias, %b_features_13_conv_3_running_mean, %b_features_13_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_38 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_38, 0), kwargs = {}) + %aten_add_tensor_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_6, %getitem_38), kwargs = {}) + %aten_convolution_default_39 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_7, %p_features_14_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_39 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_39, %p_features_14_conv_0_1_weight, %p_features_14_conv_0_1_bias, %b_features_14_conv_0_1_running_mean, %b_features_14_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_39 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_39, 0), kwargs = {}) + %aten_hardtanh_default_26 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_39, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_40 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_26, %p_features_14_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 576), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_40 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_40, %p_features_14_conv_1_1_weight, %p_features_14_conv_1_1_bias, %b_features_14_conv_1_1_running_mean, %b_features_14_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_40 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_40, 0), kwargs = {}) + %aten_hardtanh_default_27 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_40, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_41 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_27, %p_features_14_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_41 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_41, %p_features_14_conv_3_weight, %p_features_14_conv_3_bias, %b_features_14_conv_3_running_mean, %b_features_14_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_41 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_41, 0), kwargs = {}) + %aten_convolution_default_42 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_41, %p_features_15_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_42 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_42, %p_features_15_conv_0_1_weight, %p_features_15_conv_0_1_bias, %b_features_15_conv_0_1_running_mean, %b_features_15_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_42 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_42, 0), kwargs = {}) + %aten_hardtanh_default_28 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_42, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_43 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_28, %p_features_15_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 960), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_43 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_43, %p_features_15_conv_1_1_weight, %p_features_15_conv_1_1_bias, %b_features_15_conv_1_1_running_mean, %b_features_15_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_43 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_43, 0), kwargs = {}) + %aten_hardtanh_default_29 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_43, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_44 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_29, %p_features_15_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_44 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_44, %p_features_15_conv_3_weight, %p_features_15_conv_3_bias, %b_features_15_conv_3_running_mean, %b_features_15_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_44 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_44, 0), kwargs = {}) + %aten_add_tensor_8 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_41, %getitem_44), kwargs = {}) + %aten_convolution_default_45 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_8, %p_features_16_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_45 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_45, %p_features_16_conv_0_1_weight, %p_features_16_conv_0_1_bias, %b_features_16_conv_0_1_running_mean, %b_features_16_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_45 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_45, 0), kwargs = {}) + %aten_hardtanh_default_30 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_45, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_46 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_30, %p_features_16_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 960), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_46 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_46, %p_features_16_conv_1_1_weight, %p_features_16_conv_1_1_bias, %b_features_16_conv_1_1_running_mean, %b_features_16_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_46 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_46, 0), kwargs = {}) + %aten_hardtanh_default_31 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_46, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_47 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_31, %p_features_16_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_47 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_47, %p_features_16_conv_3_weight, %p_features_16_conv_3_bias, %b_features_16_conv_3_running_mean, %b_features_16_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_47 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_47, 0), kwargs = {}) + %aten_add_tensor_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_8, %getitem_47), kwargs = {}) + %aten_convolution_default_48 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_9, %p_features_17_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_48 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_48, %p_features_17_conv_0_1_weight, %p_features_17_conv_0_1_bias, %b_features_17_conv_0_1_running_mean, %b_features_17_conv_0_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_48 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_48, 0), kwargs = {}) + %aten_hardtanh_default_32 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_48, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_49 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_32, %p_features_17_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 960), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_49 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_49, %p_features_17_conv_1_1_weight, %p_features_17_conv_1_1_bias, %b_features_17_conv_1_1_running_mean, %b_features_17_conv_1_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_49 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_49, 0), kwargs = {}) + %aten_hardtanh_default_33 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_49, 0.0, 6.0), kwargs = {}) + %aten_convolution_default_50 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_33, %p_features_17_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_50 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_50, %p_features_17_conv_3_weight, %p_features_17_conv_3_bias, %b_features_17_conv_3_running_mean, %b_features_17_conv_3_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_50 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_50, 0), kwargs = {}) + %aten_convolution_default_51 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_50, %p_features_18_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {}) + %aten__native_batch_norm_legit_no_training_default_51 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_51, %p_features_18_1_weight, %p_features_18_1_bias, %b_features_18_1_running_mean, %b_features_18_1_running_var, 0.1, 1e-05), kwargs = {}) + %getitem_51 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_51, 0), kwargs = {}) + %aten_hardtanh_default_34 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_51, 0.0, 6.0), kwargs = {}) + %aten_mean_dim : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten_hardtanh_default_34, [-1, -2], True), kwargs = {}) + return (aten_mean_dim,) + %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %x), kwargs = {}) + %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 0), kwargs = {}) + %aten_view_copy_default : [num_users=1] = call_function[target=executorch.exir.memory.view](args = (%getitem, [1, 1280]), kwargs = {}) + %alloc : [num_users=1] = call_function[target=executorch.exir.memory.alloc](args = (((1, 1280), torch.float32),), kwargs = {}) + %dim_order_ops__clone_dim_order_default : [num_users=1] = call_function[target=torch.ops.dim_order_ops._clone_dim_order.out](args = (%aten_view_copy_default,), kwargs = {dim_order: [0, 1], out: %alloc}) + %lowered_module_1 : [num_users=1] = get_attr[target=lowered_module_1] + backend_id: XnnpackBackend + lowered graph(): + %p_classifier_1_weight : [num_users=1] = placeholder[target=p_classifier_1_weight] + %p_classifier_1_bias : [num_users=1] = placeholder[target=p_classifier_1_bias] + %dim_order_ops__clone_dim_order_default : [num_users=1] = placeholder[target=dim_order_ops__clone_dim_order_default] + %aten_linear_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.linear.default](args = (%dim_order_ops__clone_dim_order_default, %p_classifier_1_weight, %p_classifier_1_bias), kwargs = {}) + return (aten_linear_default,) + %executorch_call_delegate_1 : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_1, %dim_order_ops__clone_dim_order_default), kwargs = {}) + %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate_1, 0), kwargs = {}) + return (getitem_1,) ``` +
diff --git a/docs/source/desktop-backends.md b/docs/source/desktop-backends.md new file mode 100644 index 00000000000..e4220edb47f --- /dev/null +++ b/docs/source/desktop-backends.md @@ -0,0 +1,27 @@ +(desktop-backends)= +# Backends + +Available hardware acceleration backends for desktop platforms. + +## Linux Backends + +- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration) +- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization) + +## macOS Backends + +- {doc}`desktop-coreml` — CoreML (recommended for Apple Silicon) +- {doc}`desktop-mps` — Metal Performance Shaders (Apple Silicon GPU) +- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration) + +## Windows Backends + +- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration) +- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization) + +```{toctree} +:hidden: +desktop-xnnpack +desktop-openvino +desktop-coreml +desktop-mps diff --git a/docs/source/desktop-coreml.md b/docs/source/desktop-coreml.md new file mode 100644 index 00000000000..48271326d87 --- /dev/null +++ b/docs/source/desktop-coreml.md @@ -0,0 +1 @@ +```{include} backends-coreml.md diff --git a/docs/source/desktop-mps.md b/docs/source/desktop-mps.md new file mode 100644 index 00000000000..d6f305d33aa --- /dev/null +++ b/docs/source/desktop-mps.md @@ -0,0 +1 @@ +```{include} backends-mps.md diff --git a/docs/source/desktop-openvino.md b/docs/source/desktop-openvino.md new file mode 100644 index 00000000000..a0fd5774c73 --- /dev/null +++ b/docs/source/desktop-openvino.md @@ -0,0 +1 @@ +```{include} build-run-openvino.md diff --git a/docs/source/desktop-section.md b/docs/source/desktop-section.md new file mode 100644 index 00000000000..7afccbe1d4f --- /dev/null +++ b/docs/source/desktop-section.md @@ -0,0 +1,19 @@ +(desktop-section)= +# Desktop & Laptop Platforms + +Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends for each platform. + +## Platform Overview & Runtime + +- {doc}`using-executorch-cpp` — C++ runtime integration guide +- {doc}`using-executorch-building-from-source` — Building ExecuTorch from source + +## Backends + +- {doc}`desktop-backends` — Available desktop backends and platform-specific optimization + +```{toctree} +:hidden: +using-executorch-cpp +using-executorch-building-from-source +desktop-backends diff --git a/docs/source/desktop-xnnpack.md b/docs/source/desktop-xnnpack.md new file mode 100644 index 00000000000..315dd747006 --- /dev/null +++ b/docs/source/desktop-xnnpack.md @@ -0,0 +1 @@ +```{include} backends-xnnpack.md diff --git a/docs/source/developer-tools.md b/docs/source/developer-tools.md new file mode 100644 index 00000000000..d3b90b7adc8 --- /dev/null +++ b/docs/source/developer-tools.md @@ -0,0 +1,16 @@ +# Tools + +```{toctree} +:maxdepth: 1 + +devtools-overview +bundled-io +etrecord +etdump +runtime-profiling +model-debugging +model-inspector +memory-planning-inspection +delegate-debugging +devtools-tutorial +``` diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md index 449dd1485dc..8e13e67f1a1 100644 --- a/docs/source/devtools-overview.md +++ b/docs/source/devtools-overview.md @@ -41,6 +41,6 @@ More details are available in the [ETDump documentation](etdump.md) on how to ge ### Inspector APIs -The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. +The Inspector Python APIs are the main user entry point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. More details are available in the [Inspector API documentation](model-inspector.rst) on how to use the Inspector APIs. diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md index 7c6cedc311b..6d540dc7f35 100644 --- a/docs/source/devtools-tutorial.md +++ b/docs/source/devtools-tutorial.md @@ -1,3 +1,3 @@ ## Developer Tools Usage Tutorial -Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools. +Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools. diff --git a/docs/source/edge-platforms-section.md b/docs/source/edge-platforms-section.md new file mode 100644 index 00000000000..99e44093544 --- /dev/null +++ b/docs/source/edge-platforms-section.md @@ -0,0 +1,73 @@ +(edge-platforms-section)= +# Edge + +Deploy ExecuTorch on mobile, desktop, and embedded platforms with optimized backends for each. + +ExecuTorch supports deployment across a wide variety of edge computing platforms, from high-end mobile devices to constrained embedded systems and microcontrollers. + +## Android + +Deploy ExecuTorch on Android devices with hardware acceleration support. + +**→ {doc}`android-section` — Complete Android deployment guide** + +Key features: +- Hardware acceleration support (CPU, GPU, NPU) +- Multiple backend options (XNNPACK, Vulkan, Qualcomm, MediaTek, ARM, Samsung) +- Comprehensive examples and demos + +## iOS + +Deploy ExecuTorch on iOS devices with Apple hardware acceleration. + +**→ {doc}`ios-section` — Complete iOS deployment guide** + +Key features: +- Apple hardware optimization (CoreML, MPS, XNNPACK) +- Swift and Objective-C integration +- LLM and computer vision examples + +## Desktop & Laptop Platforms + +Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends. + +**→ {doc}`desktop-section` — Complete desktop deployment guide** + +Key features: +- Cross-platform C++ runtime +- Platform-specific optimization (OpenVINO, CoreML, MPS) +- CPU and GPU acceleration options + +## Embedded Systems + +Deploy ExecuTorch on constrained embedded systems and microcontrollers. + +**→ {doc}`embedded-section` — Complete embedded deployment guide** + +Key features: + +- Resource-constrained deployment +- DSP and NPU acceleration (Cadence, ARM Ethos-U, NXP) +- Custom backend development support +- LLM and computer vision examples + +## Troubleshooting & Support + +- **{doc}`using-executorch-troubleshooting`** - Common issues and solutions across all platforms + +## Next Steps + +After choosing your platform: +- **{doc}`backends-section`** - Deep dive into backend selection and optimization +- **{doc}`llm/working-with-llms`** - Working with Large Language Models on edge devices + +```{toctree} +:hidden: +:maxdepth: 2 +:caption: Edge Platforms + +android-section +ios-section +desktop-section +embedded-section +using-executorch-troubleshooting diff --git a/docs/source/embedded-arm-ethos-u.md b/docs/source/embedded-arm-ethos-u.md new file mode 100644 index 00000000000..cdc544a6553 --- /dev/null +++ b/docs/source/embedded-arm-ethos-u.md @@ -0,0 +1 @@ +```{include} backends-arm-ethos-u.md diff --git a/docs/source/embedded-backends.md b/docs/source/embedded-backends.md new file mode 100644 index 00000000000..4ed7962ef42 --- /dev/null +++ b/docs/source/embedded-backends.md @@ -0,0 +1,20 @@ +(embedded-backends)= +# Backends + +Available hardware acceleration backends for embedded systems. + +## DSP Acceleration + +- {doc}`embedded-cadence` — Cadence Xtensa DSP processors + +## NPU Acceleration + +- {doc}`embedded-arm-ethos-u` — ARM Ethos-U NPU acceleration +- {doc}`embedded-nxp` — NXP eIQ Neutron Backend + + +```{toctree} +:hidden: +embedded-cadence +embedded-arm-ethos-u +embedded-nxp diff --git a/docs/source/embedded-cadence.md b/docs/source/embedded-cadence.md new file mode 100644 index 00000000000..d2f7ea78259 --- /dev/null +++ b/docs/source/embedded-cadence.md @@ -0,0 +1 @@ +```{include} backends-cadence.md diff --git a/docs/source/embedded-nxp.md b/docs/source/embedded-nxp.md new file mode 100644 index 00000000000..35d8f0ab75d --- /dev/null +++ b/docs/source/embedded-nxp.md @@ -0,0 +1 @@ +```{include} backends-nxp.md diff --git a/docs/source/embedded-section.md b/docs/source/embedded-section.md new file mode 100644 index 00000000000..834001afbc3 --- /dev/null +++ b/docs/source/embedded-section.md @@ -0,0 +1,39 @@ +(embedded-section)= + +# Embedded Systems + +Deploy ExecuTorch on constrained embedded systems and microcontrollers. + +## API Reference & Development + +Start here for C++ development with ExecuTorch runtime APIs and essential tutorials. + +- {doc}`executorch-runtime-api-reference` — **Start here**: Complete runtime API reference for embedded development +- {doc}`running-a-model-cpp-tutorial` — Step-by-step C++ API tutorial with practical examples +- {doc}`extension-module` — Custom module extensions for specialized functionality +- {doc}`extension-tensor` — Tensor operations and memory management extensions + +## Build & Integration Guide + +- {doc}`using-executorch-cpp` — Complete setup guide for C++ runtime integration +- {doc}`using-executorch-building-from-source` — Building from Source + +## Choose Backend for acceleration + +- {doc}`embedded-backends` — Available embedded backends and acceleration options + +## Tutorials + +- {doc}`tutorial-arm-ethos-u` — Export a simple PyTorch model for the ExecuTorch Ethos-U backend + + +```{toctree} +:hidden: +executorch-runtime-api-reference +running-a-model-cpp-tutorial +extension-module +extension-tensor +using-executorch-cpp +using-executorch-building-from-source +embedded-backends +tutorial-arm-ethos-u diff --git a/docs/source/etrecord.rst b/docs/source/etrecord.rst index 1ab84a6ee10..39bc45cab5a 100644 --- a/docs/source/etrecord.rst +++ b/docs/source/etrecord.rst @@ -23,13 +23,120 @@ It should be provided to the `Inspector API `__ to link ba Generating an ``ETRecord`` -------------------------- -The user should use the following API to generate an ``ETRecord`` file. They -will be expected to provide the Edge Dialect program (returned by the call to ``to_edge()``), -the ExecuTorch program (returned by the call to ``to_executorch()``), and optional models that -they are interested in working with via our tooling. +There are multiple ways to generate an ``ETRecord`` for debugging purposes: + +Method 1: Using the ``generate_etrecord`` Parameter (Recommended) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The recommended approach is to enable ``ETRecord`` generation by passing ``generate_etrecord=True`` +to your export API calls. This can be used with: + +* ``executorch.export()`` - High-level export API +* ``to_edge()`` - Edge dialect conversion +* ``to_edge_transform_and_lower()`` - Edge conversion with transformations and lowering + +After export completes, retrieve the ``ETRecord`` using the ``get_etrecord()`` method, and save it using the ``save()`` method: + +**Example with** ``executorch.export()``: + +.. code-block:: python + + import executorch + from executorch.export import ExportRecipe + + # Export with ETRecord generation enabled + session = executorch.export( + model=model, + example_inputs=[example_inputs], + export_recipe=recipe, + generate_etrecord=True # Enable ETRecord generation + ) + + # Get and save the ETRecord + etrecord = session.get_etrecord() + etrecord.save("model_debug.etrecord") + +**Example with** ``to_edge()``: + +.. code-block:: python + + from executorch.exir.program import to_edge + from torch.export import export + + # Export model first + exported_program = export(model, example_inputs) + + # Convert to edge with ETRecord generation + edge_manager = to_edge( + exported_program, + generate_etrecord=True # Enable ETRecord generation + ) + + # Apply transformations + edge_manager = edge_manager.to_backend() + et_manager = edge_manager.to_executorch() + + # Get and save ETRecord + etrecord = et_manager.get_etrecord() + etrecord.save("edge_debug.etrecord") + +**Example with** ``to_edge_transform_and_lower()``: + +.. code-block:: python + + from executorch.exir.program import to_edge_transform_and_lower + from torch.export import export + + # Export model first + exported_program = export(model, example_inputs) + + # Transform and lower with ETRecord generation + edge_manager = to_edge_transform_and_lower( + exported_program, + partitioner=[MyPartitioner()], + generate_etrecord=True # Enable ETRecord generation + ) + + et_manager = edge_manager.to_executorch() + + # Get and save ETRecord + etrecord = et_manager.get_etrecord() + etrecord.save("debug.etrecord") + +Method 2: Using the ``generate_etrecord()`` Function +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can also use the standalone ``generate_etrecord()`` function to generate an ``ETRecord``. +This method requires you to provide the Edge Dialect program (returned by ``to_edge()``), +the ExecuTorch program (returned by ``to_executorch()``), and optional models. .. warning:: - Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process. + When using the standalone function, users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process. + +**Example:** + +.. code-block:: python + + import copy + from executorch.devtools import generate_etrecord + from torch.export import export + + # Export and convert to edge + aten_dialect = export(model, example_inputs, strict=True) + edge_program = to_edge(aten_dialect) + + # Create copy for ETRecord (needed because to_executorch modifies in-place) + edge_program_copy = copy.deepcopy(edge_program) + + # Convert to ExecutorchProgramManager + executorch_program = edge_program_copy.to_executorch() + + # Generate ETRecord separately + generate_etrecord( + "debug.etrecord", + edge_program, + executorch_program, + ) .. currentmodule:: executorch.devtools.etrecord._etrecord .. autofunction:: generate_etrecord diff --git a/docs/source/examples.md b/docs/source/examples.md new file mode 100644 index 00000000000..6a3a8ac29c9 --- /dev/null +++ b/docs/source/examples.md @@ -0,0 +1,9 @@ +# Examples + +```{toctree} +:maxdepth: 1 + +Building an ExecuTorch Android Demo App +Building an ExecuTorch iOS Demo App +tutorial-arm +``` diff --git a/docs/source/executorch_custom_versions.py b/docs/source/executorch_custom_versions.py deleted file mode 100644 index 590f21b10ec..00000000000 --- a/docs/source/executorch_custom_versions.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -Sphinx extension to replace ${executorch_version:TAG} with version numbers. - -It also defines a special variable ${executorch_version} that is set to the value -of `EXECUTORCH_VERSION` defined in this file. - -This custom extension pulls third-party version strings from files in the -.ci/docker/ci_commit_pins directory, and uses them to expand specific strings in -markdown files. - -For example, `${executorch_version:pytorch}` will be replaced with the -appropriate pytorch version string used by CI. -""" - -import os - -from docutils import nodes - -version_file_names = [ - "buck2.txt", - "pytorch.txt", -] - -EXECUTORCH_VERSION = "0.7.0" - -variables: dict[str, str] = {} - - -def populate_version_variable(): - variables["${executorch_version}"] = EXECUTORCH_VERSION - cwd = os.getcwd() - version_file_path = os.path.join(cwd, "..", ".ci", "docker", "ci_commit_pins") - - for file_name in version_file_names: - file_path = os.path.join(version_file_path, file_name) - with open(file_path, "r") as f: - var_name = "${executorch_version:" + file_name.split(".")[0] + "}" - variables[var_name] = f.read().strip() - - -populate_version_variable() - - -def replace_variables(app, doctree, docname): - # Replace in regular text: - for node in doctree.traverse(nodes.Text): - new_text = node.astext() - for var, value in variables.items(): - new_text = new_text.replace(var, value) - node.parent.replace(node, nodes.Text(new_text)) - # Replace in code blocks: - for node in doctree.traverse(nodes.literal_block): - new_text = node.astext() - for var, value in variables.items(): - new_text = new_text.replace(var, value) - - classes = node.get("classes", []) - # check if the output is generated by sphinx-gallery and if yes, keep the original - # CSS classes. Otherwise, the sphinx-gallery generated outputs are - # formatted as regular code blocks with gray background instead of pink. - is_sphinx_gallery = any("sphx-glr" in class_ for class_ in classes) - - language = node.get("language") - - if is_sphinx_gallery: - new_literal_block = nodes.literal_block(new_text, new_text, classes=classes) - else: - new_literal_block = nodes.literal_block( - new_text, - new_text, - classes=["highlight-none", "notranslate"], - language=language, - ) - - node.parent.replace(node, new_literal_block) - - -def setup(app): - app.connect("doctree-resolved", replace_variables) diff --git a/docs/source/export-overview.md b/docs/source/export-overview.md index d07701d06cd..c96716a0949 100644 --- a/docs/source/export-overview.md +++ b/docs/source/export-overview.md @@ -11,5 +11,5 @@ program, making it easier for you to understand and implement the process. To learn more about exporting your model: -* Complete the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial). +* Complete the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) . * Read the [torch.export documentation](https://pytorch.org/docs/2.1/export.html). diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md index 29aa6712d37..690256fecbb 100644 --- a/docs/source/extension-module.md +++ b/docs/source/extension-module.md @@ -6,7 +6,7 @@ In the [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial.md), we ## Example -Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs: +Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs: ```cpp #include diff --git a/docs/source/file-formats-advanced.md b/docs/source/file-formats-advanced.md new file mode 100644 index 00000000000..c16ebccfd65 --- /dev/null +++ b/docs/source/file-formats-advanced.md @@ -0,0 +1,17 @@ +(file-formats-advanced)= + +# File Formats + +ExecuTorch file format specifications and internal structure. + +## Program File Formats + +- {doc}`pte-file-format` — PTE (PyTorch ExecuTorch) file format specification +- {doc}`ptd-file-format` — PTD file format specification + +```{toctree} +:hidden: +:maxdepth: 1 + +pte-file-format +ptd-file-format diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md index ef4a12d1a7f..617d521b802 100644 --- a/docs/source/getting-started-architecture.md +++ b/docs/source/getting-started-architecture.md @@ -4,7 +4,7 @@ This page describes the technical architecture of ExecuTorch and its individual **Context** -In order to target on-device AI with diverse hardware, critical power requirements, and realtime processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extendable architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms. +In order to target on-device AI with diverse hardware, critical power requirements, and real-time processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extensible architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms. ## Overview @@ -89,6 +89,6 @@ _Executor_ is the entry point to load the program and execute it. The execution ## Developer Tools -It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. +It should be efficient for users to go from research to production using the flow above. Productivity is especially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments. diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md index d3d9662f5c3..51c59f5e021 100644 --- a/docs/source/getting-started.md +++ b/docs/source/getting-started.md @@ -68,7 +68,7 @@ with open("model.pte", "wb") as f: If the model requires varying input sizes, you will need to specify the varying dimensions and bounds as part of the `export` call. See [Model Export and Lowering](using-executorch-export.md) for more information. -The hardware backend to target is controlled by the partitioner parameter to to\_edge\_transform\_and\_lower. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend. +The hardware backend to target is controlled by the partitioner parameter to `to_edge_transform_and_lower`. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend. Quantization can also be done at this stage to reduce model size and runtime. Quantization is backend-specific. See the documentation for the target backend for a full description of supported quantization schemes. @@ -89,7 +89,7 @@ input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224) program = runtime.load_program("model.pte") method = program.load_method("forward") output: List[torch.Tensor] = method.execute([input_tensor]) -print("Run succesfully via executorch") +print("Run successfully via executorch") from torchvision.models.mobilenetv2 import MobileNet_V2_Weights import torchvision.models as models @@ -226,5 +226,5 @@ ExecuTorch provides a high-degree of customizability to support diverse hardware - [Using ExecuTorch on Android](using-executorch-android.md) and [Using ExecuTorch on iOS](using-executorch-ios.md) for mobile runtime integration. - [Using ExecuTorch with C++](using-executorch-cpp.md) for embedded and mobile native development. - [Profiling and Debugging](using-executorch-troubleshooting.md) for developer tooling and debugging. -- [API Reference](export-to-executorch-api-reference.md) for a full description of available APIs. +- [API Reference](export-to-executorch-api-reference.rst) for a full description of available APIs. - [Examples](https://github.com/pytorch/executorch/tree/main/examples) for demo apps and example code. diff --git a/docs/source/index.md b/docs/source/index.md index 8afe4e85d78..b65139319a7 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,298 +1,195 @@ (home)= # Welcome to the ExecuTorch Documentation -**ExecuTorch** is PyTorch's solution to training and inference on the -Edge. +**ExecuTorch** is PyTorch's solution for efficient AI inference on edge devices — from mobile phones to embedded systems. ## Key Value Propositions -- **Portability:** Compatibility with a wide variety of computing - platforms, from high-end mobile phones to highly constrained - embedded systems and microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and - Developer Tools from PyTorch model authoring and conversion, to - debugging and deployment to a wide variety of platforms. -- **Performance:** Providing end users with a seamless and - high-performance experience due to a lightweight runtime and - utilizing full hardware capabilities such as CPUs, NPUs, and DSPs. - -ExecuTorch provides support for: - -* **Strong Model Support** LLMs (Large Language Models), - CV (Computer Vision), ASR (Automatic Speech Recognition), TTS (Text To Speech) -* **All Major Platforms** Android, Mac, Linux, Windows -* **Rich Acceleration Support** Apple, Arm, Cadence, MediaTek, NXP, OpenVino, Qualcomm, Vulkan, XNNPACK - -### Documentation Navigation -#### Introduction -- [Overview](intro-overview) -- [How it Works](intro-how-it-works) -- [Getting Started with Architecture](getting-started-architecture) -- [Concepts](concepts) -#### Usage -- [Getting Started](getting-started) -- [Using Executorch Export](using-executorch-export) -- [Using Executorch on Android](using-executorch-android) -- [Using Executorch on iOS](using-executorch-ios) -- [Using Executorch with C++](using-executorch-cpp) -- [Runtime Integration](using-executorch-runtime-integration) -- [Troubleshooting](using-executorch-troubleshooting) -- [Building from Source](using-executorch-building-from-source) -- [Quantization](quantization-overview) -- [FAQs](using-executorch-faqs) -#### Examples -- [Android Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app) -- [iOS Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo) -- [Hugging Face Models](https://github.com/huggingface/optimum-executorch/blob/main/README.md) -#### Backends -- [Overview](backends-overview) -- [XNNPACK](backends-xnnpack) -- [Core ML](backends-coreml) -- [MPS](backends-mps) -- [Vulkan](backends-vulkan) -- [ARM Ethos-U](backends-arm-ethos-u) -- [ARM VGF](backends-arm-vgf) -- [Qualcomm](backends-qualcomm) -- [MediaTek](backends-mediatek) -- [Cadence](backends-cadence) -- [OpenVINO](build-run-openvino) -- [NXP](backend-nxp) -#### Developer Tools -- [Overview](devtools-overview) -- [Bundled IO](bundled-io) -- [ETRecord](etrecord) -- [ETDump](etdump) -- [Runtime Profiling](runtime-profiling) -- [Model Debugging](model-debugging) -- [Model Inspector](model-inspector) -- [Memory Planning Inspection](memory-planning-inspection) -- [Delegate Debugging](delegate-debugging) -- [Tutorial](devtools-tutorial) -#### Runtime -- [Overview](runtime-overview) -- [Extension Module](extension-module) -- [Extension Tensor](extension-tensor) -- [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial) -- [Backend Delegate Implementation and Linking](runtime-backend-delegate-implementation-and-linking) -- [Platform Abstraction Layer](runtime-platform-abstraction-layer) -#### Portable C++ Programming -- [PTE File Format](pte-file-format) -- [PTD File Format](ptd-file-format) -#### API Reference -- [Export to Executorch API Reference](export-to-executorch-api-reference) -- [Executorch Runtime API Reference](executorch-runtime-api-reference) -- [Runtime Python API Reference](runtime-python-api-reference) -- [API Life Cycle](api-life-cycle) -- [Javadoc](https://pytorch.org/executorch/main/javadoc/) -#### Kernel Library -- [Overview](kernel-library-overview) -- [Custom ATen Kernel](kernel-library-custom-aten-kernel) -- [Selective Build](kernel-library-selective-build) -#### Working with LLMs -- [Getting Started](llm/getting-started.md) -- [Exporting LLMs](llm/export-llm.md) -- [Exporting custom LLMs](llm/export-custom-llm.md) -- [Running with C++](llm/run-with-c-plus-plus.md) -- [Running on Android (XNNPack)](llm/llama-demo-android.md) -- [Running on Android (QNN)](llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md) -- [Running on iOS](llm/run-on-ios.md) -#### Backend Development -- [Delegates Integration](backend-delegates-integration) -- [XNNPACK Reference](backend-delegates-xnnpack-reference) -- [Dependencies](backend-delegates-dependencies) -- [Compiler Delegate and Partitioner](compiler-delegate-and-partitioner) -- [Debug Backend Delegate](debug-backend-delegate) -#### IR Specification -- [EXIR](ir-exir) -- [Ops Set Definition](ir-ops-set-definition) -#### Compiler Entry Points -- [Backend Dialect](compiler-backend-dialect) -- [Custom Compiler Passes](compiler-custom-compiler-passes) -- [Memory Planning](compiler-memory-planning) -#### Contributing -- [Contributing](contributing) +- **Portability:** Run on diverse platforms, from high-end mobile to constrained microcontrollers +- **Performance:** Lightweight runtime with full hardware acceleration (CPU, GPU, NPU, DSP) +- **Productivity:** Use familiar PyTorch tools from authoring to deployment -```{toctree} -:glob: -:maxdepth: 1 -:caption: Introduction -:hidden: +--- -intro-overview -intro-how-it-works -getting-started-architecture -concepts -``` +## 🎯 Wins & Success Stories -```{toctree} -:glob: -:maxdepth: 1 -:caption: Usage -:hidden: +::::{grid} 1 +:class-container: success-showcase +:::{grid-item-card} +:class-header: bg-primary text-white +:class-body: text-center +[View All Success Stories →](success-stories) +::: +:::: -getting-started -using-executorch-export -using-executorch-android -using-executorch-ios -using-executorch-cpp -using-executorch-runtime-integration -using-executorch-troubleshooting -using-executorch-building-from-source -using-executorch-faqs -``` +--- -```{toctree} -:glob: -:maxdepth: 1 -:caption: Examples -:hidden: +## Quick Navigation -Building an ExecuTorch Android Demo App -Building an ExecuTorch iOS Demo App -tutorial-arm.md -``` +::::{grid} 2 -```{toctree} -:glob: -:maxdepth: 1 -:caption: Backends -:hidden: +:::{grid-item-card} **Get Started** +:link: quick-start-section +:link-type: doc -backends-overview -backends-xnnpack -backends-coreml -backends-mps -backends-vulkan -backends-arm-ethos-u -backends-qualcomm -backends-mediatek -backends-cadence -OpenVINO Backend -backends-nxp -``` +New to ExecuTorch? Start here for installation and your first model deployment. +::: -```{toctree} -:glob: -:maxdepth: 1 -:caption: Developer Tools -:hidden: +:::{grid-item-card} **Deploy on Edge Platforms** +:link: edge-platforms-section +:link-type: doc -devtools-overview -bundled-io -etrecord -etdump -runtime-profiling -model-debugging -model-inspector -memory-planning-inspection -delegate-debugging -devtools-tutorial -``` +Deploy on Android, iOS, Laptops / Desktops and embedded platforms with optimized backends. +::: -```{toctree} -:glob: -:maxdepth: 1 -:caption: Runtime -:hidden: +:::{grid-item-card} **Work with LLMs** +:link: llm/working-with-llms +:link-type: doc -runtime-overview -extension-module -extension-tensor -running-a-model-cpp-tutorial -runtime-backend-delegate-implementation-and-linking -runtime-platform-abstraction-layer -portable-cpp-programming -pte-file-format -ptd-file-format -``` +Export, optimize, and deploy Large Language Models on edge devices. +::: -```{toctree} -:glob: -:maxdepth: 1 -:caption: API Reference -:hidden: +:::{grid-item-card} 🔧 **Developer Tools** +:link: tools-section +:link-type: doc -export-to-executorch-api-reference -executorch-runtime-api-reference -runtime-python-api-reference -api-life-cycle -Javadoc -``` +Profile, debug, and inspect your models with comprehensive tooling. +::: -```{toctree} -:glob: -:maxdepth: 1 -:caption: Quantization -:hidden: +:::: -quantization-overview -``` +--- -```{toctree} -:glob: -:maxdepth: 1 -:caption: Kernel Library -:hidden: +## Explore Documentation -kernel-library-overview -kernel-library-custom-aten-kernel -kernel-library-selective-build -``` +::::{grid} 1 +:::{grid-item-card} **Intro** +:link: intro-section +:link-type: doc -```{toctree} -:glob: -:maxdepth: 2 -:caption: Working with LLMs -:hidden: +**Overview, architecture, and core concepts** — Understand how ExecuTorch works and its benefits +::: +:::: -Getting Started -Exporting LLMs with export_llm -Exporting custom LLMs -Running with C++ -Running on Android -Running on Android -Running on iOS -``` +::::{grid} 1 +:::{grid-item-card} **Quick Start** +:link: quick-start-section +:link-type: doc -```{toctree} -:glob: -:maxdepth: 1 -:caption: Backend Development -:hidden: +**Get started with ExecuTorch** — Install, export your first model, and run inference +::: +:::: -backend-delegates-integration -backend-delegates-xnnpack-reference -backend-delegates-dependencies -compiler-delegate-and-partitioner -debug-backend-delegate -``` +::::{grid} 1 +:::{grid-item-card} **Edge** +:link: edge-platforms-section +:link-type: doc -```{toctree} -:glob: -:maxdepth: 1 -:caption: IR Specification -:hidden: +**Android, iOS, Desktop, Embedded** — Platform-specific deployment guides and examples +::: +:::: -ir-exir -ir-ops-set-definition -``` +::::{grid} 1 +:::{grid-item-card} **Backends** +:link: backends-section +:link-type: doc -```{toctree} -:glob: -:maxdepth: 1 -:caption: Compiler Entry Points -:hidden: +**CPU, GPU, NPU/Accelerator backends** — Hardware acceleration and backend selection +::: +:::: + +::::{grid} 1 +:::{grid-item-card} **LLMs** +:link: llm/working-with-llms +:link-type: doc + +**LLM export, optimization, and deployment** — Complete LLM workflow for edge devices +::: +:::: + +::::{grid} 1 +:::{grid-item-card} **Advanced** +:link: advanced-topics-section +:link-type: doc + +**Quantization, memory planning, custom passes** — Deep customization and optimization +::: +:::: + +::::{grid} 1 +:::{grid-item-card} **Tools** +:link: tools-section +:link-type: doc + +**Developer tools, profiling, debugging** — Comprehensive development and debugging suite +::: +:::: -compiler-backend-dialect -compiler-custom-compiler-passes -compiler-memory-planning -``` +::::{grid} 1 +:::{grid-item-card} **API** +:link: api-section +:link-type: doc + +**API Reference Usages & Examples** — Detailed Python, C++, and Java API references +::: +:::: + +::::{grid} 1 +:::{grid-item-card} **💬 Support** +:link: support-section +:link-type: doc + +**FAQ, troubleshooting, contributing** — Get help and contribute to the project +::: +:::: + +--- + +## What's Supported + +::::{grid} 3 + +:::{grid-item} +**Model Types** + +- Large Language Models (LLMs) +- Computer Vision (CV) +- Speech Recognition (ASR) +- Text-to-Speech (TTS) +- More ... +::: + +:::{grid-item} +**Platforms** + +- Android & iOS +- Linux, macOS, Windows +- Embedded & MCUs +- Go **→ {doc}`edge-platforms-section`** +::: + +:::{grid-item} +**Rich Acceleration** + +- CPU +- GPU +- NPU +- DSP +- Go **→ {doc}`backends-section`** +::: + +:::: ```{toctree} -:glob: -:maxdepth: 1 -:caption: Contributing :hidden: +:maxdepth: 1 -contributing -``` +intro-section +quick-start-section +edge-platforms-section +backends-section +llm/working-with-llms +advanced-topics-section +tools-section +api-section +support-section diff --git a/docs/source/intro-how-it-works.md b/docs/source/intro-how-it-works.md index 3e6d384a62f..3ced602fed4 100644 --- a/docs/source/intro-how-it-works.md +++ b/docs/source/intro-how-it-works.md @@ -6,7 +6,7 @@ At a high-level, there are three steps for running a PyTorch model with ExecuTor 1. **Export the model.** The first step is to capture the PyTorch program as a graph, which is a new representation of the model that can be expressed in terms of a series of operators such as addition, multiplication, or convolution. This process safely preserves the semantics of the original PyTorch program. This representation is the first step to enable running the model on edge use cases that have low memory and/or low compute. 1. **Compile the exported model to an ExecuTorch program.** Given an exported model from step 1, convert it to an executable format called an ExecuTorch program that the runtime can use for inference. This step provides entry points for various optimizations such as compressing the model (e.g., quantization) to reduce size and further compiling subgraphs down to on-device specialized hardware accelerators to improve latency. It also provides an entry point for memory planning, i.e. to efficiently plan the location of intermediate tensors to reduce the runtime memory footprint. -1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enabled performant execution even on highly-constrained devices. +1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enables performant execution even on highly-constrained devices. This figure illustrates the three-step process of exporting a PyTorch program, compiling it into an ExecuTorch program that targets a specific hardware device, and finally executing the program on the device using the ExecuTorch runtime. ![name](_static/img/how-executorch-works-high-level.png) diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md index 96c7982b8fe..be2fd468716 100644 --- a/docs/source/intro-overview.md +++ b/docs/source/intro-overview.md @@ -20,7 +20,7 @@ Key value propositions of ExecuTorch are: ## Why ExecuTorch? Supporting on-device AI presents unique challenges with diverse hardware, -critical power requirements, low/no internet connectivity, and realtime +critical power requirements, low/no internet connectivity, and real-time processing needs. These constraints have historically prevented or slowed down the creation of scalable and performant on-device AI solutions. We designed ExecuTorch, backed by our industry partners like Meta, Arm, Apple, and Qualcomm, diff --git a/docs/source/intro-section.md b/docs/source/intro-section.md new file mode 100644 index 00000000000..2f6f3c57c88 --- /dev/null +++ b/docs/source/intro-section.md @@ -0,0 +1,27 @@ +(intro-section)= + +# Intro + +Overview, architecture, and core concepts of ExecuTorch. + +ExecuTorch is PyTorch's solution for training and inference on the Edge, providing portability, productivity, and performance for edge computing platforms. + +## Getting Started with ExecuTorch + +New to ExecuTorch? Start with these foundational topics: + +- **{doc}`intro-overview`** - High-level overview of ExecuTorch capabilities +- **{doc}`intro-how-it-works`** - Technical overview of the ExecuTorch workflow +- **{doc}`getting-started-architecture`** - System architecture and components +- **{doc}`concepts`** - Core concepts and terminology + +```{toctree} +:hidden: +:maxdepth: 2 +:caption: Introduction Topics + +intro-overview +intro-how-it-works +getting-started-architecture +concepts +``` diff --git a/docs/source/ios-backends.md b/docs/source/ios-backends.md new file mode 100644 index 00000000000..cb186f53319 --- /dev/null +++ b/docs/source/ios-backends.md @@ -0,0 +1,19 @@ +(ios-backends)= +# Backends + +Available hardware acceleration backends for iOS deployment. + +## Apple Hardware Acceleration (Recommended) + +- {doc}`ios-coreml` — CoreML (NPU/GPU, recommended for iOS) +- {doc}`ios-mps` — Metal Performance Shaders (GPU) + +## CPU Acceleration + +- {doc}`ios-xnnpack` — XNNPACK (CPU acceleration) + +```{toctree} +:hidden: +ios-coreml +ios-mps +ios-xnnpack diff --git a/docs/source/ios-coreml.md b/docs/source/ios-coreml.md new file mode 100644 index 00000000000..48271326d87 --- /dev/null +++ b/docs/source/ios-coreml.md @@ -0,0 +1 @@ +```{include} backends-coreml.md diff --git a/docs/source/ios-examples.md b/docs/source/ios-examples.md new file mode 100644 index 00000000000..86acf3273a6 --- /dev/null +++ b/docs/source/ios-examples.md @@ -0,0 +1,4 @@ +# Examples & Demos + +- [iOS LLM Examples Repository](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple) +- [MobileViT Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo) diff --git a/docs/source/ios-mps.md b/docs/source/ios-mps.md new file mode 100644 index 00000000000..d6f305d33aa --- /dev/null +++ b/docs/source/ios-mps.md @@ -0,0 +1 @@ +```{include} backends-mps.md diff --git a/docs/source/ios-section.md b/docs/source/ios-section.md new file mode 100644 index 00000000000..33c9a61ce1d --- /dev/null +++ b/docs/source/ios-section.md @@ -0,0 +1,23 @@ +(ios-section)= +# iOS + +Deploy ExecuTorch on iOS devices with Apple hardware acceleration. + +## Quick Start & Integration + +- {doc}`using-executorch-ios` — Complete iOS integration guide + +## Backends + +- {doc}`ios-backends` — Available iOS backends and acceleration options + +## Examples & Demos + +- {doc}`ios-examples` — Explore iOS Examples & Demos + + +```{toctree} +:hidden: +using-executorch-ios +ios-backends +ios-examples diff --git a/docs/source/ios-xnnpack.md b/docs/source/ios-xnnpack.md new file mode 100644 index 00000000000..315dd747006 --- /dev/null +++ b/docs/source/ios-xnnpack.md @@ -0,0 +1 @@ +```{include} backends-xnnpack.md diff --git a/docs/source/ir-specification.md b/docs/source/ir-specification.md new file mode 100644 index 00000000000..c58098ffc67 --- /dev/null +++ b/docs/source/ir-specification.md @@ -0,0 +1,8 @@ +# IR Specification + +```{toctree} +:maxdepth: 1 + +ir-exir +ir-ops-set-definition +``` diff --git a/docs/source/kernel-library-advanced.md b/docs/source/kernel-library-advanced.md new file mode 100644 index 00000000000..5f0215b87c1 --- /dev/null +++ b/docs/source/kernel-library-advanced.md @@ -0,0 +1,23 @@ +(kernel-library-advanced)= + +# Kernel Library Deep Dive + +Advanced kernel implementation and customization for ExecuTorch. + +## Kernel Library Overview + +- {doc}`kernel-library-overview` — Architecture and design of the kernel library + +- {doc}`kernel-library-custom-aten-kernel` — Kernel registration and customization + +## Build Optimization + +- {doc}`kernel-library-selective-build` — Selective build for reduced binary footprint + +```{toctree} +:hidden: +:maxdepth: 1 + +kernel-library-overview +kernel-library-custom-aten-kernel +kernel-library-selective-build diff --git a/docs/source/kernel-library-overview.md b/docs/source/kernel-library-overview.md index cfd46524097..a826b334ba4 100644 --- a/docs/source/kernel-library-overview.md +++ b/docs/source/kernel-library-overview.md @@ -1,7 +1,7 @@ -This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries. - # Overview of ExecuTorch’s Kernel Libraries +This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries. + An ExecuTorch program encodes instructions that describe the computation that should be performed by the program. Many of these instructions will correspond to calling a specific ATen operator, for example `aten.convolution`. However, one of the core design principles of ExecuTorch is that the signature of an operator should be separate from the implementation of the operator. This means that the ExecuTorch runtime does not ship with any standard implementation for ATen operators; users must make sure to link against kernel libraries that contain implementations of the operators required by their ExecuTorch program, and configure [operator registration](kernel-library-custom-aten-kernel.md) to map an operator signature to the desired implementation. This makes it easy to adjust the implementation of operators such as `aten.convolution` that will be called when executing an ExecuTorch program; it allows users to select the exact operator implementations that will meet the unique performance, memory usage, battery usage, etc. constraints of their use-case. **In essence, a kernel library is simply a collection of ATen operator implementations that follow a common theme or design principle**. Note that due to ExecuTorch’s selective build process (discussed in the following section), operator implementations are linked individually. This means that users can easily mix different kernel libraries in their build without sacrificing build size. diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md index 7d6495656a2..666206acb94 100644 --- a/docs/source/kernel-library-selective-build.md +++ b/docs/source/kernel-library-selective-build.md @@ -65,7 +65,7 @@ gen_selected_ops( ) ``` -The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/BujSet/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction. +The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/pytorch/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction. ### Select all ops @@ -83,7 +83,7 @@ This API lets users pass in a list of operator names. Note that this API can be ### Select ops from model -This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model. +This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model. ### Dtype Selective Build @@ -91,7 +91,7 @@ Beyond pruning the binary to remove unused operators, the binary size can furthe ## Example Walkthrough -In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L48-L72), we have the following cmake config options: +In [examples/selective_build/CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt), we have the following cmake config options: 1. `EXECUTORCH_SELECT_OPS_YAML` 2. `EXECUTORCH_SELECT_OPS_LIST` @@ -99,10 +99,10 @@ In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorc 4. `EXECUTORCH_SELECT_OPS_FROM_MODEL` 5. `EXECUTORCH_DTYPE_SELECTIVE_BUILD` -These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L110-L123). The following table describes some examples of how the invocation changes when these configs are set: +These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt). The following table describes some examples of how the invocation changes when these configs are set: | Example cmake Call | Resultant `gen_selected_ops` Invocation | -| :----: | :---:| +| :----: | :---:| |
cmake -D… -DSELECT_OPS_LIST="aten::add.out,aten::mm.out"
|
gen_selected_ops("" "${SELECT_OPS_LIST}" "" "" "")
| |
cmake -D… -DSELECT_OPS_YAML=ON
|
set(_custom_ops_yaml ${EXECUTORCH_ROOT}/examples/portable/custom_ops/custom_ops.yaml)
gen_selected_ops("${_custom_ops_yaml}" "" "")
| |
cmake -D… -DEXECUTORCH_SELECT_OPS_FROM_MODEL="model.pte.out"
|
gen_selected_ops("" "" "" "${_model_path}" "")
| diff --git a/docs/source/kernel-library.md b/docs/source/kernel-library.md new file mode 100644 index 00000000000..a995a20973b --- /dev/null +++ b/docs/source/kernel-library.md @@ -0,0 +1,9 @@ +# Kernel Library + +```{toctree} +:maxdepth: 1 + +kernel-library-overview +kernel-library-custom-aten-kernel +kernel-library-selective-build +``` diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md index 4587589a51b..ae1b4f15c99 100644 --- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -1,6 +1,7 @@ -# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend +# Run Llama 3 3B Instruct on Android (with Qualcomm AI Engine Direct Backend) -This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device. +This tutorial demonstrates how to export and run the Llama 3 3B Instruct model on a Qualcomm device using the Qualcomm AI Engine Direct Backend via ExecuTorch. +We use a static Llama [implementation](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/model/static_llama.py) to optimize performance and memory usage during on-device inference. ## Prerequisites @@ -13,10 +14,8 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng ## Instructions -### Step 1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant) - -1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`. -2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**. +### Step 1: Prepare the checkpoint and tokenizer of the model. +1. For Llama 3 tokenizer and checkpoint, please refer to [instructions](https://www.llama.com/models/llama-3) for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`. ### Step 2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend Deploying large language models like Llama 3 on-device presents the following challenges: @@ -25,122 +24,79 @@ Deploying large language models like Llama 3 on-device presents the following ch 2. High model loading and inference time. 3. Difficulty in quantization. -To address these challenges, we have implemented the following solutions: -1. Using `quantization.pt2e_quantize = "qnn_16a4w'` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference. -2. Using `backed.qnn.num_sharding = 8` to shard the model into sub-parts. -3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations. -4. Using `backend.qnn.optimized_rotation_path = ""` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy. -5. Using `quantization.calibration_data = "<|start_header_id|>system<|end_header_id|..."` to ensure that during quantization, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/). +To address these, we apply the following optimizations: + +1. Quantization: Use `QuantDtype.use_16a4w_block` for post-training quantization to reduce model size and memory usage. + +2. Mixed Precision Quantization: compresses KV cache tensors to 8-bit and applies `QuantDtype.use_16a8w` to the LM head. + +3. Model Sharding: Set `num_sharding` = 4 to shard the model into sub-parts. This helps reduce memory pressure and improve performance during on-device inference. The number of shards might be different depending on the model size. + +4. Graph Transformations: Convert operations into accelerator-friendly formats for better runtime performance. + +You can find the full optimization configuration in this [file](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/__init__.py), as shown below: + +``` python +@register_llm_model("llama3_2-3b_instruct") +@dataclass(init=False, frozen=True) +class Llama3_2_3B_Instruct(LLMModelConfig): + repo_id = None + params_path = None + convert_weights = None + transform_weight = True + # The Llama3_2 enabled should be instruct, however, Llama's tokenizer does not provide utility to apply chat template. + instruct_model = False + + num_sharding = 4 + # quant config + ptq = QuantDtype.use_16a4w_block + group_size = 32 # Group size used in block quantization for weight quantization. Will only be used when ptq = 16a4w_block + masked_softmax = False + + # SeqMSE Quantization: optimizes the parameter encodings of each layer of a model individually to minimize the difference between the layer’s original and quantized outputs. (Implementation details: ./backends/qualcomm/_passes/seq_mse.py) In this configuration, we set `seq_mse_candidates` = 0, which means SeqMSE quantization is not applied. + seq_mse_candidates = 0 + r1 = False + r2 = False + r3 = False + custom_annotation = ( + annotate_kv_8bit, + annotate_output_16a8w, + ) +``` + To export with the Qualcomm AI Engine Direct Backend, ensure the following: -1. The host machine has more than 100GB of memory (RAM + swap space). +1. The host machine has more than 64GB of memory (RAM + swap space). 2. The entire process takes a few hours. ```bash -# path/to/config.yaml -base: - model_class: llama3 - checkpoint: path/to/consolidated.00.pth - params: path/to/params.json - tokenizer_path: path/to/tokenizer.model - metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' -model: - use_kv_cache: True - enable_dynamic_shape: False -quantization: - pt2e_quantize: qnn_16a4w - # Please note that calibration_data must include the prompt template for special tokens. - calibration_data: "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" -backend: - qnn: - enabled: True - num_sharding: 8 - - -# export_llm -python -m extension.llm.export.export_llm \ - --config path/to/config.yaml +# export llama +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --compile_only ``` +Note: end-to-end [instructions](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md) ### Step 3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs -1. Build executorch with Qualcomm AI Engine Direct Backend for android - ```bash - cmake \ - -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \ - -DANDROID_ABI=arm64-v8a \ - -DCMAKE_INSTALL_PREFIX=cmake-android-out \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_QNN=ON \ - -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -Bcmake-android-out . - - cmake --build cmake-android-out -j16 --target install --config Release - ``` -2. Build llama runner for android -```bash - cmake \ - -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DCMAKE_INSTALL_PREFIX=cmake-android-out \ - -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \ - -DEXECUTORCH_BUILD_QNN=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -Bcmake-android-out/examples/models/llama examples/models/llama - - cmake --build cmake-android-out/examples/models/llama -j16 --config Release -``` -3. Run on Android via adb shell -*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone - **3.1 Connect your android phone** -**3.2 We need to push required QNN libraries to the device.** -```bash -# make sure you have write-permission on below path. -DEVICE_DIR=/data/local/tmp/llama -adb shell mkdir -p ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR} -``` - -**3.3 Upload model, tokenizer and llama runner binary to phone** -```bash -adb push ${DEVICE_DIR} -adb push ${DEVICE_DIR} -adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR} -adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR} -``` +**3.2 Make sure the following artifact is present before running the model.** +-- artifact/ + └── llama_qnn.pte -**3.4 Run model** +**3.3 Run model** ```bash -adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path --tokenizer_path --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128" -``` -You should see the message: -``` -<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the +# Run llama +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --pre_gen_pte ${PATH_TO_ARTIFACT} ``` ## What is coming? - Performance improvements - Reduce the memory pressure during inference to support 12GB Qualcomm devices -- Support more LLMs (Qwen, Phi-4-mini, etc.) +- Broader LLM Support via [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch?tab=readme-ov-file#llms-large-language-models) + + - Already supported models (e.g.): Llama2, Llama3, Gemma, Qwen, Phi-4, SmolLM. For usage examples, please refer to [README](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md) ## FAQ If you encounter any issues while reproducing the tutorial, please file a github -issue on ExecuTorch repo and tag use `#qcom_aisw` tag +[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag \ No newline at end of file diff --git a/docs/source/llm/export-custom-llm.md b/docs/source/llm/export-custom-llm.md index 57537ba31d8..4797f773fa3 100644 --- a/docs/source/llm/export-custom-llm.md +++ b/docs/source/llm/export-custom-llm.md @@ -81,7 +81,7 @@ with open("nanogpt.pte", "wb") as file: To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory. -For more information, see [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) and +For more information, see [Exporting to ExecuTorch](../tutorials/export-to-executorch-tutorial) and [torch.export](https://pytorch.org/docs/stable/export.html). ## Backend delegation @@ -143,7 +143,7 @@ example_inputs = ( # long as they adhere to the rules specified in the dynamic shape configuration. # Here we set the range of 0th model input's 1st dimension as # [0, model.config.block_size]. -# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes +# See ../concepts.html#dynamic-shapes # for details about creating dynamic shapes. dynamic_shape = ( {1: torch.export.Dim("token_dim", max=model.config.block_size - 1)}, diff --git a/docs/source/llm/export-llm.md b/docs/source/llm/export-llm.md index 462d9a51849..082b8c2b18d 100644 --- a/docs/source/llm/export-llm.md +++ b/docs/source/llm/export-llm.md @@ -4,7 +4,7 @@ Instead of needing to manually write code to call torch.export(), use ExecuTorch ## Prerequisites -The LLM export functionality requires the `pytorch_tokenizers` package. If you encounter a `ModuleNotFoundError: No module named 'pytorch_tokenizers'` error, install it from the ExecutorTorch source code: +The LLM export functionality requires the `pytorch_tokenizers` package. If you encounter a `ModuleNotFoundError: No module named 'pytorch_tokenizers'` error, install it from the ExecuTorch source code: ```bash pip install -e ./extension/llm/tokenizers/ @@ -78,7 +78,7 @@ python -m extension.llm.export.export_llm \ - `use_shared_embedding` can help for models with tied input/output embedding layers, given that you quantize using TorchAO low bit ops (`quantization.qmode: torchao:8da(\\d+)w` or `quantization.qmode: torchao:fpa(\d+)w`), see more [here](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L307). - `use_attention_sink` to extend generation by removing from the beginning of the KV cache when the max context length is reached. - `quantize_kv_cache` quantizes the KV cache in int8. -- `local_global_attention` impements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache. +- `local_global_attention` implements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache. ## Quantization Quantization options are defined by [`QuantizationConfig`](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L283). ExecuTorch does quantization in two ways: @@ -92,7 +92,7 @@ The quantization modes are defined [here](https://github.com/pytorch/executorch/ Common ones to use are: - `8da4w`: short for int8 dynamic activation + int4 weight quantization. -- `int8`: int8 weight-only quanziation. +- `int8`: int8 weight-only quantization. Group size is specified with: - `group_size`: 8, 32, 64, etc. diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 849418342b6..6b6f9d96df7 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -21,6 +21,6 @@ Deploying LLMs to ExecuTorch can be boiled down to a two-step process: (1) expor - [Exporting LLMs](export-llm.md) - [Exporting custom LLMs](export-custom-llm.md) - [Running with C++](run-with-c-plus-plus.md) -- [Running on Android (XNNPack)](llama-demo-android.md) +- [Running on Android (XNNPack)](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android) - [Running on Android (Qualcomm)](build-run-llama3-qualcomm-ai-engine-direct-backend.md) - [Running on iOS](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple) diff --git a/docs/source/llm/llama-demo-android.md b/docs/source/llm/llama-demo-android.md deleted file mode 100644 index 023f82baf33..00000000000 --- a/docs/source/llm/llama-demo-android.md +++ /dev/null @@ -1,2 +0,0 @@ -```{include} ../../../examples/demo-apps/android/LlamaDemo/README.md -``` diff --git a/docs/source/llm/run-with-c-plus-plus.md b/docs/source/llm/run-with-c-plus-plus.md index f987fcab2a5..217afad847b 100644 --- a/docs/source/llm/run-with-c-plus-plus.md +++ b/docs/source/llm/run-with-c-plus-plus.md @@ -10,7 +10,7 @@ Before you begin, make sure you have: - Please also see [Model Metadata](#model-metadata) section for important metadata to be serialized into `.pte`. 2. A tokenizer file compatible with your model - For HuggingFace tokenizers, this is a JSON file `tokenizer.json` - - For SentencePiece tokenizers, this is is a `tokenizer.model` file and normally live alongside the weights file + - For SentencePiece tokenizers, this is a `tokenizer.model` file and normally lives alongside the weights file 3. CMake and a C++ compiler installed - CMake version 3.29 or higher - g++ or clang compiler diff --git a/docs/source/llm/working-with-llms.md b/docs/source/llm/working-with-llms.md new file mode 100644 index 00000000000..4c238f7ae5c --- /dev/null +++ b/docs/source/llm/working-with-llms.md @@ -0,0 +1,18 @@ +(working-with-llms)= + +# LLMs + +Learn how to export LLM models and deploy them across different platforms and runtime environments. This section covers the complete workflow from model export to running inference on mobile devices and edge hardware. + + +```{toctree} +:maxdepth: 1 +:caption: Working with LLMs + +getting-started +export-llm +export-custom-llm +run-with-c-plus-plus +build-run-llama3-qualcomm-ai-engine-direct-backend +run-on-ios +``` diff --git a/docs/source/platforms-desktop.md b/docs/source/platforms-desktop.md new file mode 100644 index 00000000000..acbdb06a6b6 --- /dev/null +++ b/docs/source/platforms-desktop.md @@ -0,0 +1,23 @@ +# Desktop & Laptop + +ExecuTorch supports desktop and laptop deployment across Linux, macOS, and Windows. + +## Platform-Specific Guides +- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide +- [Building from Source](using-executorch-building-from-source) + +## Available Backends by Platform + +### Linux +- [XNNPACK (CPU)](backends-xnnpack) +- [OpenVINO (Intel)](build-run-openvino) +- [ARM Ethos-U (ARM64)](backends-arm-ethos-u) + +### macOS +- [CoreML (recommended)](backends-coreml) +- [MPS (Apple Silicon)](backends-mps) +- [XNNPACK (CPU)](backends-xnnpack) + +### Windows +- [XNNPACK (CPU)](backends-xnnpack) +- [OpenVINO (Intel)](build-run-openvino) diff --git a/docs/source/platforms-embedded.md b/docs/source/platforms-embedded.md new file mode 100644 index 00000000000..5ea248fc0d9 --- /dev/null +++ b/docs/source/platforms-embedded.md @@ -0,0 +1,19 @@ +# Embedded Platforms + +ExecuTorch supports embedded devices from microcontrollers to edge devices. + +## Platform-Specific Guides +- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide +- [Building from Source](using-executorch-building-from-source) + +## Available Backends by Device Type + +### Microcontrollers +- [Cadence Xtensa Backend](backends-cadence) +- [ARM Ethos-U NPU Backend](backends-arm-ethos-u) +- [Custom Backend Development](backend-delegates-integration) + +### Edge Devices +- [ARM Ethos-U NPU Backend](backends-arm-ethos-u) +- [NXP eIQ Neutron Backend](backend-nxp) +- [Custom Hardware Integration](backend-delegates-integration) diff --git a/docs/source/ptd-file-format.md b/docs/source/ptd-file-format.md index 6381e8a071c..c7bad1f34c0 100644 --- a/docs/source/ptd-file-format.md +++ b/docs/source/ptd-file-format.md @@ -111,7 +111,7 @@ The flatbuffer-encoded metadata follows the headers and contains: ### Tensor Layout If a data segment contains a canonical tensor, it may have associated layout information: -- **Scalar type**: Data type (float32, int32, etc.) using ExecutorTorch scalar types. +- **Scalar type**: Data type (float32, int32, etc.) using ExecuTorch scalar types. - **Sizes**: Dimensions of the tensor. - **Dim order**: Memory layout order specifying how dimensions are arranged in memory. diff --git a/docs/source/quantization-optimization.md b/docs/source/quantization-optimization.md new file mode 100644 index 00000000000..d2005b3adac --- /dev/null +++ b/docs/source/quantization-optimization.md @@ -0,0 +1,20 @@ +(quantization-optimization)= + +# Quantization & Optimization + +Advanced techniques for model compression and performance optimization. + +## Quantization Strategies + +- {doc}`quantization-overview` — Comprehensive quantization strategies and techniques + +## Performance Optimization + +- {doc}`runtime-profiling` — Performance profiling and optimization techniques + +```{toctree} +:hidden: +:maxdepth: 1 + +quantization-overview +runtime-profiling diff --git a/docs/source/quantization-overview.md b/docs/source/quantization-overview.md index fdceee80e8e..4ff8d34a4a8 100644 --- a/docs/source/quantization-overview.md +++ b/docs/source/quantization-overview.md @@ -14,7 +14,7 @@ Quantization in ExecuTorch is backend-specific. Each backend defines how models The PT2E quantization workflow has three main steps: 1. Configure a backend-specific quantizer. -2. Prepare, calibrate, convert, and evalute the quantized model in PyTorch +2. Prepare, calibrate, convert, and evaluate the quantized model in PyTorch 3. Lower the model to the target backend ## 1. Configure a Backend-Specific Quantizer diff --git a/docs/source/quantization.md b/docs/source/quantization.md new file mode 100644 index 00000000000..b5ee9f21897 --- /dev/null +++ b/docs/source/quantization.md @@ -0,0 +1,7 @@ +# Quantization + +```{toctree} +:maxdepth: 1 + +quantization-overview +``` diff --git a/docs/source/quick-start-section.md b/docs/source/quick-start-section.md new file mode 100644 index 00000000000..b35bed8d22c --- /dev/null +++ b/docs/source/quick-start-section.md @@ -0,0 +1,38 @@ +(quick-start-section)= +# Quick Start + +Get started with ExecuTorch in just a few steps. + +This section walks you through the essential steps to get ExecuTorch up and running, from initial setup to exporting your first model for edge deployment. + +## What You'll Learn + +Follow these guides in order to get started with ExecuTorch: + +- **{doc}`getting-started`** - Initial Setup: Set up your development environment and run your first ExecuTorch example. + +- **{doc}`using-executorch-export`** - Exporting your model: Export for Edge deployment. + +- **{doc}`using-executorch-building-from-source`** - Building from Source: Build ExecuTorch from source for custom configurations and development. + +## Prerequisites + +- Python 3.10-3.12 +- PyTorch 2.9+ +- Basic familiarity with PyTorch model development + +## Next Steps + +After completing the quick start, explore: + +- **{doc}`edge-platforms-section`** - Deploy to specific platforms (Android, iOS, Desktop, Embedded) +- **{doc}`backends-section`** - Choose the right acceleration backend for your hardware + +```{toctree} +:hidden: +:maxdepth: 2 +:caption: Quick Start Guide + +getting-started +using-executorch-export +using-executorch-building-from-source diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md index a12ef122bc8..5ae4235995d 100644 --- a/docs/source/running-a-model-cpp-tutorial.md +++ b/docs/source/running-a-model-cpp-tutorial.md @@ -6,13 +6,13 @@ In this tutorial, we will cover how to run an ExecuTorch model in C++ using the For a high level overview of the ExecuTorch Runtime please see [Runtime Overview](runtime-overview.md), and for more in-depth documentation on each API please see the [Runtime API Reference](executorch-runtime-api-reference.rst). -[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.md) doc shows how to build and run it. +[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.rst) doc shows how to build and run it. ## Prerequisites You will need an ExecuTorch model to follow along. We will be using -the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial). +the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) . ## Model Loading @@ -96,7 +96,7 @@ MemoryManager memory_manager(&method_allocator, &planned_memory); ## Loading a Method -In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to intializing delegates, etc. +In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to initializing delegates, etc. ``` cpp Result method = program->load_method(method_name); diff --git a/docs/source/runtime-integration-advanced.md b/docs/source/runtime-integration-advanced.md new file mode 100644 index 00000000000..a76265c4093 --- /dev/null +++ b/docs/source/runtime-integration-advanced.md @@ -0,0 +1,20 @@ +(runtime-integration-advanced)= + +# Runtime & Integration + +Advanced runtime integration topics + +## Platform Integration + +- {doc}`runtime-platform-abstraction-layer` — Platform abstraction layer for cross-platform deployment + +## Portable C++ Programming + +- {doc}`portable-cpp-programming` — Portable C++ programming for cross-platform deployment + +```{toctree} +:hidden: +:maxdepth: 1 + +runtime-platform-abstraction-layer +portable-cpp-programming diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md index 96a618a2a41..1df3da40478 100644 --- a/docs/source/runtime-overview.md +++ b/docs/source/runtime-overview.md @@ -11,7 +11,7 @@ Works](intro-how-it-works.md). At the highest level, the ExecuTorch runtime is responsible for: * Loading binary `.pte` program files that were generated by the - [`to_executorch()`](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) step of the + [`to_executorch()`](tutorials/export-to-executorch-tutorial) step of the model-lowering process. * Executing the series of instructions that implement a lowered model. diff --git a/docs/source/runtime-profiling.md b/docs/source/runtime-profiling.md index 120d31954fd..56b62de599d 100644 --- a/docs/source/runtime-profiling.md +++ b/docs/source/runtime-profiling.md @@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](model - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level. -Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a step-by-step walkthrough of the above process on a sample model. +Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) for a step-by-step walkthrough of the above process on a sample model. diff --git a/docs/source/runtime.md b/docs/source/runtime.md new file mode 100644 index 00000000000..1d96cc53188 --- /dev/null +++ b/docs/source/runtime.md @@ -0,0 +1,15 @@ +# Runtime + +```{toctree} +:maxdepth: 1 + +runtime-overview +extension-module +extension-tensor +running-a-model-cpp-tutorial +runtime-backend-delegate-implementation-and-linking +runtime-platform-abstraction-layer +portable-cpp-programming +pte-file-format +ptd-file-format +``` diff --git a/docs/source/success-stories.md b/docs/source/success-stories.md new file mode 100644 index 00000000000..cba874132c6 --- /dev/null +++ b/docs/source/success-stories.md @@ -0,0 +1,56 @@ +(success-stories)= + +# Success Stories + +Discover how organizations are leveraging ExecuTorch to deploy AI models at scale on edge devices. + +--- + +## 🎯 Featured Success Stories + +::::{grid} 1 +:gutter: 3 + +:::{grid-item-card} **🚀 Story 1: [Title Placeholder]** +:class-header: bg-primary text-white + +**Industry:** [Industry] +**Hardware:** [Hardware Platform] +**Impact:** [Key Metrics] + +[Placeholder Description] - Brief overview of the challenge, solution, and results achieved. + + +[Read Full Story →](#story-1-details) +::: + +:::{grid-item-card} **⚡ Story 2: [Title Placeholder]** +:class-header: bg-success text-white + +**Industry:** [Industry] +**Hardware:** [Hardware Platform] +**Impact:** [Key Metrics] + +[Placeholder Description] - Brief overview of the challenge, solution, and results achieved. + + + +[Read Full Story →](#story-2-details) +::: + +:::{grid-item-card} **🧠 Story 3: [Title Placeholder]** +:class-header: bg-info text-white + +**Industry:** [Industry] +**Hardware:** [Hardware Platform] +**Impact:** [Key Metrics] + +[Placeholder Description] - Brief overview of the challenge, solution, and results achieved. + + +[Read Full Story →](#story-3-details) +::: + +:::: + +--- diff --git a/docs/source/support-section.md b/docs/source/support-section.md new file mode 100644 index 00000000000..64c47a3e55b --- /dev/null +++ b/docs/source/support-section.md @@ -0,0 +1,17 @@ +(support-section)= +# Support + +In this section, find answers to common questions, troubleshooting guides, and information on how to contribute to the ExecuTorch project. Get help with issues and learn how to participate in the community. + +- {doc}`using-executorch-faqs` — FAQ +- {doc}`using-executorch-troubleshooting` — Common Issues +- {doc}`contributing` — Contributing + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Support + +using-executorch-faqs +using-executorch-troubleshooting +contributing diff --git a/docs/source/tools-section.md b/docs/source/tools-section.md new file mode 100644 index 00000000000..461a1f6849a --- /dev/null +++ b/docs/source/tools-section.md @@ -0,0 +1,30 @@ +(tools-sdk-section)= + +# Tools + +In this section, explore ExecuTorch's comprehensive developer tools for profiling, debugging, and model inspection. These tools help optimize performance and troubleshoot issues during development and deployment. + +- {doc}`devtools-overview` — Developer Tools Overview +- {doc}`bundled-io` — Bundled I/O +- {doc}`etrecord` — ETRecord +- {doc}`etdump` — ETDump +- {doc}`runtime-profiling` — Profiling Suite +- {doc}`model-debugging` — Debugging Tools +- {doc}`model-inspector` — Model Inspector +- {doc}`memory-planning-inspection` — Memory Planning Inspection +- {doc}`devtools-tutorial` — Development Utilities + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Tools + +devtools-overview +bundled-io +etrecord +etdump +runtime-profiling +model-debugging +model-inspector +memory-planning-inspection +devtools-tutorial diff --git a/docs/source/tutorial-arm-ethos-u.md b/docs/source/tutorial-arm-ethos-u.md new file mode 100644 index 00000000000..0c713e996f8 --- /dev/null +++ b/docs/source/tutorial-arm-ethos-u.md @@ -0,0 +1,214 @@ +# Arm Ethos-U NPU Backend Tutorial + + +::::{grid} 2 + +:::{grid-item-card} Tutorials we recommend you complete before this: +:class-card: card-prerequisites +* [Introduction to ExecuTorch](intro-how-it-works.md) +* [Getting Started](getting-started.md) +* [Building ExecuTorch with CMake](using-executorch-building-from-source.md) +::: + +:::{grid-item-card} What you will learn in this tutorial: +:class-card: card-prerequisites +In this tutorial you will learn how to export a simple PyTorch model for the ExecuTorch Ethos-U backend. +::: + +:::: + +```{tip} +If you are already familiar with this delegate, you may want to jump directly to the examples: +* [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm) +* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/examples/arm/aot_arm_compiler.py) +``` + +This tutorial serves as an introduction to using ExecuTorch to deploy PyTorch models on Arm® Ethos™-U targets. It is based on `ethos_u_minimal_example.ipynb`, provided in Arm’s examples folder. + +## Prerequisites + +### Hardware + +To successfully complete this tutorial, you will need a Linux machine with aarch64 or x86_64 processor architecture, or a macOS™ machine with Apple® Silicon. + +To enable development without a specific development board, we will be using a [Fixed Virtual Platform (FVP)](https://www.arm.com/products/development-tools/simulation/fixed-virtual-platforms), simulating [Arm® Corstone™-300](https://developer.arm.com/Processors/Corstone-300)(cs300) and [Arm® Corstone™-300](https://developer.arm.com/Processors/Corstone-320)(cs320)systems. Think of it as virtual hardware. + +### Software + +First, you will need to install ExecuTorch. Please follow the recommended tutorials to set up a working ExecuTorch development environment. + +In addition to this, you need to install a number of SDK dependencies for generating Ethos-U command streams. Scripts to automate this are available in the main [ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm/). +To install Ethos-U dependencies, run +```bash +./examples/arm/setup.sh --i-agree-to-the-contained-eula +``` +This will install: +- [TOSA Serialization Library](https://www.mlplatform.org/tosa/software.html) for serializing the Exir IR graph into TOSA IR. +- [Ethos-U Vela graph compiler](https://pypi.org/project/ethos-u-vela/) for compiling TOSA flatbuffers into a Ethos-U command stream. +- [Arm GNU Toolchain](https://developer.arm.com/Tools%20and%20Software/GNU%20Toolchain) for cross compilation. +- [Corstone SSE-300 FVP](https://developer.arm.com/documentation/100966/1128/Arm--Corstone-SSE-300-FVP) for testing on Ethos-U55 reference design. +- [Corstone SSE-320 FVP](https://developer.arm.com/documentation/109760/0000/SSE-320-FVP) for testing on Ethos-U85 reference design. + +## Set Up the Developer Environment + +The setup.sh script generates a setup_path.sh script that you need to source whenever you restart your shell. Run: + +```{bash} +source examples/arm/ethos-u-scratch/setup_path.sh +``` + +As a simple check that your environment is set up correctly, run `which FVP_Corstone_SSE-320` and make sure that the executable is located where you expect, in the `examples/arm` tree. + +## Build + +### Ahead-of-Time (AOT) components + +The ExecuTorch Ahead-of-Time (AOT) pipeline takes a PyTorch Model (a `torch.nn.Module`) and produces a `.pte` binary file, which is then consumed by the ExecuTorch Runtime. This [document](getting-started-architecture.md) goes in much more depth about the ExecuTorch software stack for both AoT as well as Runtime. + +The example below shows how to quantize a model consisting of a single addition, and export it it through the AOT flow using the EthosU backend. For more details, see `examples/arm/ethos_u_minimal_example.ipynb`. + +```python +import torch + +class Add(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return x + y + +example_inputs = (torch.ones(1,1,1,1),torch.ones(1,1,1,1)) + +model = Add() +model = model.eval() +exported_program = torch.export.export(model, example_inputs) +graph_module = exported_program.graph_module + + +from executorch.backends.arm.ethosu import EthosUCompileSpec +from executorch.backends.arm.quantizer import ( + EthosUQuantizer, + get_symmetric_quantization_config, +) +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e + +# Create a compilation spec describing the target for configuring the quantizer +# Some args are used by the Arm Vela graph compiler later in the example. Refer to Arm Vela documentation for an +# explanation of its flags: https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/blob/main/OPTIONS.md +compile_spec = EthosUCompileSpec( + target="ethos-u55-128", + system_config="Ethos_U55_High_End_Embedded", + memory_mode="Shared_Sram", + extra_flags=["--output-format=raw", "--debug-force-regor"] + ) + +# Create and configure quantizer to use a symmetric quantization config globally on all nodes +quantizer = EthosUQuantizer(compile_spec) +operator_config = get_symmetric_quantization_config() +quantizer.set_global(operator_config) + +# Post training quantization +quantized_graph_module = prepare_pt2e(graph_module, quantizer) +quantized_graph_module(*example_inputs) # Calibrate the graph module with the example input +quantized_graph_module = convert_pt2e(quantized_graph_module) + + +# Create a new exported program using the quantized_graph_module +quantized_exported_program = torch.export.export(quantized_graph_module, example_inputs) +from executorch.backends.arm.ethosu import EthosUPartitioner +from executorch.exir import ( + EdgeCompileConfig, + ExecutorchBackendConfig, + to_edge_transform_and_lower, +) +from executorch.extension.export_util.utils import save_pte_program + +# Create partitioner from compile spec +partitioner = EthosUPartitioner(compile_spec) + +# Lower the exported program to the Ethos-U backend +edge_program_manager = to_edge_transform_and_lower( + quantized_exported_program, + partitioner=[partitioner], + compile_config=EdgeCompileConfig( + _check_ir_validity=False, + ), + ) + +# Convert edge program to executorch +executorch_program_manager = edge_program_manager.to_executorch( + config=ExecutorchBackendConfig(extract_delegate_segments=False) + ) + + +# Save pte file +save_pte_program(executorch_program_manager, "ethos_u_minimal_example.pte") +``` + + +```{tip} +For a quick start, you can use the script `examples/arm/aot_arm_compiler.py` to produce the pte. +To produce a pte file equivalent to the one above, run +`python -m examples.arm.aot_arm_compiler --model_name=add --delegate --quantize --output=ethos_u_minimal_example.pte` +``` + +### Runtime: + +After the AOT compilation flow is done, the runtime can be cross compiled and linked to the produced `.pte`-file using the Arm cross-compilation toolchain. This is done in two steps: + +First, build and install the ExecuTorch libraries and EthosUDelegate: +``` +# In ExecuTorch top-level, with sourced setup_path.sh +cmake -DCMAKE_BUILD_TYPE=Release --preset arm-baremetal -B cmake-out-arm . +cmake --build cmake-out-arm --target install -j$(nproc) +``` +Second, build and link the `arm_executor_runner` and generate kernel bindings for any non delegated ops. This is the actual program that will run on target. + +``` +# In ExecuTorch top-level, with sourced setup_path.sh +cmake -DCMAKE_TOOLCHAIN_FILE=`pwd`/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DET_PTE_FILE_PATH=ethos_u_minimal_example.pte \ + -DTARGET_CPU=cortex-m55 \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \ + -DMEMORY_MODE=Shared_Sram \ + -DSYSTEM_CONFIG=Ethos_U55_High_End_Embedded \ + -Bethos_u_minimal_example \ + examples/arm/executor_runner +cmake --build ethos_u_minimal_example -j$(nproc) -- arm_executor_runner +``` + +```{tip} +For a quick start, you can use the script `backends/arm/scripts/build_executor_runner.sh` to build the runner. +To build a runner equivalent to the one above, run +`./backends/arm/scripts/build_executor_runner.sh --pte=ethos_u_minimal_example.pte` +```` + +The block diagram below shows, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable. + +![](arm-delegate-runtime-build.svg) + + + +## Running on Corstone FVP Platforms + +Finally, use the `backends/arm/scripts/run_fvp.sh` utility script to run the .elf-file on simulated Arm hardware. +``` +backends/arm/scripts/run_fvp.sh --elf=$(find ethos_u_minimal_example -name arm_executor_runner) --target=ethos-u55-128 +``` +The example application is by default built with an input of ones, so the expected result of the quantized addition should be close to 2. + + +## Takeaways + +In this tutorial you have learned how to use ExecuTorch to export a PyTorch model to an executable that can run on an embedded target, and then run that executable on simulated hardware. +To learn more, check out these learning paths: + +https://learn.arm.com/learning-paths/embedded-and-microcontrollers/rpi-llama3/ +https://learn.arm.com/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/ + +## FAQs + +If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new). + + +``` +Arm is a registered trademark of Arm Limited (or its subsidiaries or affiliates). +``` diff --git a/docs/source/tutorial-arm-vgf.md b/docs/source/tutorial-arm-vgf.md new file mode 100644 index 00000000000..0e34e4be4b6 --- /dev/null +++ b/docs/source/tutorial-arm-vgf.md @@ -0,0 +1,221 @@ +# Arm VGF Backend Tutorial + + +::::{grid} 2 + +:::{grid-item-card} Tutorials we recommend you complete before this: +:class-card: card-prerequisites +* [Introduction to ExecuTorch](intro-how-it-works.md) +* [Getting Started](getting-started.md) +* [Building ExecuTorch with CMake](using-executorch-building-from-source.md) +::: + +:::{grid-item-card} What you will learn in this tutorial: +:class-card: card-prerequisites +In this tutorial you will learn how to export a simple PyTorch model for the ExecuTorch VGF backend. +::: + +:::: + +```{warning} +This delegate is under active development, to get best results please use a recent version. +The VGF backend support is in early development and you may encounter issues. +You may encounter some rough edges and features which may be documented or planned but not implemented, please refer to the in-tree documentation for the latest status of features. +``` + +```{tip} +If you are already familiar with this delegate, you may want to jump directly to the examples: +* [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm) +* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/examples/arm/aot_arm_compiler.py) +``` + +This tutorial serves as an introduction to using ExecuTorch to deploy PyTorch models on VGF targets. The tutorial is based on `vgf_minimal_example.ipyb`, provided in Arm®'s example folder. + +## Prerequisites + +### Hardware + +To successfully complete this tutorial, you will need a Linux machine with aarch64 or x86_64 processor architecture, or a macOS™ machine with Apple® Silicon. + +To enable development without a specific development board, we will be using the [ML SDK for Vulkan®](https://github.com/arm/ai-ml-sdk-for-vulkan/) to emulate the program consumer. + +### Software + +First, you will need to install ExecuTorch. Please follow the recommended tutorials if you haven't already, to set up a working ExecuTorch development environment. For the VGF backend it's recommended you [install from source](https://docs.pytorch.org/executorch/stable/using-executorch-building-from-source.html), or from a [nightly](https://download.pytorch.org/whl/nightly/executorch/). + +Additionally, you need to install a number of SDK dependencies for generating VGF files. For glslc, prefer installing it via your package manager. If this is not possible, and for other dependencies, there are scripts to automate installation available in the main [ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm/). glscl will then be installed via the Vulkan SDK. + +To install VGF dependencies, run +```bash +./examples/arm/setup.sh --i-agree-to-the-contained-eula --disable-ethos-u-deps --enable-mlsdk-deps +``` +This will install: +- [TOSA Serialization Library](https://www.mlplatform.org/tosa/software.html) for serializing the Exir IR graph into TOSA IR. +- [ML SDK Model Converter](https://github.com/arm/ai-ml-sdk-model-converter) for converting TOSA flatbuffers to VGF files. +- [Vulkan API (If needed)](https://www.vulkan.org) Should be set up locally for GPU execution support. +- [ML Emulation Layer for Vulkan](https://github.com/arm/ai-ml-emulation-layer-for-vulkan) for testing on Vulkan API. + + +## Set Up the Developer Environment + +The `setup.sh` script has generated a `setup_path.sh` script that you need to source whenever you restart your shell. Do this by running + +`source examples/arm/ethos-u-scratch/setup_path.sh` + +As a simple check that your environment is set up correctly, run + +```bash +which model-converter +``` +Make sure the executable is located where you expect, in the `examples/arm` tree. + +## Build + +### Ahead-of-Time (AOT) components + +The ExecuTorch Ahead-of-Time (AOT) pipeline takes a PyTorch Model (a `torch.nn.Module`) and produces a `.pte` binary file, which is then typically consumed by the ExecuTorch Runtime. This [document](getting-started-architecture.md) goes in much more depth about the ExecuTorch software stack for both AoT as well as Runtime. + +The example below shows how to quantize a model consisting of a single addition, and export it it through the AOT flow using the VGF backend. For more details, se `examples/arm/vgf_minimal_example.ipynb`. + +```python +import torch + +class Add(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return x + y + +example_inputs = (torch.ones(1,1,1,1),torch.ones(1,1,1,1)) + +model = Add() +model = model.eval() +exported_program = torch.export.export_for_training(model, example_inputs) +graph_module = exported_program.graph_module + + +from executorch.backends.arm.vgf import VgfCompileSpec +from executorch.backends.arm.quantizer import ( + VgfQuantizer, + get_symmetric_quantization_config, +) +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e + +# Create a compilation spec describing the target for configuring the quantizer +compile_spec = VgfCompileSpec("TOSA-1.0+INT") + +# Create and configure quantizer to use a symmetric quantization config globally on all nodes +quantizer = VgfQuantizer(compile_spec) +operator_config = get_symmetric_quantization_config(is_per_channel=False) +quantizer.set_global(operator_config) + +# Post training quantization +quantized_graph_module = prepare_pt2e(graph_module, quantizer) +quantized_graph_module(*example_inputs) # Calibrate the graph module with the example input +quantized_graph_module = convert_pt2e(quantized_graph_module) + + +# Create a new exported program using the quantized_graph_module +quantized_exported_program = torch.export.export(quantized_graph_module, example_inputs) +import os +from executorch.backends.arm.vgf import VgfPartitioner +from executorch.exir import ( + EdgeCompileConfig, + ExecutorchBackendConfig, + to_edge_transform_and_lower, +) +from executorch.extension.export_util.utils import save_pte_program + +# Create partitioner from compile spec +partitioner = VgfPartitioner(compile_spec) + +# Lower the exported program to the VGF backend +edge_program_manager = to_edge_transform_and_lower( + quantized_exported_program, + partitioner=[partitioner], + compile_config=EdgeCompileConfig( + _check_ir_validity=False, + ), +) + +# Convert edge program to executorch +executorch_program_manager = edge_program_manager.to_executorch( + config=ExecutorchBackendConfig(extract_delegate_segments=False) +) + + +# Save pte file +cwd_dir = os.getcwd() +pte_base_name = "simple_example" +pte_name = pte_base_name + ".pte" +pte_path = os.path.join(cwd_dir, pte_name) +save_pte_program(executorch_program_manager, pte_name) +assert os.path.exists(pte_path), "Build failed; no .pte-file found" +``` + + +```{tip} +For a quick start, you can use the script `examples/arm/aot_arm_compiler.py` to produce the pte. +To produce a pte file equivalent to the one above, run +`python -m examples.arm.aot_arm_compiler --model_name=add --delegate --quantize --output=simple_example.pte --target=vgf` +``` + +### Runtime: + +## Build executor runtime + +After the AOT compilation flow is done, we can build the executor runner target. For this tutorial, the default runner can be used. Build it with the following configuration: + +```bash +# In ExecuTorch top-level, with sourced setup_path.sh +cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Debug \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_VULKAN=ON \ + -DEXECUTORCH_BUILD_VGF=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DPYTHON_EXECUTABLE=python \ + -Bcmake-out . + +cmake --build cmake-out --target executor_runner` +``` + + +The block diagram below demonstrates, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable. + +![](arm-delegate-runtime-build.svg) + + +## Deploying and running on device + +Since we are using the Vulkan emulation layer, we can run the executor runner with the VGF delegate on the host machine: + +```bash +./cmake-out/executor_runner -model_path simple_example.pte +``` + +The example application is by default built with an input of ones, so the expected result of the quantized addition should be close to 2. + +## Takeaways + +In this tutorial you have learned how to use ExecuTorch to export a PyTorch model to an executable that can run on an embedded target, and then run that executable on simulated hardware. + + +## FAQs + +*glslc is not found when configuring the executor runner*. + +The Vulkan sdk is likely not in your path, check whether setup_path.sh contains something like +`export PATH=$(pwd)/examples/arm/ethos-u-scratch/vulkan_sdk/1.4.321.1/x86_64/bin:$PATH`. +If not, add it and source the file. + +If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new). + +``` +Arm is a registered trademark of Arm Limited (or its subsidiaries or affiliates). +``` \ No newline at end of file diff --git a/docs/source/tutorial-arm.md b/docs/source/tutorial-arm.md deleted file mode 100644 index 0692b631154..00000000000 --- a/docs/source/tutorial-arm.md +++ /dev/null @@ -1,467 +0,0 @@ -# Arm® Backend Tutorial - - -::::{grid} 2 - -:::{grid-item-card} Tutorials we recommend you complete before this: -:class-card: card-prerequisites -* [Introduction to ExecuTorch](intro-how-it-works.md) -* [Getting Started](getting-started.md) -* [Building ExecuTorch with CMake](using-executorch-building-from-source.md) -::: - -:::{grid-item-card} What you will learn in this tutorial: -:class-card: card-prerequisites -In this tutorial you will learn how to export a simple PyTorch model for ExecuTorch Arm backends. -::: - -:::: - -```{warning} -This delegate is under active development, to get best results please use a recent version. -The TOSA and Ethos(tm) backend support is reasonably mature and used in production by some users. -The VGF backend support is in early development and you may encounter issues. -You may encounter some rough edges and features which may be documented or planned but not implemented, please refer to the in-tree documentation for the latest status of features. -``` - -```{tip} -If you are already familiar with this delegate, you may want to jump directly to the examples: -* [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm) -* [Compilation for Ethos-U](https://github.com/pytorch/executorch/blob/main/examples/arm/ethos_u_minimal_example.ipynb) -* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/examples/arm/aot_arm_compiler.py) -``` - -## Prerequisites - -Let's make sure you have everything you need before you get started. - -### Hardware - -To successfully complete this tutorial, you will need a Linux or MacOS host machine with Arm aarch64 or x86_64 processor architecture. - -The target device will be an emulated platform to enable development without a specific development board. This tutorial has guidance for both Ethos-U targets and VGF via the ML SDK for Vulkan®. - -For Ethos-U and Cortex-M, We will be using a [Fixed Virtual Platform (FVP)](https://www.arm.com/products/development-tools/simulation/fixed-virtual-platforms), simulating [Corstone-300](https://developer.arm.com/Processors/Corstone-300)(cs300) and [Corstone-320](https://developer.arm.com/Processors/Corstone-320)(cs320)systems. Since we will be using the FVP (think of it as virtual hardware), we won't be requiring any real embedded hardware for this tutorial. - -For VGF we will be using the [ML SDK for Vulkan(R)](https://github.com/arm/ai-ml-sdk-for-vulkan/)) to emulate the program consumer. - -### Software - -First, you will need to install ExecuTorch. Please follow the recommended tutorials if you haven't already, to set up a working ExecuTorch development environment. For the VGF backend it's recommended you [install from source](https://docs.pytorch.org/executorch/stable/using-executorch-building-from-source.html), or from a [nightly](https://download.pytorch.org/whl/nightly/executorch/). - -In addition to this, you need to install a number of SDK dependencies for generating Ethos-U command streams or VGF files. There are scripts which automate this, which are found in the main [ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm/). - -## Set Up the Developer Environment - -In this section, we will do a one-time setup of the platform support files needed to run ExecuTorch programs in this tutorial. It is recommended to run the script in a conda or venv environment. - -With a checkout of the ExecuTorch repository, we will use the `examples/arm/setup.sh` script to pull each item in an automated fashion. - -For Ethos-U run: -```bash -./examples/arm/setup.sh --i-agree-to-the-contained-eula -``` - -For VGF run: -```bash -./examples/arm/setup.sh --i-agree-to-the-contained-eula --disable-ethos-u-deps --enable-mlsdk-deps -``` -It is possible to install both sets of dependencies if you omit the disable options. - - -### Notes: - -```{warning} -The `setup.sh` script has generated a `setup_path.sh` script that you need to source whenever you restart your shell. -``` - -i.e. run -`source executorch/examples/arm/ethos-u-scratch/setup_path.sh` - - -To confirm your environment is set up correctly and will enable you to generate .pte's for your target: - -For Ethos-U run: -```bash -# Check for Vela, which converts TOSA to Ethos-U command streams. -which vela -``` - -For VGF run: -```bash -# Check for model-converter, which converts TOSA to ML-SDK VGF format. -which model-converter -``` - -To ensure there's no environment pollution you should confirm these binaries reside within your executorch checkout, under the examples/arm tree. Other versions may present compatibility issues, so this should be corrected by modifying your environment variables such as ${PATH} appropriately. - - -## Convert the PyTorch Model to the `.pte` File - -`.pte` is a binary file produced by ExecuTorch Ahead-of-Time (AoT) pipeline by taking in a PyTorch Model (a torch.nn.Module), exporting it, running a variety of passes, and finally serializing it to a `.pte` file format. This binary file is typically consumed by the ExecuTorch Runtime. This [document](https://github.com/pytorch/executorch/blob/main/docs/source/getting-started-architecture.md) goes in much more depth about the ExecuTorch software stack for both AoT as well as Runtime. - -In this section, we will primarily focus on the AoT flow with the end goal of producing a `.pte` file. There are a set of export configurations to target different backends at runtime. For each, the AoT flow will produce a unique `.pte` file. We will explore a couple of different configurations producing different `.pte` files, particularly interesting for our Corstone-300 system and available processing elements. - -Before we get started, let's first talk about the PyTorch modules we will be using. - -### PyTorch Example Modules -We will use a couple of simple PyTorch Modules to explore the end-to-end flow. These modules will be used in various different ways throughout the tutorial, referring to them by their ``. - -#### SoftmaxModule -This is a very simple PyTorch module with just one [Softmax](https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html#torch.nn.Softmax) operator. - -```python -import torch - -class SoftmaxModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.softmax = torch.nn.Softmax() - - def forward(self, x): - z = self.softmax(x) - return z -``` - -Running it using the Python environment (on the same development Linux machine), you get the expected output. - -```python ->>> m = SoftmaxModule() ->>> m(torch.ones(2,2)) -tensor([[0.5000, 0.5000], - [0.5000, 0.5000]]) -``` - -#### AddModule -Let's write another simple PyTorch module with just one [Add](https://pytorch.org/docs/stable/generated/torch.add.html#torch.add) operator. - -```python -class AddModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x + x -``` - -Running it in python shows that 1 + 1 produces 2 as exepected: - -```python ->>> m = AddModule() ->>> m(torch.ones(5, dtype=torch.int32)) # integer types for non-quantized Ethos-U delegation -tensor([2, 2, 2, 2, 2], dtype=torch.int32) -``` -Keep the inputs and outputs to these modules in mind. When you will lower and run this through alternate means as opposed to running on this Linux machine, you will use the same inputs, and expect the outputs to match with the one shown here. - -```{tip} -you need to be aware of data types for running networks on the Ethos-U as it is an integer only co-processor. For this example you use integer types explicitly, for typical use of such a flow networks are built and trained in floating point, and then are quantized from floating point to integer for efficient inference. -``` - -#### MobileNetV2 Module -[MobileNetV2](https://arxiv.org/abs/1801.04381) is a commonly used network for edge and mobile devices. -It's also available as a default model in [torchvision](https://github.com/pytorch/vision), so you can load it with the sample code below. -``` -from torchvision.models import mobilenet_v2 # @manual -from torchvision.models.mobilenetv2 import MobileNet_V2_Weights - -mv2 = mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT) -``` -For more details, refer to the code snippet [here](https://github.com/pytorch/executorch/blob/2354945d47f67f60d9a118ea1a08eef8ba2364b5/examples/models/mobilenet_v2/model.py#L18). - -### Non-delegated Workflow - -In the ExecuTorch AoT pipeline, one of the options is to select a backend. ExecuTorch offers a variety of different backends. Selecting backend is optional, it is typically done to target a particular mode of acceleration or hardware for a given model compute requirements. Without any backends, ExecuTorch runtime will fallback to using, available by default, a highly portable set of operators. - -It's expected that on platforms with dedicated acceleration like the Ethos-U55, that the non-delegated flow is used for two primary cases: -1. When the network is designed to be very small and best suited to run on the Cortex-M alone. -2. When the network has a mix of operations that can target the NPU and those that can't, e.g. the Ethos-U55 supports integer operations and so floating point softmax will fall back to execute on the CPU. - -In this flow, without any backend delegates, to illustrate the portability of the ExecuTorch runtime, as well as of the operator library you will skip specifying the backend during the `.pte` generation. - -Following script will serve as a helper utility to help generating the `.pte` file. This is available in the `examples/arm` directory. - -```bash -python3 -m examples.arm.aot_arm_compiler --model_name="softmax" -# This should produce ./softmax_arm_ethos-u55-128.pte -``` - -### Delegated Workflow - -Working with Arm, you introduced a new Arm backend delegate for ExecuTorch. This backend is under active development and has a limited set of features available as of writing this. - -By including a following step during the ExecuTorch AoT export pipeline to generate the `.pte` file, you can enable this backend delegate. - -```python -from executorch.backends.arm.arm_backend import generate_ethosu_compile_spec - -graph_module_edge.exported_program = to_backend( - model.exported_program, - ArmPartitioner(generate_ethosu_compile_spec("ethos-u55-128"))) -``` - -Similar to the non-delegate flow, the same script will server as a helper utility to help generate the `.pte` file. Notice the `--delegate` option to enable the `to_backend` call. - -For Ethos targets: -```bash -python3 -m examples.arm.aot_arm_compiler --model_name="add" --delegate -# This targets the default of ethos-u55-128, see --help for further targets -# should produce ./add_arm_delegate_ethos-u55-128.pte -``` - -For basic post-training quantization: -```bash -python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --delegate --quantize -# This targets the default of ethos-u55-128, see --help for further targets -# should produce ./mv2_arm_delegate_ethos-u55-128.pte -``` - - -For VGF targets: -```bash -python3 -m examples.arm.aot_arm_compiler --model_name="add" --target=vgf --delegate -# should produce ./add_arm_delegate_vgf.pte -``` - -For basic post-training quantization: -```bash -python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --target=vgf --delegate --quantize -# should produce ./mv2_arm_delegate_vgf.pte -``` - -To capture intermediates such as VGF for lower level integration, invoke with the "-i" option: -```bash -python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --target=vgf --delegate --quantize -i ./mv2_output -# should produce ./mv2_arm_delegate_vgf.pte and intermediates in ./mv2_out/ -``` - -
- -At the end of this, you should have a number of different `.pte` files. - -- the SoftmaxModule, without any backend delegates. -- the AddModule, targeting the Arm Ethos-U backend. -- the Quantized MV2Model, targeting the Arm Ethos-U backend. -- the AddModule, targeting the VGF backend. -- the Quantized MV2Model, targeting the VGF backend. - -Now let's try to run these `.pte` files on a target. - -## Getting a Bare-Metal Executable - -In this section, you will go over steps that you need to go through to build the runtime application. This then run on the target device. In the executorch repository you have a functioning script which does the exact same steps. It is located at `executorch/examples/arm/run.sh`. You will use that to build necessary pieces and finally run the previously generated PTE file on an FVP. - -By default the `run.sh` will use `arm_test/` as an build and output folder and you will find the build artifacts under it. This can be controlled/overrided with the `--et_build_root` and the `--output` flags if needed. - -e.g. running `examples/arm/run.sh --model_name=add --target=ethos-u85-128` will produce a pte and elf file like this: - -```bash -arm_test/add/add_arm_delegate_ethos-u85-128.pte -arm_test/add/cmake-out/arm_executor_runner -``` -Also before you get started, make sure that you have completed ExecuTorch cmake build setup, and the instructions to setup the development environment described [earlier](#set-up-the-developer-environment). - -The block diagram below demonstrates, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable. - -![](arm-delegate-runtime-build.svg) - -```{tip} -The `generate_pte_file` function in `run.sh` script produces the `.pte` files based on the models provided through `--model_name` input argument -``` - -### Generating ExecuTorch Libraries - -ExecuTorch's CMake build system produces a set of build pieces which are critical to building the ExecuTorch runtime with-in the bare-metal environment you have for Corstone FVPs from Ethos-U SDK. - -[This](using-executorch-building-from-source.md) document provides a detailed overview of each individual build piece. For running either variant of the `.pte` file, you will need a core set of libraries. Here is a list, - -- `libexecutorch.a` -- `libportable_kernels.a` -- `libportable_ops_lib.a` - -To run a `.pte` file with the Arm backend delegate call instructions, you will need the Arm backend delegate runtime library, that is, - -- `libexecutorch_delegate_ethos_u.a` - -These libraries are generated by the `backends/arm/scripts/build_executorch.sh` script called from the `run.sh` script. - -### Building the executor_runner Bare-Metal Application - -The SDK dir is the same one prepared [earlier](#setup-the-arm-ethos-u-software-development). And, you will be passing the `.pte` file (any one of them) generated above. - -Note, you have to generate a new `executor-runner` binary if you want to change the model or the `.pte` file. This constraint is from the constrained bare-metal runtime environment you have for Corstone-300/Corstone-320 platforms. The build also generates a kernel registration library for the relevant operators which could not be delegated to the EthosU, see the [Kernel Library Selective Build documentation](https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html). - -This step is executed by the build_executor_runner.sh script, which is invoked from the run.sh in the backends/arm/scripts folder. - -```{tip} -The `run.sh` script takes in `--target` option, which provides a way to provide a specific target, Corstone-300(ethos-u55-128) or Corstone-320(ethos-u85-128) -``` - -## Running on Corstone FVP Platforms - -Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf. `run.sh` will run the FVP for you via the `backends/arm/scripts/run_fvp.sh` script. - -#### Automatic FVP Selection - -- To run a specific test model with the compiler flag and target -```bash -./run.sh --model_name=mv2 --delegate --quantize --target=ethos-u85-128 -``` - -- To run a specific test model and target -```bash -./run.sh --model_name=mv2 --delegate --target=ethos-u85-128 -``` - -- To run all the test models iteratively in a loop , simply run -```bash -./run.sh -``` - -Note that you could use `build_executor_runner.sh` and `run_fvp.sh` scripts in tandem by passing the relevant --target argument (e.g., --target=ethos-u55-128), the correct FVP binary will be chosen automatically. For more details, see the [section on Runtime Integration](https://docs.pytorch.org/executorch/main/backends-arm-ethos-u.html#runtime-integration). - - -#### Manual FVP Binary Selection - -- If you build for the Ethos delegate U55/U65 target (e.g., using --target=ethos-u55-128 or --target=ethos-u65-256 with `build_executor_runner.sh` and `run_fvp.sh`), you should use the corresponding FVP binary: - - For U55: - ```bash - examples/arm/ethos-u-scratch/FVP-corstone300/models/Linux64_GCC-9.3/FVP_Corstone_SSE-300_Ethos-U55 - ``` - - For U65: - ```bash - examples/arm/ethos-u-scratch/FVP-corstone300/models/Linux64_GCC-9.3/FVP_Corstone_SSE-300_Ethos-U65 - ``` -- And say if you are not building for an Ethos target, use: - ```bash - examples/arm/ethos-u-scratch/FVP-corstone320/models/Linux64_GCC-9.3/FVP_Corstone_SSE-320 - ``` - -Following is an example usage: - -```bash -ethos_u_build_dir=examples/arm/executor_runner/ - -elf=$(find ${ethos_u_build_dir} -name "arm_executor_runner") - -FVP_Corstone_SSE-320 \ - -C mps4_board.subsystem.ethosu.num_macs=128 \ - -C mps4_board.visualisation.disable-visualisation=1 \ - -C vis_hdlcd.disable_visualisation=1 \ - -C mps4_board.telnetterminal0.start_telnet=0 \ - -C mps4_board.uart0.out_file='-' \ - -C mps4_board.uart0.shutdown_on_eot=1 \ - -a "${elf}" \ - --timelimit 120 || true # seconds- after which sim will kill itself -``` - -#### Verification of Successful FVP Execution -After running the FVP command, either automatically or manually, you should see output similar to the following on your shell if the execution is successful: - -```console -I [executorch:arm_executor_runner.cpp:364] Model in 0x70000000 $ -I [executorch:arm_executor_runner.cpp:366] Model PTE file loaded. Size: 4425968 bytes. -I [executorch:arm_executor_runner.cpp:376] Model buffer loaded, has 1 methods -I [executorch:arm_executor_runner.cpp:384] Running method forward -I [executorch:arm_executor_runner.cpp:395] Setup Method allocator pool. Size: 62914560 bytes. -I [executorch:arm_executor_runner.cpp:412] Setting up planned buffer 0, size 752640. -I [executorch:ArmBackendEthosU.cpp:79] ArmBackend::init 0x70000070 -I [executorch:arm_executor_runner.cpp:445] Method loaded. -I [executorch:arm_executor_runner.cpp:447] Preparing inputs... -I [executorch:arm_executor_runner.cpp:461] Input prepared. -I [executorch:arm_executor_runner.cpp:463] Starting the model execution... -I [executorch:ArmBackendEthosU.cpp:118] ArmBackend::execute 0x70000070 -I [executorch:ArmBackendEthosU.cpp:298] Tensor input/output 0 will be permuted -I [executorch:arm_perf_monitor.cpp:120] NPU Inferences : 1 -I [executorch:arm_perf_monitor.cpp:121] Profiler report, CPU cycles per operator: -I [executorch:arm_perf_monitor.cpp:125] ethos-u : cycle_cnt : 1498202 cycles -I [executorch:arm_perf_monitor.cpp:132] Operator(s) total: 1498202 CPU cycles -I [executorch:arm_perf_monitor.cpp:138] Inference runtime: 6925114 CPU cycles total -I [executorch:arm_perf_monitor.cpp:140] NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency -I [executorch:arm_perf_monitor.cpp:149] Inference CPU ratio: 99.99 % -I [executorch:arm_perf_monitor.cpp:153] Inference NPU ratio: 0.01 % -I [executorch:arm_perf_monitor.cpp:162] cpu_wait_for_npu_cntr : 729 CPU cycles -I [executorch:arm_perf_monitor.cpp:167] Ethos-U PMU report: -I [executorch:arm_perf_monitor.cpp:168] ethosu_pmu_cycle_cntr : 5920305 -I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr0 : 359921 -I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr1 : 0 -I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr2 : 0 -I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr3 : 503 -I [executorch:arm_perf_monitor.cpp:178] Ethos-U PMU Events:[ETHOSU_PMU_EXT0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE] -I [executorch:arm_executor_runner.cpp:470] model_pte_loaded_size: 4425968 bytes. -I [executorch:arm_executor_runner.cpp:484] method_allocator_used: 1355722 / 62914560 free: 61558838 ( used: 2 % ) -I [executorch:arm_executor_runner.cpp:491] method_allocator_planned: 752640 bytes -I [executorch:arm_executor_runner.cpp:493] method_allocator_loaded: 966 bytes -I [executorch:arm_executor_runner.cpp:494] method_allocator_input: 602116 bytes -I [executorch:arm_executor_runner.cpp:495] method_allocator_executor: 0 bytes -I [executorch:arm_executor_runner.cpp:498] temp_allocator_used: 0 / 1048576 free: 1048576 ( used: 0 % ) -I [executorch:arm_executor_runner.cpp:152] Model executed successfully. -I [executorch:arm_executor_runner.cpp:156] 1 outputs: -Output[0][0]: -0.749744 -Output[0][1]: -0.019224 -Output[0][2]: 0.134570 -...(Skipped) -Output[0][996]: -0.230691 -Output[0][997]: -0.634399 -Output[0][998]: -0.115345 -Output[0][999]: 1.576386 -I [executorch:arm_executor_runner.cpp:177] Program complete, exiting. -I [executorch:arm_executor_runner.cpp:179] -``` - -```{note} -The `run.sh` script provides various options to select a particular FVP target, use desired models, select portable kernels and can be explored using the `--help` argument -``` - -## Running on the VGF backend with the standard executor_runner for Linux - -Follow typical [Building ExecuTorch with CMake](using-executorch-building-from-source.md) flow to build the linux target, ensuring that the VGF delegate is enabled. - -```bash --DEXECUTORCH_BUILD_VGF=ON -``` - -A full example buld line is: -``` -cmake bash \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_XNNPACK=OFF \ - -DEXECUTORCH_BUILD_VULKAN=ON \ - -DEXECUTORCH_BUILD_VGF=ON \ - -DEXECUTORCH_ENABLE_LOGGING=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ - -DPYTHON_EXECUTABLE=python \ - -Bcmake-out . -cmake --build cmake-out -j25 --target install --config Release -``` - -You can then invoke the executor runner on the host machine, which will use the VGF delegate, and requires the vulkan layer drivers we installed with setup.sh. - -```bash -./cmake-out/executor_runner -model_path add_arm_delegate_vgf.pte -``` - - -## Takeaways -In this tutorial you have learnt how to use the ExecuTorch software to both export a standard model from PyTorch and to run it on the compact and fully functioned ExecuTorch runtime, enabling a smooth path for offloading models from PyTorch to Arm based platforms. - -To recap, there are two major flows: - * A direct flow which offloads work onto the Cortex-M using libraries built into ExecuTorch. - * A delegated flow which partitions the graph into sections for Cortex-M and sections which can be offloaded and accelerated on the Ethos-U hardware. - -Both of these flows continue to evolve, enabling more use-cases and better performance. - -## FAQs - - -If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new). diff --git a/docs/source/tutorial-template.md b/docs/source/tutorial-template.md index b25731afa17..73b787c9e2c 100644 --- a/docs/source/tutorial-template.md +++ b/docs/source/tutorial-template.md @@ -9,12 +9,12 @@ :::{grid-item-card} Tutorials we recommend you complete before this: :class-card: card-prerequisites * [Introduction to ExecuTorch](intro-how-it-works.md) -* [Setting up ExecuTorch](getting-started-setup.md) -* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md) +* [Setting up ExecuTorch](getting-started-setup.rst) +* [Building ExecuTorch with CMake](using-executorch-building-from-source.md) ::: :::: -## Prerequsites (Hardware and Software) +## Prerequisites (Hardware and Software) Provide instructions on what kind of hardware and software are pre-requisite for the tutorial. diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md index bccd4e4add3..3fb079f24d6 100644 --- a/docs/source/tutorial-xnnpack-delegate-lowering.md +++ b/docs/source/tutorial-xnnpack-delegate-lowering.md @@ -11,7 +11,7 @@ In this tutorial, you will learn how to export an XNNPACK lowered Model and run :::{grid-item-card} Before you begin it is recommended you go through the following: :class-card: card-prerequisites * [Setting up ExecuTorch](getting-started-setup.rst) -* [Model Lowering Tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) +* [Model Lowering Tutorial](tutorials/export-to-executorch-tutorial) * [ExecuTorch XNNPACK Delegate](backends-xnnpack.md) ::: :::: @@ -74,7 +74,7 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and ## Lowering a Quantized Model to XNNPACK -The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder. +The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Quantization Overview](quantization-overview.md). For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder. ```python from torch.export import export diff --git a/docs/source/usage.md b/docs/source/usage.md new file mode 100644 index 00000000000..6ffc136093b --- /dev/null +++ b/docs/source/usage.md @@ -0,0 +1,19 @@ +# Usage + +This section describes how to use Executorch. It covers everything from +getting started to platform-specific implementations, runtime integration, +troubleshooting, and frequently asked questions. + +```{toctree} +:maxdepth: 1 + +getting-started +using-executorch-export +using-executorch-android +using-executorch-ios +using-executorch-cpp +using-executorch-runtime-integration +using-executorch-troubleshooting +using-executorch-building-from-source +using-executorch-faqs +``` diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md index 23513302063..ce9977218a1 100644 --- a/docs/source/using-executorch-android.md +++ b/docs/source/using-executorch-android.md @@ -72,7 +72,7 @@ curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250 curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar.sha256sums ``` -We aim to make every daily snapshot available and useable. However, for best stability, please use releases, not snapshots. +We aim to make every daily snapshot available and usable. However, for best stability, please use releases, not snapshots. ## Using AAR file @@ -83,12 +83,12 @@ To add the AAR file to your app: An AAR file itself does not contain dependency info, unlike the Maven one which bundled with pom.xml. The Java package requires `fbjni` and `soloader`, and currently requires users to explicitly declare the dependency. Therefore, two more `dependencies` in gradle rule is required: ``` implementation("com.facebook.soloader:soloader:0.10.5") -implementation("com.facebook.fbjni:fbjni:0.5.1") +implementation("com.facebook.fbjni:fbjni:0.7.0") ``` ### Example usage -In your app working directory, such as executorch/examples/demo-apps/android/LlamaDemo, +In your app working directory, such as executorch-examples/llm/android/LlamaDemo, ``` mkdir -p app/libs curl https://ossci-android.s3.amazonaws.com/executorch/release/${executorch_version}/executorch.aar -o app/libs/executorch.aar @@ -100,7 +100,7 @@ And include it in gradle: dependencies { implementation(files("libs/executorch.aar")) implementation("com.facebook.soloader:soloader:0.10.5") - implementation("com.facebook.fbjni:fbjni:0.5.1") + implementation("com.facebook.fbjni:fbjni:0.7.0") } ``` @@ -112,7 +112,7 @@ Now you can compile your app with the ExecuTorch Android library. You need Android [SDK](https://developer.android.com/studio) and [NDK](https://developer.android.com/ndk/downloads) to use it. -Current NDK version used in ExecuTorch CI: r27b. +Current NDK version used in ExecuTorch CI: r28c. You need to set `ANDROID_HOME` to Android SDK home and `ANDROID_NDK` to the correct NDK root (containing NOTICE file). @@ -202,7 +202,7 @@ adb push extension/module/test/resources/add.pte /data/local/tmp/ This example loads an ExecuTorch module, prepares input data, runs inference, and processes the output data. Please use [DeepLabV3AndroidDemo](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) -and [LlamaDemo](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo) for the code examples +and [LlamaDemo](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android/LlamaDemo) for the code examples using ExecuTorch AAR package. ## Java API reference diff --git a/docs/source/using-executorch-cpp.md b/docs/source/using-executorch-cpp.md index 3736226bc06..5505ade9573 100644 --- a/docs/source/using-executorch-cpp.md +++ b/docs/source/using-executorch-cpp.md @@ -69,7 +69,7 @@ The runner source code can be found in the ExecuTorch repo under [examples/porta ## Next Steps -- [Runtime API Reference](executorch-runtime-api-reference.md) for documentation on the available C++ runtime APIs. +- [Runtime API Reference](executorch-runtime-api-reference.rst) for documentation on the available C++ runtime APIs. - [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md) for information on the high-level Module API. - [Managing Tensor Memory in C++](extension-tensor.md) for information on high-level tensor APIs. - [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md) for information on the low-level runtime APIs. diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md index 2a887bb346d..7abf5cbd30a 100644 --- a/docs/source/using-executorch-export.md +++ b/docs/source/using-executorch-export.md @@ -24,7 +24,7 @@ Quantization - the process of using reduced precision to reduce inference time a ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each. -The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details. +The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requirements and level of model support. See the documentation for each hardware backend for more details. As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example. @@ -32,7 +32,7 @@ As part of the .pte file creation process, ExecuTorch identifies portions of the Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation and the [Export and Lowering](#export-and-lowering) section below for more information. -- [XNNPACK (Mobile CPU)](backends-xnnpack.md) +- [XNNPACK (CPU)](backends-xnnpack.md) - [Core ML (iOS)](backends-coreml.md) - [Metal Performance Shaders (iOS GPU)](backends-mps.md) - [Vulkan (Android GPU)](backends-vulkan.md) @@ -141,7 +141,6 @@ delegate_external_constants_pass_unlifted( exported_program = export(tagged_module, inputs, dynamic_shapes=dynamic_shapes) executorch_program = to_edge_transform_and_lower( exported_program, - transform_passes = [partial_function], partitioner = [XnnpackPartitioner()] ).to_executorch() ``` @@ -184,6 +183,7 @@ For more complex use cases, dynamic shape specification allows for mathematical Before integrating the runtime code, it is common to test the exported model from Python. This can be used to evaluate model accuracy and sanity check behavior before moving to the target device. Note that not all hardware backends are available from Python, as they may require specialized hardware to function. See the specific backend documentation for more information on hardware requirements and the availablilty of simulators. The XNNPACK delegate used in this example is always available on host machines. ```python +import torch from executorch.runtime import Runtime runtime = Runtime.get() @@ -194,9 +194,19 @@ method = program.load_method("forward") outputs = method.execute([input_tensor]) ``` -Pybindings currently does not support loading program and data. To run a model with PTE and PTD components, please use the [Extension Module](extension-module.md). There is also an E2E demo in [executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/program-data-separation). +To run a model with program and data separated, please use the [ExecuTorch Module pybindings](https://github.com/pytorch/executorch/blob/main/extension/pybindings/README.md). +```python +import torch +from executorch.extension.pybindings import portable_lib + +input_tensor = torch.randn(1, 3, 32, 32) +module = portable_lib._load_for_executorch("model.pte", "model.ptd") +outputs = module.forward([input_tensor]) +``` + +There is also an E2E demo in [executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/program-data-separation). -For more information, see [Runtime API Reference](executorch-runtime-api-reference.md). +For more information, see [Runtime API Reference](executorch-runtime-api-reference.rst). ## Advanced Topics @@ -270,7 +280,7 @@ decode_ep = torch.export.export(DecodeWrapper(model), ...) ## Next Steps -The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.md) for more information. +The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.rst) for more information. For advanced use cases, see the following: - [Quantization Overview](quantization-overview.md) for information on quantizing models to reduce inference time and memory footprint. diff --git a/docs/source/using-executorch-faqs.md b/docs/source/using-executorch-faqs.md index d1bd0390569..c147403c9e8 100644 --- a/docs/source/using-executorch-faqs.md +++ b/docs/source/using-executorch-faqs.md @@ -16,7 +16,7 @@ if you are using Ubuntu, or use an equivalent install command. ### ModuleNotFoundError: No module named 'pytorch_tokenizers' -The `pytorch_tokenizers` package is required for LLM export functionality. Install it from the ExecutorTorch source code: +The `pytorch_tokenizers` package is required for LLM export functionality. Install it from the ExecuTorch source code: ``` pip install -e ./extension/llm/tokenizers/ ``` @@ -48,7 +48,7 @@ Thread count can be set with the following function. Ensure this is done prior t ::executorch::extension::threadpool::get_threadpool()->_unsafe_reset_threadpool(num_threads); ``` -For a deeper investgiation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information. +For a deeper investigation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information. ### Missing Logs diff --git a/docs/source/using-executorch-runtime-integration.md b/docs/source/using-executorch-runtime-integration.md index 550cb3eb71a..36bc4f6b2fe 100644 --- a/docs/source/using-executorch-runtime-integration.md +++ b/docs/source/using-executorch-runtime-integration.md @@ -64,7 +64,7 @@ namespace { ``` ### Weak Symbol Override -ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatability. +ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatibility. To override one or more PAL methods, take the following steps: diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md index 56c2e1a0653..75648dc5b46 100644 --- a/docs/source/using-executorch-troubleshooting.md +++ b/docs/source/using-executorch-troubleshooting.md @@ -1,11 +1,11 @@ # Profiling and Debugging -To faciliate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch. +To facilitate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch. ## General Troubleshooting Steps - To troubleshoot failure of runtime API calls, such as loading or running a model, ensure that ExecuTorch framework logging is enabled. See [Logging](using-executorch-runtime-integration.md#logging) for more information. -- As a prelimatinary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information. +- As a preliminary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information. - Check [Frequently Asked Questions](using-executorch-faqs.md) for common issues and questions encountered during install, model export, and runtime integration. ## Developer Tools @@ -16,5 +16,5 @@ The ExecuTorch developer tools, or devtools, are a collection of tooling for tro - [Frequently Asked Questions](using-executorch-faqs.md) for solutions to commonly encountered questions and issues. - [Introduction to the ExecuTorch Developer Tools](runtime-profiling.md) for a high-level introduction to available developer tooling. -- [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for information on runtime performance profiling. +- [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial) for information on runtime performance profiling. - [Inspector APIs](runtime-profiling.md) for reference material on trace inspector APIs. diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py index 48edc3c0669..af2fa3c74ee 100644 --- a/examples/apple/coreml/llama/export.py +++ b/examples/apple/coreml/llama/export.py @@ -23,7 +23,6 @@ from executorch.exir.backend.utils import format_delegated_graph from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.exir.passes import MemoryPlanningPass -from executorch.exir.passes.quant_fusion_pass import QuantFusionPass from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.extension.export_util.utils import save_pte_program @@ -211,9 +210,7 @@ def main() -> None: executorch_program = edge_manager.to_executorch( ExecutorchBackendConfig( extract_delegate_segments=True, - passes=[ - QuantFusionPass(), - ], + do_quant_fusion_and_const_prop=True, memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(), ) diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 106ab35363c..34ed7e3f1bd 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -9,7 +9,6 @@ import argparse import copy -import json import logging import os @@ -19,25 +18,24 @@ import torch from examples.devtools.scripts.export_bundled_program import save_bundled_program from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec -from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner -from executorch.backends.arm.quantizer import ( - EthosUQuantizer, - get_symmetric_quantization_config, - TOSAQuantizer, - VgfQuantizer, -) +from executorch.backends.arm.ethosu import EthosUCompileSpec +from executorch.backends.arm.quantizer import get_symmetric_quantization_config from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec -from executorch.backends.arm.tosa.partitioner import TOSAPartitioner +from executorch.backends.arm.util._factory import create_partitioner, create_quantizer from executorch.backends.arm.util.arm_model_evaluator import ( - GenericModelEvaluator, - MobileNetV2Evaluator, + evaluate_model, + evaluator_calibration_data, ) -from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner +from executorch.backends.arm.vgf import VgfCompileSpec # To use Cortex-M backend +from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import ( + QuantizedLinearFusionPass, +) + from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import ( QuantizedOpFusionPass, ) @@ -55,8 +53,11 @@ ExecutorchBackendConfig, to_edge_transform_and_lower, ) + from executorch.extension.export_util.utils import save_pte_program from tabulate import tabulate +from torch.export import ExportedProgram +from torch.fx import GraphModule from torch.utils.data import DataLoader # Quantize model if required using the standard export quantizaion flow. @@ -141,25 +142,19 @@ def get_model_and_inputs_from_name( def quantize( - model: torch.nn.Module, + model: GraphModule, model_name: str, compile_specs: EthosUCompileSpec | VgfCompileSpec | TosaCompileSpec, example_inputs: Tuple[torch.Tensor], evaluator_name: str | None, evaluator_config: Dict[str, Any] | None, -) -> torch.nn.Module: - """This is the official recommended flow for quantization in pytorch 2.0 export""" +) -> GraphModule: + """This is the official recommended flow for quantization in pytorch 2.0 + export""" logging.info("Quantizing Model...") logging.debug(f"Original model: {model}") - quantizer = None - if isinstance(compile_specs, EthosUCompileSpec): - quantizer = EthosUQuantizer(compile_specs) - elif isinstance(compile_specs, TosaCompileSpec): - quantizer = TOSAQuantizer(compile_specs) - elif isinstance(compile_specs, VgfCompileSpec): - quantizer = VgfQuantizer(compile_specs) - else: - raise RuntimeError("Unsupported compilespecs for quantization!") + + quantizer = create_quantizer(compile_specs) operator_config = get_symmetric_quantization_config() quantizer.set_global(operator_config) @@ -182,46 +177,6 @@ def quantize( return m -# Simple example models -class AddModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x + x - - example_input = (torch.ones(5, dtype=torch.int32),) - can_delegate = True - - -class AddModule2(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - return x + y - - example_input = ( - torch.ones(5, dtype=torch.int32), - torch.ones(5, dtype=torch.int32), - ) - can_delegate = True - - -class AddModule3(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - return (x + y, x + x) - - example_input = ( - torch.ones(5, dtype=torch.int32), - torch.ones(5, dtype=torch.int32), - ) - can_delegate = True - - class QuantAddTest(torch.nn.Module): def __init__(self): super().__init__() @@ -270,48 +225,29 @@ def forward(self, w, x, y, z): can_delegate = True # when quantized -class SoftmaxModule(torch.nn.Module): +class QuantLinearTest(torch.nn.Module): def __init__(self): super().__init__() - self.softmax = torch.nn.Softmax(dim=0) + # Define a simple linear layer + self.linear = torch.nn.Linear(61, 37) def forward(self, x): - z = self.softmax(x) - return z + return self.linear(x) - example_input = (torch.ones(2, 2),) - can_delegate = True - - -class MultipleOutputsModule(torch.nn.Module): - def forward(self, x: torch.Tensor, y: torch.Tensor): - return (x * y, x.sum(dim=-1, keepdim=True)) - - example_input = (torch.randn(10, 4, 5), torch.randn(10, 4, 5)) + example_input = (torch.randn([8, 61], dtype=torch.float32),) can_delegate = True models = { - "add": AddModule, - "add2": AddModule2, - "add3": AddModule3, "qadd": QuantAddTest, "qadd2": QuantAddTest2, "qops": QuantOpTest, - "softmax": SoftmaxModule, - "MultipleOutputsModule": MultipleOutputsModule, + # TODO: Remove this from here, once we have dedicated MCU test pipeline ready. This is an interim solution. + # See https://github.com/pytorch/executorch/discussions/13944 + "qlinear": QuantLinearTest, } calibration_data = { - "add": (torch.randn(1, 5),), - "add2": ( - torch.randn(1, 5), - torch.randn(1, 5), - ), - "add3": ( - torch.randn(32, 5), - torch.randn(32, 5), - ), "qadd": (torch.randn(32, 2, 1),), "qadd2": ( torch.randn(32, 2, 1), @@ -323,12 +259,6 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): torch.randn(32, 2, 1) * -0.000001, torch.randn(32, 2, 1) * 1000, ), - "softmax": (torch.randn(32, 2, 2),), -} - -evaluators = { - "generic": GenericModelEvaluator, - "mv2": MobileNetV2Evaluator, } targets = [ @@ -355,21 +285,9 @@ def get_calibration_data( ): # Firstly, if the model is being evaluated, take the evaluators calibration function if it has one if evaluator_name is not None: - evaluator = evaluators[evaluator_name] - - if hasattr(evaluator, "get_calibrator"): - assert evaluator_config is not None - - config_path = Path(evaluator_config) - with config_path.open() as f: - config = json.load(f) - - if evaluator_name == "mv2": - return evaluator.get_calibrator( - training_dataset_path=config["training_dataset_path"] - ) - else: - raise RuntimeError(f"Unknown evaluator: {evaluator_name}") + evaluator_data = evaluator_calibration_data(evaluator_name, evaluator_config) + if evaluator_data is not None: + return evaluator_data # If the model is in the calibration_data dictionary, get the data from there # This is used for the simple model examples provided @@ -397,11 +315,14 @@ def get_compile_spec( tosa_spec = TosaSpecification.create_from_string("TOSA-1.0+INT") compile_spec = TosaCompileSpec(tosa_spec) elif "ethos-u" in target: + extra_flags = ["--verbose-operators", "--verbose-cycle-estimate"] + if debug_mode is not None: + extra_flags.append("--enable-debug-db") compile_spec = EthosUCompileSpec( target, system_config=system_config, memory_mode=memory_mode, - extra_flags=["--verbose-operators", "--verbose-cycle-estimate"], + extra_flags=extra_flags, config_ini=config, ) elif "vgf" in target: @@ -423,52 +344,6 @@ def get_compile_spec( return compile_spec -def evaluate_model( - model_name: str, - intermediates: str, - model_fp32: torch.nn.Module, - model_int8: torch.nn.Module, - example_inputs: Tuple[torch.Tensor], - evaluator_name: str, - evaluator_config: str | None, -) -> None: - evaluator = evaluators[evaluator_name] - - # Get the path of the TOSA flatbuffer that is dumped - intermediates_path = Path(intermediates) - tosa_paths = list(intermediates_path.glob("*.tosa")) - - if evaluator.REQUIRES_CONFIG: - assert evaluator_config is not None - - config_path = Path(evaluator_config) - with config_path.open() as f: - config = json.load(f) - - if evaluator_name == "mv2": - init_evaluator = evaluator( - model_name, - model_fp32, - model_int8, - example_inputs, - str(tosa_paths[0]), - config["batch_size"], - config["validation_dataset_path"], - ) - else: - raise RuntimeError(f"Unknown evaluator {evaluator_name}") - else: - init_evaluator = evaluator( - model_name, model_fp32, model_int8, example_inputs, str(tosa_paths[0]) - ) - - quant_metrics = init_evaluator.evaluate() - output_json_path = intermediates_path / "quant_metrics.json" - - with output_json_path.open("w") as json_file: - json.dump(quant_metrics, json_file) - - def dump_delegation_info(edge, intermediate_files_folder: Optional[str] = None): graph_module = edge.exported_program().graph_module delegation_info = get_delegation_info(graph_module) @@ -535,7 +410,7 @@ def get_args(): required=False, nargs="?", const="generic", - choices=["generic", "mv2"], + choices=["generic", "mv2", "deit_tiny"], help="Flag for running evaluation of the model.", ) parser.add_argument( @@ -593,7 +468,7 @@ def get_args(): "--config", required=False, default="Arm/vela.ini", - help="Specify custom vela configuration file (vela.ini)", + help="Specify custom vela configuration file (vela.ini) for Ethos-U targets.", ) parser.add_argument( "--non_strict_export", @@ -605,13 +480,13 @@ def get_args(): parser.add_argument( "--enable_qdq_fusion_pass", action="store_true", - help="Enable the QuantizedOpFusionPass fusion step", + help="Enable the Quantized qdq fusion Op passes", ) parser.add_argument( "--enable_debug_mode", required=False, choices=["json", "tosa"], - help="Flag to enable ATen-to-TOSA debug mode.", + help="Flag to enable ATen-to-TOSA debug mode and dumping of Vela's debug database.", ) args = parser.parse_args() @@ -718,7 +593,12 @@ def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: s save_bundled_program(exec_prog, method_test_suites, output_name) -def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec): +def quantize_model( + args, + model: GraphModule, + example_inputs: Tuple[torch.Tensor], + compile_spec, +) -> Tuple[GraphModule, ExportedProgram]: model_int8 = quantize( model, args.model_name, @@ -736,7 +616,10 @@ def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec): def to_edge_TOSA_delegate( - exported_program, args, model: torch.nn.Module, example_inputs + exported_program: ExportedProgram, + args, + model: GraphModule, + example_inputs: Tuple[torch.Tensor], ): # As we can target multiple output encodings, one must # be specified. @@ -755,16 +638,8 @@ def to_edge_TOSA_delegate( model_int8, exported_program = quantize_model( args, model, example_inputs, compile_spec ) - model = model_int8 - - if isinstance(compile_spec, EthosUCompileSpec): - partitioner = EthosUPartitioner(compile_spec) - elif isinstance(compile_spec, TosaCompileSpec): - partitioner = TOSAPartitioner(compile_spec) - elif isinstance(compile_spec, VgfCompileSpec): - partitioner = VgfPartitioner(compile_spec) - else: - raise RuntimeError(f"Unhandled compile spec: {compile_spec}") + + partitioner = create_partitioner(compile_spec) edge = to_edge_transform_and_lower( exported_program, @@ -777,7 +652,12 @@ def to_edge_TOSA_delegate( return model_int8, edge -def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_inputs): +def to_edge_no_delegate( + exported_program: ExportedProgram, + args, + model: GraphModule, + example_inputs: Tuple[torch.Tensor], +): model_int8 = None if args.quantize: # As we can target multiple output encodings, one must @@ -806,22 +686,24 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_ return model_int8, edge -def transform_for_cortex_m_backend(edge, args): +def transform_for_cortex_m_backend(edge_program_manager, args): # Let's make sure we are using optimized Cortex M backend # NB: If we can't find and replace ops those are expected to be replaced, # bad things will happen at runtime, like "missing operator" errors! # Instantiate the mandatory ReplaceQuantNodesPass - passes = [ReplaceQuantNodesPass()] - - # Conditionally add the QuantizedOpFusionPass + passes = [ReplaceQuantNodesPass] if args.enable_qdq_fusion_pass: - passes.append(QuantizedOpFusionPass()) - - # Apply the passes - edge = edge.transform(passes) - - return edge + passes += [QuantizedLinearFusionPass, QuantizedOpFusionPass] + current_edge = edge_program_manager + for pass_cls in passes: + transform_pass = ( + pass_cls(current_edge.exported_program()) + if pass_cls.__name__ == "QuantizedLinearFusionPass" + else pass_cls() + ) + current_edge = current_edge.transform([transform_pass]) + return current_edge if __name__ == "__main__": # noqa: C901 diff --git a/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch new file mode 100644 index 00000000000..11590a8578f --- /dev/null +++ b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch @@ -0,0 +1,25 @@ +From f6a7d867212336b3e344c21240a2a03671bffd65 Mon Sep 17 00:00:00 2001 +From: Per Held +Date: Wed, 17 Sep 2025 13:46:05 +0200 +Subject: Remove hello_world from applications + +--- + applications/CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt +index a017575..130f0f7 100644 +--- a/applications/CMakeLists.txt ++++ b/applications/CMakeLists.txt +@@ -21,7 +21,7 @@ add_subdirectory(driver_unit_tests) + + add_subdirectory(freertos) + +-add_subdirectory(hello_world) ++#add_subdirectory(hello_world) + + add_subdirectory(threadx_demo) + +-- +2.43.0 + diff --git a/examples/arm/ethos_u_minimal_example.ipynb b/examples/arm/ethos_u_minimal_example.ipynb index dc8ea7193aa..6637800e456 100644 --- a/examples/arm/ethos_u_minimal_example.ipynb +++ b/examples/arm/ethos_u_minimal_example.ipynb @@ -58,7 +58,7 @@ "model = Add()\n", "model = model.eval()\n", "exported_program = torch.export.export(model, example_inputs)\n", - "graph_module = exported_program.module()\n", + "graph_module = exported_program.graph_module\n", "\n", "_ = graph_module.print_readable()" ] @@ -160,7 +160,7 @@ " config=ExecutorchBackendConfig(extract_delegate_segments=False)\n", " )\n", "\n", - "_ = executorch_program_manager.exported_program().module().print_readable()\n", + "_ = executorch_program_manager.exported_program().graph_module.print_readable()\n", "\n", "# Save pte file\n", "save_pte_program(executorch_program_manager, \"ethos_u_minimal_example.pte\")" diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index 4e4a8eeb409..d5038a1a6b8 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -235,10 +235,10 @@ list( -Map=arm_executor_runner.map ) -# Prefer to generate kernel bindings from model file if possible, which is when -# 1. Not building for semihosting 2. Not building with bundleio If that is not -# the case, fallback to select_ops_list If the model file does not contain any -# aten ops, a workaround is currently needed to avoid crashing. +# Figure out which ops to include: For semihosting build, use +# (user-set)SELECT_OPS_MODEL variable. For normal build, use +# EXECUTORCH_SELECT_OPS_MODEL to include ops automatically. If the pte contains +# no undelegated ops, use neither. execute_process( COMMAND python "${ET_DIR_PATH}/codegen/tools/gen_oplist.py" @@ -264,11 +264,6 @@ elseif(${FOUND_OPS_IN_FILE}) message( "gen_oplist: EXECUTORCH_SELECT_OPS_MODEL=${ET_PTE_FILE_PATH} is used to auto generate ops from" ) -elseif(NOT ${FOUND_OPS_IN_FILE} AND ${ET_BUNDLE_IO}) - set(EXECUTORCH_SELECT_OPS_MODEL "") - message( - "gen_oplist: Building with ET_BUNDLE_IO and .bpte is not supported to auto generate ops from will use EXECUTORCH_SELECT_OPS_LIST=${EXECUTORCH_SELECT_OPS_LIST}" - ) else() set(EXECUTORCH_SELECT_OPS_LIST "") set(EXECUTORCH_SELECT_OPS_MODEL "") diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 8f5dec85ad4..91e34b09cbd 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -53,8 +53,8 @@ function help() { echo " --no_delegate Do not delegate the model (can't override builtin models)" echo " --no_quantize Do not quantize the model (can't override builtin models)" echo " --portable_kernels= TO BE DEPRECATED: Alias to select_ops_list." - echo " --select_ops_list= Comma separated list of portable (non delegated) kernels to include Default: ${select_ops_list}" - echo " NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio." + echo " --select_ops_list= Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}" + echo " NOTE: This is only used when building for semihosting." echo " See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information." echo " --target= Target to build and run for Default: ${target}" echo " --output= Target build output folder Default: ${output_folder}" @@ -225,7 +225,6 @@ if [[ -z "$model_name" ]]; then test_model=( "softmax" # 0 "add" # 1 - "add3" # 2 "qadd" # 3 "qadd2" # 4 "qops" # 5 @@ -234,7 +233,6 @@ if [[ -z "$model_name" ]]; then model_compiler_flags=( "" # 0 softmax "--delegate" # 1 add - "--delegate" # 2 add3 "--delegate --quantize" # 3 qadd "--delegate --quantize" # 4 qadd2 "--delegate --quantize" # 5 qops diff --git a/examples/arm/run_mcu_models_fvp.sh b/examples/arm/run_mcu_models_fvp.sh index 68d5ec03003..3fa980c506b 100755 --- a/examples/arm/run_mcu_models_fvp.sh +++ b/examples/arm/run_mcu_models_fvp.sh @@ -24,9 +24,9 @@ VALID_TARGETS=( ) # Default models for MCU validation with portable kernels -DEFAULT_MODELS=(mv2 mv3 lstm) +DEFAULT_MODELS=(mv2 mv3 lstm qadd qlinear) # Available models (on FVP) -AVAILABLE_MODELS=(mv2 mv3 lstm) +AVAILABLE_MODELS=(mv2 mv3 lstm qadd qlinear) # Add the following models if you want to enable them later (atm they are not working on FVP) # edsr w2l ic3 ic4 resnet18 resnet50 @@ -257,6 +257,7 @@ for model in "${MODELS[@]}"; do -m "$model" \ --target="$ETHOS_TARGET" \ --quantize \ + --enable_qdq_fusion_pass \ --output="arm_test/$model"; then echo "❌ AOT compilation failed for $model" MODEL_SUCCESS=false diff --git a/examples/arm/vgf_minimal_example.ipynb b/examples/arm/vgf_minimal_example.ipynb index 36004f2c7cd..1f8e0a61601 100644 --- a/examples/arm/vgf_minimal_example.ipynb +++ b/examples/arm/vgf_minimal_example.ipynb @@ -56,8 +56,8 @@ "\n", "model = Add()\n", "model = model.eval()\n", - "exported_program = torch.export.export_for_training(model, example_inputs)\n", - "graph_module = exported_program.module()\n", + "exported_program = torch.export.export(model, example_inputs)\n", + "graph_module = exported_program.graph_module\n", "\n", "_ = graph_module.print_readable()" ] @@ -197,7 +197,7 @@ " config=ExecutorchBackendConfig(extract_delegate_segments=False)\n", ")\n", "\n", - "executorch_program_manager.exported_program().module().print_readable()\n", + "executorch_program_manager.exported_program().graph_module.print_readable()\n", "\n", "# Save pte file\n", "cwd_dir = os.getcwd()\n", @@ -240,6 +240,7 @@ " -DCMAKE_BUILD_TYPE=Debug \\\n", " -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \\\n", " -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \\\n", + " -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \\\n", " -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \\\n", " -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \\\n", " -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \\\n", diff --git a/examples/cadence/models/babyllama.py b/examples/cadence/models/babyllama.py index 1b576a1a3eb..f393cd30037 100644 --- a/examples/cadence/models/babyllama.py +++ b/examples/cadence/models/babyllama.py @@ -14,8 +14,10 @@ from executorch.backends.cadence.aot.export_example import export_and_run_model -from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer - +from executorch.examples.models.llama.llama_transformer import ( + construct_transformer, + ModelArgs, +) FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -32,7 +34,7 @@ def main() -> None: ) seq = 64 b = 1 - model = Transformer(args) + model = construct_transformer(args) example_inputs = (torch.randint(0, 10, [b, seq], dtype=torch.int64),) export_and_run_model(model, example_inputs) diff --git a/examples/cuda/scripts/__init__.py b/examples/cuda/scripts/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/cuda/scripts/export.py b/examples/cuda/scripts/export.py new file mode 100644 index 00000000000..c103d7ee50a --- /dev/null +++ b/examples/cuda/scripts/export.py @@ -0,0 +1,116 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Example script for exporting simple models to flatbuffer with CUDA delegate. + +import argparse +import pathlib + +import torch + +from executorch.backends.cuda.cuda_backend import CudaBackend + +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner + +from executorch.examples.models import MODEL_NAME_TO_MODEL +from executorch.examples.models.model_factory import EagerModelFactory + +from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower + +from executorch.extension.export_util.utils import save_pte_program +from torch._inductor.decomposition import conv1d_to_conv2d +from torch.nn.attention import SDPBackend + +# Script to export a model with CUDA delegation. + +_EDGE_COMPILE_CONFIG = EdgeCompileConfig( + _check_ir_validity=False, + _skip_dim_order=True, # TODO(T182928844): enable dim_order in backend +) + + +def is_fbcode(): + return not hasattr(torch.version, "git_version") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + + parser.add_argument( + "-m", + "--model_name", + required=True, + help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}", + ) + parser.add_argument( + "--output_dir", + type=pathlib.Path, + default=pathlib.Path("./"), + help="Output directory for the exported model", + ) + parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction) + parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction) + + args = parser.parse_args() + return args + + +def save_processed_bytes(processed_bytes, base_name: str): + filename = f"{base_name}.bin" + print(f"Saving processed bytes to {filename}") + with open(filename, "wb") as file: + file.write(processed_bytes) + return + + +def main(): + args = parse_args() + + if args.model_name not in MODEL_NAME_TO_MODEL: + raise RuntimeError( + f"Model {args.model_name} is not a valid name. " + f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}." + ) + + ( + model, + example_args, + example_kwargs, + dynamic_shapes, + ) = EagerModelFactory.create_model(*MODEL_NAME_TO_MODEL[args.model_name]) + model = model.eval() + exported_programs = torch.export.export( + model, + args=example_args, + kwargs=example_kwargs, + dynamic_shapes=dynamic_shapes, + ) + print(exported_programs) + + partitioner = CudaPartitioner( + [CudaBackend.generate_method_name_compile_spec(args.model_name)] + ) + # Add decompositions for triton to generate kernels. + exported_programs = exported_programs.run_decompositions( + { + torch.ops.aten.conv1d.default: conv1d_to_conv2d, + } + ) + with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]): + et_prog = to_edge_transform_and_lower( + exported_programs, + partitioner=[partitioner], + compile_config=_EDGE_COMPILE_CONFIG, + generate_etrecord=args.generate_etrecord, + ) + exec_program = et_prog.to_executorch() + save_pte_program(exec_program, args.model_name, args.output_dir) + if args.generate_etrecord: + exec_program.get_etrecord().save(f"{args.model_name}_etrecord.bin") + + +if __name__ == "__main__": + main() diff --git a/examples/demo-apps/android/LlamaDemo/.gitignore b/examples/demo-apps/android/LlamaDemo/.gitignore deleted file mode 100644 index 41853c0472c..00000000000 --- a/examples/demo-apps/android/LlamaDemo/.gitignore +++ /dev/null @@ -1,12 +0,0 @@ -*.iml -.gradle -/local.properties -.idea -.DS_Store -/build -/captures -.externalNativeBuild -.cxx -local.properties -*.so -*.aar diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md deleted file mode 100644 index 9a6b3b020e7..00000000000 --- a/examples/demo-apps/android/LlamaDemo/README.md +++ /dev/null @@ -1,174 +0,0 @@ -# ExecuTorch Llama Android Demo App - -**[UPDATE - 2025-05-15]** We have added support for running Qwen3 0.6B and 4B model. Please see [this tutorial](https://github.com/pytorch/executorch/tree/main/examples/models/qwen3#summary) for export. Loading and running Qwen3 with this app is the same as Llama, as in this doc. - -We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer. - -This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case. - -Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas. - - -## Key Concepts -From this demo app, you will learn many key concepts such as: -* How to prepare Llama models, build the ExecuTorch library, and model inferencing across delegates -* Expose the ExecuTorch library via JNI layer -* Familiarity with current ExecuTorch app-facing capabilities - -The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases. - -## Supporting Models -As a whole, the models that this app supports are (varies by delegate): -* Llama 3.2 Quantized 1B/3B -* Llama 3.2 1B/3B in BF16 -* Llama Guard 3 1B -* Llama 3.1 8B -* Llama 3 8B -* Llama 2 7B -* LLaVA-1.5 vision model (only XNNPACK) -* Qwen 3 0.6B, 1.7B, and 4B - - -## Building the APK -First it’s important to note that currently ExecuTorch provides support across 3 delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to exporting the models to build ExecuTorch libraries and apps to run on device: - -| Delegate | Resource | -| ------------- | ------------- | -| XNNPACK (CPU-based library) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md) | -| QNN (Qualcomm AI Accelerators) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md) | -| MediaTek (MediaTek AI Accelerators) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md) | - - -## How to Use the App - -This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API. - -For loading the app, development, and running on device we recommend Android Studio: -1. Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo. -2. Run the app (^R). This builds and launches the app on the phone. - -### Opening the App - -Below are the UI features for the app. - -Select the settings widget to get started with picking a model, its parameters and any prompts. -

- -

- - - -### Select Models and Parameters - -Once you've selected the model, tokenizer, and model type you are ready to click on "Load Model" to have the app load the model and go back to the main Chat activity. -

- -

- - - -Optional Parameters: -* Temperature: Defaulted to 0, you can adjust the temperature for the model as well. The model will reload upon any adjustments. -* System Prompt: Without any formatting, you can enter in a system prompt. For example, "you are a travel assistant" or "give me a response in a few sentences". -* User Prompt: More for the advanced user, if you would like to manually input a prompt then you can do so by modifying the `{{user prompt}}`. You can also modify the special tokens as well. Once changed then go back to the main Chat activity to send. - -#### ExecuTorch App API - -```java -// Upon returning to the Main Chat Activity -mModule = new LlmModule( - ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()), - modelPath, - tokenizerPath, - temperature); -int loadResult = mModule.load(); -``` - -* `modelCategory`: Indicate whether it’s a text-only or vision model -* `modePath`: path to the .pte file -* `tokenizerPath`: path to the tokenizer file -* `temperature`: model parameter to adjust the randomness of the model’s output - - -### User Prompt -Once model is successfully loaded then enter any prompt and click the send (i.e. generate) button to send it to the model. -

- -

- -You can provide it more follow-up questions as well. -

- -

- -#### ExecuTorch App API - -```java -mModule.generate(prompt,sequence_length, MainActivity.this); -``` -* `prompt`: User formatted prompt -* `sequence_length`: Number of tokens to generate in response to a prompt -* `MainActivity.this`: Indicate that the callback functions (OnResult(), OnStats()) are present in this class. - -[*LLaVA-1.5: Only for XNNPACK delegate*] - -For LLaVA-1.5 implementation, select the exported LLaVA .pte and tokenizer file in the Settings menu and load the model. After this you can send an image from your gallery or take a live picture along with a text prompt to the model. - -

- -

- - -### Output Generated -To show completion of the follow-up question, here is the complete detailed response from the model. -

- -

- -#### ExecuTorch App API - -Ensure you have the following functions in your callback class that you provided in the `mModule.generate()`. For this example, it is `MainActivity.this`. -```java - @Override - public void onResult(String result) { - //...result contains token from response - //.. onResult will continue to be invoked until response is complete - } - - @Override - public void onStats(String stats) { - //... will be a json. See extension/llm/stats.h for the field definitions - } - -``` - -## Instrumentation Test -You can run the instrumentation test for sanity check. The test loads a model pte file and tokenizer.bin file -under `/data/local/tmp/llama`. - -### Model preparation -Go to ExecuTorch root, -```sh -curl -C - -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt -curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model -# Create params.json file -touch params.json -echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json -python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override="fp16" export.output_name=stories110m_h.pte model.use_kv_cache=True -python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin -``` -### Push model -```sh -adb mkdir -p /data/local/tmp/llama -adb push stories110m_h.pte /data/local/tmp/llama -adb push tokenizer.bin /data/local/tmp/llama -``` - -### Run test -Go to `examples/demo-apps/android/LlamaDemo`, -```sh -./gradlew connectedAndroidTest -``` - -## Reporting Issues -If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new), or join our discord [here](https://lnkd.in/gWCM4ViK). diff --git a/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md b/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md deleted file mode 100644 index 9ae79e96763..00000000000 --- a/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md +++ /dev/null @@ -1,94 +0,0 @@ -# Guide to set up Java/SDK/NDK for Android - -Follow this doc if you haven't set up Java/SDK/NDK for Android development -already. -This doc provides a CLI tutorial to set them up. Otherwise, you can do the same -thing with Android Studio GUI. - -## Set up Java 17 -1. Download the archive from Oracle website. -Make sure you have read and agree with the terms and conditions from the website before downloading. -```bash -export DEV_HOME= -cd $DEV_HOME -``` -Linux: -```bash -curl https://download.oracle.com/java/17/archive/jdk-17.0.10_linux-x64_bin.tar.gz -o jdk-17.0.10.tar.gz -``` -macOS: -```bash -curl https://download.oracle.com/java/17/archive/jdk-17.0.10_macos-aarch64_bin.tar.gz -o jdk-17.0.10.tar.gz -``` -2. Unzip the archive. The directory named `jdk-17.0.10` is the Java root directory. -```bash -tar xf jdk-17.0.10.tar.gz -``` -3. Set `JAVA_HOME` and update `PATH`. - -Linux: -```bash -export JAVA_HOME="$DEV_HOME"/jdk-17.0.10 -export PATH="$JAVA_HOME/bin:$PATH" -``` -macOS: -```bash -export JAVA_HOME="$DEV_HOME"/jdk-17.0.10.jdk/Contents/Home -export PATH="$JAVA_HOME/bin:$PATH" -``` - -Note: Oracle has tutorials for installing Java on -[Linux](https://docs.oracle.com/en/java/javase/17/install/installation-jdk-linux-platforms.html#GUID-4A6BD592-1840-4BB4-A758-4CD49E9EE88B) -and [macOS](https://docs.oracle.com/en/java/javase/17/install/installation-jdk-macos.html#GUID-E8A251B6-D9A9-4276-ABC8-CC0DAD62EA33). -Some Linux distributions has JDK package in package manager. For example, Debian users can install -openjdk-17-jdk package. - -## Set up Android SDK/NDK -Android has a command line tool [sdkmanager](https://developer.android.com/tools/sdkmanager) which -helps users managing SDK and other tools related to Android development. - -1. Go to https://developer.android.com/studio and download the archive from "Command line tools -only" section. Make sure you have read and agree with the terms and conditions from the website. - -Linux: -```bash -curl https://dl.google.com/android/repository/commandlinetools-linux-11076708_latest.zip -o commandlinetools.zip -``` -macOS: -```bash -curl https://dl.google.com/android/repository/commandlinetools-mac-11076708_latest.zip -o commandlinetools.zip -``` -2. Unzip. -```bash -unzip commandlinetools.zip -``` -3. Specify a root for Android SDK. For example, we can put it under `$DEV_HOME/sdk`. - -``` -mkdir -p $DEV_HOME/sdk -export ANDROID_HOME="$(realpath $DEV_HOME/sdk)" -# Install SDK 34 -./cmdline-tools/bin/sdkmanager --sdk_root="${ANDROID_HOME}" --install "platforms;android-34" -# Install NDK -./cmdline-tools/bin/sdkmanager --sdk_root="${ANDROID_HOME}" --install "ndk;26.3.11579264" -# The NDK root is then under `ndk/`. -export ANDROID_NDK="$ANDROID_HOME/ndk/26.3.11579264" -``` - -### (Optional) Android Studio Setup -If you want to use Android Studio and never set up Java/SDK/NDK before, or if -you use the newly installed ones, follow these steps to set Android Studio to use -them. - -Copy these output paths to be used by Android Studio -```bash -echo $ANDROID_HOME -echo $ANDROID_NDK -echo $JAVA_HOME -``` - -Open a project in Android Studio. In Project Structure (File -> Project -Structure, or `⌘;`) -> SDK Location, -* Set Android SDK Location to the path of $ANDROID_HOME -* Set Android NDK Location to the path of $ANDROID_NDK -* Set JDK location (Click Gradle Settings link) -> Gradle JDK -> Add JDK... to the path of $JAVA_HOME diff --git a/examples/demo-apps/android/LlamaDemo/app/.gitignore b/examples/demo-apps/android/LlamaDemo/app/.gitignore deleted file mode 100644 index 796b96d1c40..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/build diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts deleted file mode 100644 index 19cfda847db..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -plugins { - id("com.android.application") - id("org.jetbrains.kotlin.android") -} - -val qnnVersion: String? = project.findProperty("qnnVersion") as? String - -android { - namespace = "com.example.executorchllamademo" - compileSdk = 34 - - defaultConfig { - applicationId = "com.example.executorchllamademo" - minSdk = 28 - targetSdk = 33 - versionCode = 1 - versionName = "1.0" - - testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" - vectorDrawables { useSupportLibrary = true } - externalNativeBuild { cmake { cppFlags += "" } } - } - - buildTypes { - release { - isMinifyEnabled = false - proguardFiles(getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro") - } - } - compileOptions { - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 - } - kotlinOptions { jvmTarget = "1.8" } - buildFeatures { compose = true } - composeOptions { kotlinCompilerExtensionVersion = "1.4.3" } - packaging { resources { excludes += "/META-INF/{AL2.0,LGPL2.1}" } } -} - -dependencies { - implementation("androidx.core:core-ktx:1.9.0") - implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.1") - implementation("androidx.activity:activity-compose:1.7.0") - implementation(platform("androidx.compose:compose-bom:2023.03.00")) - implementation("androidx.compose.ui:ui") - implementation("androidx.compose.ui:ui-graphics") - implementation("androidx.compose.ui:ui-tooling-preview") - implementation("androidx.compose.material3:material3") - implementation("androidx.appcompat:appcompat:1.6.1") - implementation("androidx.camera:camera-core:1.3.0-rc02") - implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") - implementation("com.facebook.fbjni:fbjni:0.5.1") - implementation("com.google.code.gson:gson:2.8.6") - implementation(files("libs/executorch.aar")) - implementation("com.google.android.material:material:1.12.0") - implementation("androidx.activity:activity:1.9.0") - implementation("org.json:json:20250107") - if (!qnnVersion.isNullOrEmpty()) { - implementation("com.qualcomm.qti:qnn-runtime:$qnnVersion") - } - testImplementation("junit:junit:4.13.2") - androidTestImplementation("androidx.test.ext:junit:1.1.5") - androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") - androidTestImplementation(platform("androidx.compose:compose-bom:2023.03.00")) - androidTestImplementation("androidx.compose.ui:ui-test-junit4") - debugImplementation("androidx.compose.ui:ui-tooling") - debugImplementation("androidx.compose.ui:ui-test-manifest") -} - -tasks.register("setup") { - doFirst { - exec { - commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup.sh") - workingDir("../../../../../") - } - } -} - -tasks.register("setupQnn") { - doFirst { - exec { - commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh") - workingDir("../../../../../") - } - } -} - -tasks.register("download_prebuilt_lib") { - doFirst { - exec { - commandLine("sh", "examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh") - workingDir("../../../../../") - } - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/proguard-rules.pro b/examples/demo-apps/android/LlamaDemo/app/proguard-rules.pro deleted file mode 100644 index 481bb434814..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/proguard-rules.pro +++ /dev/null @@ -1,21 +0,0 @@ -# Add project specific ProGuard rules here. -# You can control the set of applied configuration files using the -# proguardFiles setting in build.gradle. -# -# For more details, see -# http://developer.android.com/guide/developing/tools/proguard.html - -# If your project uses WebView with JS, uncomment the following -# and specify the fully qualified class name to the JavaScript interface -# class: -#-keepclassmembers class fqcn.of.javascript.interface.for.webview { -# public *; -#} - -# Uncomment this to preserve the line number information for -# debugging stack traces. -#-keepattributes SourceFile,LineNumberTable - -# If you keep the line number information, uncomment this to -# hide the original source file name. -#-renamesourcefileattribute SourceFile \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java deleted file mode 100644 index 32ec24a0df9..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; - -import android.os.Bundle; -import androidx.test.ext.junit.runners.AndroidJUnit4; -import androidx.test.platform.app.InstrumentationRegistry; -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import org.json.JSONException; -import org.json.JSONObject; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.pytorch.executorch.extension.llm.LlmCallback; -import org.pytorch.executorch.extension.llm.LlmModule; - -@RunWith(AndroidJUnit4.class) -public class PerfTest implements LlmCallback { - - private static final String RESOURCE_PATH = "/data/local/tmp/llama/"; - private static final String TOKENIZER_BIN = "tokenizer.bin"; - - private final List results = new ArrayList<>(); - private final List tokensPerSecond = new ArrayList<>(); - - @Test - public void testTokensPerSecond() { - String tokenizerPath = RESOURCE_PATH + TOKENIZER_BIN; - // Find out the model name - File directory = new File(RESOURCE_PATH); - Arrays.stream(directory.listFiles()) - .filter(file -> file.getName().endsWith(".pte")) - .forEach( - model -> { - LlmModule mModule = new LlmModule(model.getPath(), tokenizerPath, 0.8f); - // Print the model name because there might be more than one of them - report("ModelName", model.getName()); - - int loadResult = mModule.load(); - // Check that the model can be load successfully - assertEquals(0, loadResult); - - // Run a testing prompt - mModule.generate("How do you do! I'm testing llama2 on mobile device", PerfTest.this); - assertFalse(tokensPerSecond.isEmpty()); - - final Float tps = tokensPerSecond.get(tokensPerSecond.size() - 1); - report("TPS", tps); - }); - } - - @Override - public void onResult(String result) { - results.add(result); - } - - @Override - public void onStats(String result) { - try { - JSONObject jsonObject = new JSONObject(result); - int numGeneratedTokens = jsonObject.getInt("generated_tokens"); - int inferenceEndMs = jsonObject.getInt("inference_end_ms"); - int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms"); - float tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000; - tokensPerSecond.add(tps); - } catch (JSONException e) { - } - } - - private void report(final String metric, final Float value) { - Bundle bundle = new Bundle(); - bundle.putFloat(metric, value); - InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle); - } - - private void report(final String key, final String value) { - Bundle bundle = new Bundle(); - bundle.putString(key, value); - InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml deleted file mode 100644 index 7096a7d4e76..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK deleted file mode 100644 index a64e11d1306..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK +++ /dev/null @@ -1,67 +0,0 @@ -load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target") -load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary") -load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") -load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource") - -oncall("executorch") - -non_fbcode_target(_kind = fb_android_resource, - name = "app_res", - package = "com.example.executorchllamademo", - res = "res", -) - -non_fbcode_target(_kind = fb_android_library, - name = "app_lib", - srcs = [ - "java/com/example/executorchllamademo/AppLog.java", - "java/com/example/executorchllamademo/BackendType.java", - "java/com/example/executorchllamademo/DemoSharedPreferences.java", - "java/com/example/executorchllamademo/ETImage.java", - "java/com/example/executorchllamademo/ETLogging.java", - "java/com/example/executorchllamademo/LlmBenchmarkRunner.java", - "java/com/example/executorchllamademo/LogsActivity.java", - "java/com/example/executorchllamademo/LogsAdapter.java", - "java/com/example/executorchllamademo/MainActivity.java", - "java/com/example/executorchllamademo/Message.java", - "java/com/example/executorchllamademo/MessageAdapter.java", - "java/com/example/executorchllamademo/MessageType.java", - "java/com/example/executorchllamademo/ModelRunner.java", - "java/com/example/executorchllamademo/ModelRunnerCallback.java", - "java/com/example/executorchllamademo/ModelType.java", - "java/com/example/executorchllamademo/ModelUtils.java", - "java/com/example/executorchllamademo/PromptFormat.java", - "java/com/example/executorchllamademo/SettingsActivity.java", - "java/com/example/executorchllamademo/SettingsFields.java", - ], - autoglob = False, - language = "JAVA", - deps = [ - ":app_res", - "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout", - "//third-party/java/com/google/code/gson/gson:gson", - "//xplat/executorch/extension/android:executorch_llama", - ], -) - -non_fbcode_target(_kind = fb_android_binary, - name = "ExecuTorchLlamaDemo", - keystore = "//fbandroid/keystores:debug", - manifest = "AndroidManifest.xml", - manifest_entries = { - "min_sdk_version": 21, - "target_sdk_version": 34, - "version_code": "1", - "version_name": "1.0", - }, - package_type = "release", - skip_proguard = True, - deps = [ - ":app_lib", - ":app_res", - "//third-party/java/androidx/appcompat/appcompat:appcompat", - "//third-party/java/com/google/code/gson/gson:gson", - "//xplat/executorch/extension/android:executorch_llama", - "//xplat/executorch/extension/android/jni:executorch_llama_jni", - ], -) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java deleted file mode 100644 index 36d07419381..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Locale; - -public class AppLog { - private final Long timestamp; - private final String message; - - public AppLog(String message) { - this.timestamp = getCurrentTimeStamp(); - this.message = message; - } - - public Long getTimestamp() { - return timestamp; - } - - public String getMessage() { - return message; - } - - public String getFormattedLog() { - return "[" + getFormattedTimeStamp() + "] " + message; - } - - private Long getCurrentTimeStamp() { - return System.currentTimeMillis(); - } - - private String getFormattedTimeStamp() { - return formatDate(timestamp); - } - - private String formatDate(long milliseconds) { - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.getDefault()); - Date date = new Date(milliseconds); - return formatter.format(date); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java deleted file mode 100644 index 7c84799795f..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.example.executorchllamademo; - -public enum BackendType { - XNNPACK, - QUALCOMM, - MEDIATEK -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java deleted file mode 100644 index 99a94c00ebb..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.content.Context; -import android.content.SharedPreferences; -import com.google.gson.Gson; -import com.google.gson.reflect.TypeToken; -import java.lang.reflect.Type; -import java.util.ArrayList; - -public class DemoSharedPreferences { - Context context; - SharedPreferences sharedPreferences; - - public DemoSharedPreferences(Context context) { - this.context = context; - this.sharedPreferences = getSharedPrefs(); - } - - private SharedPreferences getSharedPrefs() { - return context.getSharedPreferences( - context.getString(R.string.demo_pref_file_key), Context.MODE_PRIVATE); - } - - public String getSavedMessages() { - return sharedPreferences.getString(context.getString(R.string.saved_messages_json_key), ""); - } - - public void addMessages(MessageAdapter messageAdapter) { - SharedPreferences.Editor editor = sharedPreferences.edit(); - Gson gson = new Gson(); - String msgJSON = gson.toJson(messageAdapter.getSavedMessages()); - editor.putString(context.getString(R.string.saved_messages_json_key), msgJSON); - editor.apply(); - } - - public void removeExistingMessages() { - SharedPreferences.Editor editor = sharedPreferences.edit(); - editor.remove(context.getString(R.string.saved_messages_json_key)); - editor.apply(); - } - - public void addSettings(SettingsFields settingsFields) { - SharedPreferences.Editor editor = sharedPreferences.edit(); - Gson gson = new Gson(); - String settingsJSON = gson.toJson(settingsFields); - editor.putString(context.getString(R.string.settings_json_key), settingsJSON); - editor.apply(); - } - - public String getSettings() { - return sharedPreferences.getString(context.getString(R.string.settings_json_key), ""); - } - - public void saveLogs() { - SharedPreferences.Editor editor = sharedPreferences.edit(); - Gson gson = new Gson(); - String msgJSON = gson.toJson(ETLogging.getInstance().getLogs()); - editor.putString(context.getString(R.string.logs_json_key), msgJSON); - editor.apply(); - } - - public void removeExistingLogs() { - SharedPreferences.Editor editor = sharedPreferences.edit(); - editor.remove(context.getString(R.string.logs_json_key)); - editor.apply(); - } - - public ArrayList getSavedLogs() { - String logsJSONString = - sharedPreferences.getString(context.getString(R.string.logs_json_key), null); - if (logsJSONString == null || logsJSONString.isEmpty()) { - return new ArrayList<>(); - } - Gson gson = new Gson(); - Type type = new TypeToken>() {}.getType(); - ArrayList appLogs = gson.fromJson(logsJSONString, type); - if (appLogs == null) { - return new ArrayList<>(); - } - return appLogs; - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java deleted file mode 100644 index e68c8472626..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.content.ContentResolver; -import android.graphics.Bitmap; -import android.graphics.BitmapFactory; -import android.graphics.Color; -import android.net.Uri; -import androidx.annotation.Nullable; -import java.io.FileNotFoundException; -import java.io.InputStream; - -public class ETImage { - private int width; - private int height; - private final byte[] bytes; - private final Uri uri; - private final ContentResolver contentResolver; - - ETImage(ContentResolver contentResolver, Uri uri) { - this.contentResolver = contentResolver; - this.uri = uri; - bytes = getBytesFromImageURI(uri); - } - - public int getWidth() { - return width; - } - - public int getHeight() { - return height; - } - - public Uri getUri() { - return uri; - } - - public byte[] getBytes() { - return bytes; - } - - public int[] getInts() { - // We need to convert the byte array to an int array because - // the runner expects an int array as input. - int[] intArray = new int[bytes.length]; - for (int i = 0; i < bytes.length; i++) { - intArray[i] = (bytes[i++] & 0xFF); - } - return intArray; - } - - private byte[] getBytesFromImageURI(Uri uri) { - try { - int RESIZED_IMAGE_WIDTH = 336; - Bitmap bitmap = resizeImage(uri, RESIZED_IMAGE_WIDTH); - - if (bitmap == null) { - ETLogging.getInstance().log("Unable to get bytes from Image URI. Bitmap is null"); - return new byte[0]; - } - - width = bitmap.getWidth(); - height = bitmap.getHeight(); - - byte[] rgbValues = new byte[width * height * 3]; - - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - // Get the color of the current pixel - int color = bitmap.getPixel(x, y); - - // Extract the RGB values from the color - int red = Color.red(color); - int green = Color.green(color); - int blue = Color.blue(color); - - // Store the RGB values in the byte array - rgbValues[y * width + x] = (byte) red; - rgbValues[(y * width + x) + height * width] = (byte) green; - rgbValues[(y * width + x) + 2 * height * width] = (byte) blue; - } - } - return rgbValues; - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } - } - - @Nullable - private Bitmap resizeImage(Uri uri, int maxLength) throws FileNotFoundException { - InputStream inputStream = contentResolver.openInputStream(uri); - if (inputStream == null) { - ETLogging.getInstance().log("Unable to resize image, input streams is null"); - return null; - } - Bitmap bitmap = BitmapFactory.decodeStream(inputStream); - if (bitmap == null) { - ETLogging.getInstance().log("Unable to resize image, bitmap during decode stream is null"); - return null; - } - - float aspectRatio; - int finalWidth, finalHeight; - - if (bitmap.getWidth() > bitmap.getHeight()) { - // width > height --> width = maxLength, height scale with aspect ratio - aspectRatio = bitmap.getWidth() / (float) bitmap.getHeight(); - finalWidth = maxLength; - finalHeight = Math.round(maxLength / aspectRatio); - } else { - // height >= width --> height = maxLength, width scale with aspect ratio - aspectRatio = bitmap.getHeight() / (float) bitmap.getWidth(); - finalHeight = maxLength; - finalWidth = Math.round(maxLength / aspectRatio); - } - - return Bitmap.createScaledBitmap(bitmap, finalWidth, finalHeight, false); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java deleted file mode 100644 index e595348945f..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.app.Application; -import android.util.Log; -import java.util.ArrayList; - -public class ETLogging extends Application { - private static ETLogging singleton; - - private ArrayList logs; - private DemoSharedPreferences mDemoSharedPreferences; - - @Override - public void onCreate() { - super.onCreate(); - singleton = this; - mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext()); - logs = mDemoSharedPreferences.getSavedLogs(); - if (logs == null) { // We don't have existing sharedPreference stored - logs = new ArrayList<>(); - } - } - - public static ETLogging getInstance() { - return singleton; - } - - public void log(String message) { - AppLog appLog = new AppLog(message); - logs.add(appLog); - Log.d("ETLogging", appLog.getMessage()); - } - - public ArrayList getLogs() { - return logs; - } - - public void clearLogs() { - logs.clear(); - mDemoSharedPreferences.removeExistingLogs(); - } - - public void saveLogs() { - mDemoSharedPreferences.saveLogs(); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java deleted file mode 100644 index 8c2d60252a0..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.app.Activity; -import android.app.ActivityManager; -import android.content.Intent; -import android.os.Build; -import android.os.Bundle; -import android.util.Log; -import android.widget.TextView; -import androidx.annotation.NonNull; -import com.google.gson.Gson; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback { - ModelRunner mModelRunner; - - String mPrompt; - TextView mTextView; - StatsDump mStatsDump; - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - setContentView(R.layout.activity_benchmarking); - mTextView = findViewById(R.id.log_view); - - Intent intent = getIntent(); - - File modelDir = new File(intent.getStringExtra("model_dir")); - File model = - Arrays.stream(modelDir.listFiles()) - .filter(file -> file.getName().endsWith(".pte")) - .findFirst() - .get(); - String tokenizerPath = intent.getStringExtra("tokenizer_path"); - - float temperature = intent.getFloatExtra("temperature", 0.8f); - mPrompt = intent.getStringExtra("prompt"); - if (mPrompt == null) { - mPrompt = "The ultimate answer"; - } - - mStatsDump = new StatsDump(); - mStatsDump.modelName = model.getName().replace(".pte", ""); - mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this); - mStatsDump.loadStart = System.nanoTime(); - } - - @Override - public void onModelLoaded(int status) { - mStatsDump.loadEnd = System.nanoTime(); - mStatsDump.loadStatus = status; - if (status != 0) { - Log.e("LlmBenchmarkRunner", "Loaded failed: " + status); - onGenerationStopped(); - return; - } - mStatsDump.generateStart = System.nanoTime(); - mModelRunner.generate(mPrompt); - } - - @Override - public void onTokenGenerated(String token) { - runOnUiThread( - () -> { - mTextView.append(token); - }); - } - - @Override - public void onStats(String stats) { - mStatsDump.tokens = stats; - } - - @Override - public void onGenerationStopped() { - mStatsDump.generateEnd = System.nanoTime(); - runOnUiThread( - () -> { - mTextView.append(mStatsDump.toString()); - }); - - final BenchmarkMetric.BenchmarkModel benchmarkModel = - BenchmarkMetric.extractBackendAndQuantization(mStatsDump.modelName); - final List results = new ArrayList<>(); - // The list of metrics we have atm includes: - // Load status - results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsDump.loadStatus, 0)); - // Model load time - results.add( - new BenchmarkMetric( - benchmarkModel, - "model_load_time(ms)", - (mStatsDump.loadEnd - mStatsDump.loadStart) * 1e-6, - 0.0f)); - // LLM generate time - results.add( - new BenchmarkMetric( - benchmarkModel, - "generate_time(ms)", - (mStatsDump.generateEnd - mStatsDump.generateStart) * 1e-6, - 0.0f)); - // Token per second - results.add( - new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsDump.tokens), 0.0f)); - - try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { - Gson gson = new Gson(); - writer.write(gson.toJson(results)); - } catch (IOException e) { - e.printStackTrace(); - } - } - - private double extractTPS(final String tokens) { - final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens); - if (m.find()) { - return Double.parseDouble(m.group()); - } else { - return 0.0f; - } - } -} - -class BenchmarkMetric { - public static class BenchmarkModel { - // The model name, i.e. stories110M - String name; - String backend; - String quantization; - - public BenchmarkModel(final String name, final String backend, final String quantization) { - this.name = name; - this.backend = backend; - this.quantization = quantization; - } - } - - BenchmarkModel benchmarkModel; - - // The metric name, i.e. TPS - String metric; - - // The actual value and the option target value - double actualValue; - double targetValue; - - public static class DeviceInfo { - // Let's see which information we want to include here - final String device = Build.BRAND; - // The phone model and Android release version - final String arch = Build.MODEL; - final String os = "Android " + Build.VERSION.RELEASE; - final long totalMem = new ActivityManager.MemoryInfo().totalMem; - final long availMem = new ActivityManager.MemoryInfo().availMem; - } - - DeviceInfo deviceInfo = new DeviceInfo(); - - public BenchmarkMetric( - final BenchmarkModel benchmarkModel, - final String metric, - final double actualValue, - final double targetValue) { - this.benchmarkModel = benchmarkModel; - this.metric = metric; - this.actualValue = actualValue; - this.targetValue = targetValue; - } - - // TODO (huydhn): Figure out a way to extract the backend and quantization information from - // the .pte model itself instead of parsing its name - public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) { - final Matcher m = - Pattern.compile("(?\\w+)_(?[\\w\\+]+)_(?\\w+)").matcher(model); - if (m.matches()) { - return new BenchmarkMetric.BenchmarkModel( - m.group("name"), m.group("backend"), m.group("quantization")); - } else { - return new BenchmarkMetric.BenchmarkModel(model, "", ""); - } - } -} - -class StatsDump { - int loadStatus; - long loadStart; - long loadEnd; - long generateStart; - long generateEnd; - String tokens; - String modelName; - - @NonNull - @Override - public String toString() { - return "loadStart: " - + loadStart - + "\nloadEnd: " - + loadEnd - + "\ngenerateStart: " - + generateStart - + "\ngenerateEnd: " - + generateEnd - + "\n" - + tokens; - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java deleted file mode 100644 index 7777b275e6e..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.app.AlertDialog; -import android.content.DialogInterface; -import android.os.Build; -import android.os.Bundle; -import android.widget.ImageButton; -import android.widget.ListView; -import androidx.appcompat.app.AppCompatActivity; -import androidx.core.content.ContextCompat; -import androidx.core.graphics.Insets; -import androidx.core.view.ViewCompat; -import androidx.core.view.WindowInsetsCompat; - -public class LogsActivity extends AppCompatActivity { - - private LogsAdapter mLogsAdapter; - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - setContentView(R.layout.activity_logs); - if (Build.VERSION.SDK_INT >= 21) { - getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); - getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); - } - ViewCompat.setOnApplyWindowInsetsListener( - requireViewById(R.id.main), - (v, insets) -> { - Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars()); - v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom); - return insets; - }); - - setupLogs(); - setupClearLogsButton(); - } - - @Override - public void onResume() { - super.onResume(); - mLogsAdapter.clear(); - mLogsAdapter.addAll(ETLogging.getInstance().getLogs()); - mLogsAdapter.notifyDataSetChanged(); - } - - private void setupLogs() { - ListView mLogsListView = requireViewById(R.id.logsListView); - mLogsAdapter = new LogsAdapter(this, R.layout.logs_message); - - mLogsListView.setAdapter(mLogsAdapter); - mLogsAdapter.addAll(ETLogging.getInstance().getLogs()); - mLogsAdapter.notifyDataSetChanged(); - } - - private void setupClearLogsButton() { - ImageButton clearLogsButton = requireViewById(R.id.clearLogsButton); - clearLogsButton.setOnClickListener( - view -> { - new AlertDialog.Builder(this) - .setTitle("Delete Logs History") - .setMessage("Do you really want to delete logs history?") - .setIcon(android.R.drawable.ic_dialog_alert) - .setPositiveButton( - android.R.string.yes, - new DialogInterface.OnClickListener() { - public void onClick(DialogInterface dialog, int whichButton) { - // Clear the messageAdapter and sharedPreference - ETLogging.getInstance().clearLogs(); - mLogsAdapter.clear(); - mLogsAdapter.notifyDataSetChanged(); - } - }) - .setNegativeButton(android.R.string.no, null) - .show(); - }); - } - - @Override - protected void onDestroy() { - super.onDestroy(); - ETLogging.getInstance().saveLogs(); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java deleted file mode 100644 index 76c6a1aa1b4..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.view.LayoutInflater; -import android.view.View; -import android.view.ViewGroup; -import android.widget.ArrayAdapter; -import android.widget.TextView; -import androidx.annotation.NonNull; -import java.util.Objects; - -public class LogsAdapter extends ArrayAdapter { - public LogsAdapter(android.content.Context context, int resource) { - super(context, resource); - } - - static class ViewHolder { - private TextView logTextView; - } - - @NonNull - @Override - public View getView(int position, View convertView, @NonNull ViewGroup parent) { - ViewHolder mViewHolder = null; - - String logMessage = Objects.requireNonNull(getItem(position)).getFormattedLog(); - - if (convertView == null || convertView.getTag() == null) { - mViewHolder = new ViewHolder(); - convertView = LayoutInflater.from(getContext()).inflate(R.layout.logs_message, parent, false); - mViewHolder.logTextView = convertView.requireViewById(R.id.logsTextView); - } else { - mViewHolder = (ViewHolder) convertView.getTag(); - } - mViewHolder.logTextView.setText(logMessage); - return convertView; - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java deleted file mode 100644 index f995c5bc65a..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ /dev/null @@ -1,847 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.Manifest; -import android.app.ActivityManager; -import android.app.AlertDialog; -import android.content.ContentResolver; -import android.content.ContentValues; -import android.content.Intent; -import android.content.pm.PackageManager; -import android.net.Uri; -import android.os.Build; -import android.os.Bundle; -import android.os.Handler; -import android.os.Looper; -import android.os.Process; -import android.provider.MediaStore; -import android.system.ErrnoException; -import android.system.Os; -import android.util.Log; -import android.view.View; -import android.view.inputmethod.InputMethodManager; -import android.widget.EditText; -import android.widget.ImageButton; -import android.widget.ImageView; -import android.widget.LinearLayout; -import android.widget.ListView; -import android.widget.TextView; -import android.widget.Toast; -import androidx.activity.result.ActivityResultLauncher; -import androidx.activity.result.PickVisualMediaRequest; -import androidx.activity.result.contract.ActivityResultContracts; -import androidx.annotation.NonNull; -import androidx.appcompat.app.AppCompatActivity; -import androidx.constraintlayout.widget.ConstraintLayout; -import androidx.core.app.ActivityCompat; -import androidx.core.content.ContextCompat; -import androidx.core.content.res.ResourcesCompat; -import com.google.gson.Gson; -import com.google.gson.reflect.TypeToken; -import java.lang.reflect.Type; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Executor; -import java.util.concurrent.Executors; -import org.json.JSONException; -import org.json.JSONObject; -import org.pytorch.executorch.extension.llm.LlmCallback; -import org.pytorch.executorch.extension.llm.LlmModule; - -public class MainActivity extends AppCompatActivity implements Runnable, LlmCallback { - private EditText mEditTextMessage; - private ImageButton mThinkModeButton; - private ImageButton mSendButton; - private ImageButton mGalleryButton; - private ImageButton mCameraButton; - private ListView mMessagesView; - private MessageAdapter mMessageAdapter; - private LlmModule mModule = null; - private Message mResultMessage = null; - private ImageButton mSettingsButton; - private TextView mMemoryView; - private ActivityResultLauncher mPickGallery; - private ActivityResultLauncher mCameraRoll; - private List mSelectedImageUri; - private ConstraintLayout mMediaPreviewConstraintLayout; - private LinearLayout mAddMediaLayout; - private static final int MAX_NUM_OF_IMAGES = 5; - private static final int REQUEST_IMAGE_CAPTURE = 1; - private Uri cameraImageUri; - private DemoSharedPreferences mDemoSharedPreferences; - private SettingsFields mCurrentSettingsFields; - private Handler mMemoryUpdateHandler; - private Runnable memoryUpdater; - private boolean mThinkMode = false; - private int promptID = 0; - private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2; - private Executor executor; - - @Override - public void onResult(String result) { - if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) { - return; - } - result = PromptFormat.replaceSpecialToken(mCurrentSettingsFields.getModelType(), result); - if (result.equals("\n\n") || result.equals("\n")) { - if (!mResultMessage.getText().isEmpty()) { - mResultMessage.appendText(result); - run(); - } - } else { - mResultMessage.appendText(result); - run(); - } - } - - @Override - public void onStats(String stats) { - runOnUiThread( - () -> { - if (mResultMessage != null) { - float tps = 0; - try { - JSONObject jsonObject = new JSONObject(stats); - int numGeneratedTokens = jsonObject.getInt("generated_tokens"); - int inferenceEndMs = jsonObject.getInt("inference_end_ms"); - int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms"); - tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000; - } catch (JSONException e) { - Log.e("LLM", "Error parsing JSON: " + e.getMessage()); - } - mResultMessage.setTokensPerSecond(tps); - mMessageAdapter.notifyDataSetChanged(); - } - }); - } - - private void setLocalModel(String modelPath, String tokenizerPath, float temperature) { - Message modelLoadingMessage = new Message("Loading model...", false, MessageType.SYSTEM, 0); - ETLogging.getInstance().log("Loading model " + modelPath + " with tokenizer " + tokenizerPath); - runOnUiThread( - () -> { - mSendButton.setEnabled(false); - mMessageAdapter.add(modelLoadingMessage); - mMessageAdapter.notifyDataSetChanged(); - }); - if (mModule != null) { - ETLogging.getInstance().log("Start deallocating existing module instance"); - mModule.resetNative(); - mModule = null; - ETLogging.getInstance().log("Completed deallocating existing module instance"); - } - long runStartTime = System.currentTimeMillis(); - mModule = - new LlmModule( - ModelUtils.getModelCategory( - mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()), - modelPath, - tokenizerPath, - temperature); - int loadResult = mModule.load(); - long loadDuration = System.currentTimeMillis() - runStartTime; - String modelLoadError = ""; - String modelInfo = ""; - if (loadResult != 0) { - // TODO: Map the error code to a reason to let the user know why model loading failed - modelInfo = "*Model could not load (Error Code: " + loadResult + ")*" + "\n"; - loadDuration = 0; - AlertDialog.Builder builder = new AlertDialog.Builder(this); - builder.setTitle("Load failed: " + loadResult); - runOnUiThread( - () -> { - AlertDialog alert = builder.create(); - alert.show(); - }); - } else { - String[] segments = modelPath.split("/"); - String pteName = segments[segments.length - 1]; - segments = tokenizerPath.split("/"); - String tokenizerName = segments[segments.length - 1]; - modelInfo = - "Successfully loaded model. " - + pteName - + " and tokenizer " - + tokenizerName - + " in " - + (float) loadDuration / 1000 - + " sec." - + " You can send text or image for inference"; - - if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { - ETLogging.getInstance().log("Llava start prefill prompt"); - mModule.resetContext(); - mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt()); - ETLogging.getInstance().log("Llava completes prefill prompt"); - } - } - - Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0); - - String modelLoggingInfo = - modelLoadError - + "Model path: " - + modelPath - + "\nTokenizer path: " - + tokenizerPath - + "\nBackend: " - + mCurrentSettingsFields.getBackendType().toString() - + "\nModelType: " - + ModelUtils.getModelCategory( - mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()) - + "\nTemperature: " - + temperature - + "\nModel loaded time: " - + loadDuration - + " ms"; - ETLogging.getInstance().log("Load complete. " + modelLoggingInfo); - - runOnUiThread( - () -> { - mSendButton.setEnabled(true); - mMessageAdapter.remove(modelLoadingMessage); - mMessageAdapter.add(modelLoadedMessage); - mMessageAdapter.notifyDataSetChanged(); - }); - } - - private void loadLocalModelAndParameters( - String modelFilePath, String tokenizerFilePath, float temperature) { - Runnable runnable = - new Runnable() { - @Override - public void run() { - setLocalModel(modelFilePath, tokenizerFilePath, temperature); - } - }; - new Thread(runnable).start(); - } - - private void populateExistingMessages(String existingMsgJSON) { - Gson gson = new Gson(); - Type type = new TypeToken>() {}.getType(); - ArrayList savedMessages = gson.fromJson(existingMsgJSON, type); - for (Message msg : savedMessages) { - mMessageAdapter.add(msg); - } - mMessageAdapter.notifyDataSetChanged(); - } - - private int setPromptID() { - - return mMessageAdapter.getMaxPromptID() + 1; - } - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - setContentView(R.layout.activity_main); - - if (Build.VERSION.SDK_INT >= 21) { - getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); - getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); - } - - try { - Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); - Os.setenv("LD_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); - } catch (ErrnoException e) { - finish(); - } - - mThinkModeButton = requireViewById(R.id.thinkModeButton); - mEditTextMessage = requireViewById(R.id.editTextMessage); - mSendButton = requireViewById(R.id.sendButton); - mSendButton.setEnabled(false); - mMessagesView = requireViewById(R.id.messages_view); - mMessageAdapter = new MessageAdapter(this, R.layout.sent_message, new ArrayList()); - mMessagesView.setAdapter(mMessageAdapter); - mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext()); - String existingMsgJSON = mDemoSharedPreferences.getSavedMessages(); - if (!existingMsgJSON.isEmpty()) { - populateExistingMessages(existingMsgJSON); - promptID = setPromptID(); - } - mSettingsButton = requireViewById(R.id.settings); - mSettingsButton.setOnClickListener( - view -> { - Intent myIntent = new Intent(MainActivity.this, SettingsActivity.class); - MainActivity.this.startActivity(myIntent); - }); - - mThinkModeButton.setOnClickListener( - view -> { - if (mThinkMode) { - mThinkMode = false; - mThinkModeButton.setImageDrawable( - ResourcesCompat.getDrawable( - getResources(), R.drawable.baseline_lightbulb_24, null)); - } else { - mThinkMode = true; - mThinkModeButton.setImageDrawable( - ResourcesCompat.getDrawable(getResources(), R.drawable.blue_lightbulb_24, null)); - } - runOnUiThread( - () -> { - String thinkingModeText = mThinkMode ? "on" : "off"; - mMessageAdapter.add( - new Message( - "Thinking mode is " + thinkingModeText, false, MessageType.SYSTEM, 0)); - mMessageAdapter.notifyDataSetChanged(); - }); - }); - - mCurrentSettingsFields = new SettingsFields(); - mMemoryUpdateHandler = new Handler(Looper.getMainLooper()); - onModelRunStopped(); - setupMediaButton(); - setupGalleryPicker(); - setupCameraRoll(); - startMemoryUpdate(); - setupShowLogsButton(); - executor = Executors.newSingleThreadExecutor(); - } - - @Override - protected void onPause() { - super.onPause(); - mDemoSharedPreferences.addMessages(mMessageAdapter); - } - - @Override - protected void onResume() { - super.onResume(); - // Check for if settings parameters have changed - Gson gson = new Gson(); - String settingsFieldsJSON = mDemoSharedPreferences.getSettings(); - if (!settingsFieldsJSON.isEmpty()) { - SettingsFields updatedSettingsFields = - gson.fromJson(settingsFieldsJSON, SettingsFields.class); - if (updatedSettingsFields == null) { - // Added this check, because gson.fromJson can return null - askUserToSelectModel(); - return; - } - boolean isUpdated = !mCurrentSettingsFields.equals(updatedSettingsFields); - boolean isLoadModel = updatedSettingsFields.getIsLoadModel(); - setBackendMode(updatedSettingsFields.getBackendType()); - if (isUpdated) { - if (isLoadModel) { - // If users change the model file, but not pressing loadModelButton, we won't load the new - // model - checkForUpdateAndReloadModel(updatedSettingsFields); - } else { - askUserToSelectModel(); - } - - checkForClearChatHistory(updatedSettingsFields); - // Update current to point to the latest - mCurrentSettingsFields = new SettingsFields(updatedSettingsFields); - } - } else { - askUserToSelectModel(); - } - } - - private void setBackendMode(BackendType backendType) { - if (backendType.equals(BackendType.XNNPACK) || backendType.equals(BackendType.QUALCOMM)) { - setXNNPACKMode(); - } else if (backendType.equals(BackendType.MEDIATEK)) { - setMediaTekMode(); - } - } - - private void setXNNPACKMode() { - requireViewById(R.id.addMediaButton).setVisibility(View.VISIBLE); - } - - private void setMediaTekMode() { - requireViewById(R.id.addMediaButton).setVisibility(View.GONE); - } - - private void checkForClearChatHistory(SettingsFields updatedSettingsFields) { - if (updatedSettingsFields.getIsClearChatHistory()) { - mMessageAdapter.clear(); - mMessageAdapter.notifyDataSetChanged(); - mDemoSharedPreferences.removeExistingMessages(); - // changing to false since chat history has been cleared. - updatedSettingsFields.saveIsClearChatHistory(false); - mDemoSharedPreferences.addSettings(updatedSettingsFields); - } - } - - private void checkForUpdateAndReloadModel(SettingsFields updatedSettingsFields) { - // TODO need to add 'load model' in settings and queue loading based on that - String modelPath = updatedSettingsFields.getModelFilePath(); - String tokenizerPath = updatedSettingsFields.getTokenizerFilePath(); - double temperature = updatedSettingsFields.getTemperature(); - if (!modelPath.isEmpty() && !tokenizerPath.isEmpty()) { - if (updatedSettingsFields.getIsLoadModel() - || !modelPath.equals(mCurrentSettingsFields.getModelFilePath()) - || !tokenizerPath.equals(mCurrentSettingsFields.getTokenizerFilePath()) - || temperature != mCurrentSettingsFields.getTemperature()) { - loadLocalModelAndParameters( - updatedSettingsFields.getModelFilePath(), - updatedSettingsFields.getTokenizerFilePath(), - (float) updatedSettingsFields.getTemperature()); - updatedSettingsFields.saveLoadModelAction(false); - mDemoSharedPreferences.addSettings(updatedSettingsFields); - } - } else { - askUserToSelectModel(); - } - } - - private void askUserToSelectModel() { - String askLoadModel = - "To get started, select your desired model and tokenizer " + "from the top right corner"; - Message askLoadModelMessage = new Message(askLoadModel, false, MessageType.SYSTEM, 0); - ETLogging.getInstance().log(askLoadModel); - runOnUiThread( - () -> { - mMessageAdapter.add(askLoadModelMessage); - mMessageAdapter.notifyDataSetChanged(); - }); - } - - private void setupShowLogsButton() { - ImageButton showLogsButton = requireViewById(R.id.showLogsButton); - showLogsButton.setOnClickListener( - view -> { - Intent myIntent = new Intent(MainActivity.this, LogsActivity.class); - MainActivity.this.startActivity(myIntent); - }); - } - - private void setupMediaButton() { - mAddMediaLayout = requireViewById(R.id.addMediaLayout); - mAddMediaLayout.setVisibility(View.GONE); // We hide this initially - - ImageButton addMediaButton = requireViewById(R.id.addMediaButton); - addMediaButton.setOnClickListener( - view -> { - mAddMediaLayout.setVisibility(View.VISIBLE); - }); - - mGalleryButton = requireViewById(R.id.galleryButton); - mGalleryButton.setOnClickListener( - view -> { - // Launch the photo picker and let the user choose only images. - mPickGallery.launch( - new PickVisualMediaRequest.Builder() - .setMediaType(ActivityResultContracts.PickVisualMedia.ImageOnly.INSTANCE) - .build()); - }); - mCameraButton = requireViewById(R.id.cameraButton); - mCameraButton.setOnClickListener( - view -> { - Log.d("CameraRoll", "Check permission"); - if (ContextCompat.checkSelfPermission(MainActivity.this, Manifest.permission.CAMERA) - != PackageManager.PERMISSION_GRANTED) { - ActivityCompat.requestPermissions( - MainActivity.this, - new String[] {Manifest.permission.CAMERA}, - REQUEST_IMAGE_CAPTURE); - } else { - launchCamera(); - } - }); - } - - private void setupCameraRoll() { - // Registers a camera roll activity launcher. - mCameraRoll = - registerForActivityResult( - new ActivityResultContracts.TakePicture(), - result -> { - if (result && cameraImageUri != null) { - Log.d("CameraRoll", "Photo saved to uri: " + cameraImageUri); - mAddMediaLayout.setVisibility(View.GONE); - List uris = new ArrayList<>(); - uris.add(cameraImageUri); - showMediaPreview(uris); - } else { - // Delete the temp image file based on the url since the photo is not successfully - // taken - if (cameraImageUri != null) { - ContentResolver contentResolver = MainActivity.this.getContentResolver(); - contentResolver.delete(cameraImageUri, null, null); - Log.d("CameraRoll", "No photo taken. Delete temp uri"); - } - } - }); - mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout); - ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton); - mediaPreviewCloseButton.setOnClickListener( - view -> { - mMediaPreviewConstraintLayout.setVisibility(View.GONE); - mSelectedImageUri = null; - }); - - ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton); - addMoreImageButton.setOnClickListener( - view -> { - Log.d("addMore", "clicked"); - mMediaPreviewConstraintLayout.setVisibility(View.GONE); - // Direct user to select type of input - mCameraButton.callOnClick(); - }); - } - - private String updateMemoryUsage() { - ActivityManager.MemoryInfo memoryInfo = new ActivityManager.MemoryInfo(); - ActivityManager activityManager = (ActivityManager) getSystemService(ACTIVITY_SERVICE); - if (activityManager == null) { - return "---"; - } - activityManager.getMemoryInfo(memoryInfo); - long totalMem = memoryInfo.totalMem / (1024 * 1024); - long availableMem = memoryInfo.availMem / (1024 * 1024); - long usedMem = totalMem - availableMem; - return usedMem + "MB"; - } - - private void startMemoryUpdate() { - mMemoryView = requireViewById(R.id.ram_usage_live); - memoryUpdater = - new Runnable() { - @Override - public void run() { - mMemoryView.setText(updateMemoryUsage()); - mMemoryUpdateHandler.postDelayed(this, 1000); - } - }; - mMemoryUpdateHandler.post(memoryUpdater); - } - - @Override - public void onRequestPermissionsResult( - int requestCode, @NonNull String[] permissions, @NonNull int[] grantResults) { - super.onRequestPermissionsResult(requestCode, permissions, grantResults); - if (requestCode == REQUEST_IMAGE_CAPTURE && grantResults.length != 0) { - if (grantResults[0] == PackageManager.PERMISSION_GRANTED) { - launchCamera(); - } else if (grantResults[0] == PackageManager.PERMISSION_DENIED) { - Log.d("CameraRoll", "Permission denied"); - } - } - } - - private void launchCamera() { - ContentValues values = new ContentValues(); - values.put(MediaStore.Images.Media.TITLE, "New Picture"); - values.put(MediaStore.Images.Media.DESCRIPTION, "From Camera"); - values.put(MediaStore.Images.Media.RELATIVE_PATH, "DCIM/Camera/"); - cameraImageUri = - MainActivity.this - .getContentResolver() - .insert(MediaStore.Images.Media.EXTERNAL_CONTENT_URI, values); - mCameraRoll.launch(cameraImageUri); - } - - private void setupGalleryPicker() { - // Registers a photo picker activity launcher in single-select mode. - mPickGallery = - registerForActivityResult( - new ActivityResultContracts.PickMultipleVisualMedia(MAX_NUM_OF_IMAGES), - uris -> { - if (!uris.isEmpty()) { - Log.d("PhotoPicker", "Selected URIs: " + uris); - mAddMediaLayout.setVisibility(View.GONE); - for (Uri uri : uris) { - MainActivity.this - .getContentResolver() - .takePersistableUriPermission(uri, Intent.FLAG_GRANT_READ_URI_PERMISSION); - } - showMediaPreview(uris); - } else { - Log.d("PhotoPicker", "No media selected"); - } - }); - - mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout); - ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton); - mediaPreviewCloseButton.setOnClickListener( - view -> { - mMediaPreviewConstraintLayout.setVisibility(View.GONE); - mSelectedImageUri = null; - }); - - ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton); - addMoreImageButton.setOnClickListener( - view -> { - Log.d("addMore", "clicked"); - mMediaPreviewConstraintLayout.setVisibility(View.GONE); - mGalleryButton.callOnClick(); - }); - } - - private List getProcessedImagesForModel(List uris) { - List imageList = new ArrayList<>(); - if (uris != null) { - uris.forEach( - (uri) -> { - imageList.add(new ETImage(this.getContentResolver(), uri)); - }); - } - return imageList; - } - - private void showMediaPreview(List uris) { - if (mSelectedImageUri == null) { - mSelectedImageUri = uris; - } else { - mSelectedImageUri.addAll(uris); - } - - if (mSelectedImageUri.size() > MAX_NUM_OF_IMAGES) { - mSelectedImageUri = mSelectedImageUri.subList(0, MAX_NUM_OF_IMAGES); - Toast.makeText( - this, "Only max " + MAX_NUM_OF_IMAGES + " images are allowed", Toast.LENGTH_SHORT) - .show(); - } - Log.d("mSelectedImageUri", mSelectedImageUri.size() + " " + mSelectedImageUri); - - mMediaPreviewConstraintLayout.setVisibility(View.VISIBLE); - - List imageViews = new ArrayList(); - - // Pre-populate all the image views that are available from the layout (currently max 5) - imageViews.add(requireViewById(R.id.mediaPreviewImageView1)); - imageViews.add(requireViewById(R.id.mediaPreviewImageView2)); - imageViews.add(requireViewById(R.id.mediaPreviewImageView3)); - imageViews.add(requireViewById(R.id.mediaPreviewImageView4)); - imageViews.add(requireViewById(R.id.mediaPreviewImageView5)); - - // Hide all the image views (reset state) - for (int i = 0; i < imageViews.size(); i++) { - imageViews.get(i).setVisibility(View.GONE); - } - - // Only show/render those that have proper Image URIs - for (int i = 0; i < mSelectedImageUri.size(); i++) { - imageViews.get(i).setVisibility(View.VISIBLE); - imageViews.get(i).setImageURI(mSelectedImageUri.get(i)); - } - - // For LLava, we want to call prefill_image as soon as an image is selected - // Llava only support 1 image for now - if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { - List processedImageList = getProcessedImagesForModel(mSelectedImageUri); - if (!processedImageList.isEmpty()) { - mMessageAdapter.add( - new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0)); - mMessageAdapter.notifyDataSetChanged(); - Runnable runnable = - () -> { - Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); - ETLogging.getInstance().log("Starting runnable prefill image"); - ETImage img = processedImageList.get(0); - ETLogging.getInstance().log("Llava start prefill image"); - mModule.prefillImages( - img.getInts(), - img.getWidth(), - img.getHeight(), - ModelUtils.VISION_MODEL_IMAGE_CHANNELS); - }; - executor.execute(runnable); - } - } - } - - private void addSelectedImagesToChatThread(List selectedImageUri) { - if (selectedImageUri == null) { - return; - } - mMediaPreviewConstraintLayout.setVisibility(View.GONE); - for (int i = 0; i < selectedImageUri.size(); i++) { - Uri imageURI = selectedImageUri.get(i); - Log.d("image uri ", "test " + imageURI.getPath()); - mMessageAdapter.add(new Message(imageURI.toString(), true, MessageType.IMAGE, 0)); - } - mMessageAdapter.notifyDataSetChanged(); - } - - private String getConversationHistory() { - String conversationHistory = ""; - - ArrayList conversations = - mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK); - if (conversations.isEmpty()) { - return conversationHistory; - } - - int prevPromptID = conversations.get(0).getPromptID(); - String conversationFormat = - PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType()); - String format = conversationFormat; - for (int i = 0; i < conversations.size(); i++) { - Message conversation = conversations.get(i); - int currentPromptID = conversation.getPromptID(); - if (currentPromptID != prevPromptID) { - conversationHistory = conversationHistory + format; - format = conversationFormat; - prevPromptID = currentPromptID; - } - if (conversation.getIsSent()) { - format = - format - .replace(PromptFormat.USER_PLACEHOLDER, conversation.getText()) - .replace(PromptFormat.THINKING_MODE_PLACEHOLDER, ""); - } else { - format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText()); - } - } - conversationHistory = conversationHistory + format; - - return conversationHistory; - } - - private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) { - if (conversationHistory.isEmpty()) { - return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode); - } - - return mCurrentSettingsFields.getFormattedSystemPrompt() - + conversationHistory - + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt, mThinkMode); - } - - private void onModelRunStarted() { - mSendButton.setClickable(false); - mSendButton.setImageResource(R.drawable.baseline_stop_24); - mSendButton.setOnClickListener( - view -> { - mModule.stop(); - }); - } - - private void onModelRunStopped() { - mSendButton.setClickable(true); - mSendButton.setImageResource(R.drawable.baseline_send_24); - mSendButton.setOnClickListener( - view -> { - try { - InputMethodManager imm = (InputMethodManager) getSystemService(INPUT_METHOD_SERVICE); - imm.hideSoftInputFromWindow(getCurrentFocus().getWindowToken(), 0); - } catch (Exception e) { - ETLogging.getInstance().log("Keyboard dismissal error: " + e.getMessage()); - } - addSelectedImagesToChatThread(mSelectedImageUri); - String finalPrompt; - String rawPrompt = mEditTextMessage.getText().toString(); - if (ModelUtils.getModelCategory( - mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()) - == ModelUtils.VISION_MODEL) { - finalPrompt = - mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode); - } else { - finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt); - } - // We store raw prompt into message adapter, because we don't want to show the extra - // tokens from system prompt - mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID)); - mMessageAdapter.notifyDataSetChanged(); - mEditTextMessage.setText(""); - mResultMessage = new Message("", false, MessageType.TEXT, promptID); - mMessageAdapter.add(mResultMessage); - // Scroll to bottom of the list - mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1); - // After images are added to prompt and chat thread, we clear the imageURI list - // Note: This has to be done after imageURIs are no longer needed by LlmModule - mSelectedImageUri = null; - promptID++; - Runnable runnable = - new Runnable() { - @Override - public void run() { - Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); - ETLogging.getInstance().log("starting runnable generate()"); - runOnUiThread( - new Runnable() { - @Override - public void run() { - onModelRunStarted(); - } - }); - long generateStartTime = System.currentTimeMillis(); - if (ModelUtils.getModelCategory( - mCurrentSettingsFields.getModelType(), - mCurrentSettingsFields.getBackendType()) - == ModelUtils.VISION_MODEL) { - mModule.generate( - finalPrompt, ModelUtils.VISION_MODEL_SEQ_LEN, MainActivity.this, false); - } else if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_GUARD_3) { - String llamaGuardPromptForClassification = - PromptFormat.getFormattedLlamaGuardPrompt(rawPrompt); - ETLogging.getInstance() - .log("Running inference.. prompt=" + llamaGuardPromptForClassification); - mModule.generate( - llamaGuardPromptForClassification, - llamaGuardPromptForClassification.length() + 64, - MainActivity.this, - false); - } else { - ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt); - mModule.generate( - finalPrompt, - (int) (finalPrompt.length() * 0.75) + 64, - MainActivity.this, - false); - } - - long generateDuration = System.currentTimeMillis() - generateStartTime; - mResultMessage.setTotalGenerationTime(generateDuration); - runOnUiThread( - new Runnable() { - @Override - public void run() { - onModelRunStopped(); - } - }); - ETLogging.getInstance().log("Inference completed"); - } - }; - executor.execute(runnable); - }); - mMessageAdapter.notifyDataSetChanged(); - } - - @Override - public void run() { - runOnUiThread( - new Runnable() { - @Override - public void run() { - mMessageAdapter.notifyDataSetChanged(); - } - }); - } - - @Override - public void onBackPressed() { - super.onBackPressed(); - if (mAddMediaLayout != null && mAddMediaLayout.getVisibility() == View.VISIBLE) { - mAddMediaLayout.setVisibility(View.GONE); - } else { - // Default behavior of back button - finish(); - } - } - - @Override - protected void onDestroy() { - super.onDestroy(); - mMemoryUpdateHandler.removeCallbacks(memoryUpdater); - // This is to cover the case where the app is shutdown when user is on MainActivity but - // never clicked on the logsActivity - ETLogging.getInstance().saveLogs(); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java deleted file mode 100644 index b2e5380e2a5..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Locale; - -public class Message { - private String text; - private final boolean isSent; - private float tokensPerSecond; - private long totalGenerationTime; - private final long timestamp; - private final MessageType messageType; - private String imagePath; - private final int promptID; - - private static final String TIMESTAMP_FORMAT = "hh:mm a"; // example: 2:23 PM - - public Message(String text, boolean isSent, MessageType messageType, int promptID) { - this.isSent = isSent; - this.messageType = messageType; - this.promptID = promptID; - - if (messageType == MessageType.IMAGE) { - this.imagePath = text; - } else { - this.text = text; - } - - if (messageType != MessageType.SYSTEM) { - this.timestamp = System.currentTimeMillis(); - } else { - this.timestamp = (long) 0; - } - } - - public int getPromptID() { - return promptID; - } - - public MessageType getMessageType() { - return messageType; - } - - public String getImagePath() { - return imagePath; - } - - public String getText() { - return text; - } - - public void appendText(String text) { - this.text += text; - } - - public boolean getIsSent() { - return isSent; - } - - public void setTokensPerSecond(float tokensPerSecond) { - this.tokensPerSecond = tokensPerSecond; - } - - public void setTotalGenerationTime(long totalGenerationTime) { - this.totalGenerationTime = totalGenerationTime; - } - - public float getTokensPerSecond() { - return tokensPerSecond; - } - - public long getTotalGenerationTime() { - return totalGenerationTime; - } - - public long getTimestamp() { - return timestamp; - } - - public String getFormattedTimestamp() { - SimpleDateFormat formatter = new SimpleDateFormat(TIMESTAMP_FORMAT, Locale.getDefault()); - Date date = new Date(timestamp); - return formatter.format(date); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java deleted file mode 100644 index 31aaa9a1d5f..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.net.Uri; -import android.view.LayoutInflater; -import android.view.View; -import android.view.ViewGroup; -import android.widget.ArrayAdapter; -import android.widget.ImageView; -import android.widget.TextView; -import java.util.ArrayList; -import java.util.Collections; - -public class MessageAdapter extends ArrayAdapter { - - private final ArrayList savedMessages; - - public MessageAdapter( - android.content.Context context, int resource, ArrayList savedMessages) { - super(context, resource); - this.savedMessages = savedMessages; - } - - @Override - public View getView(int position, View convertView, ViewGroup parent) { - Message currentMessage = getItem(position); - int layoutIdForListItem; - - if (currentMessage.getMessageType() == MessageType.SYSTEM) { - layoutIdForListItem = R.layout.system_message; - } else { - layoutIdForListItem = - currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message; - } - View listItemView = - LayoutInflater.from(getContext()).inflate(layoutIdForListItem, parent, false); - if (currentMessage.getMessageType() == MessageType.IMAGE) { - ImageView messageImageView = listItemView.requireViewById(R.id.message_image); - messageImageView.setImageURI(Uri.parse(currentMessage.getImagePath())); - TextView messageTextView = listItemView.requireViewById(R.id.message_text); - messageTextView.setVisibility(View.GONE); - } else { - TextView messageTextView = listItemView.requireViewById(R.id.message_text); - messageTextView.setText(currentMessage.getText()); - } - - String metrics = ""; - TextView tokensView; - if (currentMessage.getTokensPerSecond() > 0) { - metrics = String.format("%.2f", currentMessage.getTokensPerSecond()) + "t/s "; - } - - if (currentMessage.getTotalGenerationTime() > 0) { - metrics = metrics + (float) currentMessage.getTotalGenerationTime() / 1000 + "s "; - } - - if (currentMessage.getTokensPerSecond() > 0 || currentMessage.getTotalGenerationTime() > 0) { - tokensView = listItemView.requireViewById(R.id.generation_metrics); - tokensView.setText(metrics); - TextView separatorView = listItemView.requireViewById(R.id.bar); - separatorView.setVisibility(View.VISIBLE); - } - - if (currentMessage.getTimestamp() > 0) { - TextView timestampView = listItemView.requireViewById(R.id.timestamp); - timestampView.setText(currentMessage.getFormattedTimestamp()); - } - - return listItemView; - } - - @Override - public void add(Message msg) { - super.add(msg); - savedMessages.add(msg); - } - - @Override - public void clear() { - super.clear(); - savedMessages.clear(); - } - - public ArrayList getSavedMessages() { - return savedMessages; - } - - public ArrayList getRecentSavedTextMessages(int numOfLatestPromptMessages) { - ArrayList recentMessages = new ArrayList(); - int lastIndex = savedMessages.size() - 1; - // In most cases lastIndex >=0 . - // A situation where the user clears chat history and enters prompt. Causes lastIndex=-1 . - if (lastIndex >= 0) { - Message messageToAdd = savedMessages.get(lastIndex); - int oldPromptID = messageToAdd.getPromptID(); - - for (int i = 0; i < savedMessages.size(); i++) { - messageToAdd = savedMessages.get(lastIndex - i); - if (messageToAdd.getMessageType() != MessageType.SYSTEM) { - if (messageToAdd.getPromptID() != oldPromptID) { - numOfLatestPromptMessages--; - oldPromptID = messageToAdd.getPromptID(); - } - if (numOfLatestPromptMessages > 0) { - if (messageToAdd.getMessageType() == MessageType.TEXT) { - recentMessages.add(messageToAdd); - } - } else { - break; - } - } - } - // To place the order in [input1, output1, input2, output2...] - Collections.reverse(recentMessages); - } - - return recentMessages; - } - - public int getMaxPromptID() { - int maxPromptID = -1; - for (Message msg : savedMessages) { - - maxPromptID = Math.max(msg.getPromptID(), maxPromptID); - } - return maxPromptID; - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java deleted file mode 100644 index 6042acb5726..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -public enum MessageType { - TEXT, - IMAGE, - SYSTEM -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java deleted file mode 100644 index a1bc205c4ac..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.os.Handler; -import android.os.HandlerThread; -import android.os.Looper; -import android.os.Message; -import androidx.annotation.NonNull; -import org.json.JSONException; -import org.json.JSONObject; -import org.pytorch.executorch.extension.llm.LlmCallback; -import org.pytorch.executorch.extension.llm.LlmModule; - -/** A helper class to handle all model running logic within this class. */ -public class ModelRunner implements LlmCallback { - LlmModule mModule = null; - - String mModelFilePath = ""; - String mTokenizerFilePath = ""; - - ModelRunnerCallback mCallback = null; - - HandlerThread mHandlerThread = null; - Handler mHandler = null; - - /** - * ] Helper class to separate between UI logic and model runner logic. Automatically handle - * generate() request on worker thread. - * - * @param modelFilePath - * @param tokenizerFilePath - * @param callback - */ - ModelRunner( - String modelFilePath, - String tokenizerFilePath, - float temperature, - ModelRunnerCallback callback) { - mModelFilePath = modelFilePath; - mTokenizerFilePath = tokenizerFilePath; - mCallback = callback; - - mModule = new LlmModule(mModelFilePath, mTokenizerFilePath, 0.8f); - mHandlerThread = new HandlerThread("ModelRunner"); - mHandlerThread.start(); - mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this); - - mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL); - } - - int generate(String prompt) { - Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt); - msg.sendToTarget(); - return 0; - } - - void stop() { - mModule.stop(); - } - - @Override - public void onResult(String result) { - mCallback.onTokenGenerated(result); - } - - @Override - public void onStats(String stats) { - float tps = 0; - try { - JSONObject jsonObject = new JSONObject(stats); - int numGeneratedTokens = jsonObject.getInt("generated_tokens"); - int inferenceEndMs = jsonObject.getInt("inference_end_ms"); - int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms"); - tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000; - } catch (JSONException e) { - } - mCallback.onStats("tokens/second: " + tps); - } -} - -class ModelRunnerHandler extends Handler { - public static int MESSAGE_LOAD_MODEL = 1; - public static int MESSAGE_GENERATE = 2; - - private final ModelRunner mModelRunner; - - public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) { - super(looper); - mModelRunner = modelRunner; - } - - @Override - public void handleMessage(@NonNull android.os.Message msg) { - if (msg.what == MESSAGE_LOAD_MODEL) { - int status = mModelRunner.mModule.load(); - mModelRunner.mCallback.onModelLoaded(status); - } else if (msg.what == MESSAGE_GENERATE) { - mModelRunner.mModule.generate((String) msg.obj, mModelRunner); - mModelRunner.mCallback.onGenerationStopped(); - } - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java deleted file mode 100644 index 5e8b6f00e3d..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -/** - * A helper interface within the app for MainActivity and Benchmarking to handle callback from - * ModelRunner. - */ -public interface ModelRunnerCallback { - - void onModelLoaded(int status); - - void onTokenGenerated(String token); - - void onStats(String stats); - - void onGenerationStopped(); -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java deleted file mode 100644 index 9f8132504ea..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -public enum ModelType { - LLAMA_3, - LLAMA_3_1, - LLAMA_3_2, - LLAVA_1_5, - LLAMA_GUARD_3, - QWEN_3, -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java deleted file mode 100644 index cf7ab1756ce..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -public class ModelUtils { - // XNNPACK or QNN - static final int TEXT_MODEL = 1; - - // XNNPACK - static final int VISION_MODEL = 2; - static final int VISION_MODEL_IMAGE_CHANNELS = 3; - static final int VISION_MODEL_SEQ_LEN = 768; - static final int TEXT_MODEL_SEQ_LEN = 256; - - // MediaTek - static final int MEDIATEK_TEXT_MODEL = 3; - - // QNN static llama - static final int QNN_TEXT_MODEL = 4; - - public static int getModelCategory(ModelType modelType, BackendType backendType) { - if (backendType.equals(BackendType.XNNPACK)) { - switch (modelType) { - case LLAVA_1_5: - return VISION_MODEL; - case LLAMA_3: - case LLAMA_3_1: - case LLAMA_3_2: - case QWEN_3: - default: - return TEXT_MODEL; - } - } else if (backendType.equals(BackendType.MEDIATEK)) { - return MEDIATEK_TEXT_MODEL; - } else if (backendType.equals(BackendType.QUALCOMM)) { - return QNN_TEXT_MODEL; - } - - return TEXT_MODEL; // default - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java deleted file mode 100644 index 524ad7cbf6d..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -public class PromptFormat { - - public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}"; - public static final String USER_PLACEHOLDER = "{{ user_prompt }}"; - public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}"; - public static final String THINKING_MODE_PLACEHOLDER = "{{ thinking_mode }}"; - public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences"; - - public static String getSystemPromptTemplate(ModelType modelType) { - switch (modelType) { - case LLAMA_3: - case LLAMA_3_1: - case LLAMA_3_2: - return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n" - + SYSTEM_PLACEHOLDER - + "<|eot_id|>"; - case LLAVA_1_5: - return "USER: "; - case QWEN_3: - return "<|im_start|>system\n" + "You are a helpful assistant.\n" + "<|im_end|>\n"; - default: - return SYSTEM_PLACEHOLDER; - } - } - - public static String getUserPromptTemplate(ModelType modelType, boolean thinkingMode) { - switch (modelType) { - case LLAMA_3: - case LLAMA_3_1: - case LLAMA_3_2: - case LLAMA_GUARD_3: - return "<|start_header_id|>user<|end_header_id|>\n" - + USER_PLACEHOLDER - + "<|eot_id|>" - + "<|start_header_id|>assistant<|end_header_id|>"; - - case QWEN_3: - return "<|im_start|>user\n" - + USER_PLACEHOLDER - + "\n<|im_end|>\n" - + "<|im_start|>assistant\n" - + THINKING_MODE_PLACEHOLDER; - case LLAVA_1_5: - default: - return USER_PLACEHOLDER; - } - } - - public static String getConversationFormat(ModelType modelType) { - switch (modelType) { - case LLAMA_3: - case LLAMA_3_1: - case LLAMA_3_2: - return getUserPromptTemplate(modelType, false) - + "\n" - + ASSISTANT_PLACEHOLDER - + "<|eot_id|>"; - case LLAVA_1_5: - return USER_PLACEHOLDER + " ASSISTANT:"; - case QWEN_3: - return getUserPromptTemplate(modelType, false) + "<|im_end|>\n"; - default: - return USER_PLACEHOLDER; - } - } - - public static String getStopToken(ModelType modelType) { - switch (modelType) { - case LLAMA_3: - case LLAMA_3_1: - case LLAMA_3_2: - case LLAMA_GUARD_3: - return "<|eot_id|>"; - case LLAVA_1_5: - return ""; - case QWEN_3: - return "<|endoftext|>"; - default: - return ""; - } - } - - public static String getThinkingModeToken(ModelType modelType, boolean thinkingMode) { - switch (modelType) { - case QWEN_3: - return thinkingMode ? "" : "\n\n\n\n\n"; - default: - return ""; - } - } - - public static String replaceSpecialToken(ModelType modelType, String token) { - switch (modelType) { - case QWEN_3: - switch (token) { - case "<|im_end|>": - return ""; - case "": - return "Thinking...\n"; - case "": - return "\nDone thinking"; - } - default: - return token; - } - } - - public static String getLlavaPresetPrompt() { - return "A chat between a curious human and an artificial intelligence assistant. The assistant" - + " gives helpful, detailed, and polite answers to the human's questions. USER: "; - } - - public static String getFormattedLlamaGuardPrompt(String userPrompt) { - return getUserPromptTemplate(ModelType.LLAMA_GUARD_3, false) - .replace( - USER_PLACEHOLDER, getLlamaGuardPresetPrompt().replace(USER_PLACEHOLDER, userPrompt)); - } - - public static String getLlamaGuardPresetPrompt() { - return "Task: Check if there is unsafe content in 'User' messages in conversations" - + " according our safety policy with the below categories.\n" - + "\n" - + "\n" - + "S1: Violent Crimes. \n" - + "S2: Non-Violent Crimes. \n" - + "S3: Sex Crimes. \n" - + "S4: Child Exploitation. \n" - + "S5: Defamation. \n" - + "S6: Specialized Advice. \n" - + "S7: Privacy. \n" - + "S8: Intellectual Property. \n" - + "S9: Indiscriminate Weapons. \n" - + "S10: Hate. \n" - + "S11: Self-Harm. \n" - + "S12: Sexual Content. \n" - + "S13: Elections. \n" - + "S14: Code Interpreter Abuse.\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "User: " - + USER_PLACEHOLDER - + "\n" - + "\n" - + "Provide your safety assessment for ONLY THE LAST User message in the above" - + " conversation:\n" - + " - First line must read 'safe' or 'unsafe'.\n" - + " - If unsafe, a second line must include a comma-separated list of violated" - + " categories."; - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java deleted file mode 100644 index 0e388a5b0a4..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -import android.app.AlertDialog; -import android.content.DialogInterface; -import android.os.Build; -import android.os.Bundle; -import android.text.Editable; -import android.text.TextWatcher; -import android.view.View; -import android.widget.Button; -import android.widget.EditText; -import android.widget.ImageButton; -import android.widget.TextView; -import androidx.appcompat.app.AppCompatActivity; -import androidx.core.content.ContextCompat; -import androidx.core.graphics.Insets; -import androidx.core.view.ViewCompat; -import androidx.core.view.WindowInsetsCompat; -import com.google.gson.Gson; -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class SettingsActivity extends AppCompatActivity { - - private String mModelFilePath = ""; - private String mTokenizerFilePath = ""; - private TextView mBackendTextView; - private TextView mModelTextView; - private TextView mTokenizerTextView; - private TextView mModelTypeTextView; - private EditText mSystemPromptEditText; - private EditText mUserPromptEditText; - private Button mLoadModelButton; - private double mSetTemperature; - private String mSystemPrompt; - private String mUserPrompt; - private BackendType mBackendType; - private ModelType mModelType; - public SettingsFields mSettingsFields; - - private DemoSharedPreferences mDemoSharedPreferences; - public static double TEMPERATURE_MIN_VALUE = 0.0; - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - setContentView(R.layout.activity_settings); - if (Build.VERSION.SDK_INT >= 21) { - getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); - getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); - } - ViewCompat.setOnApplyWindowInsetsListener( - requireViewById(R.id.main), - (v, insets) -> { - Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars()); - v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom); - return insets; - }); - mDemoSharedPreferences = new DemoSharedPreferences(getBaseContext()); - mSettingsFields = new SettingsFields(); - setupSettings(); - } - - private void setupSettings() { - mBackendTextView = requireViewById(R.id.backendTextView); - mModelTextView = requireViewById(R.id.modelTextView); - mTokenizerTextView = requireViewById(R.id.tokenizerTextView); - mModelTypeTextView = requireViewById(R.id.modelTypeTextView); - ImageButton backendImageButton = requireViewById(R.id.backendImageButton); - ImageButton modelImageButton = requireViewById(R.id.modelImageButton); - ImageButton tokenizerImageButton = requireViewById(R.id.tokenizerImageButton); - ImageButton modelTypeImageButton = requireViewById(R.id.modelTypeImageButton); - mSystemPromptEditText = requireViewById(R.id.systemPromptText); - mUserPromptEditText = requireViewById(R.id.userPromptText); - loadSettings(); - - // TODO: The two setOnClickListeners will be removed after file path issue is resolved - backendImageButton.setOnClickListener( - view -> { - setupBackendSelectorDialog(); - }); - modelImageButton.setOnClickListener( - view -> { - setupModelSelectorDialog(); - }); - tokenizerImageButton.setOnClickListener( - view -> { - setupTokenizerSelectorDialog(); - }); - modelTypeImageButton.setOnClickListener( - view -> { - setupModelTypeSelectorDialog(); - }); - mModelFilePath = mSettingsFields.getModelFilePath(); - if (!mModelFilePath.isEmpty()) { - mModelTextView.setText(getFilenameFromPath(mModelFilePath)); - } - mTokenizerFilePath = mSettingsFields.getTokenizerFilePath(); - if (!mTokenizerFilePath.isEmpty()) { - mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath)); - } - mModelType = mSettingsFields.getModelType(); - ETLogging.getInstance().log("mModelType from settings " + mModelType); - if (mModelType != null) { - mModelTypeTextView.setText(mModelType.toString()); - } - mBackendType = mSettingsFields.getBackendType(); - ETLogging.getInstance().log("mBackendType from settings " + mBackendType); - if (mBackendType != null) { - mBackendTextView.setText(mBackendType.toString()); - setBackendSettingMode(); - } - - setupParameterSettings(); - setupPromptSettings(); - setupClearChatHistoryButton(); - setupLoadModelButton(); - } - - private void setupLoadModelButton() { - mLoadModelButton = requireViewById(R.id.loadModelButton); - mLoadModelButton.setEnabled(true); - mLoadModelButton.setOnClickListener( - view -> { - new AlertDialog.Builder(this) - .setTitle("Load Model") - .setMessage("Do you really want to load the new model?") - .setIcon(android.R.drawable.ic_dialog_alert) - .setPositiveButton( - android.R.string.yes, - new DialogInterface.OnClickListener() { - public void onClick(DialogInterface dialog, int whichButton) { - mSettingsFields.saveLoadModelAction(true); - mLoadModelButton.setEnabled(false); - onBackPressed(); - } - }) - .setNegativeButton(android.R.string.no, null) - .show(); - }); - } - - private void setupClearChatHistoryButton() { - Button clearChatButton = requireViewById(R.id.clearChatButton); - clearChatButton.setOnClickListener( - view -> { - new AlertDialog.Builder(this) - .setTitle("Delete Chat History") - .setMessage("Do you really want to delete chat history?") - .setIcon(android.R.drawable.ic_dialog_alert) - .setPositiveButton( - android.R.string.yes, - new DialogInterface.OnClickListener() { - public void onClick(DialogInterface dialog, int whichButton) { - mSettingsFields.saveIsClearChatHistory(true); - } - }) - .setNegativeButton(android.R.string.no, null) - .show(); - }); - } - - private void setupParameterSettings() { - setupTemperatureSettings(); - } - - private void setupTemperatureSettings() { - mSetTemperature = mSettingsFields.getTemperature(); - EditText temperatureEditText = requireViewById(R.id.temperatureEditText); - temperatureEditText.setText(String.valueOf(mSetTemperature)); - temperatureEditText.addTextChangedListener( - new TextWatcher() { - @Override - public void beforeTextChanged(CharSequence s, int start, int count, int after) {} - - @Override - public void onTextChanged(CharSequence s, int start, int before, int count) {} - - @Override - public void afterTextChanged(Editable s) { - mSetTemperature = Double.parseDouble(s.toString()); - // This is needed because temperature is changed together with model loading - // Once temperature is no longer in LlmModule constructor, we can remove this - mSettingsFields.saveLoadModelAction(true); - saveSettings(); - } - }); - } - - private void setupPromptSettings() { - setupSystemPromptSettings(); - setupUserPromptSettings(); - } - - private void setupSystemPromptSettings() { - mSystemPrompt = mSettingsFields.getSystemPrompt(); - mSystemPromptEditText.setText(mSystemPrompt); - mSystemPromptEditText.addTextChangedListener( - new TextWatcher() { - @Override - public void beforeTextChanged(CharSequence s, int start, int count, int after) {} - - @Override - public void onTextChanged(CharSequence s, int start, int before, int count) {} - - @Override - public void afterTextChanged(Editable s) { - mSystemPrompt = s.toString(); - } - }); - - ImageButton resetSystemPrompt = requireViewById(R.id.resetSystemPrompt); - resetSystemPrompt.setOnClickListener( - view -> { - new AlertDialog.Builder(this) - .setTitle("Reset System Prompt") - .setMessage("Do you really want to reset system prompt?") - .setIcon(android.R.drawable.ic_dialog_alert) - .setPositiveButton( - android.R.string.yes, - new DialogInterface.OnClickListener() { - public void onClick(DialogInterface dialog, int whichButton) { - // Clear the messageAdapter and sharedPreference - mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT); - } - }) - .setNegativeButton(android.R.string.no, null) - .show(); - }); - } - - private void setupUserPromptSettings() { - mUserPrompt = mSettingsFields.getUserPrompt(); - mUserPromptEditText.setText(mUserPrompt); - mUserPromptEditText.addTextChangedListener( - new TextWatcher() { - @Override - public void beforeTextChanged(CharSequence s, int start, int count, int after) {} - - @Override - public void onTextChanged(CharSequence s, int start, int before, int count) {} - - @Override - public void afterTextChanged(Editable s) { - if (isValidUserPrompt(s.toString())) { - mUserPrompt = s.toString(); - } else { - showInvalidPromptDialog(); - } - } - }); - - ImageButton resetUserPrompt = requireViewById(R.id.resetUserPrompt); - resetUserPrompt.setOnClickListener( - view -> { - new AlertDialog.Builder(this) - .setTitle("Reset Prompt Template") - .setMessage("Do you really want to reset the prompt template?") - .setIcon(android.R.drawable.ic_dialog_alert) - .setPositiveButton( - android.R.string.yes, - new DialogInterface.OnClickListener() { - public void onClick(DialogInterface dialog, int whichButton) { - // Clear the messageAdapter and sharedPreference - mUserPromptEditText.setText( - PromptFormat.getUserPromptTemplate(mModelType, false)); - } - }) - .setNegativeButton(android.R.string.no, null) - .show(); - }); - } - - private boolean isValidUserPrompt(String userPrompt) { - return userPrompt.contains(PromptFormat.USER_PLACEHOLDER); - } - - private void showInvalidPromptDialog() { - new AlertDialog.Builder(this) - .setTitle("Invalid Prompt Format") - .setMessage( - "Prompt format must contain " - + PromptFormat.USER_PLACEHOLDER - + ". Do you want to reset prompt format?") - .setIcon(android.R.drawable.ic_dialog_alert) - .setPositiveButton( - android.R.string.yes, - (dialog, whichButton) -> { - mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType, false)); - }) - .setNegativeButton(android.R.string.no, null) - .show(); - } - - private void setupBackendSelectorDialog() { - // Convert enum to list - List backendTypesList = new ArrayList<>(); - for (BackendType backendType : BackendType.values()) { - backendTypesList.add(backendType.toString()); - } - // Alert dialog builder takes in arr of string instead of list - String[] backendTypes = backendTypesList.toArray(new String[0]); - AlertDialog.Builder backendTypeBuilder = new AlertDialog.Builder(this); - backendTypeBuilder.setTitle("Select backend type"); - backendTypeBuilder.setSingleChoiceItems( - backendTypes, - -1, - (dialog, item) -> { - mBackendTextView.setText(backendTypes[item]); - mBackendType = BackendType.valueOf(backendTypes[item]); - setBackendSettingMode(); - dialog.dismiss(); - }); - - backendTypeBuilder.create().show(); - } - - private void setupModelSelectorDialog() { - String[] pteFiles = listLocalFile("/data/local/tmp/llama/", new String[] {".pte"}); - AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); - modelPathBuilder.setTitle("Select model path"); - - modelPathBuilder.setSingleChoiceItems( - pteFiles, - -1, - (dialog, item) -> { - mModelFilePath = pteFiles[item]; - mModelTextView.setText(getFilenameFromPath(mModelFilePath)); - mLoadModelButton.setEnabled(true); - dialog.dismiss(); - }); - - modelPathBuilder.create().show(); - } - - private static boolean fileHasExtension(String file, String[] suffix) { - return Arrays.stream(suffix).anyMatch(entry -> file.endsWith(entry)); - } - - private static String[] listLocalFile(String path, String[] suffix) { - File directory = new File(path); - if (directory.exists() && directory.isDirectory()) { - File[] files = directory.listFiles((dir, name) -> (fileHasExtension(name, suffix))); - String[] result = new String[files.length]; - for (int i = 0; i < files.length; i++) { - if (files[i].isFile() && fileHasExtension(files[i].getName(), suffix)) { - result[i] = files[i].getAbsolutePath(); - } - } - return result; - } - return new String[] {}; - } - - private void setupModelTypeSelectorDialog() { - // Convert enum to list - List modelTypesList = new ArrayList<>(); - for (ModelType modelType : ModelType.values()) { - modelTypesList.add(modelType.toString()); - } - // Alert dialog builder takes in arr of string instead of list - String[] modelTypes = modelTypesList.toArray(new String[0]); - AlertDialog.Builder modelTypeBuilder = new AlertDialog.Builder(this); - modelTypeBuilder.setTitle("Select model type"); - modelTypeBuilder.setSingleChoiceItems( - modelTypes, - -1, - (dialog, item) -> { - mModelTypeTextView.setText(modelTypes[item]); - mModelType = ModelType.valueOf(modelTypes[item]); - mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType, false)); - dialog.dismiss(); - }); - - modelTypeBuilder.create().show(); - } - - private void setupTokenizerSelectorDialog() { - String[] tokenizerFiles = - listLocalFile("/data/local/tmp/llama/", new String[] {".bin", ".json", ".model"}); - AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); - tokenizerPathBuilder.setTitle("Select tokenizer path"); - tokenizerPathBuilder.setSingleChoiceItems( - tokenizerFiles, - -1, - (dialog, item) -> { - mTokenizerFilePath = tokenizerFiles[item]; - mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath)); - mLoadModelButton.setEnabled(true); - dialog.dismiss(); - }); - - tokenizerPathBuilder.create().show(); - } - - private String getFilenameFromPath(String uriFilePath) { - String[] segments = uriFilePath.split("/"); - if (segments.length > 0) { - return segments[segments.length - 1]; // get last element (aka filename) - } - return ""; - } - - private void setBackendSettingMode() { - if (mBackendType.equals(BackendType.XNNPACK) || mBackendType.equals(BackendType.QUALCOMM)) { - setXNNPACKSettingMode(); - } else if (mBackendType.equals(BackendType.MEDIATEK)) { - setMediaTekSettingMode(); - } - } - - private void setXNNPACKSettingMode() { - requireViewById(R.id.modelLayout).setVisibility(View.VISIBLE); - requireViewById(R.id.tokenizerLayout).setVisibility(View.VISIBLE); - requireViewById(R.id.parametersView).setVisibility(View.VISIBLE); - requireViewById(R.id.temperatureLayout).setVisibility(View.VISIBLE); - mModelFilePath = ""; - mTokenizerFilePath = ""; - } - - private void setMediaTekSettingMode() { - requireViewById(R.id.modelLayout).setVisibility(View.GONE); - requireViewById(R.id.tokenizerLayout).setVisibility(View.GONE); - requireViewById(R.id.parametersView).setVisibility(View.GONE); - requireViewById(R.id.temperatureLayout).setVisibility(View.GONE); - mModelFilePath = "/in/mtk/llama/runner"; - mTokenizerFilePath = "/in/mtk/llama/runner"; - } - - private void loadSettings() { - Gson gson = new Gson(); - String settingsFieldsJSON = mDemoSharedPreferences.getSettings(); - if (!settingsFieldsJSON.isEmpty()) { - mSettingsFields = gson.fromJson(settingsFieldsJSON, SettingsFields.class); - } - } - - private void saveSettings() { - mSettingsFields.saveModelPath(mModelFilePath); - mSettingsFields.saveTokenizerPath(mTokenizerFilePath); - mSettingsFields.saveParameters(mSetTemperature); - mSettingsFields.savePrompts(mSystemPrompt, mUserPrompt); - mSettingsFields.saveModelType(mModelType); - mSettingsFields.saveBackendType(mBackendType); - mDemoSharedPreferences.addSettings(mSettingsFields); - } - - @Override - public void onBackPressed() { - super.onBackPressed(); - saveSettings(); - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java deleted file mode 100644 index 94036f43947..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package com.example.executorchllamademo; - -public class SettingsFields { - - public String getModelFilePath() { - return modelFilePath; - } - - public String getTokenizerFilePath() { - return tokenizerFilePath; - } - - public double getTemperature() { - return temperature; - } - - public String getSystemPrompt() { - return systemPrompt; - } - - public ModelType getModelType() { - return modelType; - } - - public BackendType getBackendType() { - return backendType; - } - - public String getUserPrompt() { - return userPrompt; - } - - public String getFormattedSystemAndUserPrompt(String prompt, boolean thinkingMode) { - return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt, thinkingMode); - } - - public String getFormattedSystemPrompt() { - return PromptFormat.getSystemPromptTemplate(modelType) - .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt); - } - - public String getFormattedUserPrompt(String prompt, boolean thinkingMode) { - return userPrompt - .replace(PromptFormat.USER_PLACEHOLDER, prompt) - .replace( - PromptFormat.THINKING_MODE_PLACEHOLDER, - PromptFormat.getThinkingModeToken(modelType, thinkingMode)); - } - - public boolean getIsClearChatHistory() { - return isClearChatHistory; - } - - public boolean getIsLoadModel() { - return isLoadModel; - } - - private String modelFilePath; - private String tokenizerFilePath; - private double temperature; - private String systemPrompt; - private String userPrompt; - private boolean isClearChatHistory; - private boolean isLoadModel; - private ModelType modelType; - private BackendType backendType; - - public SettingsFields() { - ModelType DEFAULT_MODEL = ModelType.LLAMA_3; - BackendType DEFAULT_BACKEND = BackendType.XNNPACK; - - modelFilePath = ""; - tokenizerFilePath = ""; - temperature = SettingsActivity.TEMPERATURE_MIN_VALUE; - systemPrompt = ""; - userPrompt = PromptFormat.getUserPromptTemplate(DEFAULT_MODEL, false); - isClearChatHistory = false; - isLoadModel = false; - modelType = DEFAULT_MODEL; - backendType = DEFAULT_BACKEND; - } - - public SettingsFields(SettingsFields settingsFields) { - this.modelFilePath = settingsFields.modelFilePath; - this.tokenizerFilePath = settingsFields.tokenizerFilePath; - this.temperature = settingsFields.temperature; - this.systemPrompt = settingsFields.getSystemPrompt(); - this.userPrompt = settingsFields.getUserPrompt(); - this.isClearChatHistory = settingsFields.getIsClearChatHistory(); - this.isLoadModel = settingsFields.getIsLoadModel(); - this.modelType = settingsFields.modelType; - this.backendType = settingsFields.backendType; - } - - public void saveModelPath(String modelFilePath) { - this.modelFilePath = modelFilePath; - } - - public void saveTokenizerPath(String tokenizerFilePath) { - this.tokenizerFilePath = tokenizerFilePath; - } - - public void saveModelType(ModelType modelType) { - this.modelType = modelType; - } - - public void saveBackendType(BackendType backendType) { - this.backendType = backendType; - } - - public void saveParameters(Double temperature) { - this.temperature = temperature; - } - - public void savePrompts(String systemPrompt, String userPrompt) { - this.systemPrompt = systemPrompt; - this.userPrompt = userPrompt; - } - - public void saveIsClearChatHistory(boolean needToClear) { - this.isClearChatHistory = needToClear; - } - - public void saveLoadModelAction(boolean shouldLoadModel) { - this.isLoadModel = shouldLoadModel; - } - - public boolean equals(SettingsFields anotherSettingsFields) { - if (this == anotherSettingsFields) return true; - return modelFilePath.equals(anotherSettingsFields.modelFilePath) - && tokenizerFilePath.equals(anotherSettingsFields.tokenizerFilePath) - && temperature == anotherSettingsFields.temperature - && systemPrompt.equals(anotherSettingsFields.systemPrompt) - && userPrompt.equals(anotherSettingsFields.userPrompt) - && isClearChatHistory == anotherSettingsFields.isClearChatHistory - && isLoadModel == anotherSettingsFields.isLoadModel - && modelType == anotherSettingsFields.modelType - && backendType == anotherSettingsFields.backendType; - } -} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml deleted file mode 100644 index 0868ffffa6f..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml deleted file mode 100644 index 2ae27b8409e..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml deleted file mode 100644 index 7077fedd483..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml deleted file mode 100644 index a6837b9c69f..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml deleted file mode 100644 index fb902d4331b..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml deleted file mode 100644 index 4680bc6629e..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_lightbulb_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_lightbulb_24.xml deleted file mode 100644 index aa045396d28..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_lightbulb_24.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml deleted file mode 100644 index 860470ab109..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml deleted file mode 100644 index 2de1f642089..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml deleted file mode 100644 index c51d84b9f4f..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml deleted file mode 100644 index 832e2585954..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/blue_lightbulb_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/blue_lightbulb_24.xml deleted file mode 100644 index 585cd3b1892..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/blue_lightbulb_24.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml deleted file mode 100644 index ceb3ac56c9e..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml deleted file mode 100644 index eb8b9d1f1a9..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml deleted file mode 100644 index 87c82d2a38d..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml deleted file mode 100644 index 0a7a71f0700..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_background.xml deleted file mode 100644 index 07d5da9cbf1..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_background.xml +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_foreground.xml deleted file mode 100644 index 7706ab9e6d4..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_foreground.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml deleted file mode 100644 index 35c778a437d..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png deleted file mode 100644 index 60e3e5174e9..00000000000 Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png and /dev/null differ diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml deleted file mode 100644 index bb45d63d85b..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml deleted file mode 100644 index c7b4b2e4a1d..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml deleted file mode 100644 index a8bb4b2f646..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml deleted file mode 100644 index 5f81396e382..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml deleted file mode 100644 index c2288b5bfce..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml deleted file mode 100644 index e8d13ca4e12..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml deleted file mode 100644 index afbe22da808..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml deleted file mode 100644 index 6e48b5de8be..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml deleted file mode 100644 index b327a544f25..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml +++ /dev/null @@ -1,55 +0,0 @@ - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml deleted file mode 100644 index 52bf533521a..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml deleted file mode 100644 index 0ec551ae364..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml +++ /dev/null @@ -1,338 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -